1 /*
2 * IR - Lightweight JIT Compilation Framework
3 * (CFG - Control Flow Graph)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8 #include "ir.h"
9 #include "ir_private.h"
10
_ir_merge_blocks(ir_ctx * ctx,ir_ref end,ir_ref begin)11 static ir_ref _ir_merge_blocks(ir_ctx *ctx, ir_ref end, ir_ref begin)
12 {
13 ir_ref prev, next;
14 ir_use_list *use_list;
15 ir_ref n, *p;
16
17 IR_ASSERT(ctx->ir_base[begin].op == IR_BEGIN);
18 IR_ASSERT(ctx->ir_base[end].op == IR_END);
19 IR_ASSERT(ctx->ir_base[begin].op1 == end);
20 IR_ASSERT(ctx->use_lists[end].count == 1);
21
22 prev = ctx->ir_base[end].op1;
23
24 use_list = &ctx->use_lists[begin];
25 IR_ASSERT(use_list->count == 1);
26 next = ctx->use_edges[use_list->refs];
27
28 /* remove BEGIN and END */
29 ctx->ir_base[begin].op = IR_NOP;
30 ctx->ir_base[begin].op1 = IR_UNUSED;
31 ctx->use_lists[begin].count = 0;
32 ctx->ir_base[end].op = IR_NOP;
33 ctx->ir_base[end].op1 = IR_UNUSED;
34 ctx->use_lists[end].count = 0;
35
36 /* connect their predecessor and successor */
37 ctx->ir_base[next].op1 = prev;
38 use_list = &ctx->use_lists[prev];
39 n = use_list->count;
40 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
41 if (*p == end) {
42 *p = next;
43 }
44 }
45
46 return next;
47 }
48
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)49 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
50 {
51 ir_use_list *use_list = &ctx->use_lists[ref];
52 ir_ref *p, use, n = use_list->count;
53
54 if (n < 2) {
55 if (n == 1) {
56 use = ctx->use_edges[use_list->refs];
57 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
58 ir_worklist_push(worklist, use);
59 }
60 } else {
61 p = &ctx->use_edges[use_list->refs];
62 if (n == 2) {
63 use = *p;
64 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
65 ir_worklist_push(worklist, use);
66 use = *(p + 1);
67 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
68 ir_worklist_push(worklist, use);
69 } else {
70 for (; n > 0; p++, n--) {
71 use = *p;
72 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
73 ir_worklist_push(worklist, use);
74 }
75 }
76 }
77 }
78
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)79 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
80 {
81 ir_ref n, ref;
82 const ir_ref *p;
83
84 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
85 n = insn->inputs_count;
86 for (p = insn->ops + 1; n > 0; p++, n--) {
87 ref = *p;
88 IR_ASSERT(ref);
89 ir_worklist_push(worklist, ref);
90 }
91 } else if (insn->op != IR_START) {
92 if (EXPECTED(insn->op1)) {
93 ir_worklist_push(worklist, insn->op1);
94 }
95 }
96 }
97
ir_build_cfg(ir_ctx * ctx)98 int ir_build_cfg(ir_ctx *ctx)
99 {
100 ir_ref n, *p, ref, start, end, next;
101 uint32_t b;
102 ir_insn *insn;
103 ir_worklist worklist;
104 uint32_t bb_init_falgs;
105 uint32_t count, bb_count = 0;
106 uint32_t edges_count = 0;
107 ir_block *blocks, *bb;
108 uint32_t *_blocks, *edges;
109 ir_use_list *use_list;
110 uint32_t len = ir_bitset_len(ctx->insns_count);
111 ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
112 ir_bitset bb_leaks = bb_starts + len;
113 _blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
114 ir_worklist_init(&worklist, ctx->insns_count);
115
116 /* First try to perform backward DFS search starting from "stop" nodes */
117
118 /* Add all "stop" nodes */
119 ref = ctx->ir_base[1].op1;
120 while (ref) {
121 ir_worklist_push(&worklist, ref);
122 ref = ctx->ir_base[ref].op3;
123 }
124
125 while (ir_worklist_len(&worklist)) {
126 ref = ir_worklist_pop(&worklist);
127 insn = &ctx->ir_base[ref];
128
129 IR_ASSERT(IR_IS_BB_END(insn->op));
130 /* Remember BB end */
131 end = ref;
132 /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
133 use_list = &ctx->use_lists[end];
134 n = use_list->count;
135 if (n > 1) {
136 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
137 /* Remember possible inaccessible successors */
138 ir_bitset_incl(bb_leaks, *p);
139 }
140 }
141 /* Skip control nodes untill BB start */
142 ref = insn->op1;
143 while (1) {
144 insn = &ctx->ir_base[ref];
145 if (IR_IS_BB_START(insn->op)) {
146 if (insn->op == IR_BEGIN
147 && (ctx->flags & IR_OPT_CFG)
148 && ctx->ir_base[insn->op1].op == IR_END
149 && ctx->use_lists[ref].count == 1) {
150 ref = _ir_merge_blocks(ctx, insn->op1, ref);
151 ref = ctx->ir_base[ref].op1;
152 continue;
153 }
154 break;
155 }
156 ref = insn->op1; // follow connected control blocks untill BB start
157 }
158 /* Mark BB Start */
159 bb_count++;
160 _blocks[ref] = end;
161 ir_bitset_incl(bb_starts, ref);
162 /* Add predecessors */
163 _ir_add_predecessors(insn, &worklist);
164 }
165
166 /* Backward DFS way miss some branches ending by infinite loops. */
167 /* Try forward DFS. (in most cases all nodes are already proceed). */
168
169 /* START node may be inaccessible from "stop" nodes */
170 ir_bitset_incl(bb_leaks, 1);
171
172 /* Add not processed START and successor of IF and SWITCH */
173 IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
174 ir_worklist_push(&worklist, start);
175 } IR_BITSET_FOREACH_END();
176
177 if (ir_worklist_len(&worklist)) {
178 ir_bitset_union(worklist.visited, bb_starts, len);
179 do {
180 ref = ir_worklist_pop(&worklist);
181 insn = &ctx->ir_base[ref];
182
183 IR_ASSERT(IR_IS_BB_START(insn->op));
184 /* Remember BB start */
185 start = ref;
186 /* Skip control nodes untill BB end */
187 while (1) {
188 use_list = &ctx->use_lists[ref];
189 n = use_list->count;
190 next = IR_UNUSED;
191 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
192 next = *p;
193 insn = &ctx->ir_base[next];
194 if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
195 break;
196 }
197 }
198 IR_ASSERT(next != IR_UNUSED);
199 ref = next;
200 next_successor:
201 if (IR_IS_BB_END(insn->op)) {
202 if (insn->op == IR_END && (ctx->flags & IR_OPT_CFG)) {
203 use_list = &ctx->use_lists[ref];
204 IR_ASSERT(use_list->count == 1);
205 next = ctx->use_edges[use_list->refs];
206
207 if (ctx->ir_base[next].op == IR_BEGIN
208 && ctx->use_lists[next].count == 1) {
209 ref = _ir_merge_blocks(ctx, ref, next);
210 insn = &ctx->ir_base[ref];
211 goto next_successor;
212 }
213 }
214 break;
215 }
216 }
217 /* Mark BB Start */
218 bb_count++;
219 _blocks[start] = ref;
220 ir_bitset_incl(bb_starts, start);
221 /* Add successors */
222 _ir_add_successors(ctx, ref, &worklist);
223 } while (ir_worklist_len(&worklist));
224 }
225
226 IR_ASSERT(bb_count > 0);
227
228 /* Create array of basic blocks and count successor/predecessors edges for each BB */
229 blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
230 b = 1;
231 bb = blocks + 1;
232 count = 0;
233 /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
234 bb_init_falgs = (ctx->flags2 & IR_SCCP_DONE) ? 0 : IR_BB_UNREACHABLE;
235 IR_BITSET_FOREACH(bb_starts, len, start) {
236 end = _blocks[start];
237 _blocks[start] = b;
238 _blocks[end] = b;
239 insn = &ctx->ir_base[start];
240 IR_ASSERT(IR_IS_BB_START(insn->op));
241 IR_ASSERT(end > start);
242 bb->start = start;
243 bb->end = end;
244 bb->successors = count;
245 count += ctx->use_lists[end].count;
246 bb->successors_count = 0;
247 bb->predecessors = count;
248 bb->dom_parent = 0;
249 bb->dom_depth = 0;
250 bb->dom_child = 0;
251 bb->dom_next_child = 0;
252 bb->loop_header = 0;
253 bb->loop_depth = 0;
254 if (insn->op == IR_START) {
255 bb->flags = IR_BB_START;
256 bb->predecessors_count = 0;
257 } else {
258 bb->flags = bb_init_falgs;
259 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
260 n = insn->inputs_count;
261 bb->predecessors_count = n;
262 edges_count += n;
263 count += n;
264 } else if (EXPECTED(insn->op1)) {
265 if (insn->op == IR_ENTRY) {
266 bb->flags |= IR_BB_ENTRY;
267 ctx->entries_count++;
268 }
269 bb->predecessors_count = 1;
270 edges_count++;
271 count++;
272 } else {
273 IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
274 bb->predecessors_count = 0;
275 }
276 }
277 b++;
278 bb++;
279 } IR_BITSET_FOREACH_END();
280 IR_ASSERT(count == edges_count * 2);
281 ir_mem_free(bb_starts);
282
283 /* Create an array of successor/predecessors control edges */
284 edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
285 bb = blocks + 1;
286 for (b = 1; b <= bb_count; b++, bb++) {
287 insn = &ctx->ir_base[bb->start];
288 if (bb->predecessors_count > 1) {
289 uint32_t *q = edges + bb->predecessors;
290 n = insn->inputs_count;
291 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
292 ref = *p;
293 IR_ASSERT(ref);
294 ir_ref pred_b = _blocks[ref];
295 ir_block *pred_bb = &blocks[pred_b];
296 *q = pred_b;
297 edges[pred_bb->successors + pred_bb->successors_count++] = b;
298 }
299 } else if (bb->predecessors_count == 1) {
300 ref = insn->op1;
301 IR_ASSERT(ref);
302 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
303 ir_ref pred_b = _blocks[ref];
304 ir_block *pred_bb = &blocks[pred_b];
305 edges[bb->predecessors] = pred_b;
306 edges[pred_bb->successors + pred_bb->successors_count++] = b;
307 }
308 }
309
310 ctx->cfg_blocks_count = bb_count;
311 ctx->cfg_edges_count = edges_count * 2;
312 ctx->cfg_blocks = blocks;
313 ctx->cfg_edges = edges;
314 ctx->cfg_map = _blocks;
315
316 if (!(ctx->flags2 & IR_SCCP_DONE)) {
317 uint32_t reachable_count = 0;
318
319 /* Mark reachable blocks */
320 ir_worklist_clear(&worklist);
321 ir_worklist_push(&worklist, 1);
322 while (ir_worklist_len(&worklist) != 0) {
323 uint32_t *p;
324
325 reachable_count++;
326 b = ir_worklist_pop(&worklist);
327 bb = &blocks[b];
328 bb->flags &= ~IR_BB_UNREACHABLE;
329 n = bb->successors_count;
330 if (n > 1) {
331 for (p = edges + bb->successors; n > 0; p++, n--) {
332 ir_worklist_push(&worklist, *p);
333 }
334 } else if (n == 1) {
335 ir_worklist_push(&worklist, edges[bb->successors]);
336 }
337 }
338 if (reachable_count != ctx->cfg_blocks_count) {
339 ir_remove_unreachable_blocks(ctx);
340 }
341 }
342
343 ir_worklist_free(&worklist);
344
345 return 1;
346 }
347
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)348 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
349 {
350 uint32_t i, *p, *q, n = 0;
351
352 p = q = &ctx->cfg_edges[bb->predecessors];
353 for (i = 0; i < bb->predecessors_count; i++, p++) {
354 if (*p != from) {
355 if (p != q) {
356 *q = *p;
357 }
358 q++;
359 n++;
360 }
361 }
362 IR_ASSERT(n != bb->predecessors_count);
363 bb->predecessors_count = n;
364 }
365
ir_remove_from_use_list(ir_ctx * ctx,ir_ref from,ir_ref ref)366 static void ir_remove_from_use_list(ir_ctx *ctx, ir_ref from, ir_ref ref)
367 {
368 ir_ref j, n, *p, *q, use;
369 ir_use_list *use_list = &ctx->use_lists[from];
370 ir_ref skip = 0;
371
372 n = use_list->count;
373 for (j = 0, p = q = &ctx->use_edges[use_list->refs]; j < n; j++, p++) {
374 use = *p;
375 if (use == ref) {
376 skip++;
377 } else {
378 if (p != q) {
379 *q = use;
380 }
381 q++;
382 }
383 }
384 use_list->count -= skip;
385 }
386
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)387 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
388 {
389 ir_ref i, j, n, k, *p, use;
390 ir_insn *use_insn;
391 ir_use_list *use_list;
392 ir_bitset life_inputs;
393 ir_insn *insn = &ctx->ir_base[merge];
394
395 IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
396 n = insn->inputs_count;
397 i = 1;
398 life_inputs = ir_bitset_malloc(n + 1);
399 for (j = 1; j <= n; j++) {
400 ir_ref input = ir_insn_op(insn, j);
401
402 if (input != from) {
403 if (i != j) {
404 ir_insn_set_op(insn, i, input);
405 }
406 ir_bitset_incl(life_inputs, j);
407 i++;
408 }
409 }
410 i--;
411 if (i == 1) {
412 insn->op = IR_BEGIN;
413 insn->inputs_count = 0;
414 use_list = &ctx->use_lists[merge];
415 if (use_list->count > 1) {
416 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
417 use = *p;
418 use_insn = &ctx->ir_base[use];
419 if (use_insn->op == IR_PHI) {
420 /* Convert PHI to COPY */
421 i = 2;
422 for (j = 2; j <= n; j++) {
423 ir_ref input = ir_insn_op(use_insn, j);
424
425 if (ir_bitset_in(life_inputs, j - 1)) {
426 use_insn->op1 = ir_insn_op(use_insn, j);
427 } else if (input > 0) {
428 ir_remove_from_use_list(ctx, input, use);
429 }
430 }
431 use_insn->op = IR_COPY;
432 use_insn->op2 = IR_UNUSED;
433 use_insn->op3 = IR_UNUSED;
434 ir_remove_from_use_list(ctx, merge, use);
435 }
436 }
437 }
438 } else {
439 insn->inputs_count = i;
440
441 n++;
442 use_list = &ctx->use_lists[merge];
443 if (use_list->count > 1) {
444 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
445 use = *p;
446 use_insn = &ctx->ir_base[use];
447 if (use_insn->op == IR_PHI) {
448 i = 2;
449 for (j = 2; j <= n; j++) {
450 ir_ref input = ir_insn_op(use_insn, j);
451
452 if (ir_bitset_in(life_inputs, j - 1)) {
453 IR_ASSERT(input);
454 if (i != j) {
455 ir_insn_set_op(use_insn, i, input);
456 }
457 i++;
458 } else if (input > 0) {
459 ir_remove_from_use_list(ctx, input, use);
460 }
461 }
462 }
463 }
464 }
465 }
466 ir_mem_free(life_inputs);
467 ir_remove_from_use_list(ctx, from, merge);
468 }
469
470 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)471 int ir_remove_unreachable_blocks(ir_ctx *ctx)
472 {
473 uint32_t b, *p, i;
474 uint32_t unreachable_count = 0;
475 uint32_t bb_count = ctx->cfg_blocks_count;
476 ir_block *bb = ctx->cfg_blocks + 1;
477
478 for (b = 1; b <= bb_count; b++, bb++) {
479 if (bb->flags & IR_BB_UNREACHABLE) {
480 #if 0
481 do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
482 #endif
483 if (bb->successors_count) {
484 for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
485 ir_block *succ_bb = &ctx->cfg_blocks[*p];
486
487 if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
488 ir_remove_predecessor(ctx, succ_bb, b);
489 ir_remove_merge_input(ctx, succ_bb->start, bb->end);
490 }
491 }
492 } else {
493 ir_ref prev, ref = bb->end;
494 ir_insn *insn = &ctx->ir_base[ref];
495
496 IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
497 /* remove from terminators list */
498 prev = ctx->ir_base[1].op1;
499 if (prev == ref) {
500 ctx->ir_base[1].op1 = insn->op3;
501 } else {
502 while (prev) {
503 if (ctx->ir_base[prev].op3 == ref) {
504 ctx->ir_base[prev].op3 = insn->op3;
505 break;
506 }
507 prev = ctx->ir_base[prev].op3;
508 }
509 }
510 }
511 ctx->cfg_map[bb->start] = 0;
512 ctx->cfg_map[bb->end] = 0;
513 unreachable_count++;
514 }
515 }
516
517 if (unreachable_count) {
518 ir_block *dst_bb;
519 uint32_t n = 1;
520 uint32_t *edges;
521
522 dst_bb = bb = ctx->cfg_blocks + 1;
523 for (b = 1; b <= bb_count; b++, bb++) {
524 if (!(bb->flags & IR_BB_UNREACHABLE)) {
525 if (dst_bb != bb) {
526 memcpy(dst_bb, bb, sizeof(ir_block));
527 ctx->cfg_map[dst_bb->start] = n;
528 ctx->cfg_map[dst_bb->end] = n;
529 }
530 dst_bb->successors_count = 0;
531 dst_bb++;
532 n++;
533 }
534 }
535 ctx->cfg_blocks_count = bb_count = n - 1;
536
537 /* Rebuild successor/predecessors control edges */
538 edges = ctx->cfg_edges;
539 bb = ctx->cfg_blocks + 1;
540 for (b = 1; b <= bb_count; b++, bb++) {
541 ir_insn *insn = &ctx->ir_base[bb->start];
542 ir_ref *p, ref;
543
544 n = bb->predecessors_count;
545 if (n > 1) {
546 uint32_t *q = edges + bb->predecessors;
547
548 IR_ASSERT(n == insn->inputs_count);
549 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
550 ref = *p;
551 IR_ASSERT(ref);
552 ir_ref pred_b = ctx->cfg_map[ref];
553 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
554 *q = pred_b;
555 edges[pred_bb->successors + pred_bb->successors_count++] = b;
556 }
557 } else if (n == 1) {
558 ref = insn->op1;
559 IR_ASSERT(ref);
560 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
561 ir_ref pred_b = ctx->cfg_map[ref];
562 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
563 edges[bb->predecessors] = pred_b;
564 edges[pred_bb->successors + pred_bb->successors_count++] = b;
565 }
566 }
567 }
568
569 return 1;
570 }
571
572 #if 0
573 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
574 {
575 uint32_t i, *p;
576 ir_block *bb = &ctx->cfg_blocks[b];
577
578 if (bb->postnum != 0) {
579 return;
580 }
581
582 if (bb->successors_count) {
583 bb->postnum = -1; /* Marker for "currently visiting" */
584 p = ctx->cfg_edges + bb->successors;
585 i = bb->successors_count;
586 do {
587 compute_postnum(ctx, cur, *p);
588 p++;
589 } while (--i);
590 }
591 bb->postnum = (*cur)++;
592 }
593
594 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
595 * Cooper, Harvey and Kennedy. */
596 int ir_build_dominators_tree(ir_ctx *ctx)
597 {
598 uint32_t blocks_count, b, postnum;
599 ir_block *blocks, *bb;
600 uint32_t *edges;
601 bool changed;
602
603 ctx->flags2 &= ~IR_NO_LOOPS;
604
605 postnum = 1;
606 compute_postnum(ctx, &postnum, 1);
607
608 /* Find immediate dominators */
609 blocks = ctx->cfg_blocks;
610 edges = ctx->cfg_edges;
611 blocks_count = ctx->cfg_blocks_count;
612 blocks[1].idom = 1;
613 do {
614 changed = 0;
615 /* Iterating in Reverse Post Order */
616 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
617 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
618 if (bb->predecessors_count == 1) {
619 uint32_t pred_b = edges[bb->predecessors];
620
621 IR_ASSERT(blocks[pred_b].idom > 0);
622 if (bb->idom != pred_b) {
623 bb->idom = pred_b;
624 changed = 1;
625 }
626 } else if (bb->predecessors_count) {
627 uint32_t idom = 0;
628 uint32_t k = bb->predecessors_count;
629 uint32_t *p = edges + bb->predecessors;
630
631 do {
632 uint32_t pred_b = *p;
633 ir_block *pred_bb = &blocks[pred_b];
634 ir_block *idom_bb;
635
636 if (pred_bb->idom > 0) {
637 idom = pred_b;
638 idom_bb = &blocks[idom];
639
640 while (--k > 0) {
641 pred_b = *(++p);
642 pred_bb = &blocks[pred_b];
643 if (pred_bb->idom > 0) {
644 while (idom != pred_b) {
645 while (pred_bb->postnum < idom_bb->postnum) {
646 pred_b = pred_bb->idom;
647 pred_bb = &blocks[pred_b];
648 }
649 while (idom_bb->postnum < pred_bb->postnum) {
650 idom = idom_bb->idom;
651 idom_bb = &blocks[idom];
652 }
653 }
654 }
655 }
656
657 if (bb->idom != idom) {
658 bb->idom = idom;
659 changed = 1;
660 }
661 break;
662 }
663 p++;
664 } while (--k > 0);
665 }
666 }
667 } while (changed);
668 blocks[1].idom = 0;
669 blocks[1].dom_depth = 0;
670
671 /* Construct dominators tree */
672 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
673 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
674 if (bb->idom > 0) {
675 ir_block *idom_bb = &blocks[bb->idom];
676
677 bb->dom_depth = idom_bb->dom_depth + 1;
678 /* Sort by block number to traverse children in pre-order */
679 if (idom_bb->dom_child == 0) {
680 idom_bb->dom_child = b;
681 } else if (b < idom_bb->dom_child) {
682 bb->dom_next_child = idom_bb->dom_child;
683 idom_bb->dom_child = b;
684 } else {
685 int child = idom_bb->dom_child;
686 ir_block *child_bb = &blocks[child];
687
688 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
689 child = child_bb->dom_next_child;
690 child_bb = &blocks[child];
691 }
692 bb->dom_next_child = child_bb->dom_next_child;
693 child_bb->dom_next_child = b;
694 }
695 }
696 }
697
698 return 1;
699 }
700 #else
701 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
702 * Cooper, Harvey and Kennedy, that relays on IR block ordering */
ir_build_dominators_tree(ir_ctx * ctx)703 int ir_build_dominators_tree(ir_ctx *ctx)
704 {
705 uint32_t blocks_count, b;
706 ir_block *blocks, *bb;
707 uint32_t *edges;
708
709 ctx->flags2 |= IR_NO_LOOPS;
710
711 /* Find immediate dominators */
712 blocks = ctx->cfg_blocks;
713 edges = ctx->cfg_edges;
714 blocks_count = ctx->cfg_blocks_count;
715 blocks[1].idom = 1;
716 blocks[1].dom_depth = 0;
717
718 /* Iterating in Reverse Post Order */
719 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
720 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
721 IR_ASSERT(bb->predecessors_count > 0);
722 uint32_t k = bb->predecessors_count;
723 uint32_t *p = edges + bb->predecessors;
724 uint32_t idom = *p;
725 ir_block *idom_bb;
726
727 if (UNEXPECTED(idom > b)) {
728 /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
729 ctx->flags2 &= ~IR_NO_LOOPS;
730 while (1) {
731 k--;
732 p++;
733 idom = *p;
734 if (idom < b) {
735 break;
736 }
737 IR_ASSERT(k > 0);
738 }
739 }
740 IR_ASSERT(blocks[idom].idom > 0);
741
742 while (--k > 0) {
743 uint32_t pred_b = *(++p);
744
745 if (pred_b < b) {
746 IR_ASSERT(blocks[pred_b].idom > 0);
747 while (idom != pred_b) {
748 while (pred_b > idom) {
749 pred_b = blocks[pred_b].idom;
750 }
751 while (idom > pred_b) {
752 idom = blocks[idom].idom;
753 }
754 }
755 } else {
756 ctx->flags2 &= ~IR_NO_LOOPS;
757 }
758 }
759 bb->idom = idom;
760 idom_bb = &blocks[idom];
761
762 bb->dom_depth = idom_bb->dom_depth + 1;
763 /* Sort by block number to traverse children in pre-order */
764 if (idom_bb->dom_child == 0) {
765 idom_bb->dom_child = b;
766 } else if (b < idom_bb->dom_child) {
767 bb->dom_next_child = idom_bb->dom_child;
768 idom_bb->dom_child = b;
769 } else {
770 int child = idom_bb->dom_child;
771 ir_block *child_bb = &blocks[child];
772
773 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
774 child = child_bb->dom_next_child;
775 child_bb = &blocks[child];
776 }
777 bb->dom_next_child = child_bb->dom_next_child;
778 child_bb->dom_next_child = b;
779 }
780 }
781
782 blocks[1].idom = 0;
783
784 return 1;
785 }
786 #endif
787
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)788 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
789 {
790 uint32_t b1_depth = blocks[b1].dom_depth;
791 const ir_block *bb2 = &blocks[b2];
792
793 while (bb2->dom_depth > b1_depth) {
794 b2 = bb2->dom_parent;
795 bb2 = &blocks[b2];
796 }
797 return b1 == b2;
798 }
799
ir_find_loops(ir_ctx * ctx)800 int ir_find_loops(ir_ctx *ctx)
801 {
802 uint32_t i, j, n, count;
803 uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
804 ir_block *blocks = ctx->cfg_blocks;
805 uint32_t *edges = ctx->cfg_edges;
806 ir_worklist work;
807
808 if (ctx->flags2 & IR_NO_LOOPS) {
809 return 1;
810 }
811
812 /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
813 * queries. These are implemented by checking entry/exit times of the DFS search. */
814 ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
815 entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
816 exit_times = entry_times + ctx->cfg_blocks_count + 1;
817 sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
818
819 memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
820
821 ir_worklist_push(&work, 1);
822 while (ir_worklist_len(&work)) {
823 ir_block *bb;
824 int child;
825
826 next:
827 i = ir_worklist_peek(&work);
828 if (!entry_times[i]) {
829 entry_times[i] = time++;
830 }
831
832 /* Visit blocks immediately dominated by i. */
833 bb = &blocks[i];
834 for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
835 if (ir_worklist_push(&work, child)) {
836 goto next;
837 }
838 }
839
840 /* Visit join edges. */
841 if (bb->successors_count) {
842 uint32_t *p = edges + bb->successors;
843 for (j = 0; j < bb->successors_count; j++,p++) {
844 uint32_t succ = *p;
845
846 if (blocks[succ].idom == i) {
847 continue;
848 } else if (ir_worklist_push(&work, succ)) {
849 goto next;
850 }
851 }
852 }
853 exit_times[i] = time++;
854 ir_worklist_pop(&work);
855 }
856
857 /* Sort blocks by level, which is the opposite order in which we want to process them */
858 sorted_blocks[1] = 1;
859 j = 1;
860 n = 2;
861 while (j != n) {
862 i = j;
863 j = n;
864 for (; i < j; i++) {
865 int child;
866 for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
867 sorted_blocks[n++] = child;
868 }
869 }
870 }
871 count = n;
872
873 /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
874 while (n > 1) {
875 i = sorted_blocks[--n];
876 ir_block *bb = &blocks[i];
877
878 if (bb->predecessors_count > 1) {
879 bool irreducible = 0;
880 uint32_t *p = &edges[bb->predecessors];
881
882 j = bb->predecessors_count;
883 do {
884 uint32_t pred = *p;
885
886 /* A join edge is one for which the predecessor does not
887 immediately dominate the successor. */
888 if (bb->idom != pred) {
889 /* In a loop back-edge (back-join edge), the successor dominates
890 the predecessor. */
891 if (ir_dominates(blocks, i, pred)) {
892 if (!ir_worklist_len(&work)) {
893 ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
894 }
895 blocks[pred].loop_header = 0; /* support for merged loops */
896 ir_worklist_push(&work, pred);
897 } else {
898 /* Otherwise it's a cross-join edge. See if it's a branch
899 to an ancestor on the DJ spanning tree. */
900 if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
901 irreducible = 1;
902 }
903 }
904 }
905 p++;
906 } while (--j);
907
908 if (UNEXPECTED(irreducible)) {
909 // TODO: Support for irreducible loops ???
910 bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
911 ctx->flags2 |= IR_IRREDUCIBLE_CFG;
912 while (ir_worklist_len(&work)) {
913 ir_worklist_pop(&work);
914 }
915 } else if (ir_worklist_len(&work)) {
916 bb->flags |= IR_BB_LOOP_HEADER;
917 ctx->flags2 |= IR_CFG_HAS_LOOPS;
918 bb->loop_depth = 1;
919 while (ir_worklist_len(&work)) {
920 j = ir_worklist_pop(&work);
921 while (blocks[j].loop_header > 0) {
922 j = blocks[j].loop_header;
923 }
924 if (j != i) {
925 ir_block *bb = &blocks[j];
926 if (bb->idom == 0 && j != 1) {
927 /* Ignore blocks that are unreachable or only abnormally reachable. */
928 continue;
929 }
930 bb->loop_header = i;
931 if (bb->predecessors_count) {
932 uint32_t *p = &edges[bb->predecessors];
933 j = bb->predecessors_count;
934 do {
935 ir_worklist_push(&work, *p);
936 p++;
937 } while (--j);
938 }
939 }
940 }
941 }
942 }
943 }
944
945 if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
946 for (n = 1; n < count; n++) {
947 i = sorted_blocks[n];
948 ir_block *bb = &blocks[i];
949 if (bb->loop_header > 0) {
950 ir_block *loop = &blocks[bb->loop_header];
951 uint32_t loop_depth = loop->loop_depth;
952
953 if (bb->flags & IR_BB_LOOP_HEADER) {
954 loop_depth++;
955 }
956 bb->loop_depth = loop_depth;
957 if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
958 loop->flags |= IR_BB_LOOP_WITH_ENTRY;
959 }
960 }
961 }
962 }
963
964 ir_mem_free(entry_times);
965 ir_worklist_free(&work);
966
967 return 1;
968 }
969
970 /* A variation of "Top-down Positioning" algorithm described by
971 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
972 *
973 * TODO: Switch to "Bottom-up Positioning" algorithm
974 */
ir_schedule_blocks(ir_ctx * ctx)975 int ir_schedule_blocks(ir_ctx *ctx)
976 {
977 ir_bitqueue blocks;
978 uint32_t b, best_successor, j, last_non_empty;
979 ir_block *bb, *best_successor_bb;
980 ir_insn *insn;
981 uint32_t *list, *map;
982 uint32_t count = 0;
983 bool reorder = 0;
984
985 ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
986 blocks.pos = 0;
987 list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 1) * 2);
988 map = list + (ctx->cfg_blocks_count + 1);
989 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
990 ir_bitset_incl(blocks.set, b);
991 }
992
993 while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
994 bb = &ctx->cfg_blocks[b];
995 /* Start trace */
996 last_non_empty = 0;
997 do {
998 if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
999 /* Schedule the previous empty ENTRY block before this one */
1000 uint32_t predecessor = b - 1;
1001
1002 ir_bitqueue_del(&blocks, predecessor);
1003 count++;
1004 list[count] = predecessor;
1005 map[predecessor] = count;
1006 if (predecessor != count) {
1007 reorder = 1;
1008 }
1009 }
1010 count++;
1011 list[count] = b;
1012 map[b] = count;
1013 if (b != count) {
1014 reorder = 1;
1015 }
1016 if (!(bb->flags & IR_BB_EMPTY)) {
1017 last_non_empty = b;
1018 }
1019 best_successor_bb = NULL;
1020 if (bb->successors_count == 1) {
1021 best_successor = ctx->cfg_edges[bb->successors];
1022 if (ir_bitqueue_in(&blocks, best_successor)) {
1023 best_successor_bb = &ctx->cfg_blocks[best_successor];
1024 }
1025 } else if (bb->successors_count > 1) {
1026 uint32_t prob, best_successor_prob;
1027 uint32_t *p, successor;
1028 ir_block *successor_bb;
1029
1030 for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1031 successor = *p;
1032 if (ir_bitqueue_in(&blocks, successor)) {
1033 successor_bb = &ctx->cfg_blocks[successor];
1034 insn = &ctx->ir_base[successor_bb->start];
1035 if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1036 prob = insn->op2;
1037 if (!prob) {
1038 prob = 100 / bb->successors_count;
1039 if (!(successor_bb->flags & IR_BB_EMPTY)) {
1040 prob++;
1041 }
1042 }
1043 } else if (insn->op == IR_CASE_DEFAULT) {
1044 prob = insn->op2;
1045 if (!prob) {
1046 prob = 100 / bb->successors_count;
1047 }
1048 } else if (insn->op == IR_CASE_VAL) {
1049 prob = insn->op3;
1050 if (!prob) {
1051 prob = 100 / bb->successors_count;
1052 }
1053 } else if (insn->op == IR_ENTRY) {
1054 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1055 prob = 99; /* prefer empty ENTRY block to go first */
1056 } else {
1057 prob = 1;
1058 }
1059 } else {
1060 prob = 100 / bb->successors_count;
1061 }
1062 if (!best_successor_bb
1063 || successor_bb->loop_depth > best_successor_bb->loop_depth
1064 || prob > best_successor_prob) {
1065 best_successor = successor;
1066 best_successor_bb = successor_bb;
1067 best_successor_prob = prob;
1068 }
1069 }
1070 }
1071 }
1072 if (!best_successor_bb) {
1073 /* Try to continue trace using the other successor of the last IF */
1074 if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1075 bb = &ctx->cfg_blocks[last_non_empty];
1076 if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1077 b = ctx->cfg_edges[bb->successors];
1078
1079 if (!ir_bitqueue_in(&blocks, b)) {
1080 b = ctx->cfg_edges[bb->successors + 1];
1081 }
1082 if (ir_bitqueue_in(&blocks, b)) {
1083 bb = &ctx->cfg_blocks[b];
1084 ir_bitqueue_del(&blocks, b);
1085 continue;
1086 }
1087 }
1088 }
1089 /* End trace */
1090 break;
1091 }
1092 b = best_successor;
1093 bb = best_successor_bb;
1094 ir_bitqueue_del(&blocks, b);
1095 } while (1);
1096 }
1097
1098 if (reorder) {
1099 ir_block *cfg_blocks = ir_mem_malloc(sizeof(ir_block) * (ctx->cfg_blocks_count + 1));
1100
1101 memset(ctx->cfg_blocks, 0, sizeof(ir_block));
1102 for (b = 1, bb = cfg_blocks + 1; b <= count; b++, bb++) {
1103 *bb = ctx->cfg_blocks[list[b]];
1104 if (bb->dom_parent > 0) {
1105 bb->dom_parent = map[bb->dom_parent];
1106 }
1107 if (bb->dom_child > 0) {
1108 bb->dom_child = map[bb->dom_child];
1109 }
1110 if (bb->dom_next_child > 0) {
1111 bb->dom_next_child = map[bb->dom_next_child];
1112 }
1113 if (bb->loop_header > 0) {
1114 bb->loop_header = map[bb->loop_header];
1115 }
1116 }
1117 for (j = 0; j < ctx->cfg_edges_count; j++) {
1118 if (ctx->cfg_edges[j] > 0) {
1119 ctx->cfg_edges[j] = map[ctx->cfg_edges[j]];
1120 }
1121 }
1122 ir_mem_free(ctx->cfg_blocks);
1123 ctx->cfg_blocks = cfg_blocks;
1124
1125 if (ctx->osr_entry_loads) {
1126 ir_list *list = (ir_list*)ctx->osr_entry_loads;
1127 uint32_t pos = 0, count;
1128
1129 while (1) {
1130 b = ir_list_at(list, pos);
1131 if (b == 0) {
1132 break;
1133 }
1134 ir_list_set(list, pos, map[b]);
1135 pos++;
1136 count = ir_list_at(list, pos);
1137 pos += count + 1;
1138 }
1139 }
1140
1141 if (ctx->cfg_map) {
1142 ir_ref i;
1143
1144 for (i = IR_UNUSED + 1; i < ctx->insns_count; i++) {
1145 ctx->cfg_map[i] = map[ctx->cfg_map[i]];
1146 }
1147 }
1148 }
1149
1150 ir_mem_free(list);
1151 ir_bitqueue_free(&blocks);
1152
1153 return 1;
1154 }
1155
1156 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1157 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1158 {
1159 ir_block *bb;
1160
1161 while (1) {
1162 bb = &ctx->cfg_blocks[b];
1163
1164 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1165 b = ctx->cfg_edges[bb->successors];
1166 } else {
1167 break;
1168 }
1169 }
1170 return b;
1171 }
1172
ir_skip_empty_next_blocks(const ir_ctx * ctx,uint32_t b)1173 uint32_t ir_skip_empty_next_blocks(const ir_ctx *ctx, uint32_t b)
1174 {
1175 ir_block *bb;
1176
1177 while (1) {
1178 if (b > ctx->cfg_blocks_count) {
1179 return 0;
1180 }
1181
1182 bb = &ctx->cfg_blocks[b];
1183
1184 if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1185 b++;
1186 } else {
1187 break;
1188 }
1189 }
1190 return b;
1191 }
1192
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block,uint32_t * next_block)1193 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block, uint32_t *next_block)
1194 {
1195 ir_block *bb;
1196 uint32_t *p, use_block;
1197
1198 *true_block = 0;
1199 *false_block = 0;
1200 bb = &ctx->cfg_blocks[b];
1201 IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1202 IR_ASSERT(bb->successors_count == 2);
1203 p = &ctx->cfg_edges[bb->successors];
1204 use_block = *p;
1205 if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1206 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1207 use_block = *(p+1);
1208 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1209 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1210 } else {
1211 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1212 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1213 use_block = *(p+1);
1214 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1215 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1216 }
1217 IR_ASSERT(*true_block && *false_block);
1218 *next_block = b == ctx->cfg_blocks_count ? 0 : ir_skip_empty_next_blocks(ctx, b + 1);
1219 }
1220