1 /*
2 * IR - Lightweight JIT Compilation Framework
3 * (CFG - Control Flow Graph)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8 #include "ir.h"
9 #include "ir_private.h"
10
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)11 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
12 {
13 ir_use_list *use_list = &ctx->use_lists[ref];
14 ir_ref *p, use, n = use_list->count;
15
16 if (n < 2) {
17 if (n == 1) {
18 use = ctx->use_edges[use_list->refs];
19 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
20 ir_worklist_push(worklist, use);
21 }
22 } else {
23 p = &ctx->use_edges[use_list->refs];
24 if (n == 2) {
25 use = *p;
26 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
27 ir_worklist_push(worklist, use);
28 use = *(p + 1);
29 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
30 ir_worklist_push(worklist, use);
31 } else {
32 for (; n > 0; p++, n--) {
33 use = *p;
34 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
35 ir_worklist_push(worklist, use);
36 }
37 }
38 }
39 }
40
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)41 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
42 {
43 ir_ref n, ref;
44 const ir_ref *p;
45
46 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
47 n = insn->inputs_count;
48 for (p = insn->ops + 1; n > 0; p++, n--) {
49 ref = *p;
50 IR_ASSERT(ref);
51 ir_worklist_push(worklist, ref);
52 }
53 } else if (insn->op != IR_START) {
54 if (EXPECTED(insn->op1)) {
55 ir_worklist_push(worklist, insn->op1);
56 }
57 }
58 }
59
ir_build_cfg(ir_ctx * ctx)60 int ir_build_cfg(ir_ctx *ctx)
61 {
62 ir_ref n, *p, ref, start, end, next;
63 uint32_t b;
64 ir_insn *insn;
65 ir_worklist worklist;
66 uint32_t bb_init_falgs;
67 uint32_t count, bb_count = 0;
68 uint32_t edges_count = 0;
69 ir_block *blocks, *bb;
70 uint32_t *_blocks, *edges;
71 ir_use_list *use_list;
72 uint32_t len = ir_bitset_len(ctx->insns_count);
73 ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
74 ir_bitset bb_leaks = bb_starts + len;
75 _blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
76 ir_worklist_init(&worklist, ctx->insns_count);
77
78 /* First try to perform backward DFS search starting from "stop" nodes */
79
80 /* Add all "stop" nodes */
81 ref = ctx->ir_base[1].op1;
82 while (ref) {
83 ir_worklist_push(&worklist, ref);
84 ref = ctx->ir_base[ref].op3;
85 }
86
87 while (ir_worklist_len(&worklist)) {
88 ref = ir_worklist_pop(&worklist);
89 insn = &ctx->ir_base[ref];
90
91 if (insn->op == IR_NOP) {
92 continue;
93 }
94 IR_ASSERT(IR_IS_BB_END(insn->op));
95 /* Remember BB end */
96 end = ref;
97 /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
98 use_list = &ctx->use_lists[end];
99 n = use_list->count;
100 if (n > 1) {
101 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
102 /* Remember possible inaccessible successors */
103 ir_bitset_incl(bb_leaks, *p);
104 }
105 }
106 /* Skip control nodes untill BB start */
107 ref = insn->op1;
108 while (1) {
109 insn = &ctx->ir_base[ref];
110 if (IR_IS_BB_START(insn->op)) {
111 break;
112 }
113 ref = insn->op1; // follow connected control blocks untill BB start
114 }
115 /* Mark BB Start */
116 bb_count++;
117 _blocks[ref] = end;
118 ir_bitset_incl(bb_starts, ref);
119 /* Add predecessors */
120 _ir_add_predecessors(insn, &worklist);
121 }
122
123 /* Backward DFS way miss some branches ending by infinite loops. */
124 /* Try forward DFS. (in most cases all nodes are already proceed). */
125
126 /* START node may be inaccessible from "stop" nodes */
127 ir_bitset_incl(bb_leaks, 1);
128
129 /* Add not processed START and successor of IF and SWITCH */
130 IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
131 ir_worklist_push(&worklist, start);
132 } IR_BITSET_FOREACH_END();
133
134 if (ir_worklist_len(&worklist)) {
135 ir_bitset_union(worklist.visited, bb_starts, len);
136 do {
137 ref = ir_worklist_pop(&worklist);
138 insn = &ctx->ir_base[ref];
139
140 if (insn->op == IR_NOP) {
141 continue;
142 }
143 IR_ASSERT(IR_IS_BB_START(insn->op));
144 /* Remember BB start */
145 start = ref;
146 /* Skip control nodes untill BB end */
147 while (1) {
148 use_list = &ctx->use_lists[ref];
149 n = use_list->count;
150 next = IR_UNUSED;
151 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
152 next = *p;
153 insn = &ctx->ir_base[next];
154 if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
155 break;
156 }
157 }
158 IR_ASSERT(next != IR_UNUSED);
159 ref = next;
160 if (IR_IS_BB_END(insn->op)) {
161 break;
162 }
163 }
164 /* Mark BB Start */
165 bb_count++;
166 _blocks[start] = ref;
167 ir_bitset_incl(bb_starts, start);
168 /* Add successors */
169 _ir_add_successors(ctx, ref, &worklist);
170 } while (ir_worklist_len(&worklist));
171 }
172
173 IR_ASSERT(bb_count > 0);
174
175 /* Create array of basic blocks and count successor/predecessors edges for each BB */
176 blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
177 b = 1;
178 bb = blocks + 1;
179 count = 0;
180 /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
181 bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
182 IR_BITSET_FOREACH(bb_starts, len, start) {
183 insn = &ctx->ir_base[start];
184 if (insn->op == IR_NOP) {
185 _blocks[start] = 0;
186 continue;
187 }
188 end = _blocks[start];
189 _blocks[start] = b;
190 _blocks[end] = b;
191 IR_ASSERT(IR_IS_BB_START(insn->op));
192 IR_ASSERT(end > start);
193 bb->start = start;
194 bb->end = end;
195 bb->successors = count;
196 count += ctx->use_lists[end].count;
197 bb->successors_count = 0;
198 bb->predecessors = count;
199 bb->dom_parent = 0;
200 bb->dom_depth = 0;
201 bb->dom_child = 0;
202 bb->dom_next_child = 0;
203 bb->loop_header = 0;
204 bb->loop_depth = 0;
205 if (insn->op == IR_START) {
206 bb->flags = IR_BB_START;
207 bb->predecessors_count = 0;
208 } else {
209 bb->flags = bb_init_falgs;
210 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
211 n = insn->inputs_count;
212 bb->predecessors_count = n;
213 edges_count += n;
214 count += n;
215 } else if (EXPECTED(insn->op1)) {
216 if (insn->op == IR_ENTRY) {
217 bb->flags |= IR_BB_ENTRY;
218 ctx->entries_count++;
219 }
220 bb->predecessors_count = 1;
221 edges_count++;
222 count++;
223 } else {
224 IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
225 bb->predecessors_count = 0;
226 }
227 }
228 b++;
229 bb++;
230 } IR_BITSET_FOREACH_END();
231 bb_count = b - 1;
232 IR_ASSERT(count == edges_count * 2);
233 ir_mem_free(bb_starts);
234
235 /* Create an array of successor/predecessors control edges */
236 edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
237 bb = blocks + 1;
238 for (b = 1; b <= bb_count; b++, bb++) {
239 insn = &ctx->ir_base[bb->start];
240 if (bb->predecessors_count > 1) {
241 uint32_t *q = edges + bb->predecessors;
242 n = insn->inputs_count;
243 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
244 ref = *p;
245 IR_ASSERT(ref);
246 ir_ref pred_b = _blocks[ref];
247 ir_block *pred_bb = &blocks[pred_b];
248 *q = pred_b;
249 edges[pred_bb->successors + pred_bb->successors_count++] = b;
250 }
251 } else if (bb->predecessors_count == 1) {
252 ref = insn->op1;
253 IR_ASSERT(ref);
254 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
255 ir_ref pred_b = _blocks[ref];
256 ir_block *pred_bb = &blocks[pred_b];
257 edges[bb->predecessors] = pred_b;
258 edges[pred_bb->successors + pred_bb->successors_count++] = b;
259 }
260 }
261
262 ctx->cfg_blocks_count = bb_count;
263 ctx->cfg_edges_count = edges_count * 2;
264 ctx->cfg_blocks = blocks;
265 ctx->cfg_edges = edges;
266 ctx->cfg_map = _blocks;
267
268 if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
269 uint32_t reachable_count = 0;
270
271 /* Mark reachable blocks */
272 ir_worklist_clear(&worklist);
273 ir_worklist_push(&worklist, 1);
274 while (ir_worklist_len(&worklist) != 0) {
275 uint32_t *p;
276
277 reachable_count++;
278 b = ir_worklist_pop(&worklist);
279 bb = &blocks[b];
280 bb->flags &= ~IR_BB_UNREACHABLE;
281 n = bb->successors_count;
282 if (n > 1) {
283 for (p = edges + bb->successors; n > 0; p++, n--) {
284 ir_worklist_push(&worklist, *p);
285 }
286 } else if (n == 1) {
287 ir_worklist_push(&worklist, edges[bb->successors]);
288 }
289 }
290 if (reachable_count != ctx->cfg_blocks_count) {
291 ir_remove_unreachable_blocks(ctx);
292 }
293 }
294
295 ir_worklist_free(&worklist);
296
297 return 1;
298 }
299
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)300 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
301 {
302 uint32_t i, *p, *q, n = 0;
303
304 p = q = &ctx->cfg_edges[bb->predecessors];
305 for (i = 0; i < bb->predecessors_count; i++, p++) {
306 if (*p != from) {
307 if (p != q) {
308 *q = *p;
309 }
310 q++;
311 n++;
312 }
313 }
314 IR_ASSERT(n != bb->predecessors_count);
315 bb->predecessors_count = n;
316 }
317
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)318 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
319 {
320 ir_ref i, j, n, k, *p, use;
321 ir_insn *use_insn;
322 ir_use_list *use_list;
323 ir_bitset life_inputs;
324 ir_insn *insn = &ctx->ir_base[merge];
325
326 IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
327 n = insn->inputs_count;
328 i = 1;
329 life_inputs = ir_bitset_malloc(n + 1);
330 for (j = 1; j <= n; j++) {
331 ir_ref input = ir_insn_op(insn, j);
332
333 if (input != from) {
334 if (i != j) {
335 ir_insn_set_op(insn, i, input);
336 }
337 ir_bitset_incl(life_inputs, j);
338 i++;
339 }
340 }
341 i--;
342 if (i == 1) {
343 insn->op = IR_BEGIN;
344 insn->inputs_count = 1;
345 use_list = &ctx->use_lists[merge];
346 if (use_list->count > 1) {
347 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
348 use = *p;
349 use_insn = &ctx->ir_base[use];
350 if (use_insn->op == IR_PHI) {
351 /* Convert PHI to COPY */
352 i = 2;
353 for (j = 2; j <= n; j++) {
354 ir_ref input = ir_insn_op(use_insn, j);
355
356 if (ir_bitset_in(life_inputs, j - 1)) {
357 use_insn->op1 = ir_insn_op(use_insn, j);
358 } else if (input > 0) {
359 ir_use_list_remove_all(ctx, input, use);
360 }
361 }
362 use_insn->op = IR_COPY;
363 use_insn->op2 = IR_UNUSED;
364 use_insn->op3 = IR_UNUSED;
365 ir_use_list_remove_all(ctx, merge, use);
366 }
367 }
368 }
369 } else {
370 insn->inputs_count = i;
371
372 n++;
373 use_list = &ctx->use_lists[merge];
374 if (use_list->count > 1) {
375 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
376 use = *p;
377 use_insn = &ctx->ir_base[use];
378 if (use_insn->op == IR_PHI) {
379 i = 2;
380 for (j = 2; j <= n; j++) {
381 ir_ref input = ir_insn_op(use_insn, j);
382
383 if (ir_bitset_in(life_inputs, j - 1)) {
384 IR_ASSERT(input);
385 if (i != j) {
386 ir_insn_set_op(use_insn, i, input);
387 }
388 i++;
389 } else if (input > 0) {
390 ir_use_list_remove_all(ctx, input, use);
391 }
392 }
393 }
394 }
395 }
396 }
397 ir_mem_free(life_inputs);
398 ir_use_list_remove_all(ctx, from, merge);
399 }
400
401 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)402 int ir_remove_unreachable_blocks(ir_ctx *ctx)
403 {
404 uint32_t b, *p, i;
405 uint32_t unreachable_count = 0;
406 uint32_t bb_count = ctx->cfg_blocks_count;
407 ir_block *bb = ctx->cfg_blocks + 1;
408
409 for (b = 1; b <= bb_count; b++, bb++) {
410 if (bb->flags & IR_BB_UNREACHABLE) {
411 #if 0
412 do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
413 #endif
414 if (bb->successors_count) {
415 for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
416 ir_block *succ_bb = &ctx->cfg_blocks[*p];
417
418 if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
419 ir_remove_predecessor(ctx, succ_bb, b);
420 ir_remove_merge_input(ctx, succ_bb->start, bb->end);
421 }
422 }
423 } else {
424 ir_ref prev, ref = bb->end;
425 ir_insn *insn = &ctx->ir_base[ref];
426
427 IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
428 /* remove from terminators list */
429 prev = ctx->ir_base[1].op1;
430 if (prev == ref) {
431 ctx->ir_base[1].op1 = insn->op3;
432 } else {
433 while (prev) {
434 if (ctx->ir_base[prev].op3 == ref) {
435 ctx->ir_base[prev].op3 = insn->op3;
436 break;
437 }
438 prev = ctx->ir_base[prev].op3;
439 }
440 }
441 }
442 ctx->cfg_map[bb->start] = 0;
443 ctx->cfg_map[bb->end] = 0;
444 unreachable_count++;
445 }
446 }
447
448 if (unreachable_count) {
449 ir_block *dst_bb;
450 uint32_t n = 1;
451 uint32_t *edges;
452
453 dst_bb = bb = ctx->cfg_blocks + 1;
454 for (b = 1; b <= bb_count; b++, bb++) {
455 if (!(bb->flags & IR_BB_UNREACHABLE)) {
456 if (dst_bb != bb) {
457 memcpy(dst_bb, bb, sizeof(ir_block));
458 ctx->cfg_map[dst_bb->start] = n;
459 ctx->cfg_map[dst_bb->end] = n;
460 }
461 dst_bb->successors_count = 0;
462 dst_bb++;
463 n++;
464 }
465 }
466 ctx->cfg_blocks_count = bb_count = n - 1;
467
468 /* Rebuild successor/predecessors control edges */
469 edges = ctx->cfg_edges;
470 bb = ctx->cfg_blocks + 1;
471 for (b = 1; b <= bb_count; b++, bb++) {
472 ir_insn *insn = &ctx->ir_base[bb->start];
473 ir_ref *p, ref;
474
475 n = bb->predecessors_count;
476 if (n > 1) {
477 uint32_t *q = edges + bb->predecessors;
478
479 IR_ASSERT(n == insn->inputs_count);
480 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
481 ref = *p;
482 IR_ASSERT(ref);
483 ir_ref pred_b = ctx->cfg_map[ref];
484 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
485 *q = pred_b;
486 edges[pred_bb->successors + pred_bb->successors_count++] = b;
487 }
488 } else if (n == 1) {
489 ref = insn->op1;
490 IR_ASSERT(ref);
491 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
492 ir_ref pred_b = ctx->cfg_map[ref];
493 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
494 edges[bb->predecessors] = pred_b;
495 edges[pred_bb->successors + pred_bb->successors_count++] = b;
496 }
497 }
498 }
499
500 return 1;
501 }
502
503 #if 0
504 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
505 {
506 uint32_t i, *p;
507 ir_block *bb = &ctx->cfg_blocks[b];
508
509 if (bb->postnum != 0) {
510 return;
511 }
512
513 if (bb->successors_count) {
514 bb->postnum = -1; /* Marker for "currently visiting" */
515 p = ctx->cfg_edges + bb->successors;
516 i = bb->successors_count;
517 do {
518 compute_postnum(ctx, cur, *p);
519 p++;
520 } while (--i);
521 }
522 bb->postnum = (*cur)++;
523 }
524
525 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
526 * Cooper, Harvey and Kennedy. */
527 int ir_build_dominators_tree(ir_ctx *ctx)
528 {
529 uint32_t blocks_count, b, postnum;
530 ir_block *blocks, *bb;
531 uint32_t *edges;
532 bool changed;
533
534 ctx->flags2 &= ~IR_NO_LOOPS;
535
536 postnum = 1;
537 compute_postnum(ctx, &postnum, 1);
538
539 /* Find immediate dominators */
540 blocks = ctx->cfg_blocks;
541 edges = ctx->cfg_edges;
542 blocks_count = ctx->cfg_blocks_count;
543 blocks[1].idom = 1;
544 do {
545 changed = 0;
546 /* Iterating in Reverse Post Order */
547 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
548 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
549 if (bb->predecessors_count == 1) {
550 uint32_t pred_b = edges[bb->predecessors];
551
552 if (blocks[pred_b].idom <= 0) {
553 //IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
554 } else if (bb->idom != pred_b) {
555 bb->idom = pred_b;
556 changed = 1;
557 }
558 } else if (bb->predecessors_count) {
559 uint32_t idom = 0;
560 uint32_t k = bb->predecessors_count;
561 uint32_t *p = edges + bb->predecessors;
562
563 do {
564 uint32_t pred_b = *p;
565 ir_block *pred_bb = &blocks[pred_b];
566 ir_block *idom_bb;
567
568 if (pred_bb->idom > 0) {
569 idom = pred_b;
570 idom_bb = &blocks[idom];
571
572 while (--k > 0) {
573 pred_b = *(++p);
574 pred_bb = &blocks[pred_b];
575 if (pred_bb->idom > 0) {
576 while (idom != pred_b) {
577 while (pred_bb->postnum < idom_bb->postnum) {
578 pred_b = pred_bb->idom;
579 pred_bb = &blocks[pred_b];
580 }
581 while (idom_bb->postnum < pred_bb->postnum) {
582 idom = idom_bb->idom;
583 idom_bb = &blocks[idom];
584 }
585 }
586 }
587 }
588
589 if (bb->idom != idom) {
590 bb->idom = idom;
591 changed = 1;
592 }
593 break;
594 }
595 p++;
596 } while (--k > 0);
597 }
598 }
599 } while (changed);
600 blocks[1].idom = 0;
601 blocks[1].dom_depth = 0;
602
603 /* Construct dominators tree */
604 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
605 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
606 if (bb->idom > 0) {
607 ir_block *idom_bb = &blocks[bb->idom];
608
609 bb->dom_depth = idom_bb->dom_depth + 1;
610 /* Sort by block number to traverse children in pre-order */
611 if (idom_bb->dom_child == 0) {
612 idom_bb->dom_child = b;
613 } else if (b < idom_bb->dom_child) {
614 bb->dom_next_child = idom_bb->dom_child;
615 idom_bb->dom_child = b;
616 } else {
617 int child = idom_bb->dom_child;
618 ir_block *child_bb = &blocks[child];
619
620 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
621 child = child_bb->dom_next_child;
622 child_bb = &blocks[child];
623 }
624 bb->dom_next_child = child_bb->dom_next_child;
625 child_bb->dom_next_child = b;
626 }
627 }
628 }
629
630 return 1;
631 }
632 #else
633 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
634 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
635 * It may fallback to the general slow fixed-point algorithm. */
636 static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
ir_build_dominators_tree(ir_ctx * ctx)637 int ir_build_dominators_tree(ir_ctx *ctx)
638 {
639 uint32_t blocks_count, b;
640 ir_block *blocks, *bb;
641 uint32_t *edges;
642 ir_list worklist;
643
644 ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
645
646 ctx->flags2 |= IR_NO_LOOPS;
647
648 /* Find immediate dominators */
649 blocks = ctx->cfg_blocks;
650 edges = ctx->cfg_edges;
651 blocks_count = ctx->cfg_blocks_count;
652 blocks[1].idom = 1;
653 blocks[1].dom_depth = 0;
654
655 /* Iterating in Reverse Post Order */
656 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
657 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
658 IR_ASSERT(bb->predecessors_count > 0);
659 uint32_t k = bb->predecessors_count;
660 uint32_t *p = edges + bb->predecessors;
661 uint32_t idom = *p;
662 ir_block *idom_bb;
663
664 if (UNEXPECTED(idom > b)) {
665 /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
666 ctx->flags2 &= ~IR_NO_LOOPS;
667 IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
668 ir_list_push(&worklist, idom);
669 while (1) {
670 k--;
671 p++;
672 idom = *p;
673 if (idom < b) {
674 break;
675 }
676 IR_ASSERT(k > 0);
677 ir_list_push(&worklist, idom);
678 }
679 }
680 IR_ASSERT(blocks[idom].idom > 0);
681 IR_ASSERT(k != 0);
682
683 while (--k > 0) {
684 uint32_t pred_b = *(++p);
685
686 if (pred_b < b) {
687 IR_ASSERT(blocks[pred_b].idom > 0);
688 while (idom != pred_b) {
689 while (pred_b > idom) {
690 pred_b = blocks[pred_b].idom;
691 }
692 while (idom > pred_b) {
693 idom = blocks[idom].idom;
694 }
695 }
696 } else {
697 ctx->flags2 &= ~IR_NO_LOOPS;
698 IR_ASSERT(bb->predecessors_count > 1);
699 ir_list_push(&worklist, pred_b);
700 }
701 }
702 bb->idom = idom;
703 idom_bb = &blocks[idom];
704
705 bb->dom_depth = idom_bb->dom_depth + 1;
706 /* Sort by block number to traverse children in pre-order */
707 if (idom_bb->dom_child == 0) {
708 idom_bb->dom_child = b;
709 } else if (b < idom_bb->dom_child) {
710 bb->dom_next_child = idom_bb->dom_child;
711 idom_bb->dom_child = b;
712 } else {
713 int child = idom_bb->dom_child;
714 ir_block *child_bb = &blocks[child];
715
716 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
717 child = child_bb->dom_next_child;
718 child_bb = &blocks[child];
719 }
720 bb->dom_next_child = child_bb->dom_next_child;
721 child_bb->dom_next_child = b;
722 }
723 }
724
725 blocks[1].idom = 0;
726
727 if (ir_list_len(&worklist) != 0) {
728 uint32_t dom_depth;
729 uint32_t succ_b;
730 bool complete = 1;
731
732 /* Check if all the back-edges lead to the loop headers */
733 do {
734 b = ir_list_pop(&worklist);
735 bb = &blocks[b];
736 succ_b = ctx->cfg_edges[bb->successors];
737 if (bb->successors_count != 1) {
738 /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
739 IR_ASSERT(bb->successors_count == 2);
740 if (blocks[succ_b].flags & IR_BB_ENTRY) {
741 succ_b = ctx->cfg_edges[bb->successors + 1];
742 } else {
743 IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
744 }
745 }
746 dom_depth = blocks[succ_b].dom_depth;;
747 while (bb->dom_depth > dom_depth) {
748 b = bb->dom_parent;
749 bb = &blocks[b];
750 }
751 if (UNEXPECTED(b != succ_b)) {
752 complete = 0;
753 break;
754 }
755 } while (ir_list_len(&worklist) != 0);
756
757 if (UNEXPECTED(!complete)) {
758 ir_list_free(&worklist);
759 return ir_build_dominators_tree_iterative(ctx);
760 }
761 }
762
763 ir_list_free(&worklist);
764
765 return 1;
766 }
767
ir_build_dominators_tree_iterative(ir_ctx * ctx)768 static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
769 {
770 bool changed;
771 uint32_t blocks_count, b;
772 ir_block *blocks, *bb;
773 uint32_t *edges;
774
775 /* Find immediate dominators */
776 blocks = ctx->cfg_blocks;
777 edges = ctx->cfg_edges;
778 blocks_count = ctx->cfg_blocks_count;
779
780 /* Clear the dominators tree, but keep already found dominators */
781 for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
782 bb->dom_depth = 0;
783 bb->dom_child = 0;
784 bb->dom_next_child = 0;
785 }
786
787 /* Find immediate dominators by iterative fixed-point algorithm */
788 blocks[1].idom = 1;
789 do {
790 changed = 0;
791
792 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
793 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
794 IR_ASSERT(bb->predecessors_count > 0);
795 uint32_t k = bb->predecessors_count;
796 uint32_t *p = edges + bb->predecessors;
797 uint32_t idom = *p;
798
799 if (blocks[idom].idom == 0) {
800 while (1) {
801 k--;
802 p++;
803 idom = *p;
804 if (blocks[idom].idom > 0) {
805 break;
806 }
807 IR_ASSERT(k > 0);
808 }
809 }
810 IR_ASSERT(k != 0);
811 while (--k > 0) {
812 uint32_t pred_b = *(++p);
813
814 if (blocks[pred_b].idom > 0) {
815 IR_ASSERT(blocks[pred_b].idom > 0);
816 while (idom != pred_b) {
817 while (pred_b > idom) {
818 pred_b = blocks[pred_b].idom;
819 }
820 while (idom > pred_b) {
821 idom = blocks[idom].idom;
822 }
823 }
824 }
825 }
826 if (bb->idom != idom) {
827 bb->idom = idom;
828 changed = 1;
829 }
830 }
831 } while (changed);
832
833 /* Build dominators tree */
834 blocks[1].idom = 0;
835 blocks[1].dom_depth = 0;
836 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
837 uint32_t idom = bb->idom;
838 ir_block *idom_bb = &blocks[idom];
839
840 bb->dom_depth = idom_bb->dom_depth + 1;
841 /* Sort by block number to traverse children in pre-order */
842 if (idom_bb->dom_child == 0) {
843 idom_bb->dom_child = b;
844 } else if (b < idom_bb->dom_child) {
845 bb->dom_next_child = idom_bb->dom_child;
846 idom_bb->dom_child = b;
847 } else {
848 int child = idom_bb->dom_child;
849 ir_block *child_bb = &blocks[child];
850
851 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
852 child = child_bb->dom_next_child;
853 child_bb = &blocks[child];
854 }
855 bb->dom_next_child = child_bb->dom_next_child;
856 child_bb->dom_next_child = b;
857 }
858 }
859
860 return 1;
861 }
862 #endif
863
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)864 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
865 {
866 uint32_t b1_depth = blocks[b1].dom_depth;
867 const ir_block *bb2 = &blocks[b2];
868
869 while (bb2->dom_depth > b1_depth) {
870 b2 = bb2->dom_parent;
871 bb2 = &blocks[b2];
872 }
873 return b1 == b2;
874 }
875
ir_find_loops(ir_ctx * ctx)876 int ir_find_loops(ir_ctx *ctx)
877 {
878 uint32_t i, j, n, count;
879 uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
880 ir_block *blocks = ctx->cfg_blocks;
881 uint32_t *edges = ctx->cfg_edges;
882 ir_worklist work;
883
884 if (ctx->flags2 & IR_NO_LOOPS) {
885 return 1;
886 }
887
888 /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
889 * queries. These are implemented by checking entry/exit times of the DFS search. */
890 ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
891 entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
892 exit_times = entry_times + ctx->cfg_blocks_count + 1;
893 sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
894
895 memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
896
897 ir_worklist_push(&work, 1);
898 while (ir_worklist_len(&work)) {
899 ir_block *bb;
900 int child;
901
902 next:
903 i = ir_worklist_peek(&work);
904 if (!entry_times[i]) {
905 entry_times[i] = time++;
906 }
907
908 /* Visit blocks immediately dominated by i. */
909 bb = &blocks[i];
910 for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
911 if (ir_worklist_push(&work, child)) {
912 goto next;
913 }
914 }
915
916 /* Visit join edges. */
917 if (bb->successors_count) {
918 uint32_t *p = edges + bb->successors;
919 for (j = 0; j < bb->successors_count; j++,p++) {
920 uint32_t succ = *p;
921
922 if (blocks[succ].idom == i) {
923 continue;
924 } else if (ir_worklist_push(&work, succ)) {
925 goto next;
926 }
927 }
928 }
929 exit_times[i] = time++;
930 ir_worklist_pop(&work);
931 }
932
933 /* Sort blocks by level, which is the opposite order in which we want to process them */
934 sorted_blocks[1] = 1;
935 j = 1;
936 n = 2;
937 while (j != n) {
938 i = j;
939 j = n;
940 for (; i < j; i++) {
941 int child;
942 for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
943 sorted_blocks[n++] = child;
944 }
945 }
946 }
947 count = n;
948
949 /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
950 while (n > 1) {
951 i = sorted_blocks[--n];
952 ir_block *bb = &blocks[i];
953
954 if (bb->predecessors_count > 1) {
955 bool irreducible = 0;
956 uint32_t *p = &edges[bb->predecessors];
957
958 j = bb->predecessors_count;
959 do {
960 uint32_t pred = *p;
961
962 /* A join edge is one for which the predecessor does not
963 immediately dominate the successor. */
964 if (bb->idom != pred) {
965 /* In a loop back-edge (back-join edge), the successor dominates
966 the predecessor. */
967 if (ir_dominates(blocks, i, pred)) {
968 if (!ir_worklist_len(&work)) {
969 ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
970 }
971 blocks[pred].loop_header = 0; /* support for merged loops */
972 ir_worklist_push(&work, pred);
973 } else {
974 /* Otherwise it's a cross-join edge. See if it's a branch
975 to an ancestor on the DJ spanning tree. */
976 if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
977 irreducible = 1;
978 }
979 }
980 }
981 p++;
982 } while (--j);
983
984 if (UNEXPECTED(irreducible)) {
985 // TODO: Support for irreducible loops ???
986 bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
987 ctx->flags2 |= IR_IRREDUCIBLE_CFG;
988 while (ir_worklist_len(&work)) {
989 ir_worklist_pop(&work);
990 }
991 } else if (ir_worklist_len(&work)) {
992 bb->flags |= IR_BB_LOOP_HEADER;
993 ctx->flags2 |= IR_CFG_HAS_LOOPS;
994 bb->loop_depth = 1;
995 while (ir_worklist_len(&work)) {
996 j = ir_worklist_pop(&work);
997 while (blocks[j].loop_header > 0) {
998 j = blocks[j].loop_header;
999 }
1000 if (j != i) {
1001 ir_block *bb = &blocks[j];
1002 if (bb->idom == 0 && j != 1) {
1003 /* Ignore blocks that are unreachable or only abnormally reachable. */
1004 continue;
1005 }
1006 bb->loop_header = i;
1007 if (bb->predecessors_count) {
1008 uint32_t *p = &edges[bb->predecessors];
1009 j = bb->predecessors_count;
1010 do {
1011 ir_worklist_push(&work, *p);
1012 p++;
1013 } while (--j);
1014 }
1015 }
1016 }
1017 }
1018 }
1019 }
1020
1021 if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1022 for (n = 1; n < count; n++) {
1023 i = sorted_blocks[n];
1024 ir_block *bb = &blocks[i];
1025 if (bb->loop_header > 0) {
1026 ir_block *loop = &blocks[bb->loop_header];
1027 uint32_t loop_depth = loop->loop_depth;
1028
1029 if (bb->flags & IR_BB_LOOP_HEADER) {
1030 loop_depth++;
1031 }
1032 bb->loop_depth = loop_depth;
1033 if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1034 loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1035 }
1036 }
1037 }
1038 }
1039
1040 ir_mem_free(entry_times);
1041 ir_worklist_free(&work);
1042
1043 return 1;
1044 }
1045
_ir_skip_empty_blocks(const ir_ctx * ctx,uint32_t b)1046 static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1047 {
1048 while (1) {
1049 ir_block *bb = &ctx->cfg_blocks[b];
1050
1051 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1052 IR_ASSERT(bb->successors_count == 1);
1053 b = ctx->cfg_edges[bb->successors];
1054 } else {
1055 return b;
1056 }
1057 }
1058 }
1059
1060 /* A variation of "Bottom-up Positioning" algorithm described by
1061 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1062 */
1063 typedef struct _ir_edge_info {
1064 uint32_t from;
1065 uint32_t to;
1066 float freq;
1067 } ir_edge_info;
1068
1069 typedef struct _ir_chain {
1070 uint32_t head;
1071 uint32_t next;
1072 union {
1073 uint32_t prev;
1074 uint32_t tail;
1075 };
1076 } ir_chain;
1077
ir_edge_info_cmp(const void * b1,const void * b2)1078 static int ir_edge_info_cmp(const void *b1, const void *b2)
1079 {
1080 ir_edge_info *e1 = (ir_edge_info*)b1;
1081 ir_edge_info *e2 = (ir_edge_info*)b2;
1082
1083 if (e1->freq != e2->freq) {
1084 return e1->freq < e2->freq ? 1 : -1;
1085 }
1086 /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1087 * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1088 * (in merge points).
1089 *
1090 * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1091 * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1092 */
1093 if (e1->from != e2->from) {
1094 return e2->from - e1->from;
1095 } else {
1096 return e1->to - e2->to;
1097 }
1098 }
1099
ir_chain_head_path_compress(ir_chain * chains,uint32_t src,uint32_t head)1100 static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1101 {
1102 IR_ASSERT(head != 0);
1103 do {
1104 head = chains[head].head;
1105 } while (chains[head].head != head);
1106 do {
1107 uint32_t next = chains[src].head;
1108 chains[src].head = head;
1109 src = next;
1110 } while (chains[src].head != head);
1111 return head;
1112 }
1113
ir_chain_head(ir_chain * chains,uint32_t src)1114 IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1115 {
1116 uint32_t head = chains[src].head;
1117 if (chains[head].head == head) {
1118 return head;
1119 } else {
1120 return ir_chain_head_path_compress(chains, src, head);
1121 }
1122 }
1123
ir_join_chains(ir_chain * chains,uint32_t src,uint32_t dst)1124 static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1125 {
1126 uint32_t dst_tail = chains[dst].tail;
1127 uint32_t src_tail = chains[src].tail;
1128
1129 chains[dst_tail].next = src;
1130 chains[dst].prev = src_tail;
1131 chains[src_tail].next = dst;
1132 chains[src].tail = dst_tail;
1133 chains[dst].head = src;
1134 }
1135
ir_insert_chain_before(ir_chain * chains,uint32_t c,uint32_t before)1136 static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1137 {
1138 ir_chain *this = &chains[c];
1139 ir_chain *next = &chains[before];
1140
1141 IR_ASSERT(chains[c].head == c);
1142 if (chains[before].head != before) {
1143 this->head = next->head;
1144 } else {
1145 next->head = c;
1146 }
1147 this->next = before;
1148 this->prev = next->prev;
1149 next->prev = c;
1150 chains[this->prev].next = c;
1151 }
1152
1153 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1154 # ifdef IR_DEBUG
1155 # define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1156 # else
1157 # define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1158 # endif
1159 #endif
1160
1161 #if IR_DEBUG_BB_SCHEDULE_GRAPH
ir_dump_cfg_freq_graph(ir_ctx * ctx,float * bb_freq,uint32_t edges_count,ir_edge_info * edges,ir_chain * chains)1162 static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1163 {
1164 uint32_t b, i;
1165 ir_block *bb;
1166 uint8_t c, *colors;
1167 bool is_head, is_empty;
1168 uint32_t max_colors = 12;
1169
1170 colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1171 memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1172 i = 0;
1173 for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1174 if (chains[b].head == b) {
1175 colors[b] = (i % max_colors) + 1;
1176 i++;
1177 }
1178 }
1179
1180 fprintf(stderr, "digraph {\n");
1181 fprintf(stderr, "\trankdir=TB;\n");
1182 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1183 bb = &ctx->cfg_blocks[b];
1184 c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1185 is_head = chains[b].head == b;
1186 is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1187 if (c) {
1188 fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1189 bb->loop_depth, bb_freq[b],
1190 ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1191 c,
1192 is_head ? ",penwidth=3" : "",
1193 is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1194 } else {
1195 fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1196 bb->loop_depth, bb_freq[b],
1197 ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1198 is_head ? ",penwidth=3" : "",
1199 is_empty ? ",style=\"dotted\"" : "");
1200 }
1201 }
1202 fprintf(stderr, "\n");
1203
1204 for (i = 0; i < edges_count; i++) {
1205 fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1206 }
1207 fprintf(stderr, "}\n");
1208 }
1209 #endif
1210
1211 #ifdef IR_DEBUG
ir_dump_edges(ir_ctx * ctx,uint32_t edges_count,ir_edge_info * edges)1212 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1213 {
1214 uint32_t i;
1215
1216 fprintf(stderr, "Edges:\n");
1217 for (i = 0; i < edges_count; i++) {
1218 fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1219 }
1220 }
1221
ir_dump_chains(ir_ctx * ctx,ir_chain * chains)1222 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1223 {
1224 uint32_t b, tail, i;
1225
1226 fprintf(stderr, "Chains:\n");
1227 for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1228 if (chains[b].head == b) {
1229 tail = chains[b].tail;
1230 i = b;
1231 fprintf(stderr, "(BB%d", i);
1232 while (i != tail) {
1233 i = chains[i].next;
1234 fprintf(stderr, ", BB%d", i);
1235 }
1236 fprintf(stderr, ")\n");
1237 }
1238 }
1239 }
1240 #endif
1241
ir_schedule_blocks_bottom_up(ir_ctx * ctx)1242 static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1243 {
1244 uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1245 uint32_t edges_count = 0;
1246 uint32_t b, i, loop_depth;
1247 float *bb_freq, freq;
1248 ir_block *bb;
1249 ir_edge_info *edges, *e;
1250 ir_chain *chains;
1251 ir_bitqueue worklist;
1252 ir_bitset visited;
1253 uint32_t *schedule_end, count;
1254
1255 ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1256 schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1257
1258 /* 1. Create initial chains for each BB */
1259 chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1260 chains[0].head = 0;
1261 chains[0].next = 0;
1262 chains[0].prev = 0;
1263 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1264 chains[b].head = b;
1265 chains[b].next = b;
1266 chains[b].prev = b;
1267 }
1268
1269 /* 2. Collect information about BBs and EDGEs frequencies */
1270 edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1271 bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1272 bb_freq[1] = 1.0f;
1273 visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1274 ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1275 ir_bitqueue_add(&worklist, 1);
1276 while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1277 restart:
1278 bb = &ctx->cfg_blocks[b];
1279 if (bb->predecessors_count) {
1280 uint32_t n = bb->predecessors_count;
1281 uint32_t *p = ctx->cfg_edges + bb->predecessors;
1282 for (; n > 0; p++, n--) {
1283 uint32_t predecessor = *p;
1284 /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1285 * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1286 * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1287 */
1288 if (predecessor < b) {
1289 if (!ir_bitset_in(visited, predecessor)) {
1290 b = predecessor;
1291 ir_bitqueue_del(&worklist, b);
1292 goto restart;
1293 }
1294 } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1295 ir_dump_cfg(ctx, stderr);
1296 IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1297 }
1298 }
1299 }
1300
1301 ir_bitset_incl(visited, b);
1302
1303 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1304 uint32_t successor = ctx->cfg_edges[bb->successors];
1305
1306 /* move empty blocks to the end */
1307 IR_ASSERT(chains[b].head == b);
1308 chains[b].head = 0;
1309 *schedule_end = b;
1310 schedule_end--;
1311
1312 if (successor > b) {
1313 bb_freq[successor] += bb_freq[b];
1314 b = successor;
1315 ir_bitqueue_del(&worklist, b);
1316 goto restart;
1317 } else {
1318 continue;
1319 }
1320 }
1321
1322 loop_depth = bb->loop_depth;
1323 if (bb->flags & IR_BB_LOOP_HEADER) {
1324 // TODO: Estimate the loop iterations count
1325 bb_freq[b] *= 10;
1326 }
1327
1328 if (bb->successors_count) {
1329 uint32_t n = bb->successors_count;
1330 uint32_t *p = ctx->cfg_edges + bb->successors;
1331
1332 if (n == 1) {
1333 uint32_t successor = *p;
1334
1335 IR_ASSERT(edges_count < max_edges_count);
1336 freq = bb_freq[b];
1337 if (successor > b) {
1338 IR_ASSERT(!ir_bitset_in(visited, successor));
1339 bb_freq[successor] += freq;
1340 ir_bitqueue_add(&worklist, successor);
1341 }
1342 successor = _ir_skip_empty_blocks(ctx, successor);
1343 edges[edges_count].from = b;
1344 edges[edges_count].to = successor;
1345 edges[edges_count].freq = freq;
1346 edges_count++;
1347 } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1348 uint32_t successor1 = *p;
1349 ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1350 ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1351 uint32_t successor2 = *(p + 1);
1352 ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1353 ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1354 int prob1, prob2, probN = 100;
1355
1356 if (insn1->op2) {
1357 prob1 = insn1->op2;
1358 if (insn2->op2) {
1359 prob2 = insn2->op2;
1360 probN = prob1 + prob2;
1361 } else {
1362 if (prob1 > 100) {
1363 prob1 = 100;
1364 }
1365 prob2 = 100 - prob1;
1366 }
1367
1368 } else if (insn2->op2) {
1369 prob2 = insn2->op2;
1370 if (prob2 > 100) {
1371 prob2 = 100;
1372 }
1373 prob1 = 100 - prob2;
1374 } else if (successor1_bb->loop_depth >= loop_depth
1375 && successor2_bb->loop_depth < loop_depth) {
1376 prob1 = 90;
1377 prob2 = 10;
1378 } else if (successor1_bb->loop_depth < loop_depth
1379 && successor2_bb->loop_depth >= loop_depth) {
1380 prob1 = 10;
1381 prob2 = 90;
1382 } else if (successor2_bb->flags & IR_BB_EMPTY) {
1383 prob1 = 51;
1384 prob2 = 49;
1385 } else if (successor1_bb->flags & IR_BB_EMPTY) {
1386 prob1 = 49;
1387 prob2 = 51;
1388 } else {
1389 prob1 = prob2 = 50;
1390 }
1391 do {
1392 freq = bb_freq[b] * (float)prob1 / (float)probN;
1393 if (successor1 > b) {
1394 IR_ASSERT(!ir_bitset_in(visited, successor1));
1395 bb_freq[successor1] += freq;
1396 if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1397 /* move cold block without successors to the end */
1398 IR_ASSERT(chains[successor1].head == successor1);
1399 chains[successor1].head = 0;
1400 *schedule_end = successor1;
1401 schedule_end--;
1402 break;
1403 } else {
1404 ir_bitqueue_add(&worklist, successor1);
1405 }
1406 }
1407 /* try to join edges early to reduce number of edges and the cost of their sorting */
1408 if (prob1 > prob2
1409 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1410 uint32_t src = chains[b].next;
1411 IR_ASSERT(chains[src].head == src);
1412 IR_ASSERT(src == ir_chain_head(chains, b));
1413 IR_ASSERT(chains[successor1].head == successor1);
1414 ir_join_chains(chains, src, successor1);
1415 if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1416 }
1417 successor1 = _ir_skip_empty_blocks(ctx, successor1);
1418 IR_ASSERT(edges_count < max_edges_count);
1419 edges[edges_count].from = b;
1420 edges[edges_count].to = successor1;
1421 edges[edges_count].freq = freq;
1422 edges_count++;
1423 } while (0);
1424 do {
1425 freq = bb_freq[b] * (float)prob2 / (float)probN;
1426 if (successor2 > b) {
1427 IR_ASSERT(!ir_bitset_in(visited, successor2));
1428 bb_freq[successor2] += freq;
1429 if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1430 /* move cold block without successors to the end */
1431 IR_ASSERT(chains[successor2].head == successor2);
1432 chains[successor2].head = 0;
1433 *schedule_end = successor2;
1434 schedule_end--;
1435 break;
1436 } else {
1437 ir_bitqueue_add(&worklist, successor2);
1438 }
1439 }
1440 if (prob2 > prob1
1441 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1442 uint32_t src = chains[b].next;
1443 IR_ASSERT(chains[src].head == src);
1444 IR_ASSERT(src == ir_chain_head(chains, b));
1445 IR_ASSERT(chains[successor2].head == successor2);
1446 ir_join_chains(chains, src, successor2);
1447 if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1448 }
1449 successor2 = _ir_skip_empty_blocks(ctx, successor2);
1450 IR_ASSERT(edges_count < max_edges_count);
1451 edges[edges_count].from = b;
1452 edges[edges_count].to = successor2;
1453 edges[edges_count].freq = freq;
1454 edges_count++;
1455 } while (0);
1456 } else {
1457 int prob;
1458
1459 for (; n > 0; p++, n--) {
1460 uint32_t successor = *p;
1461 ir_block *successor_bb = &ctx->cfg_blocks[successor];
1462 ir_insn *insn = &ctx->ir_base[successor_bb->start];
1463
1464 if (insn->op == IR_CASE_DEFAULT) {
1465 prob = insn->op2;
1466 if (!prob) {
1467 prob = 100 / bb->successors_count;
1468 }
1469 } else if (insn->op == IR_CASE_VAL) {
1470 prob = insn->op3;
1471 if (!prob) {
1472 prob = 100 / bb->successors_count;
1473 }
1474 } else if (insn->op == IR_ENTRY) {
1475 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1476 prob = 99; /* prefer empty ENTRY block to go first */
1477 } else {
1478 prob = 1;
1479 }
1480 } else {
1481 prob = 100 / bb->successors_count;
1482 }
1483 freq = bb_freq[b] * (float)prob / 100.0f;
1484 if (successor > b) {
1485 IR_ASSERT(!ir_bitset_in(visited, successor));
1486 bb_freq[successor] += freq;
1487 ir_bitqueue_add(&worklist, successor);
1488 }
1489 successor = _ir_skip_empty_blocks(ctx, successor);
1490 IR_ASSERT(edges_count < max_edges_count);
1491 edges[edges_count].from = b;
1492 edges[edges_count].to = successor;
1493 edges[edges_count].freq = freq;
1494 edges_count++;
1495 }
1496 }
1497 }
1498 }
1499 ir_bitqueue_free(&worklist);
1500 ir_mem_free(visited);
1501
1502 /* 2. Sort EDGEs according to their frequencies */
1503 qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1504
1505 #ifdef IR_DEBUG
1506 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1507 ir_dump_edges(ctx, edges_count, edges);
1508 }
1509 #endif
1510
1511 /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1512 for (e = edges, i = edges_count; i > 0; e++, i--) {
1513 uint32_t dst = chains[e->to].head;
1514 if (dst == e->to) {
1515 uint32_t src = chains[e->from].next;
1516 if (chains[src].head == src) {
1517 IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1518 if (src != dst) {
1519 ir_join_chains(chains, src, dst);
1520 } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1521 /* Try to rotate loop chian to finish it with a conditional branch */
1522 uint32_t tail = e->from;
1523 uint32_t prev = src;
1524 uint32_t next = chains[prev].next;
1525 uint32_t best = 0;
1526
1527 while (prev != tail) {
1528 if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1529 if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1530 && ctx->cfg_blocks[prev].loop_depth > 1) {
1531 best = next;
1532 break;
1533 } else if (!best || bb_freq[next] < bb_freq[best]) {
1534 /* Find the successor of IF with the least frequency */
1535 best = next;
1536 }
1537 }
1538 prev = next;
1539 next = chains[next].next;
1540 }
1541 if (best) {
1542 /* change the head of the chain */
1543 chains[src].head = best;
1544 chains[best].head = best;
1545 }
1546 }
1547 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1548 e->from = 0; /* reset "from" to avoid check on step #5 */
1549 #endif
1550 }
1551 }
1552 }
1553
1554 #if IR_DEBUG_BB_SCHEDULE_GRAPH
1555 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1556 ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1557 }
1558 #endif
1559
1560 ir_mem_free(bb_freq);
1561
1562 #ifdef IR_DEBUG
1563 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1564 ir_dump_chains(ctx, chains);
1565 }
1566 #endif
1567
1568 /* 4. Merge empty entry blocks */
1569 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1570 for (i = 0; i < ctx->entries_count; i++) {
1571 b = ctx->entries[i];
1572 IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1573 if (b && chains[b].head == b && chains[b].tail == b) {
1574 bb = &ctx->cfg_blocks[b];
1575 if (bb->flags & IR_BB_EMPTY) {
1576 uint32_t successor;
1577
1578 do {
1579 IR_ASSERT(bb->successors_count == 1);
1580 successor = ctx->cfg_edges[bb->successors];
1581 bb = &ctx->cfg_blocks[successor];
1582 } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1583 IR_ASSERT(chains[successor].head);
1584 ir_insert_chain_before(chains, b, successor);
1585 }
1586 }
1587 }
1588
1589 #ifdef IR_DEBUG
1590 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1591 ir_dump_chains(ctx, chains);
1592 }
1593 #endif
1594 }
1595
1596 /* 5. Align loop headers */
1597 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1598 if (chains[b].head == b) {
1599 bb = &ctx->cfg_blocks[b];
1600 if (bb->loop_depth) {
1601 if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1602 bb->flags |= IR_BB_ALIGN_LOOP;
1603 }
1604 }
1605 }
1606 }
1607
1608 /* 6. Group chains according to the most frequent edge between them */
1609 // TODO: Try to find a better heuristic
1610 for (e = edges, i = edges_count; i > 0; e++, i--) {
1611 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1612 if (!e->from) continue;
1613 #endif
1614 uint32_t src = ir_chain_head(chains, e->from);
1615 uint32_t dst = ir_chain_head(chains, e->to);
1616 if (src != dst) {
1617 if (dst == 1) {
1618 ir_join_chains(chains, dst, src);
1619 } else {
1620 ir_join_chains(chains, src, dst);
1621 }
1622 }
1623 }
1624
1625 #ifdef IR_DEBUG
1626 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1627 ir_dump_chains(ctx, chains);
1628 }
1629 #endif
1630
1631 /* 7. Form a final BB order */
1632 count = 0;
1633 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1634 if (chains[b].head == b) {
1635 uint32_t tail = chains[b].tail;
1636 uint32_t i = b;
1637 while (1) {
1638 count++;
1639 ctx->cfg_schedule[count] = i;
1640 if (i == tail) break;
1641 i = chains[i].next;
1642 }
1643 }
1644 }
1645
1646 IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1647 ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1648
1649 ir_mem_free(edges);
1650 ir_mem_free(chains);
1651
1652 return 1;
1653 }
1654
1655 /* A variation of "Top-down Positioning" algorithm described by
1656 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1657 */
ir_schedule_blocks_top_down(ir_ctx * ctx)1658 static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1659 {
1660 ir_bitqueue blocks;
1661 uint32_t b, best_successor, last_non_empty;
1662 ir_block *bb, *best_successor_bb;
1663 ir_insn *insn;
1664 uint32_t *list, *schedule_end;
1665 uint32_t count = 0;
1666
1667 ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1668 blocks.pos = 0;
1669 list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1670 list[ctx->cfg_blocks_count + 1] = 0;
1671 schedule_end = list + ctx->cfg_blocks_count;
1672 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1673 ir_bitset_incl(blocks.set, b);
1674 }
1675
1676 while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1677 bb = &ctx->cfg_blocks[b];
1678 /* Start trace */
1679 last_non_empty = 0;
1680 do {
1681 if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1682 /* Schedule the previous empty ENTRY block before this one */
1683 uint32_t predecessor = b - 1;
1684
1685 ir_bitqueue_del(&blocks, predecessor);
1686 count++;
1687 list[count] = predecessor;
1688 }
1689 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1690 /* move empty blocks to the end */
1691 *schedule_end = b;
1692 schedule_end--;
1693 } else {
1694 count++;
1695 list[count] = b;
1696 last_non_empty = b;
1697 }
1698 best_successor_bb = NULL;
1699 if (bb->successors_count == 1) {
1700 best_successor = ctx->cfg_edges[bb->successors];
1701 if (ir_bitqueue_in(&blocks, best_successor)) {
1702 best_successor_bb = &ctx->cfg_blocks[best_successor];
1703 }
1704 } else if (bb->successors_count > 1) {
1705 uint32_t prob, best_successor_prob;
1706 uint32_t *p, successor;
1707 ir_block *successor_bb;
1708
1709 for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1710 successor = *p;
1711 if (ir_bitqueue_in(&blocks, successor)) {
1712 successor_bb = &ctx->cfg_blocks[successor];
1713 insn = &ctx->ir_base[successor_bb->start];
1714 if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1715 prob = insn->op2;
1716 if (!prob) {
1717 prob = 100 / bb->successors_count;
1718 if (!(successor_bb->flags & IR_BB_EMPTY)) {
1719 prob++;
1720 }
1721 }
1722 } else if (insn->op == IR_CASE_DEFAULT) {
1723 prob = insn->op2;
1724 if (!prob) {
1725 prob = 100 / bb->successors_count;
1726 }
1727 } else if (insn->op == IR_CASE_VAL) {
1728 prob = insn->op3;
1729 if (!prob) {
1730 prob = 100 / bb->successors_count;
1731 }
1732 } else if (insn->op == IR_ENTRY) {
1733 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1734 prob = 99; /* prefer empty ENTRY block to go first */
1735 } else {
1736 prob = 1;
1737 }
1738 } else {
1739 prob = 100 / bb->successors_count;
1740 }
1741 if (!best_successor_bb
1742 || successor_bb->loop_depth > best_successor_bb->loop_depth
1743 || prob > best_successor_prob) {
1744 best_successor = successor;
1745 best_successor_bb = successor_bb;
1746 best_successor_prob = prob;
1747 }
1748 }
1749 }
1750 }
1751 if (!best_successor_bb) {
1752 /* Try to continue trace using the other successor of the last IF */
1753 if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1754 bb = &ctx->cfg_blocks[last_non_empty];
1755 if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1756 b = ctx->cfg_edges[bb->successors];
1757
1758 if (!ir_bitqueue_in(&blocks, b)) {
1759 b = ctx->cfg_edges[bb->successors + 1];
1760 }
1761 if (ir_bitqueue_in(&blocks, b)) {
1762 bb = &ctx->cfg_blocks[b];
1763 ir_bitqueue_del(&blocks, b);
1764 continue;
1765 }
1766 }
1767 }
1768 /* End trace */
1769 break;
1770 }
1771 b = best_successor;
1772 bb = best_successor_bb;
1773 ir_bitqueue_del(&blocks, b);
1774 } while (1);
1775 }
1776
1777 IR_ASSERT(list + count == schedule_end);
1778 ctx->cfg_schedule = list;
1779 ir_bitqueue_free(&blocks);
1780
1781 return 1;
1782 }
1783
ir_schedule_blocks(ir_ctx * ctx)1784 int ir_schedule_blocks(ir_ctx *ctx)
1785 {
1786 ir_ref ref;
1787
1788 if (ctx->cfg_blocks_count <= 2) {
1789 return 1;
1790 }
1791
1792 /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1793 ref = ctx->ir_base[1].op1;
1794 while (ref) {
1795 ir_insn *insn = &ctx->ir_base[ref];
1796
1797 if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1798 ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1799 uint32_t n = bb->predecessors_count;
1800
1801 if (n == 1) {
1802 ir_insn *start_insn = &ctx->ir_base[bb->start];
1803 if (start_insn->op == IR_IF_TRUE
1804 || start_insn->op == IR_IF_FALSE
1805 || start_insn->op == IR_CASE_DEFAULT) {
1806 if (!start_insn->op2) start_insn->op2 = 1;
1807 } else if (start_insn->op == IR_CASE_VAL) {
1808 if (!start_insn->op3) start_insn->op3 = 1;
1809 }
1810 } else if (n > 1) {
1811 uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1812
1813 for (; n > 0; p++, n--) {
1814 bb = &ctx->cfg_blocks[*p];
1815 if (bb->predecessors_count == 1) {
1816 ir_insn *start_insn = &ctx->ir_base[bb->start];
1817 if (start_insn->op == IR_IF_TRUE
1818 || start_insn->op == IR_IF_FALSE
1819 || start_insn->op == IR_CASE_DEFAULT) {
1820 if (!start_insn->op2) start_insn->op2 = 1;
1821 } else if (start_insn->op == IR_CASE_VAL) {
1822 if (!start_insn->op3) start_insn->op3 = 1;
1823 }
1824 }
1825 }
1826 }
1827 }
1828 ref = insn->op3;
1829 }
1830
1831 /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
1832 * use it only for relatively small functions.
1833 *
1834 * TODO: make the choice between top-down and bottom-up algorithm configurable
1835 */
1836 if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
1837 return ir_schedule_blocks_top_down(ctx);
1838 } else {
1839 return ir_schedule_blocks_bottom_up(ctx);
1840 }
1841 }
1842
1843 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1844 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1845 {
1846 return _ir_skip_empty_blocks(ctx, b);
1847 }
1848
ir_next_block(const ir_ctx * ctx,uint32_t b)1849 uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
1850 {
1851 ir_block *bb;
1852
1853 if (ctx->cfg_schedule) {
1854 uint32_t ret = ctx->cfg_schedule[++b];
1855
1856 /* Check for empty ENTRY block */
1857 while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
1858 ret = ctx->cfg_schedule[++b];
1859 }
1860 return ret;
1861 }
1862
1863 b++;
1864 while (1) {
1865 if (b > ctx->cfg_blocks_count) {
1866 return 0;
1867 }
1868
1869 bb = &ctx->cfg_blocks[b];
1870
1871 if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1872 b++;
1873 } else {
1874 break;
1875 }
1876 }
1877 return b;
1878 }
1879
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block)1880 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
1881 {
1882 ir_block *bb;
1883 uint32_t *p, use_block;
1884
1885 *true_block = 0;
1886 *false_block = 0;
1887 bb = &ctx->cfg_blocks[b];
1888 IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1889 IR_ASSERT(bb->successors_count == 2);
1890 p = &ctx->cfg_edges[bb->successors];
1891 use_block = *p;
1892 if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1893 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1894 use_block = *(p+1);
1895 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1896 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1897 } else {
1898 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1899 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1900 use_block = *(p+1);
1901 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1902 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1903 }
1904 IR_ASSERT(*true_block && *false_block);
1905 }
1906