1 /*
2 * IR - Lightweight JIT Compilation Framework
3 * (CFG - Control Flow Graph)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8 #include "ir.h"
9 #include "ir_private.h"
10
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)11 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
12 {
13 ir_use_list *use_list = &ctx->use_lists[ref];
14 ir_ref *p, use, n = use_list->count;
15
16 if (n < 2) {
17 if (n == 1) {
18 use = ctx->use_edges[use_list->refs];
19 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
20 ir_worklist_push(worklist, use);
21 }
22 } else {
23 p = &ctx->use_edges[use_list->refs];
24 if (n == 2) {
25 use = *p;
26 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
27 ir_worklist_push(worklist, use);
28 use = *(p + 1);
29 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
30 ir_worklist_push(worklist, use);
31 } else {
32 for (; n > 0; p++, n--) {
33 use = *p;
34 IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
35 ir_worklist_push(worklist, use);
36 }
37 }
38 }
39 }
40
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)41 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
42 {
43 ir_ref n, ref;
44 const ir_ref *p;
45
46 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
47 n = insn->inputs_count;
48 for (p = insn->ops + 1; n > 0; p++, n--) {
49 ref = *p;
50 IR_ASSERT(ref);
51 ir_worklist_push(worklist, ref);
52 }
53 } else if (insn->op != IR_START) {
54 if (EXPECTED(insn->op1)) {
55 ir_worklist_push(worklist, insn->op1);
56 }
57 }
58 }
59
ir_build_cfg(ir_ctx * ctx)60 int ir_build_cfg(ir_ctx *ctx)
61 {
62 ir_ref n, *p, ref, start, end, next;
63 uint32_t b;
64 ir_insn *insn;
65 ir_worklist worklist;
66 uint32_t bb_init_falgs;
67 uint32_t count, bb_count = 0;
68 uint32_t edges_count = 0;
69 ir_block *blocks, *bb;
70 uint32_t *_blocks, *edges;
71 ir_use_list *use_list;
72 uint32_t len = ir_bitset_len(ctx->insns_count);
73 ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
74 ir_bitset bb_leaks = bb_starts + len;
75 _blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
76 ir_worklist_init(&worklist, ctx->insns_count);
77
78 /* First try to perform backward DFS search starting from "stop" nodes */
79
80 /* Add all "stop" nodes */
81 ref = ctx->ir_base[1].op1;
82 while (ref) {
83 ir_worklist_push(&worklist, ref);
84 ref = ctx->ir_base[ref].op3;
85 }
86
87 while (ir_worklist_len(&worklist)) {
88 ref = ir_worklist_pop(&worklist);
89 insn = &ctx->ir_base[ref];
90
91 if (insn->op == IR_NOP) {
92 continue;
93 }
94 IR_ASSERT(IR_IS_BB_END(insn->op));
95 /* Remember BB end */
96 end = ref;
97 /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
98 use_list = &ctx->use_lists[end];
99 n = use_list->count;
100 if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
101 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
102 /* Remember possible inaccessible successors */
103 ir_bitset_incl(bb_leaks, *p);
104 }
105 }
106 /* Skip control nodes untill BB start */
107 ref = insn->op1;
108 while (1) {
109 insn = &ctx->ir_base[ref];
110 if (IR_IS_BB_START(insn->op)) {
111 break;
112 }
113 ref = insn->op1; // follow connected control blocks untill BB start
114 }
115 /* Mark BB Start */
116 bb_count++;
117 _blocks[ref] = end;
118 ir_bitset_incl(bb_starts, ref);
119 /* Add predecessors */
120 _ir_add_predecessors(insn, &worklist);
121 }
122
123 /* Backward DFS way miss some branches ending by infinite loops. */
124 /* Try forward DFS. (in most cases all nodes are already proceed). */
125
126 /* START node may be inaccessible from "stop" nodes */
127 ir_bitset_incl(bb_leaks, 1);
128
129 /* Add not processed START and successor of IF and SWITCH */
130 IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
131 ir_worklist_push(&worklist, start);
132 } IR_BITSET_FOREACH_END();
133
134 if (ir_worklist_len(&worklist)) {
135 ir_bitset_union(worklist.visited, bb_starts, len);
136 do {
137 ref = ir_worklist_pop(&worklist);
138 insn = &ctx->ir_base[ref];
139
140 if (insn->op == IR_NOP) {
141 continue;
142 }
143 IR_ASSERT(IR_IS_BB_START(insn->op));
144 /* Remember BB start */
145 start = ref;
146 /* Skip control nodes untill BB end */
147 while (1) {
148 use_list = &ctx->use_lists[ref];
149 n = use_list->count;
150 next = IR_UNUSED;
151 for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
152 next = *p;
153 insn = &ctx->ir_base[next];
154 if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
155 break;
156 }
157 }
158 IR_ASSERT(next != IR_UNUSED);
159 ref = next;
160 if (IR_IS_BB_END(insn->op)) {
161 break;
162 }
163 }
164 /* Mark BB Start */
165 bb_count++;
166 _blocks[start] = ref;
167 ir_bitset_incl(bb_starts, start);
168 /* Add successors */
169 _ir_add_successors(ctx, ref, &worklist);
170 } while (ir_worklist_len(&worklist));
171 }
172
173 IR_ASSERT(bb_count > 0);
174
175 /* Create array of basic blocks and count successor/predecessors edges for each BB */
176 blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
177 b = 1;
178 bb = blocks + 1;
179 count = 0;
180 /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
181 bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
182 IR_BITSET_FOREACH(bb_starts, len, start) {
183 insn = &ctx->ir_base[start];
184 if (insn->op == IR_NOP) {
185 _blocks[start] = 0;
186 continue;
187 }
188 end = _blocks[start];
189 _blocks[start] = b;
190 _blocks[end] = b;
191 IR_ASSERT(IR_IS_BB_START(insn->op));
192 IR_ASSERT(end > start);
193 bb->start = start;
194 bb->end = end;
195 bb->successors = count;
196 count += ctx->use_lists[end].count;
197 bb->successors_count = 0;
198 bb->predecessors = count;
199 bb->dom_parent = 0;
200 bb->dom_depth = 0;
201 bb->dom_child = 0;
202 bb->dom_next_child = 0;
203 bb->loop_header = 0;
204 bb->loop_depth = 0;
205 if (insn->op == IR_START) {
206 bb->flags = IR_BB_START;
207 bb->predecessors_count = 0;
208 } else {
209 bb->flags = bb_init_falgs;
210 if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
211 n = insn->inputs_count;
212 bb->predecessors_count = n;
213 edges_count += n;
214 count += n;
215 } else if (EXPECTED(insn->op1)) {
216 if (insn->op == IR_ENTRY) {
217 bb->flags |= IR_BB_ENTRY;
218 ctx->entries_count++;
219 }
220 bb->predecessors_count = 1;
221 edges_count++;
222 count++;
223 } else {
224 IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
225 bb->predecessors_count = 0;
226 }
227 }
228 b++;
229 bb++;
230 } IR_BITSET_FOREACH_END();
231 bb_count = b - 1;
232 IR_ASSERT(count == edges_count * 2);
233 ir_mem_free(bb_starts);
234
235 /* Create an array of successor/predecessors control edges */
236 edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
237 bb = blocks + 1;
238 for (b = 1; b <= bb_count; b++, bb++) {
239 insn = &ctx->ir_base[bb->start];
240 if (bb->predecessors_count > 1) {
241 uint32_t *q = edges + bb->predecessors;
242 n = insn->inputs_count;
243 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
244 ref = *p;
245 IR_ASSERT(ref);
246 ir_ref pred_b = _blocks[ref];
247 ir_block *pred_bb = &blocks[pred_b];
248 IR_ASSERT(pred_b > 0);
249 *q = pred_b;
250 edges[pred_bb->successors + pred_bb->successors_count++] = b;
251 }
252 } else if (bb->predecessors_count == 1) {
253 ref = insn->op1;
254 IR_ASSERT(ref);
255 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
256 ir_ref pred_b = _blocks[ref];
257 ir_block *pred_bb = &blocks[pred_b];
258 edges[bb->predecessors] = pred_b;
259 edges[pred_bb->successors + pred_bb->successors_count++] = b;
260 }
261 }
262
263 ctx->cfg_blocks_count = bb_count;
264 ctx->cfg_edges_count = edges_count * 2;
265 ctx->cfg_blocks = blocks;
266 ctx->cfg_edges = edges;
267 ctx->cfg_map = _blocks;
268
269 if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
270 uint32_t reachable_count = 0;
271
272 /* Mark reachable blocks */
273 ir_worklist_clear(&worklist);
274 ir_worklist_push(&worklist, 1);
275 while (ir_worklist_len(&worklist) != 0) {
276 uint32_t *p;
277
278 reachable_count++;
279 b = ir_worklist_pop(&worklist);
280 bb = &blocks[b];
281 bb->flags &= ~IR_BB_UNREACHABLE;
282 n = bb->successors_count;
283 if (n > 1) {
284 for (p = edges + bb->successors; n > 0; p++, n--) {
285 ir_worklist_push(&worklist, *p);
286 }
287 } else if (n == 1) {
288 ir_worklist_push(&worklist, edges[bb->successors]);
289 }
290 }
291 if (reachable_count != ctx->cfg_blocks_count) {
292 ir_remove_unreachable_blocks(ctx);
293 }
294 }
295
296 ir_worklist_free(&worklist);
297
298 return 1;
299 }
300
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)301 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
302 {
303 uint32_t i, *p, *q, n = 0;
304
305 p = q = &ctx->cfg_edges[bb->predecessors];
306 for (i = 0; i < bb->predecessors_count; i++, p++) {
307 if (*p != from) {
308 if (p != q) {
309 *q = *p;
310 }
311 q++;
312 n++;
313 }
314 }
315 IR_ASSERT(n != bb->predecessors_count);
316 bb->predecessors_count = n;
317 }
318
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)319 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
320 {
321 ir_ref i, j, n, k, *p, use;
322 ir_insn *use_insn;
323 ir_use_list *use_list;
324 ir_bitset life_inputs;
325 ir_insn *insn = &ctx->ir_base[merge];
326
327 IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
328 n = insn->inputs_count;
329 i = 1;
330 life_inputs = ir_bitset_malloc(n + 1);
331 for (j = 1; j <= n; j++) {
332 ir_ref input = ir_insn_op(insn, j);
333
334 if (input != from) {
335 if (i != j) {
336 ir_insn_set_op(insn, i, input);
337 }
338 ir_bitset_incl(life_inputs, j);
339 i++;
340 }
341 }
342 i--;
343 if (i == 1) {
344 insn->op = IR_BEGIN;
345 insn->inputs_count = 1;
346 use_list = &ctx->use_lists[merge];
347 if (use_list->count > 1) {
348 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
349 use = *p;
350 use_insn = &ctx->ir_base[use];
351 if (use_insn->op == IR_PHI) {
352 /* Convert PHI to COPY */
353 i = 2;
354 for (j = 2; j <= n; j++) {
355 ir_ref input = ir_insn_op(use_insn, j);
356
357 if (ir_bitset_in(life_inputs, j - 1)) {
358 use_insn->op1 = ir_insn_op(use_insn, j);
359 } else if (input > 0) {
360 ir_use_list_remove_all(ctx, input, use);
361 }
362 }
363 use_insn->op = IR_COPY;
364 use_insn->op2 = IR_UNUSED;
365 use_insn->op3 = IR_UNUSED;
366 ir_use_list_remove_all(ctx, merge, use);
367 }
368 }
369 }
370 } else {
371 insn->inputs_count = i;
372
373 n++;
374 use_list = &ctx->use_lists[merge];
375 if (use_list->count > 1) {
376 for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
377 use = *p;
378 use_insn = &ctx->ir_base[use];
379 if (use_insn->op == IR_PHI) {
380 i = 2;
381 for (j = 2; j <= n; j++) {
382 ir_ref input = ir_insn_op(use_insn, j);
383
384 if (ir_bitset_in(life_inputs, j - 1)) {
385 IR_ASSERT(input);
386 if (i != j) {
387 ir_insn_set_op(use_insn, i, input);
388 }
389 i++;
390 } else if (input > 0) {
391 ir_use_list_remove_all(ctx, input, use);
392 }
393 }
394 }
395 }
396 }
397 }
398 ir_mem_free(life_inputs);
399 ir_use_list_remove_all(ctx, from, merge);
400 }
401
402 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)403 int ir_remove_unreachable_blocks(ir_ctx *ctx)
404 {
405 uint32_t b, *p, i;
406 uint32_t unreachable_count = 0;
407 uint32_t bb_count = ctx->cfg_blocks_count;
408 ir_block *bb = ctx->cfg_blocks + 1;
409
410 for (b = 1; b <= bb_count; b++, bb++) {
411 if (bb->flags & IR_BB_UNREACHABLE) {
412 #if 0
413 do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
414 #endif
415 if (bb->successors_count) {
416 for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
417 ir_block *succ_bb = &ctx->cfg_blocks[*p];
418
419 if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
420 ir_remove_predecessor(ctx, succ_bb, b);
421 ir_remove_merge_input(ctx, succ_bb->start, bb->end);
422 }
423 }
424 } else {
425 ir_ref prev, ref = bb->end;
426 ir_insn *insn = &ctx->ir_base[ref];
427
428 IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
429 /* remove from terminators list */
430 prev = ctx->ir_base[1].op1;
431 if (prev == ref) {
432 ctx->ir_base[1].op1 = insn->op3;
433 } else {
434 while (prev) {
435 if (ctx->ir_base[prev].op3 == ref) {
436 ctx->ir_base[prev].op3 = insn->op3;
437 break;
438 }
439 prev = ctx->ir_base[prev].op3;
440 }
441 }
442 }
443 ctx->cfg_map[bb->start] = 0;
444 ctx->cfg_map[bb->end] = 0;
445 unreachable_count++;
446 }
447 }
448
449 if (unreachable_count) {
450 ir_block *dst_bb;
451 uint32_t n = 1;
452 uint32_t *edges;
453
454 dst_bb = bb = ctx->cfg_blocks + 1;
455 for (b = 1; b <= bb_count; b++, bb++) {
456 if (!(bb->flags & IR_BB_UNREACHABLE)) {
457 if (dst_bb != bb) {
458 memcpy(dst_bb, bb, sizeof(ir_block));
459 ctx->cfg_map[dst_bb->start] = n;
460 ctx->cfg_map[dst_bb->end] = n;
461 }
462 dst_bb->successors_count = 0;
463 dst_bb++;
464 n++;
465 }
466 }
467 ctx->cfg_blocks_count = bb_count = n - 1;
468
469 /* Rebuild successor/predecessors control edges */
470 edges = ctx->cfg_edges;
471 bb = ctx->cfg_blocks + 1;
472 for (b = 1; b <= bb_count; b++, bb++) {
473 ir_insn *insn = &ctx->ir_base[bb->start];
474 ir_ref *p, ref;
475
476 n = bb->predecessors_count;
477 if (n > 1) {
478 uint32_t *q = edges + bb->predecessors;
479
480 IR_ASSERT(n == insn->inputs_count);
481 for (p = insn->ops + 1; n > 0; p++, q++, n--) {
482 ref = *p;
483 IR_ASSERT(ref);
484 ir_ref pred_b = ctx->cfg_map[ref];
485 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
486 *q = pred_b;
487 edges[pred_bb->successors + pred_bb->successors_count++] = b;
488 }
489 } else if (n == 1) {
490 ref = insn->op1;
491 IR_ASSERT(ref);
492 IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
493 ir_ref pred_b = ctx->cfg_map[ref];
494 ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
495 edges[bb->predecessors] = pred_b;
496 edges[pred_bb->successors + pred_bb->successors_count++] = b;
497 }
498 }
499 }
500
501 return 1;
502 }
503
504 #if 0
505 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
506 {
507 uint32_t i, *p;
508 ir_block *bb = &ctx->cfg_blocks[b];
509
510 if (bb->postnum != 0) {
511 return;
512 }
513
514 if (bb->successors_count) {
515 bb->postnum = -1; /* Marker for "currently visiting" */
516 p = ctx->cfg_edges + bb->successors;
517 i = bb->successors_count;
518 do {
519 compute_postnum(ctx, cur, *p);
520 p++;
521 } while (--i);
522 }
523 bb->postnum = (*cur)++;
524 }
525
526 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
527 * Cooper, Harvey and Kennedy. */
528 int ir_build_dominators_tree(ir_ctx *ctx)
529 {
530 uint32_t blocks_count, b, postnum;
531 ir_block *blocks, *bb;
532 uint32_t *edges;
533 bool changed;
534
535 ctx->flags2 &= ~IR_NO_LOOPS;
536
537 postnum = 1;
538 compute_postnum(ctx, &postnum, 1);
539
540 /* Find immediate dominators */
541 blocks = ctx->cfg_blocks;
542 edges = ctx->cfg_edges;
543 blocks_count = ctx->cfg_blocks_count;
544 blocks[1].idom = 1;
545 do {
546 changed = 0;
547 /* Iterating in Reverse Post Order */
548 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
549 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
550 if (bb->predecessors_count == 1) {
551 uint32_t pred_b = edges[bb->predecessors];
552
553 if (blocks[pred_b].idom <= 0) {
554 //IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
555 } else if (bb->idom != pred_b) {
556 bb->idom = pred_b;
557 changed = 1;
558 }
559 } else if (bb->predecessors_count) {
560 uint32_t idom = 0;
561 uint32_t k = bb->predecessors_count;
562 uint32_t *p = edges + bb->predecessors;
563
564 do {
565 uint32_t pred_b = *p;
566 ir_block *pred_bb = &blocks[pred_b];
567 ir_block *idom_bb;
568
569 if (pred_bb->idom > 0) {
570 idom = pred_b;
571 idom_bb = &blocks[idom];
572
573 while (--k > 0) {
574 pred_b = *(++p);
575 pred_bb = &blocks[pred_b];
576 if (pred_bb->idom > 0) {
577 while (idom != pred_b) {
578 while (pred_bb->postnum < idom_bb->postnum) {
579 pred_b = pred_bb->idom;
580 pred_bb = &blocks[pred_b];
581 }
582 while (idom_bb->postnum < pred_bb->postnum) {
583 idom = idom_bb->idom;
584 idom_bb = &blocks[idom];
585 }
586 }
587 }
588 }
589
590 if (bb->idom != idom) {
591 bb->idom = idom;
592 changed = 1;
593 }
594 break;
595 }
596 p++;
597 } while (--k > 0);
598 }
599 }
600 } while (changed);
601 blocks[1].idom = 0;
602 blocks[1].dom_depth = 0;
603
604 /* Construct dominators tree */
605 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
606 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
607 if (bb->idom > 0) {
608 ir_block *idom_bb = &blocks[bb->idom];
609
610 bb->dom_depth = idom_bb->dom_depth + 1;
611 /* Sort by block number to traverse children in pre-order */
612 if (idom_bb->dom_child == 0) {
613 idom_bb->dom_child = b;
614 } else if (b < idom_bb->dom_child) {
615 bb->dom_next_child = idom_bb->dom_child;
616 idom_bb->dom_child = b;
617 } else {
618 int child = idom_bb->dom_child;
619 ir_block *child_bb = &blocks[child];
620
621 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
622 child = child_bb->dom_next_child;
623 child_bb = &blocks[child];
624 }
625 bb->dom_next_child = child_bb->dom_next_child;
626 child_bb->dom_next_child = b;
627 }
628 }
629 }
630
631 return 1;
632 }
633 #else
634 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
635 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
636 * It may fallback to the general slow fixed-point algorithm. */
637 static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
ir_build_dominators_tree(ir_ctx * ctx)638 int ir_build_dominators_tree(ir_ctx *ctx)
639 {
640 uint32_t blocks_count, b;
641 ir_block *blocks, *bb;
642 uint32_t *edges;
643 ir_list worklist;
644
645 ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
646
647 ctx->flags2 |= IR_NO_LOOPS;
648
649 /* Find immediate dominators */
650 blocks = ctx->cfg_blocks;
651 edges = ctx->cfg_edges;
652 blocks_count = ctx->cfg_blocks_count;
653 blocks[1].idom = 1;
654 blocks[1].dom_depth = 0;
655
656 /* Iterating in Reverse Post Order */
657 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
658 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
659 IR_ASSERT(bb->predecessors_count > 0);
660 uint32_t k = bb->predecessors_count;
661 uint32_t *p = edges + bb->predecessors;
662 uint32_t idom = *p;
663 ir_block *idom_bb;
664
665 if (UNEXPECTED(idom >= b)) {
666 /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
667 ctx->flags2 &= ~IR_NO_LOOPS;
668 IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
669 ir_list_push(&worklist, idom);
670 while (1) {
671 k--;
672 p++;
673 idom = *p;
674 if (idom < b) {
675 break;
676 }
677 IR_ASSERT(k > 0);
678 ir_list_push(&worklist, idom);
679 }
680 }
681 IR_ASSERT(blocks[idom].idom > 0);
682 IR_ASSERT(k != 0);
683
684 while (--k > 0) {
685 uint32_t pred_b = *(++p);
686
687 if (pred_b < b) {
688 IR_ASSERT(blocks[pred_b].idom > 0);
689 while (idom != pred_b) {
690 while (pred_b > idom) {
691 pred_b = blocks[pred_b].idom;
692 }
693 while (idom > pred_b) {
694 idom = blocks[idom].idom;
695 }
696 }
697 } else {
698 ctx->flags2 &= ~IR_NO_LOOPS;
699 IR_ASSERT(bb->predecessors_count > 1);
700 ir_list_push(&worklist, pred_b);
701 }
702 }
703 bb->idom = idom;
704 idom_bb = &blocks[idom];
705
706 bb->dom_depth = idom_bb->dom_depth + 1;
707 /* Sort by block number to traverse children in pre-order */
708 if (idom_bb->dom_child == 0) {
709 idom_bb->dom_child = b;
710 } else if (b < idom_bb->dom_child) {
711 bb->dom_next_child = idom_bb->dom_child;
712 idom_bb->dom_child = b;
713 } else {
714 int child = idom_bb->dom_child;
715 ir_block *child_bb = &blocks[child];
716
717 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
718 child = child_bb->dom_next_child;
719 child_bb = &blocks[child];
720 }
721 bb->dom_next_child = child_bb->dom_next_child;
722 child_bb->dom_next_child = b;
723 }
724 }
725
726 blocks[1].idom = 0;
727
728 if (ir_list_len(&worklist) != 0) {
729 uint32_t dom_depth;
730 uint32_t succ_b;
731 bool complete = 1;
732
733 /* Check if all the back-edges lead to the loop headers */
734 do {
735 b = ir_list_pop(&worklist);
736 bb = &blocks[b];
737 succ_b = ctx->cfg_edges[bb->successors];
738 if (bb->successors_count != 1) {
739 /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
740 IR_ASSERT(bb->successors_count == 2);
741 if (blocks[succ_b].flags & IR_BB_ENTRY) {
742 succ_b = ctx->cfg_edges[bb->successors + 1];
743 } else {
744 IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
745 }
746 }
747 dom_depth = blocks[succ_b].dom_depth;;
748 while (bb->dom_depth > dom_depth) {
749 b = bb->dom_parent;
750 bb = &blocks[b];
751 }
752 if (UNEXPECTED(b != succ_b)) {
753 complete = 0;
754 break;
755 }
756 } while (ir_list_len(&worklist) != 0);
757
758 if (UNEXPECTED(!complete)) {
759 ir_list_free(&worklist);
760 return ir_build_dominators_tree_iterative(ctx);
761 }
762 }
763
764 ir_list_free(&worklist);
765
766 return 1;
767 }
768
ir_build_dominators_tree_iterative(ir_ctx * ctx)769 static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
770 {
771 bool changed;
772 uint32_t blocks_count, b;
773 ir_block *blocks, *bb;
774 uint32_t *edges;
775
776 /* Find immediate dominators */
777 blocks = ctx->cfg_blocks;
778 edges = ctx->cfg_edges;
779 blocks_count = ctx->cfg_blocks_count;
780
781 /* Clear the dominators tree, but keep already found dominators */
782 for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
783 bb->dom_depth = 0;
784 bb->dom_child = 0;
785 bb->dom_next_child = 0;
786 }
787
788 /* Find immediate dominators by iterative fixed-point algorithm */
789 blocks[1].idom = 1;
790 do {
791 changed = 0;
792
793 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
794 IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
795 IR_ASSERT(bb->predecessors_count > 0);
796 uint32_t k = bb->predecessors_count;
797 uint32_t *p = edges + bb->predecessors;
798 uint32_t idom = *p;
799
800 if (blocks[idom].idom == 0) {
801 while (1) {
802 k--;
803 p++;
804 idom = *p;
805 if (blocks[idom].idom > 0) {
806 break;
807 }
808 IR_ASSERT(k > 0);
809 }
810 }
811 IR_ASSERT(k != 0);
812 while (--k > 0) {
813 uint32_t pred_b = *(++p);
814
815 if (blocks[pred_b].idom > 0) {
816 IR_ASSERT(blocks[pred_b].idom > 0);
817 while (idom != pred_b) {
818 while (pred_b > idom) {
819 pred_b = blocks[pred_b].idom;
820 }
821 while (idom > pred_b) {
822 idom = blocks[idom].idom;
823 }
824 }
825 }
826 }
827 if (bb->idom != idom) {
828 bb->idom = idom;
829 changed = 1;
830 }
831 }
832 } while (changed);
833
834 /* Build dominators tree */
835 blocks[1].idom = 0;
836 blocks[1].dom_depth = 0;
837 for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
838 uint32_t idom = bb->idom;
839 ir_block *idom_bb = &blocks[idom];
840
841 bb->dom_depth = idom_bb->dom_depth + 1;
842 /* Sort by block number to traverse children in pre-order */
843 if (idom_bb->dom_child == 0) {
844 idom_bb->dom_child = b;
845 } else if (b < idom_bb->dom_child) {
846 bb->dom_next_child = idom_bb->dom_child;
847 idom_bb->dom_child = b;
848 } else {
849 int child = idom_bb->dom_child;
850 ir_block *child_bb = &blocks[child];
851
852 while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
853 child = child_bb->dom_next_child;
854 child_bb = &blocks[child];
855 }
856 bb->dom_next_child = child_bb->dom_next_child;
857 child_bb->dom_next_child = b;
858 }
859 }
860
861 return 1;
862 }
863 #endif
864
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)865 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
866 {
867 uint32_t b1_depth = blocks[b1].dom_depth;
868 const ir_block *bb2 = &blocks[b2];
869
870 while (bb2->dom_depth > b1_depth) {
871 b2 = bb2->dom_parent;
872 bb2 = &blocks[b2];
873 }
874 return b1 == b2;
875 }
876
ir_find_loops(ir_ctx * ctx)877 int ir_find_loops(ir_ctx *ctx)
878 {
879 uint32_t i, j, n, count;
880 uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
881 ir_block *blocks = ctx->cfg_blocks;
882 uint32_t *edges = ctx->cfg_edges;
883 ir_worklist work;
884
885 if (ctx->flags2 & IR_NO_LOOPS) {
886 return 1;
887 }
888
889 /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
890 * queries. These are implemented by checking entry/exit times of the DFS search. */
891 ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
892 entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
893 exit_times = entry_times + ctx->cfg_blocks_count + 1;
894 sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
895
896 memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
897
898 ir_worklist_push(&work, 1);
899 while (ir_worklist_len(&work)) {
900 ir_block *bb;
901 int child;
902
903 next:
904 i = ir_worklist_peek(&work);
905 if (!entry_times[i]) {
906 entry_times[i] = time++;
907 }
908
909 /* Visit blocks immediately dominated by i. */
910 bb = &blocks[i];
911 for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
912 if (ir_worklist_push(&work, child)) {
913 goto next;
914 }
915 }
916
917 /* Visit join edges. */
918 if (bb->successors_count) {
919 uint32_t *p = edges + bb->successors;
920 for (j = 0; j < bb->successors_count; j++,p++) {
921 uint32_t succ = *p;
922
923 if (blocks[succ].idom == i) {
924 continue;
925 } else if (ir_worklist_push(&work, succ)) {
926 goto next;
927 }
928 }
929 }
930 exit_times[i] = time++;
931 ir_worklist_pop(&work);
932 }
933
934 /* Sort blocks by level, which is the opposite order in which we want to process them */
935 sorted_blocks[1] = 1;
936 j = 1;
937 n = 2;
938 while (j != n) {
939 i = j;
940 j = n;
941 for (; i < j; i++) {
942 int child;
943 for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
944 sorted_blocks[n++] = child;
945 }
946 }
947 }
948 count = n;
949
950 /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
951 while (n > 1) {
952 i = sorted_blocks[--n];
953 ir_block *bb = &blocks[i];
954
955 if (bb->predecessors_count > 1) {
956 bool irreducible = 0;
957 uint32_t *p = &edges[bb->predecessors];
958
959 j = bb->predecessors_count;
960 do {
961 uint32_t pred = *p;
962
963 /* A join edge is one for which the predecessor does not
964 immediately dominate the successor. */
965 if (bb->idom != pred) {
966 /* In a loop back-edge (back-join edge), the successor dominates
967 the predecessor. */
968 if (ir_dominates(blocks, i, pred)) {
969 if (!ir_worklist_len(&work)) {
970 ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
971 }
972 blocks[pred].loop_header = 0; /* support for merged loops */
973 ir_worklist_push(&work, pred);
974 } else {
975 /* Otherwise it's a cross-join edge. See if it's a branch
976 to an ancestor on the DJ spanning tree. */
977 if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
978 irreducible = 1;
979 }
980 }
981 }
982 p++;
983 } while (--j);
984
985 if (UNEXPECTED(irreducible)) {
986 // TODO: Support for irreducible loops ???
987 bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
988 ctx->flags2 |= IR_IRREDUCIBLE_CFG;
989 while (ir_worklist_len(&work)) {
990 ir_worklist_pop(&work);
991 }
992 } else if (ir_worklist_len(&work)) {
993 bb->flags |= IR_BB_LOOP_HEADER;
994 ctx->flags2 |= IR_CFG_HAS_LOOPS;
995 bb->loop_depth = 1;
996 while (ir_worklist_len(&work)) {
997 j = ir_worklist_pop(&work);
998 while (blocks[j].loop_header > 0) {
999 j = blocks[j].loop_header;
1000 }
1001 if (j != i) {
1002 ir_block *bb = &blocks[j];
1003 if (bb->idom == 0 && j != 1) {
1004 /* Ignore blocks that are unreachable or only abnormally reachable. */
1005 continue;
1006 }
1007 bb->loop_header = i;
1008 if (bb->predecessors_count) {
1009 uint32_t *p = &edges[bb->predecessors];
1010 j = bb->predecessors_count;
1011 do {
1012 ir_worklist_push(&work, *p);
1013 p++;
1014 } while (--j);
1015 }
1016 }
1017 }
1018 }
1019 }
1020 }
1021
1022 if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1023 for (n = 1; n < count; n++) {
1024 i = sorted_blocks[n];
1025 ir_block *bb = &blocks[i];
1026 if (bb->loop_header > 0) {
1027 ir_block *loop = &blocks[bb->loop_header];
1028 uint32_t loop_depth = loop->loop_depth;
1029
1030 if (bb->flags & IR_BB_LOOP_HEADER) {
1031 loop_depth++;
1032 }
1033 bb->loop_depth = loop_depth;
1034 if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1035 loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1036 if (loop_depth > 1) {
1037 /* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1038 bb = &blocks[loop->loop_header];
1039 while (1) {
1040 if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1041 break;
1042 }
1043 bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1044 if (bb->loop_depth == 1) {
1045 break;
1046 }
1047 bb = &blocks[loop->loop_header];
1048 }
1049 }
1050 }
1051 }
1052 }
1053 }
1054
1055 ir_mem_free(entry_times);
1056 ir_worklist_free(&work);
1057
1058 return 1;
1059 }
1060
_ir_skip_empty_blocks(const ir_ctx * ctx,uint32_t b)1061 static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1062 {
1063 while (1) {
1064 ir_block *bb = &ctx->cfg_blocks[b];
1065
1066 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1067 IR_ASSERT(bb->successors_count == 1);
1068 b = ctx->cfg_edges[bb->successors];
1069 } else {
1070 return b;
1071 }
1072 }
1073 }
1074
1075 /* A variation of "Bottom-up Positioning" algorithm described by
1076 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1077 */
1078 typedef struct _ir_edge_info {
1079 uint32_t from;
1080 uint32_t to;
1081 float freq;
1082 } ir_edge_info;
1083
1084 typedef struct _ir_chain {
1085 uint32_t head;
1086 uint32_t next;
1087 union {
1088 uint32_t prev;
1089 uint32_t tail;
1090 };
1091 } ir_chain;
1092
ir_edge_info_cmp(const void * b1,const void * b2)1093 static int ir_edge_info_cmp(const void *b1, const void *b2)
1094 {
1095 ir_edge_info *e1 = (ir_edge_info*)b1;
1096 ir_edge_info *e2 = (ir_edge_info*)b2;
1097
1098 if (e1->freq != e2->freq) {
1099 return e1->freq < e2->freq ? 1 : -1;
1100 }
1101 /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1102 * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1103 * (in merge points).
1104 *
1105 * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1106 * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1107 */
1108 if (e1->from != e2->from) {
1109 return e2->from - e1->from;
1110 } else {
1111 return e1->to - e2->to;
1112 }
1113 }
1114
ir_chain_head_path_compress(ir_chain * chains,uint32_t src,uint32_t head)1115 static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1116 {
1117 IR_ASSERT(head != 0);
1118 do {
1119 head = chains[head].head;
1120 } while (chains[head].head != head);
1121 do {
1122 uint32_t next = chains[src].head;
1123 chains[src].head = head;
1124 src = next;
1125 } while (chains[src].head != head);
1126 return head;
1127 }
1128
ir_chain_head(ir_chain * chains,uint32_t src)1129 IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1130 {
1131 uint32_t head = chains[src].head;
1132 if (chains[head].head == head) {
1133 return head;
1134 } else {
1135 return ir_chain_head_path_compress(chains, src, head);
1136 }
1137 }
1138
ir_join_chains(ir_chain * chains,uint32_t src,uint32_t dst)1139 static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1140 {
1141 uint32_t dst_tail = chains[dst].tail;
1142 uint32_t src_tail = chains[src].tail;
1143
1144 chains[dst_tail].next = src;
1145 chains[dst].prev = src_tail;
1146 chains[src_tail].next = dst;
1147 chains[src].tail = dst_tail;
1148 chains[dst].head = src;
1149 }
1150
ir_insert_chain_before(ir_chain * chains,uint32_t c,uint32_t before)1151 static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1152 {
1153 ir_chain *this = &chains[c];
1154 ir_chain *next = &chains[before];
1155
1156 IR_ASSERT(chains[c].head == c);
1157 if (chains[before].head != before) {
1158 this->head = next->head;
1159 } else {
1160 next->head = c;
1161 }
1162 this->next = before;
1163 this->prev = next->prev;
1164 next->prev = c;
1165 chains[this->prev].next = c;
1166 }
1167
1168 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1169 # ifdef IR_DEBUG
1170 # define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1171 # else
1172 # define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1173 # endif
1174 #endif
1175
1176 #if IR_DEBUG_BB_SCHEDULE_GRAPH
ir_dump_cfg_freq_graph(ir_ctx * ctx,float * bb_freq,uint32_t edges_count,ir_edge_info * edges,ir_chain * chains)1177 static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1178 {
1179 uint32_t b, i;
1180 ir_block *bb;
1181 uint8_t c, *colors;
1182 bool is_head, is_empty;
1183 uint32_t max_colors = 12;
1184
1185 colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1186 memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1187 i = 0;
1188 for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1189 if (chains[b].head == b) {
1190 colors[b] = (i % max_colors) + 1;
1191 i++;
1192 }
1193 }
1194
1195 fprintf(stderr, "digraph {\n");
1196 fprintf(stderr, "\trankdir=TB;\n");
1197 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1198 bb = &ctx->cfg_blocks[b];
1199 c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1200 is_head = chains[b].head == b;
1201 is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1202 if (c) {
1203 fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1204 bb->loop_depth, bb_freq[b],
1205 ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1206 c,
1207 is_head ? ",penwidth=3" : "",
1208 is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1209 } else {
1210 fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1211 bb->loop_depth, bb_freq[b],
1212 ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1213 is_head ? ",penwidth=3" : "",
1214 is_empty ? ",style=\"dotted\"" : "");
1215 }
1216 }
1217 fprintf(stderr, "\n");
1218
1219 for (i = 0; i < edges_count; i++) {
1220 fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1221 }
1222 fprintf(stderr, "}\n");
1223 }
1224 #endif
1225
1226 #ifdef IR_DEBUG
ir_dump_edges(ir_ctx * ctx,uint32_t edges_count,ir_edge_info * edges)1227 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1228 {
1229 uint32_t i;
1230
1231 fprintf(stderr, "Edges:\n");
1232 for (i = 0; i < edges_count; i++) {
1233 fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1234 }
1235 }
1236
ir_dump_chains(ir_ctx * ctx,ir_chain * chains)1237 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1238 {
1239 uint32_t b, tail, i;
1240
1241 fprintf(stderr, "Chains:\n");
1242 for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1243 if (chains[b].head == b) {
1244 tail = chains[b].tail;
1245 i = b;
1246 fprintf(stderr, "(BB%d", i);
1247 while (i != tail) {
1248 i = chains[i].next;
1249 fprintf(stderr, ", BB%d", i);
1250 }
1251 fprintf(stderr, ")\n");
1252 }
1253 }
1254 }
1255 #endif
1256
ir_schedule_blocks_bottom_up(ir_ctx * ctx)1257 static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1258 {
1259 uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1260 uint32_t edges_count = 0;
1261 uint32_t b, i, loop_depth;
1262 float *bb_freq, freq;
1263 ir_block *bb;
1264 ir_edge_info *edges, *e;
1265 ir_chain *chains;
1266 ir_bitqueue worklist;
1267 ir_bitset visited;
1268 uint32_t *schedule_end, count;
1269
1270 ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1271 schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1272
1273 /* 1. Create initial chains for each BB */
1274 chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1275 chains[0].head = 0;
1276 chains[0].next = 0;
1277 chains[0].prev = 0;
1278 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1279 chains[b].head = b;
1280 chains[b].next = b;
1281 chains[b].prev = b;
1282 }
1283
1284 /* 2. Collect information about BBs and EDGEs frequencies */
1285 edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1286 bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1287 bb_freq[1] = 1.0f;
1288 visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1289 ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1290 ir_bitqueue_add(&worklist, 1);
1291 while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1292 restart:
1293 bb = &ctx->cfg_blocks[b];
1294 if (bb->predecessors_count) {
1295 uint32_t n = bb->predecessors_count;
1296 uint32_t *p = ctx->cfg_edges + bb->predecessors;
1297 for (; n > 0; p++, n--) {
1298 uint32_t predecessor = *p;
1299 /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1300 * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1301 * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1302 */
1303 if (predecessor < b) {
1304 if (!ir_bitset_in(visited, predecessor)) {
1305 b = predecessor;
1306 ir_bitqueue_del(&worklist, b);
1307 goto restart;
1308 }
1309 } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1310 ir_dump_cfg(ctx, stderr);
1311 IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1312 }
1313 }
1314 }
1315
1316 ir_bitset_incl(visited, b);
1317
1318 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1319 uint32_t successor = ctx->cfg_edges[bb->successors];
1320
1321 /* move empty blocks to the end */
1322 IR_ASSERT(chains[b].head == b);
1323 chains[b].head = 0;
1324 *schedule_end = b;
1325 schedule_end--;
1326
1327 if (successor > b) {
1328 bb_freq[successor] += bb_freq[b];
1329 b = successor;
1330 ir_bitqueue_del(&worklist, b);
1331 goto restart;
1332 } else {
1333 continue;
1334 }
1335 }
1336
1337 loop_depth = bb->loop_depth;
1338 if (bb->flags & IR_BB_LOOP_HEADER) {
1339 // TODO: Estimate the loop iterations count
1340 bb_freq[b] *= 10;
1341 }
1342
1343 if (bb->successors_count) {
1344 uint32_t n = bb->successors_count;
1345 uint32_t *p = ctx->cfg_edges + bb->successors;
1346
1347 if (n == 1) {
1348 uint32_t successor = *p;
1349
1350 IR_ASSERT(edges_count < max_edges_count);
1351 freq = bb_freq[b];
1352 if (successor > b) {
1353 IR_ASSERT(!ir_bitset_in(visited, successor));
1354 bb_freq[successor] += freq;
1355 ir_bitqueue_add(&worklist, successor);
1356 }
1357 successor = _ir_skip_empty_blocks(ctx, successor);
1358 edges[edges_count].from = b;
1359 edges[edges_count].to = successor;
1360 edges[edges_count].freq = freq;
1361 edges_count++;
1362 } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1363 uint32_t successor1 = *p;
1364 ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1365 ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1366 uint32_t successor2 = *(p + 1);
1367 ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1368 ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1369 int prob1, prob2, probN = 100;
1370
1371 if (insn1->op2) {
1372 prob1 = insn1->op2;
1373 if (insn2->op2) {
1374 prob2 = insn2->op2;
1375 probN = prob1 + prob2;
1376 } else {
1377 if (prob1 > 100) {
1378 prob1 = 100;
1379 }
1380 prob2 = 100 - prob1;
1381 }
1382
1383 } else if (insn2->op2) {
1384 prob2 = insn2->op2;
1385 if (prob2 > 100) {
1386 prob2 = 100;
1387 }
1388 prob1 = 100 - prob2;
1389 } else if (successor1_bb->loop_depth >= loop_depth
1390 && successor2_bb->loop_depth < loop_depth) {
1391 prob1 = 90;
1392 prob2 = 10;
1393 } else if (successor1_bb->loop_depth < loop_depth
1394 && successor2_bb->loop_depth >= loop_depth) {
1395 prob1 = 10;
1396 prob2 = 90;
1397 } else if (successor2_bb->flags & IR_BB_EMPTY) {
1398 prob1 = 51;
1399 prob2 = 49;
1400 } else if (successor1_bb->flags & IR_BB_EMPTY) {
1401 prob1 = 49;
1402 prob2 = 51;
1403 } else {
1404 prob1 = prob2 = 50;
1405 }
1406 do {
1407 freq = bb_freq[b] * (float)prob1 / (float)probN;
1408 if (successor1 > b) {
1409 IR_ASSERT(!ir_bitset_in(visited, successor1));
1410 bb_freq[successor1] += freq;
1411 if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1412 /* move cold block without successors to the end */
1413 IR_ASSERT(chains[successor1].head == successor1);
1414 chains[successor1].head = 0;
1415 *schedule_end = successor1;
1416 schedule_end--;
1417 break;
1418 } else {
1419 ir_bitqueue_add(&worklist, successor1);
1420 }
1421 }
1422 /* try to join edges early to reduce number of edges and the cost of their sorting */
1423 if (prob1 > prob2
1424 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1425 uint32_t src = chains[b].next;
1426 IR_ASSERT(chains[src].head == src);
1427 IR_ASSERT(src == ir_chain_head(chains, b));
1428 IR_ASSERT(chains[successor1].head == successor1);
1429 ir_join_chains(chains, src, successor1);
1430 if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1431 }
1432 successor1 = _ir_skip_empty_blocks(ctx, successor1);
1433 IR_ASSERT(edges_count < max_edges_count);
1434 edges[edges_count].from = b;
1435 edges[edges_count].to = successor1;
1436 edges[edges_count].freq = freq;
1437 edges_count++;
1438 } while (0);
1439 do {
1440 freq = bb_freq[b] * (float)prob2 / (float)probN;
1441 if (successor2 > b) {
1442 IR_ASSERT(!ir_bitset_in(visited, successor2));
1443 bb_freq[successor2] += freq;
1444 if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1445 /* move cold block without successors to the end */
1446 IR_ASSERT(chains[successor2].head == successor2);
1447 chains[successor2].head = 0;
1448 *schedule_end = successor2;
1449 schedule_end--;
1450 break;
1451 } else {
1452 ir_bitqueue_add(&worklist, successor2);
1453 }
1454 }
1455 if (prob2 > prob1
1456 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1457 uint32_t src = chains[b].next;
1458 IR_ASSERT(chains[src].head == src);
1459 IR_ASSERT(src == ir_chain_head(chains, b));
1460 IR_ASSERT(chains[successor2].head == successor2);
1461 ir_join_chains(chains, src, successor2);
1462 if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1463 }
1464 successor2 = _ir_skip_empty_blocks(ctx, successor2);
1465 IR_ASSERT(edges_count < max_edges_count);
1466 edges[edges_count].from = b;
1467 edges[edges_count].to = successor2;
1468 edges[edges_count].freq = freq;
1469 edges_count++;
1470 } while (0);
1471 } else {
1472 int prob;
1473
1474 for (; n > 0; p++, n--) {
1475 uint32_t successor = *p;
1476 ir_block *successor_bb = &ctx->cfg_blocks[successor];
1477 ir_insn *insn = &ctx->ir_base[successor_bb->start];
1478
1479 if (insn->op == IR_CASE_DEFAULT) {
1480 prob = insn->op2;
1481 if (!prob) {
1482 prob = 100 / bb->successors_count;
1483 }
1484 } else if (insn->op == IR_CASE_VAL) {
1485 prob = insn->op3;
1486 if (!prob) {
1487 prob = 100 / bb->successors_count;
1488 }
1489 } else if (insn->op == IR_ENTRY) {
1490 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1491 prob = 99; /* prefer empty ENTRY block to go first */
1492 } else {
1493 prob = 1;
1494 }
1495 } else {
1496 prob = 100 / bb->successors_count;
1497 }
1498 freq = bb_freq[b] * (float)prob / 100.0f;
1499 if (successor > b) {
1500 IR_ASSERT(!ir_bitset_in(visited, successor));
1501 bb_freq[successor] += freq;
1502 ir_bitqueue_add(&worklist, successor);
1503 }
1504 successor = _ir_skip_empty_blocks(ctx, successor);
1505 IR_ASSERT(edges_count < max_edges_count);
1506 edges[edges_count].from = b;
1507 edges[edges_count].to = successor;
1508 edges[edges_count].freq = freq;
1509 edges_count++;
1510 }
1511 }
1512 }
1513 }
1514 ir_bitqueue_free(&worklist);
1515 ir_mem_free(visited);
1516
1517 /* 2. Sort EDGEs according to their frequencies */
1518 qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1519
1520 #ifdef IR_DEBUG
1521 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1522 ir_dump_edges(ctx, edges_count, edges);
1523 }
1524 #endif
1525
1526 /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1527 for (e = edges, i = edges_count; i > 0; e++, i--) {
1528 uint32_t dst = chains[e->to].head;
1529 if (dst == e->to) {
1530 uint32_t src = chains[e->from].next;
1531 if (chains[src].head == src) {
1532 IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1533 if (src != dst) {
1534 ir_join_chains(chains, src, dst);
1535 } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1536 /* Try to rotate loop chian to finish it with a conditional branch */
1537 uint32_t tail = e->from;
1538 uint32_t prev = src;
1539 uint32_t next = chains[prev].next;
1540 uint32_t best = 0;
1541
1542 while (prev != tail) {
1543 if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1544 if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1545 && ctx->cfg_blocks[prev].loop_depth > 1) {
1546 best = next;
1547 break;
1548 } else if (!best || bb_freq[next] < bb_freq[best]) {
1549 /* Find the successor of IF with the least frequency */
1550 best = next;
1551 }
1552 }
1553 prev = next;
1554 next = chains[next].next;
1555 }
1556 if (best) {
1557 /* change the head of the chain */
1558 chains[src].head = best;
1559 chains[best].head = best;
1560 }
1561 }
1562 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1563 e->from = 0; /* reset "from" to avoid check on step #5 */
1564 #endif
1565 }
1566 }
1567 }
1568
1569 #if IR_DEBUG_BB_SCHEDULE_GRAPH
1570 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1571 ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1572 }
1573 #endif
1574
1575 ir_mem_free(bb_freq);
1576
1577 #ifdef IR_DEBUG
1578 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1579 ir_dump_chains(ctx, chains);
1580 }
1581 #endif
1582
1583 /* 4. Merge empty entry blocks */
1584 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1585 for (i = 0; i < ctx->entries_count; i++) {
1586 b = ctx->entries[i];
1587 IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1588 if (b && chains[b].head == b && chains[b].tail == b) {
1589 bb = &ctx->cfg_blocks[b];
1590 if (bb->flags & IR_BB_EMPTY) {
1591 uint32_t successor;
1592
1593 do {
1594 IR_ASSERT(bb->successors_count == 1);
1595 successor = ctx->cfg_edges[bb->successors];
1596 bb = &ctx->cfg_blocks[successor];
1597 } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1598 IR_ASSERT(chains[successor].head);
1599 ir_insert_chain_before(chains, b, successor);
1600 }
1601 }
1602 }
1603
1604 #ifdef IR_DEBUG
1605 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1606 ir_dump_chains(ctx, chains);
1607 }
1608 #endif
1609 }
1610
1611 /* 5. Align loop headers */
1612 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1613 if (chains[b].head == b) {
1614 bb = &ctx->cfg_blocks[b];
1615 if (bb->loop_depth) {
1616 if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1617 bb->flags |= IR_BB_ALIGN_LOOP;
1618 }
1619 }
1620 }
1621 }
1622
1623 /* 6. Group chains according to the most frequent edge between them */
1624 // TODO: Try to find a better heuristic
1625 for (e = edges, i = edges_count; i > 0; e++, i--) {
1626 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1627 if (!e->from) continue;
1628 #endif
1629 uint32_t src = ir_chain_head(chains, e->from);
1630 uint32_t dst = ir_chain_head(chains, e->to);
1631 if (src != dst) {
1632 if (dst == 1) {
1633 ir_join_chains(chains, dst, src);
1634 } else {
1635 ir_join_chains(chains, src, dst);
1636 }
1637 }
1638 }
1639
1640 #ifdef IR_DEBUG
1641 if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1642 ir_dump_chains(ctx, chains);
1643 }
1644 #endif
1645
1646 /* 7. Form a final BB order */
1647 count = 0;
1648 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1649 if (chains[b].head == b) {
1650 uint32_t tail = chains[b].tail;
1651 uint32_t i = b;
1652 while (1) {
1653 count++;
1654 ctx->cfg_schedule[count] = i;
1655 if (i == tail) break;
1656 i = chains[i].next;
1657 }
1658 }
1659 }
1660
1661 IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1662 ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1663
1664 ir_mem_free(edges);
1665 ir_mem_free(chains);
1666
1667 return 1;
1668 }
1669
1670 /* A variation of "Top-down Positioning" algorithm described by
1671 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1672 */
ir_schedule_blocks_top_down(ir_ctx * ctx)1673 static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1674 {
1675 ir_bitqueue blocks;
1676 uint32_t b, best_successor, last_non_empty;
1677 ir_block *bb, *best_successor_bb;
1678 ir_insn *insn;
1679 uint32_t *list, *schedule_end;
1680 uint32_t count = 0;
1681
1682 ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1683 blocks.pos = 0;
1684 list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1685 list[ctx->cfg_blocks_count + 1] = 0;
1686 schedule_end = list + ctx->cfg_blocks_count;
1687 for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1688 ir_bitset_incl(blocks.set, b);
1689 }
1690
1691 while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1692 bb = &ctx->cfg_blocks[b];
1693 /* Start trace */
1694 last_non_empty = 0;
1695 do {
1696 if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1697 /* Schedule the previous empty ENTRY block before this one */
1698 uint32_t predecessor = b - 1;
1699
1700 ir_bitqueue_del(&blocks, predecessor);
1701 count++;
1702 list[count] = predecessor;
1703 }
1704 if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1705 /* move empty blocks to the end */
1706 *schedule_end = b;
1707 schedule_end--;
1708 } else {
1709 count++;
1710 list[count] = b;
1711 last_non_empty = b;
1712 }
1713 best_successor_bb = NULL;
1714 if (bb->successors_count == 1) {
1715 best_successor = ctx->cfg_edges[bb->successors];
1716 if (ir_bitqueue_in(&blocks, best_successor)) {
1717 best_successor_bb = &ctx->cfg_blocks[best_successor];
1718 }
1719 } else if (bb->successors_count > 1) {
1720 uint32_t prob, best_successor_prob;
1721 uint32_t *p, successor;
1722 ir_block *successor_bb;
1723
1724 for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1725 successor = *p;
1726 if (ir_bitqueue_in(&blocks, successor)) {
1727 successor_bb = &ctx->cfg_blocks[successor];
1728 insn = &ctx->ir_base[successor_bb->start];
1729 if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1730 prob = insn->op2;
1731 if (!prob) {
1732 prob = 100 / bb->successors_count;
1733 if (!(successor_bb->flags & IR_BB_EMPTY)) {
1734 prob++;
1735 }
1736 }
1737 } else if (insn->op == IR_CASE_DEFAULT) {
1738 prob = insn->op2;
1739 if (!prob) {
1740 prob = 100 / bb->successors_count;
1741 }
1742 } else if (insn->op == IR_CASE_VAL) {
1743 prob = insn->op3;
1744 if (!prob) {
1745 prob = 100 / bb->successors_count;
1746 }
1747 } else if (insn->op == IR_ENTRY) {
1748 if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1749 prob = 99; /* prefer empty ENTRY block to go first */
1750 } else {
1751 prob = 1;
1752 }
1753 } else {
1754 prob = 100 / bb->successors_count;
1755 }
1756 if (!best_successor_bb
1757 || successor_bb->loop_depth > best_successor_bb->loop_depth
1758 || prob > best_successor_prob) {
1759 best_successor = successor;
1760 best_successor_bb = successor_bb;
1761 best_successor_prob = prob;
1762 }
1763 }
1764 }
1765 }
1766 if (!best_successor_bb) {
1767 /* Try to continue trace using the other successor of the last IF */
1768 if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1769 bb = &ctx->cfg_blocks[last_non_empty];
1770 if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1771 b = ctx->cfg_edges[bb->successors];
1772
1773 if (!ir_bitqueue_in(&blocks, b)) {
1774 b = ctx->cfg_edges[bb->successors + 1];
1775 }
1776 if (ir_bitqueue_in(&blocks, b)) {
1777 bb = &ctx->cfg_blocks[b];
1778 ir_bitqueue_del(&blocks, b);
1779 continue;
1780 }
1781 }
1782 }
1783 /* End trace */
1784 break;
1785 }
1786 b = best_successor;
1787 bb = best_successor_bb;
1788 ir_bitqueue_del(&blocks, b);
1789 } while (1);
1790 }
1791
1792 IR_ASSERT(list + count == schedule_end);
1793 ctx->cfg_schedule = list;
1794 ir_bitqueue_free(&blocks);
1795
1796 return 1;
1797 }
1798
ir_schedule_blocks(ir_ctx * ctx)1799 int ir_schedule_blocks(ir_ctx *ctx)
1800 {
1801 ir_ref ref;
1802
1803 if (ctx->cfg_blocks_count <= 2) {
1804 return 1;
1805 }
1806
1807 /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1808 ref = ctx->ir_base[1].op1;
1809 while (ref) {
1810 ir_insn *insn = &ctx->ir_base[ref];
1811
1812 if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1813 ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1814 uint32_t n = bb->predecessors_count;
1815
1816 if (n == 1) {
1817 ir_insn *start_insn = &ctx->ir_base[bb->start];
1818 if (start_insn->op == IR_IF_TRUE
1819 || start_insn->op == IR_IF_FALSE
1820 || start_insn->op == IR_CASE_DEFAULT) {
1821 if (!start_insn->op2) start_insn->op2 = 1;
1822 } else if (start_insn->op == IR_CASE_VAL) {
1823 if (!start_insn->op3) start_insn->op3 = 1;
1824 }
1825 } else if (n > 1) {
1826 uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1827
1828 for (; n > 0; p++, n--) {
1829 bb = &ctx->cfg_blocks[*p];
1830 if (bb->predecessors_count == 1) {
1831 ir_insn *start_insn = &ctx->ir_base[bb->start];
1832 if (start_insn->op == IR_IF_TRUE
1833 || start_insn->op == IR_IF_FALSE
1834 || start_insn->op == IR_CASE_DEFAULT) {
1835 if (!start_insn->op2) start_insn->op2 = 1;
1836 } else if (start_insn->op == IR_CASE_VAL) {
1837 if (!start_insn->op3) start_insn->op3 = 1;
1838 }
1839 }
1840 }
1841 }
1842 }
1843 ref = insn->op3;
1844 }
1845
1846 /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
1847 * use it only for relatively small functions.
1848 *
1849 * TODO: make the choice between top-down and bottom-up algorithm configurable
1850 */
1851 if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
1852 return ir_schedule_blocks_top_down(ctx);
1853 } else {
1854 return ir_schedule_blocks_bottom_up(ctx);
1855 }
1856 }
1857
1858 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1859 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1860 {
1861 return _ir_skip_empty_blocks(ctx, b);
1862 }
1863
ir_next_block(const ir_ctx * ctx,uint32_t b)1864 uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
1865 {
1866 ir_block *bb;
1867
1868 if (ctx->cfg_schedule) {
1869 uint32_t ret = ctx->cfg_schedule[++b];
1870
1871 /* Check for empty ENTRY block */
1872 while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
1873 ret = ctx->cfg_schedule[++b];
1874 }
1875 return ret;
1876 }
1877
1878 b++;
1879 while (1) {
1880 if (b > ctx->cfg_blocks_count) {
1881 return 0;
1882 }
1883
1884 bb = &ctx->cfg_blocks[b];
1885
1886 if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1887 b++;
1888 } else {
1889 break;
1890 }
1891 }
1892 return b;
1893 }
1894
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block)1895 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
1896 {
1897 ir_block *bb;
1898 uint32_t *p, use_block;
1899
1900 *true_block = 0;
1901 *false_block = 0;
1902 bb = &ctx->cfg_blocks[b];
1903 IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1904 IR_ASSERT(bb->successors_count == 2);
1905 p = &ctx->cfg_edges[bb->successors];
1906 use_block = *p;
1907 if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1908 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1909 use_block = *(p+1);
1910 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1911 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1912 } else {
1913 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1914 *false_block = ir_skip_empty_target_blocks(ctx, use_block);
1915 use_block = *(p+1);
1916 IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1917 *true_block = ir_skip_empty_target_blocks(ctx, use_block);
1918 }
1919 IR_ASSERT(*true_block && *false_block);
1920 }
1921