xref: /php-src/ext/opcache/jit/ir/ir_cfg.c (revision 9cb48c8f)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (CFG - Control Flow Graph)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 #include "ir_private.h"
10 
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)11 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
12 {
13 	ir_use_list *use_list = &ctx->use_lists[ref];
14 	ir_ref *p, use, n = use_list->count;
15 
16 	if (n < 2) {
17 		if (n == 1) {
18 			use = ctx->use_edges[use_list->refs];
19 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
20 			ir_worklist_push(worklist, use);
21 		}
22 	} else {
23 		p = &ctx->use_edges[use_list->refs];
24 		if (n == 2) {
25 			use = *p;
26 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
27 			ir_worklist_push(worklist, use);
28 			use = *(p + 1);
29 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
30 			ir_worklist_push(worklist, use);
31 		} else {
32 			for (; n > 0; p++, n--) {
33 				use = *p;
34 				IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
35 				ir_worklist_push(worklist, use);
36 			}
37 		}
38 	}
39 }
40 
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)41 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
42 {
43 	ir_ref n, ref;
44 	const ir_ref *p;
45 
46 	if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
47 		n = insn->inputs_count;
48 		for (p = insn->ops + 1; n > 0; p++, n--) {
49 			ref = *p;
50 			IR_ASSERT(ref);
51 			ir_worklist_push(worklist, ref);
52 		}
53 	} else if (insn->op != IR_START) {
54 		if (EXPECTED(insn->op1)) {
55 			ir_worklist_push(worklist, insn->op1);
56 		}
57 	}
58 }
59 
ir_build_cfg(ir_ctx * ctx)60 int ir_build_cfg(ir_ctx *ctx)
61 {
62 	ir_ref n, *p, ref, start, end, next;
63 	uint32_t b;
64 	ir_insn *insn;
65 	ir_worklist worklist;
66 	uint32_t bb_init_falgs;
67 	uint32_t count, bb_count = 0;
68 	uint32_t edges_count = 0;
69 	ir_block *blocks, *bb;
70 	uint32_t *_blocks, *edges;
71 	ir_use_list *use_list;
72 	uint32_t len = ir_bitset_len(ctx->insns_count);
73 	ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
74 	ir_bitset bb_leaks = bb_starts + len;
75 	_blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
76 	ir_worklist_init(&worklist, ctx->insns_count);
77 
78 	/* First try to perform backward DFS search starting from "stop" nodes */
79 
80 	/* Add all "stop" nodes */
81 	ref = ctx->ir_base[1].op1;
82 	while (ref) {
83 		ir_worklist_push(&worklist, ref);
84 		ref = ctx->ir_base[ref].op3;
85 	}
86 
87 	while (ir_worklist_len(&worklist)) {
88 		ref = ir_worklist_pop(&worklist);
89 		insn = &ctx->ir_base[ref];
90 
91 		if (insn->op == IR_NOP) {
92 			continue;
93 		}
94 		IR_ASSERT(IR_IS_BB_END(insn->op));
95 		/* Remember BB end */
96 		end = ref;
97 		/* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
98 		use_list = &ctx->use_lists[end];
99 		n = use_list->count;
100 		if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
101 			for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
102 				/* Remember possible inaccessible successors */
103 				ir_bitset_incl(bb_leaks, *p);
104 			}
105 		}
106 		/* Skip control nodes untill BB start */
107 		ref = insn->op1;
108 		while (1) {
109 			insn = &ctx->ir_base[ref];
110 			if (IR_IS_BB_START(insn->op)) {
111 				break;
112 			}
113 			ref = insn->op1; // follow connected control blocks untill BB start
114 		}
115 		/* Mark BB Start */
116 		bb_count++;
117 		_blocks[ref] = end;
118 		ir_bitset_incl(bb_starts, ref);
119 		/* Add predecessors */
120 		_ir_add_predecessors(insn, &worklist);
121 	}
122 
123 	/* Backward DFS way miss some branches ending by infinite loops.  */
124 	/* Try forward DFS. (in most cases all nodes are already proceed). */
125 
126 	/* START node may be inaccessible from "stop" nodes */
127 	ir_bitset_incl(bb_leaks, 1);
128 
129 	/* Add not processed START and successor of IF and SWITCH */
130 	IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
131 		ir_worklist_push(&worklist, start);
132 	} IR_BITSET_FOREACH_END();
133 
134 	if (ir_worklist_len(&worklist)) {
135 		ir_bitset_union(worklist.visited, bb_starts, len);
136 		do {
137 			ref = ir_worklist_pop(&worklist);
138 			insn = &ctx->ir_base[ref];
139 
140 			if (insn->op == IR_NOP) {
141 				continue;
142 			}
143 			IR_ASSERT(IR_IS_BB_START(insn->op));
144 			/* Remember BB start */
145 			start = ref;
146 			/* Skip control nodes untill BB end */
147 			while (1) {
148 				use_list = &ctx->use_lists[ref];
149 				n = use_list->count;
150 				next = IR_UNUSED;
151 				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
152 					next = *p;
153 					insn = &ctx->ir_base[next];
154 					if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
155 						break;
156 					}
157 				}
158 				IR_ASSERT(next != IR_UNUSED);
159 				ref = next;
160 				if (IR_IS_BB_END(insn->op)) {
161 					break;
162 				}
163 			}
164 			/* Mark BB Start */
165 			bb_count++;
166 			_blocks[start] = ref;
167 			ir_bitset_incl(bb_starts, start);
168 			/* Add successors */
169 			_ir_add_successors(ctx, ref, &worklist);
170 		} while (ir_worklist_len(&worklist));
171 	}
172 
173 	IR_ASSERT(bb_count > 0);
174 
175 	/* Create array of basic blocks and count successor/predecessors edges for each BB */
176 	blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
177 	b = 1;
178 	bb = blocks + 1;
179 	count = 0;
180 	/* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
181 	bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
182 	IR_BITSET_FOREACH(bb_starts, len, start) {
183 		insn = &ctx->ir_base[start];
184 		if (insn->op == IR_NOP) {
185 			_blocks[start] = 0;
186 			continue;
187 		}
188 		end = _blocks[start];
189 		_blocks[start] = b;
190 		_blocks[end] = b;
191 		IR_ASSERT(IR_IS_BB_START(insn->op));
192 		IR_ASSERT(end > start);
193 		bb->start = start;
194 		bb->end = end;
195 		bb->successors = count;
196 		count += ctx->use_lists[end].count;
197 		bb->successors_count = 0;
198 		bb->predecessors = count;
199 		bb->dom_parent = 0;
200 		bb->dom_depth = 0;
201 		bb->dom_child = 0;
202 		bb->dom_next_child = 0;
203 		bb->loop_header = 0;
204 		bb->loop_depth = 0;
205 		if (insn->op == IR_START) {
206 			bb->flags = IR_BB_START;
207 			bb->predecessors_count = 0;
208 		} else {
209 			bb->flags = bb_init_falgs;
210 			if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
211 				n = insn->inputs_count;
212 				bb->predecessors_count = n;
213 				edges_count += n;
214 				count += n;
215 			} else if (EXPECTED(insn->op1)) {
216 				if (insn->op == IR_ENTRY) {
217 					bb->flags |= IR_BB_ENTRY;
218 					ctx->entries_count++;
219 				}
220 				bb->predecessors_count = 1;
221 				edges_count++;
222 				count++;
223 			} else {
224 				IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
225 				bb->predecessors_count = 0;
226 			}
227 		}
228 		b++;
229 		bb++;
230 	} IR_BITSET_FOREACH_END();
231 	bb_count = b - 1;
232 	IR_ASSERT(count == edges_count * 2);
233 	ir_mem_free(bb_starts);
234 
235 	/* Create an array of successor/predecessors control edges */
236 	edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
237 	bb = blocks + 1;
238 	for (b = 1; b <= bb_count; b++, bb++) {
239 		insn = &ctx->ir_base[bb->start];
240 		if (bb->predecessors_count > 1) {
241 			uint32_t *q = edges + bb->predecessors;
242 			n = insn->inputs_count;
243 			for (p = insn->ops + 1; n > 0; p++, q++, n--) {
244 				ref = *p;
245 				IR_ASSERT(ref);
246 				ir_ref pred_b = _blocks[ref];
247 				ir_block *pred_bb = &blocks[pred_b];
248 				IR_ASSERT(pred_b > 0);
249 				*q = pred_b;
250 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
251 			}
252 		} else if (bb->predecessors_count == 1) {
253 			ref = insn->op1;
254 			IR_ASSERT(ref);
255 			IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
256 			ir_ref pred_b = _blocks[ref];
257 			ir_block *pred_bb = &blocks[pred_b];
258 			edges[bb->predecessors] = pred_b;
259 			edges[pred_bb->successors + pred_bb->successors_count++] = b;
260 		}
261 	}
262 
263 	ctx->cfg_blocks_count = bb_count;
264 	ctx->cfg_edges_count = edges_count * 2;
265 	ctx->cfg_blocks = blocks;
266 	ctx->cfg_edges = edges;
267 	ctx->cfg_map = _blocks;
268 
269 	if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
270 		uint32_t reachable_count = 0;
271 
272 		/* Mark reachable blocks */
273 		ir_worklist_clear(&worklist);
274 		ir_worklist_push(&worklist, 1);
275 		while (ir_worklist_len(&worklist) != 0) {
276 			uint32_t *p;
277 
278 			reachable_count++;
279 			b = ir_worklist_pop(&worklist);
280 			bb = &blocks[b];
281 			bb->flags &= ~IR_BB_UNREACHABLE;
282 			n = bb->successors_count;
283 			if (n > 1) {
284 				for (p = edges + bb->successors; n > 0; p++, n--) {
285 					ir_worklist_push(&worklist, *p);
286 				}
287 			} else if (n == 1) {
288 				ir_worklist_push(&worklist, edges[bb->successors]);
289 			}
290 		}
291 		if (reachable_count != ctx->cfg_blocks_count) {
292 			ir_remove_unreachable_blocks(ctx);
293 		}
294 	}
295 
296 	ir_worklist_free(&worklist);
297 
298 	return 1;
299 }
300 
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)301 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
302 {
303 	uint32_t i, *p, *q, n = 0;
304 
305 	p = q = &ctx->cfg_edges[bb->predecessors];
306 	for (i = 0; i < bb->predecessors_count; i++, p++) {
307 		if (*p != from) {
308 			if (p != q) {
309 				*q = *p;
310 			}
311 			q++;
312 			n++;
313 		}
314 	}
315 	IR_ASSERT(n != bb->predecessors_count);
316 	bb->predecessors_count = n;
317 }
318 
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)319 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
320 {
321 	ir_ref i, j, n, k, *p, use;
322 	ir_insn *use_insn;
323 	ir_use_list *use_list;
324 	ir_bitset life_inputs;
325 	ir_insn *insn = &ctx->ir_base[merge];
326 
327 	IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
328 	n = insn->inputs_count;
329 	i = 1;
330 	life_inputs = ir_bitset_malloc(n + 1);
331 	for (j = 1; j <= n; j++) {
332 		ir_ref input = ir_insn_op(insn, j);
333 
334 		if (input != from) {
335 			if (i != j) {
336 				ir_insn_set_op(insn, i, input);
337 			}
338 			ir_bitset_incl(life_inputs, j);
339 			i++;
340 		}
341 	}
342 	i--;
343 	if (i == 1) {
344 		insn->op = IR_BEGIN;
345 		insn->inputs_count = 1;
346 		use_list = &ctx->use_lists[merge];
347 		if (use_list->count > 1) {
348 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
349 				use = *p;
350 				use_insn = &ctx->ir_base[use];
351 				if (use_insn->op == IR_PHI) {
352 					/* Convert PHI to COPY */
353 					i = 2;
354 					for (j = 2; j <= n; j++) {
355 						ir_ref input = ir_insn_op(use_insn, j);
356 
357 						if (ir_bitset_in(life_inputs, j - 1)) {
358 							use_insn->op1 = ir_insn_op(use_insn, j);
359 						} else if (input > 0) {
360 							ir_use_list_remove_all(ctx, input, use);
361 						}
362 					}
363 					use_insn->op = IR_COPY;
364 					use_insn->op2 = IR_UNUSED;
365 					use_insn->op3 = IR_UNUSED;
366 					ir_use_list_remove_all(ctx, merge, use);
367 				}
368 			}
369 		}
370 	} else {
371 		insn->inputs_count = i;
372 
373 		n++;
374 		use_list = &ctx->use_lists[merge];
375 		if (use_list->count > 1) {
376 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
377 				use = *p;
378 				use_insn = &ctx->ir_base[use];
379 				if (use_insn->op == IR_PHI) {
380 					i = 2;
381 					for (j = 2; j <= n; j++) {
382 						ir_ref input = ir_insn_op(use_insn, j);
383 
384 						if (ir_bitset_in(life_inputs, j - 1)) {
385 							IR_ASSERT(input);
386 							if (i != j) {
387 								ir_insn_set_op(use_insn, i, input);
388 							}
389 							i++;
390 						} else if (input > 0) {
391 							ir_use_list_remove_all(ctx, input, use);
392 						}
393 					}
394 				}
395 			}
396 		}
397 	}
398 	ir_mem_free(life_inputs);
399 	ir_use_list_remove_all(ctx, from, merge);
400 }
401 
402 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)403 int ir_remove_unreachable_blocks(ir_ctx *ctx)
404 {
405 	uint32_t b, *p, i;
406 	uint32_t unreachable_count = 0;
407 	uint32_t bb_count = ctx->cfg_blocks_count;
408 	ir_block *bb = ctx->cfg_blocks + 1;
409 
410 	for (b = 1; b <= bb_count; b++, bb++) {
411 		if (bb->flags & IR_BB_UNREACHABLE) {
412 #if 0
413 			do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
414 #endif
415 			if (bb->successors_count) {
416 				for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
417 					ir_block *succ_bb = &ctx->cfg_blocks[*p];
418 
419 					if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
420 						ir_remove_predecessor(ctx, succ_bb, b);
421 						ir_remove_merge_input(ctx, succ_bb->start, bb->end);
422 					}
423 				}
424 			} else {
425 				ir_ref prev, ref = bb->end;
426 				ir_insn *insn = &ctx->ir_base[ref];
427 
428 				IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
429 				/* remove from terminators list */
430 				prev = ctx->ir_base[1].op1;
431 				if (prev == ref) {
432 					ctx->ir_base[1].op1 = insn->op3;
433 				} else {
434 					while (prev) {
435 						if (ctx->ir_base[prev].op3 == ref) {
436 							ctx->ir_base[prev].op3 = insn->op3;
437 							break;
438 						}
439 						prev = ctx->ir_base[prev].op3;
440 					}
441 				}
442 			}
443 			ctx->cfg_map[bb->start] = 0;
444 			ctx->cfg_map[bb->end] = 0;
445 			unreachable_count++;
446 		}
447 	}
448 
449 	if (unreachable_count) {
450 		ir_block *dst_bb;
451 		uint32_t n = 1;
452 		uint32_t *edges;
453 
454 		dst_bb = bb = ctx->cfg_blocks + 1;
455 		for (b = 1; b <= bb_count; b++, bb++) {
456 			if (!(bb->flags & IR_BB_UNREACHABLE)) {
457 				if (dst_bb != bb) {
458 					memcpy(dst_bb, bb, sizeof(ir_block));
459 					ctx->cfg_map[dst_bb->start] = n;
460 					ctx->cfg_map[dst_bb->end] = n;
461 				}
462 				dst_bb->successors_count = 0;
463 				dst_bb++;
464 				n++;
465 			}
466 		}
467 		ctx->cfg_blocks_count = bb_count = n - 1;
468 
469 		/* Rebuild successor/predecessors control edges */
470 		edges = ctx->cfg_edges;
471 		bb = ctx->cfg_blocks + 1;
472 		for (b = 1; b <= bb_count; b++, bb++) {
473 			ir_insn *insn = &ctx->ir_base[bb->start];
474 			ir_ref *p, ref;
475 
476 			n = bb->predecessors_count;
477 			if (n > 1) {
478 				uint32_t *q = edges + bb->predecessors;
479 
480 				IR_ASSERT(n == insn->inputs_count);
481 				for (p = insn->ops + 1; n > 0; p++, q++, n--) {
482 					ref = *p;
483 					IR_ASSERT(ref);
484 					ir_ref pred_b = ctx->cfg_map[ref];
485 					ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
486 					*q = pred_b;
487 					edges[pred_bb->successors + pred_bb->successors_count++] = b;
488 				}
489 			} else if (n == 1) {
490 				ref = insn->op1;
491 				IR_ASSERT(ref);
492 				IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
493 				ir_ref pred_b = ctx->cfg_map[ref];
494 				ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
495 				edges[bb->predecessors] = pred_b;
496 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
497 			}
498 		}
499 	}
500 
501 	return 1;
502 }
503 
504 #if 0
505 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
506 {
507 	uint32_t i, *p;
508 	ir_block *bb = &ctx->cfg_blocks[b];
509 
510 	if (bb->postnum != 0) {
511 		return;
512 	}
513 
514 	if (bb->successors_count) {
515 		bb->postnum = -1; /* Marker for "currently visiting" */
516 		p = ctx->cfg_edges + bb->successors;
517 		i = bb->successors_count;
518 		do {
519 			compute_postnum(ctx, cur, *p);
520 			p++;
521 		} while (--i);
522 	}
523 	bb->postnum = (*cur)++;
524 }
525 
526 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
527  * Cooper, Harvey and Kennedy. */
528 int ir_build_dominators_tree(ir_ctx *ctx)
529 {
530 	uint32_t blocks_count, b, postnum;
531 	ir_block *blocks, *bb;
532 	uint32_t *edges;
533 	bool changed;
534 
535 	ctx->flags2 &= ~IR_NO_LOOPS;
536 
537 	postnum = 1;
538 	compute_postnum(ctx, &postnum, 1);
539 
540 	/* Find immediate dominators */
541 	blocks = ctx->cfg_blocks;
542 	edges  = ctx->cfg_edges;
543 	blocks_count = ctx->cfg_blocks_count;
544 	blocks[1].idom = 1;
545 	do {
546 		changed = 0;
547 		/* Iterating in Reverse Post Order */
548 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
549 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
550 			if (bb->predecessors_count == 1) {
551 				uint32_t pred_b = edges[bb->predecessors];
552 
553 				if (blocks[pred_b].idom <= 0) {
554 					//IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
555 				} else if (bb->idom != pred_b) {
556 					bb->idom = pred_b;
557 					changed = 1;
558 				}
559 			} else if (bb->predecessors_count) {
560 				uint32_t idom = 0;
561 				uint32_t k = bb->predecessors_count;
562 				uint32_t *p = edges + bb->predecessors;
563 
564 				do {
565 					uint32_t pred_b = *p;
566 					ir_block *pred_bb = &blocks[pred_b];
567 					ir_block *idom_bb;
568 
569 					if (pred_bb->idom > 0) {
570 						idom = pred_b;
571 						idom_bb = &blocks[idom];
572 
573 						while (--k > 0) {
574 							pred_b = *(++p);
575 							pred_bb = &blocks[pred_b];
576 							if (pred_bb->idom > 0) {
577 								while (idom != pred_b) {
578 									while (pred_bb->postnum < idom_bb->postnum) {
579 										pred_b = pred_bb->idom;
580 										pred_bb = &blocks[pred_b];
581 									}
582 									while (idom_bb->postnum < pred_bb->postnum) {
583 										idom = idom_bb->idom;
584 										idom_bb = &blocks[idom];
585 									}
586 								}
587 							}
588 						}
589 
590 						if (bb->idom != idom) {
591 							bb->idom = idom;
592 							changed = 1;
593 						}
594 						break;
595 					}
596 					p++;
597 				} while (--k > 0);
598 			}
599 		}
600 	} while (changed);
601 	blocks[1].idom = 0;
602 	blocks[1].dom_depth = 0;
603 
604 	/* Construct dominators tree */
605 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
606 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
607 		if (bb->idom > 0) {
608 			ir_block *idom_bb = &blocks[bb->idom];
609 
610 			bb->dom_depth = idom_bb->dom_depth + 1;
611 			/* Sort by block number to traverse children in pre-order */
612 			if (idom_bb->dom_child == 0) {
613 				idom_bb->dom_child = b;
614 			} else if (b < idom_bb->dom_child) {
615 				bb->dom_next_child = idom_bb->dom_child;
616 				idom_bb->dom_child = b;
617 			} else {
618 				int child = idom_bb->dom_child;
619 				ir_block *child_bb = &blocks[child];
620 
621 				while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
622 					child = child_bb->dom_next_child;
623 					child_bb = &blocks[child];
624 				}
625 				bb->dom_next_child = child_bb->dom_next_child;
626 				child_bb->dom_next_child = b;
627 			}
628 		}
629 	}
630 
631 	return 1;
632 }
633 #else
634 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
635  * Cooper, Harvey and Kennedy, that relays on IR block ordering.
636  * It may fallback to the general slow fixed-point algorithm.  */
637 static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
ir_build_dominators_tree(ir_ctx * ctx)638 int ir_build_dominators_tree(ir_ctx *ctx)
639 {
640 	uint32_t blocks_count, b;
641 	ir_block *blocks, *bb;
642 	uint32_t *edges;
643 	ir_list worklist;
644 
645 	ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
646 
647 	ctx->flags2 |= IR_NO_LOOPS;
648 
649 	/* Find immediate dominators */
650 	blocks = ctx->cfg_blocks;
651 	edges  = ctx->cfg_edges;
652 	blocks_count = ctx->cfg_blocks_count;
653 	blocks[1].idom = 1;
654 	blocks[1].dom_depth = 0;
655 
656 	/* Iterating in Reverse Post Order */
657 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
658 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
659 		IR_ASSERT(bb->predecessors_count > 0);
660 		uint32_t k = bb->predecessors_count;
661 		uint32_t *p = edges + bb->predecessors;
662 		uint32_t idom = *p;
663 		ir_block *idom_bb;
664 
665 		if (UNEXPECTED(idom >= b)) {
666 			/* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
667 			ctx->flags2 &= ~IR_NO_LOOPS;
668 			IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
669 			ir_list_push(&worklist, idom);
670 			while (1) {
671 				k--;
672 				p++;
673 				idom = *p;
674 				if (idom < b) {
675 					break;
676 				}
677 				IR_ASSERT(k > 0);
678 				ir_list_push(&worklist, idom);
679 			}
680 		}
681 		IR_ASSERT(blocks[idom].idom > 0);
682 		IR_ASSERT(k != 0);
683 
684 		while (--k > 0) {
685 			uint32_t pred_b = *(++p);
686 
687 			if (pred_b < b) {
688 				IR_ASSERT(blocks[pred_b].idom > 0);
689 				while (idom != pred_b) {
690 					while (pred_b > idom) {
691 						pred_b = blocks[pred_b].idom;
692 					}
693 					while (idom > pred_b) {
694 						idom = blocks[idom].idom;
695 					}
696 				}
697 			} else {
698 				ctx->flags2 &= ~IR_NO_LOOPS;
699 				IR_ASSERT(bb->predecessors_count > 1);
700 				ir_list_push(&worklist, pred_b);
701 			}
702 		}
703 		bb->idom = idom;
704 		idom_bb = &blocks[idom];
705 
706 		bb->dom_depth = idom_bb->dom_depth + 1;
707 		/* Sort by block number to traverse children in pre-order */
708 		if (idom_bb->dom_child == 0) {
709 			idom_bb->dom_child = b;
710 		} else if (b < idom_bb->dom_child) {
711 			bb->dom_next_child = idom_bb->dom_child;
712 			idom_bb->dom_child = b;
713 		} else {
714 			int child = idom_bb->dom_child;
715 			ir_block *child_bb = &blocks[child];
716 
717 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
718 				child = child_bb->dom_next_child;
719 				child_bb = &blocks[child];
720 			}
721 			bb->dom_next_child = child_bb->dom_next_child;
722 			child_bb->dom_next_child = b;
723 		}
724 	}
725 
726 	blocks[1].idom = 0;
727 
728 	if (ir_list_len(&worklist) != 0) {
729 		uint32_t dom_depth;
730 		uint32_t succ_b;
731 		bool complete = 1;
732 
733 		/* Check if all the back-edges lead to the loop headers */
734 		do {
735 			b = ir_list_pop(&worklist);
736 			bb = &blocks[b];
737 			succ_b = ctx->cfg_edges[bb->successors];
738 			if (bb->successors_count != 1) {
739 				/* LOOP_END/END may be linked with the following ENTRY by a fake edge */
740 				IR_ASSERT(bb->successors_count == 2);
741 				if (blocks[succ_b].flags & IR_BB_ENTRY) {
742 					succ_b = ctx->cfg_edges[bb->successors + 1];
743 				} else {
744 					IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
745 				}
746 			}
747 			dom_depth = blocks[succ_b].dom_depth;;
748 			while (bb->dom_depth > dom_depth) {
749 				b = bb->dom_parent;
750 				bb = &blocks[b];
751 			}
752 			if (UNEXPECTED(b != succ_b)) {
753 				complete = 0;
754 				break;
755 			}
756 		} while (ir_list_len(&worklist) != 0);
757 
758 		if (UNEXPECTED(!complete)) {
759 			ir_list_free(&worklist);
760 			return ir_build_dominators_tree_iterative(ctx);
761 		}
762 	}
763 
764 	ir_list_free(&worklist);
765 
766 	return 1;
767 }
768 
ir_build_dominators_tree_iterative(ir_ctx * ctx)769 static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
770 {
771 	bool changed;
772 	uint32_t blocks_count, b;
773 	ir_block *blocks, *bb;
774 	uint32_t *edges;
775 
776 	/* Find immediate dominators */
777 	blocks = ctx->cfg_blocks;
778 	edges  = ctx->cfg_edges;
779 	blocks_count = ctx->cfg_blocks_count;
780 
781 	/* Clear the dominators tree, but keep already found dominators */
782 	for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
783 		bb->dom_depth = 0;
784 		bb->dom_child = 0;
785 		bb->dom_next_child = 0;
786 	}
787 
788 	/* Find immediate dominators by iterative fixed-point algorithm */
789 	blocks[1].idom = 1;
790 	do {
791 		changed = 0;
792 
793 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
794 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
795 			IR_ASSERT(bb->predecessors_count > 0);
796 			uint32_t k = bb->predecessors_count;
797 			uint32_t *p = edges + bb->predecessors;
798 			uint32_t idom = *p;
799 
800 			if (blocks[idom].idom == 0) {
801 				while (1) {
802 					k--;
803 					p++;
804 					idom = *p;
805 					if (blocks[idom].idom > 0) {
806 						break;
807 					}
808 					IR_ASSERT(k > 0);
809 				}
810 			}
811 			IR_ASSERT(k != 0);
812 			while (--k > 0) {
813 				uint32_t pred_b = *(++p);
814 
815 				if (blocks[pred_b].idom > 0) {
816 					IR_ASSERT(blocks[pred_b].idom > 0);
817 					while (idom != pred_b) {
818 						while (pred_b > idom) {
819 							pred_b = blocks[pred_b].idom;
820 						}
821 						while (idom > pred_b) {
822 							idom = blocks[idom].idom;
823 						}
824 					}
825 				}
826 			}
827 			if (bb->idom != idom) {
828 				bb->idom = idom;
829 				changed = 1;
830 			}
831 		}
832 	} while (changed);
833 
834 	/* Build dominators tree */
835 	blocks[1].idom = 0;
836 	blocks[1].dom_depth = 0;
837 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
838 		uint32_t idom = bb->idom;
839 		ir_block *idom_bb = &blocks[idom];
840 
841 		bb->dom_depth = idom_bb->dom_depth + 1;
842 		/* Sort by block number to traverse children in pre-order */
843 		if (idom_bb->dom_child == 0) {
844 			idom_bb->dom_child = b;
845 		} else if (b < idom_bb->dom_child) {
846 			bb->dom_next_child = idom_bb->dom_child;
847 			idom_bb->dom_child = b;
848 		} else {
849 			int child = idom_bb->dom_child;
850 			ir_block *child_bb = &blocks[child];
851 
852 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
853 				child = child_bb->dom_next_child;
854 				child_bb = &blocks[child];
855 			}
856 			bb->dom_next_child = child_bb->dom_next_child;
857 			child_bb->dom_next_child = b;
858 		}
859 	}
860 
861 	return 1;
862 }
863 #endif
864 
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)865 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
866 {
867 	uint32_t b1_depth = blocks[b1].dom_depth;
868 	const ir_block *bb2 = &blocks[b2];
869 
870 	while (bb2->dom_depth > b1_depth) {
871 		b2 = bb2->dom_parent;
872 		bb2 = &blocks[b2];
873 	}
874 	return b1 == b2;
875 }
876 
ir_find_loops(ir_ctx * ctx)877 int ir_find_loops(ir_ctx *ctx)
878 {
879 	uint32_t i, j, n, count;
880 	uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
881 	ir_block *blocks = ctx->cfg_blocks;
882 	uint32_t *edges = ctx->cfg_edges;
883 	ir_worklist work;
884 
885 	if (ctx->flags2 & IR_NO_LOOPS) {
886 		return 1;
887 	}
888 
889 	/* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
890 	 * queries. These are implemented by checking entry/exit times of the DFS search. */
891 	ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
892 	entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
893 	exit_times = entry_times + ctx->cfg_blocks_count + 1;
894 	sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
895 
896 	memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
897 
898 	ir_worklist_push(&work, 1);
899 	while (ir_worklist_len(&work)) {
900 		ir_block *bb;
901 		int child;
902 
903 next:
904 		i = ir_worklist_peek(&work);
905 		if (!entry_times[i]) {
906 			entry_times[i] = time++;
907 		}
908 
909 		/* Visit blocks immediately dominated by i. */
910 		bb = &blocks[i];
911 		for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
912 			if (ir_worklist_push(&work, child)) {
913 				goto next;
914 			}
915 		}
916 
917 		/* Visit join edges. */
918 		if (bb->successors_count) {
919 			uint32_t *p = edges + bb->successors;
920 			for (j = 0; j < bb->successors_count; j++,p++) {
921 				uint32_t succ = *p;
922 
923 				if (blocks[succ].idom == i) {
924 					continue;
925 				} else if (ir_worklist_push(&work, succ)) {
926 					goto next;
927 				}
928 			}
929 		}
930 		exit_times[i] = time++;
931 		ir_worklist_pop(&work);
932 	}
933 
934 	/* Sort blocks by level, which is the opposite order in which we want to process them */
935 	sorted_blocks[1] = 1;
936 	j = 1;
937 	n = 2;
938 	while (j != n) {
939 		i = j;
940 		j = n;
941 		for (; i < j; i++) {
942 			int child;
943 			for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
944 				sorted_blocks[n++] = child;
945 			}
946 		}
947 	}
948 	count = n;
949 
950 	/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
951 	while (n > 1) {
952 		i = sorted_blocks[--n];
953 		ir_block *bb = &blocks[i];
954 
955 		if (bb->predecessors_count > 1) {
956 			bool irreducible = 0;
957 			uint32_t *p = &edges[bb->predecessors];
958 
959 			j = bb->predecessors_count;
960 			do {
961 				uint32_t pred = *p;
962 
963 				/* A join edge is one for which the predecessor does not
964 				   immediately dominate the successor.  */
965 				if (bb->idom != pred) {
966 					/* In a loop back-edge (back-join edge), the successor dominates
967 					   the predecessor.  */
968 					if (ir_dominates(blocks, i, pred)) {
969 						if (!ir_worklist_len(&work)) {
970 							ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
971 						}
972 						blocks[pred].loop_header = 0; /* support for merged loops */
973 						ir_worklist_push(&work, pred);
974 					} else {
975 						/* Otherwise it's a cross-join edge.  See if it's a branch
976 						   to an ancestor on the DJ spanning tree.  */
977 						if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
978 							irreducible = 1;
979 						}
980 					}
981 				}
982 				p++;
983 			} while (--j);
984 
985 			if (UNEXPECTED(irreducible)) {
986 				// TODO: Support for irreducible loops ???
987 				bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
988 				ctx->flags2 |= IR_IRREDUCIBLE_CFG;
989 				while (ir_worklist_len(&work)) {
990 					ir_worklist_pop(&work);
991 				}
992 			} else if (ir_worklist_len(&work)) {
993 				bb->flags |= IR_BB_LOOP_HEADER;
994 				ctx->flags2 |= IR_CFG_HAS_LOOPS;
995 				bb->loop_depth = 1;
996 				while (ir_worklist_len(&work)) {
997 					j = ir_worklist_pop(&work);
998 					while (blocks[j].loop_header > 0) {
999 						j = blocks[j].loop_header;
1000 					}
1001 					if (j != i) {
1002 						ir_block *bb = &blocks[j];
1003 						if (bb->idom == 0 && j != 1) {
1004 							/* Ignore blocks that are unreachable or only abnormally reachable. */
1005 							continue;
1006 						}
1007 						bb->loop_header = i;
1008 						if (bb->predecessors_count) {
1009 							uint32_t *p = &edges[bb->predecessors];
1010 							j = bb->predecessors_count;
1011 							do {
1012 								ir_worklist_push(&work, *p);
1013 								p++;
1014 							} while (--j);
1015 						}
1016 					}
1017 				}
1018 			}
1019 		}
1020 	}
1021 
1022 	if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1023 		for (n = 1; n < count; n++) {
1024 			i = sorted_blocks[n];
1025 			ir_block *bb = &blocks[i];
1026 			if (bb->loop_header > 0) {
1027 				ir_block *loop = &blocks[bb->loop_header];
1028 				uint32_t loop_depth = loop->loop_depth;
1029 
1030 				if (bb->flags & IR_BB_LOOP_HEADER) {
1031 					loop_depth++;
1032 				}
1033 				bb->loop_depth = loop_depth;
1034 				if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1035 					loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1036 					if (loop_depth > 1) {
1037 						/* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1038 						bb = &blocks[loop->loop_header];
1039 						while (1) {
1040 							if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1041 								break;
1042 							}
1043 							bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1044 							if (bb->loop_depth == 1) {
1045 								break;
1046 							}
1047 							bb = &blocks[loop->loop_header];
1048 						}
1049 					}
1050 				}
1051 			}
1052 		}
1053 	}
1054 
1055 	ir_mem_free(entry_times);
1056 	ir_worklist_free(&work);
1057 
1058 	return 1;
1059 }
1060 
_ir_skip_empty_blocks(const ir_ctx * ctx,uint32_t b)1061 static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1062 {
1063 	while (1) {
1064 		ir_block *bb = &ctx->cfg_blocks[b];
1065 
1066 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1067 			IR_ASSERT(bb->successors_count == 1);
1068 			b = ctx->cfg_edges[bb->successors];
1069 		} else {
1070 			return b;
1071 		}
1072 	}
1073 }
1074 
1075 /* A variation of "Bottom-up Positioning" algorithm described by
1076  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1077  */
1078 typedef struct _ir_edge_info {
1079 	uint32_t from;
1080 	uint32_t to;
1081 	float    freq;
1082 } ir_edge_info;
1083 
1084 typedef struct _ir_chain {
1085 	uint32_t head;
1086 	uint32_t next;
1087 	union {
1088 		uint32_t prev;
1089 		uint32_t tail;
1090 	};
1091 } ir_chain;
1092 
ir_edge_info_cmp(const void * b1,const void * b2)1093 static int ir_edge_info_cmp(const void *b1, const void *b2)
1094 {
1095 	ir_edge_info *e1 = (ir_edge_info*)b1;
1096 	ir_edge_info *e2 = (ir_edge_info*)b2;
1097 
1098 	if (e1->freq != e2->freq) {
1099 		return e1->freq < e2->freq ? 1 : -1;
1100 	}
1101 	/* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1102 	 * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1103 	 * (in merge points).
1104 	 *
1105 	 * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1106 	 * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1107 	 */
1108 	if (e1->from != e2->from) {
1109 		return e2->from - e1->from;
1110 	} else {
1111 		return e1->to - e2->to;
1112 	}
1113 }
1114 
ir_chain_head_path_compress(ir_chain * chains,uint32_t src,uint32_t head)1115 static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1116 {
1117 	IR_ASSERT(head != 0);
1118 	do {
1119 		head = chains[head].head;
1120 	} while (chains[head].head != head);
1121 	do {
1122 		uint32_t next = chains[src].head;
1123 		chains[src].head = head;
1124 		src = next;
1125 	} while (chains[src].head != head);
1126 	return head;
1127 }
1128 
ir_chain_head(ir_chain * chains,uint32_t src)1129 IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1130 {
1131 	uint32_t head = chains[src].head;
1132 	if (chains[head].head == head) {
1133 		return head;
1134 	} else {
1135 		return ir_chain_head_path_compress(chains, src, head);
1136 	}
1137 }
1138 
ir_join_chains(ir_chain * chains,uint32_t src,uint32_t dst)1139 static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1140 {
1141 	uint32_t dst_tail = chains[dst].tail;
1142 	uint32_t src_tail = chains[src].tail;
1143 
1144 	chains[dst_tail].next = src;
1145 	chains[dst].prev = src_tail;
1146 	chains[src_tail].next = dst;
1147 	chains[src].tail = dst_tail;
1148 	chains[dst].head = src;
1149 }
1150 
ir_insert_chain_before(ir_chain * chains,uint32_t c,uint32_t before)1151 static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1152 {
1153 	ir_chain *this = &chains[c];
1154 	ir_chain *next = &chains[before];
1155 
1156 	IR_ASSERT(chains[c].head == c);
1157 	if (chains[before].head != before) {
1158 		this->head = next->head;
1159 	} else {
1160 		next->head = c;
1161 	}
1162 	this->next = before;
1163 	this->prev = next->prev;
1164 	next->prev = c;
1165 	chains[this->prev].next = c;
1166 }
1167 
1168 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1169 # ifdef IR_DEBUG
1170 #  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1171 # else
1172 #  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1173 # endif
1174 #endif
1175 
1176 #if IR_DEBUG_BB_SCHEDULE_GRAPH
ir_dump_cfg_freq_graph(ir_ctx * ctx,float * bb_freq,uint32_t edges_count,ir_edge_info * edges,ir_chain * chains)1177 static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1178 {
1179 	uint32_t b, i;
1180 	ir_block *bb;
1181 	uint8_t c, *colors;
1182 	bool is_head, is_empty;
1183 	uint32_t max_colors = 12;
1184 
1185 	colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1186 	memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1187 	i = 0;
1188 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1189 		if (chains[b].head == b) {
1190 			colors[b] = (i % max_colors) + 1;
1191 			i++;
1192 		}
1193 	}
1194 
1195 	fprintf(stderr, "digraph {\n");
1196 	fprintf(stderr, "\trankdir=TB;\n");
1197 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1198 		bb = &ctx->cfg_blocks[b];
1199 		c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1200 		is_head = chains[b].head == b;
1201 		is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1202 		if (c) {
1203 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1204 				bb->loop_depth, bb_freq[b],
1205 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1206 				c,
1207 				is_head ? ",penwidth=3" : "",
1208 				is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1209 		} else {
1210 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1211 				bb->loop_depth, bb_freq[b],
1212 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1213 				is_head ? ",penwidth=3" : "",
1214 				is_empty ? ",style=\"dotted\"" : "");
1215 		}
1216 	}
1217 	fprintf(stderr, "\n");
1218 
1219 	for (i = 0; i < edges_count; i++) {
1220 		fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1221 	}
1222 	fprintf(stderr, "}\n");
1223 }
1224 #endif
1225 
1226 #ifdef IR_DEBUG
ir_dump_edges(ir_ctx * ctx,uint32_t edges_count,ir_edge_info * edges)1227 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1228 {
1229 	uint32_t i;
1230 
1231 	fprintf(stderr, "Edges:\n");
1232 	for (i = 0; i < edges_count; i++) {
1233 		fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1234 	}
1235 }
1236 
ir_dump_chains(ir_ctx * ctx,ir_chain * chains)1237 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1238 {
1239 	uint32_t b, tail, i;
1240 
1241 	fprintf(stderr, "Chains:\n");
1242 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1243 		if (chains[b].head == b) {
1244 			tail = chains[b].tail;
1245 			i = b;
1246 			fprintf(stderr, "(BB%d", i);
1247 			while (i != tail) {
1248 				i = chains[i].next;
1249 				fprintf(stderr, ", BB%d", i);
1250 			}
1251 			fprintf(stderr, ")\n");
1252 		}
1253 	}
1254 }
1255 #endif
1256 
ir_schedule_blocks_bottom_up(ir_ctx * ctx)1257 static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1258 {
1259 	uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1260 	uint32_t edges_count = 0;
1261 	uint32_t b, i, loop_depth;
1262 	float *bb_freq, freq;
1263 	ir_block *bb;
1264 	ir_edge_info *edges, *e;
1265 	ir_chain *chains;
1266 	ir_bitqueue worklist;
1267 	ir_bitset visited;
1268 	uint32_t *schedule_end, count;
1269 
1270 	ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1271 	schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1272 
1273 	/* 1. Create initial chains for each BB */
1274 	chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1275 	chains[0].head = 0;
1276 	chains[0].next = 0;
1277 	chains[0].prev = 0;
1278 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1279 		chains[b].head = b;
1280 		chains[b].next = b;
1281 		chains[b].prev = b;
1282 	}
1283 
1284 	/* 2. Collect information about BBs and EDGEs frequencies */
1285 	edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1286 	bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1287 	bb_freq[1] = 1.0f;
1288 	visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1289 	ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1290 	ir_bitqueue_add(&worklist, 1);
1291 	while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1292 restart:
1293 		bb = &ctx->cfg_blocks[b];
1294 		if (bb->predecessors_count) {
1295 			uint32_t n = bb->predecessors_count;
1296 			uint32_t *p = ctx->cfg_edges + bb->predecessors;
1297 			for (; n > 0; p++, n--) {
1298 				uint32_t predecessor = *p;
1299 				/* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1300 				 * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1301 				 * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1302 				 */
1303 				if (predecessor < b) {
1304 					if (!ir_bitset_in(visited, predecessor)) {
1305 						b = predecessor;
1306 						ir_bitqueue_del(&worklist, b);
1307 						goto restart;
1308 					}
1309 				} else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1310 					ir_dump_cfg(ctx, stderr);
1311 					IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1312 				}
1313 			}
1314 		}
1315 
1316 		ir_bitset_incl(visited, b);
1317 
1318 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1319 			uint32_t successor = ctx->cfg_edges[bb->successors];
1320 
1321 			/* move empty blocks to the end */
1322 			IR_ASSERT(chains[b].head == b);
1323 			chains[b].head = 0;
1324 			*schedule_end = b;
1325 			schedule_end--;
1326 
1327 			if (successor > b) {
1328 				bb_freq[successor] += bb_freq[b];
1329 				b = successor;
1330 				ir_bitqueue_del(&worklist, b);
1331 				goto restart;
1332 			} else {
1333 				continue;
1334 			}
1335 		}
1336 
1337 		loop_depth = bb->loop_depth;
1338 		if (bb->flags & IR_BB_LOOP_HEADER) {
1339 			// TODO: Estimate the loop iterations count
1340 			bb_freq[b] *= 10;
1341 		}
1342 
1343 		if (bb->successors_count) {
1344 			uint32_t n = bb->successors_count;
1345 			uint32_t *p = ctx->cfg_edges + bb->successors;
1346 
1347 			if (n == 1) {
1348 				uint32_t successor = *p;
1349 
1350 				IR_ASSERT(edges_count < max_edges_count);
1351 				freq = bb_freq[b];
1352 				if (successor > b) {
1353 					IR_ASSERT(!ir_bitset_in(visited, successor));
1354 					bb_freq[successor] += freq;
1355 					ir_bitqueue_add(&worklist, successor);
1356 				}
1357 				successor = _ir_skip_empty_blocks(ctx, successor);
1358 				edges[edges_count].from = b;
1359 				edges[edges_count].to = successor;
1360 				edges[edges_count].freq = freq;
1361 				edges_count++;
1362 			} else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1363 				uint32_t successor1 = *p;
1364 				ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1365 				ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1366 				uint32_t successor2 = *(p + 1);
1367 				ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1368 				ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1369 				int prob1, prob2, probN = 100;
1370 
1371 				if (insn1->op2) {
1372 					prob1 = insn1->op2;
1373 					if (insn2->op2) {
1374 						prob2 = insn2->op2;
1375 						probN = prob1 + prob2;
1376 					} else {
1377 						if (prob1 > 100) {
1378 							prob1 = 100;
1379 						}
1380 						prob2 = 100 - prob1;
1381 					}
1382 
1383 				} else if (insn2->op2) {
1384 					prob2 = insn2->op2;
1385 					if (prob2 > 100) {
1386 						prob2 = 100;
1387 					}
1388 					prob1 = 100 - prob2;
1389 				} else if (successor1_bb->loop_depth >= loop_depth
1390 						&& successor2_bb->loop_depth < loop_depth) {
1391 					prob1 = 90;
1392 					prob2 = 10;
1393 				} else if (successor1_bb->loop_depth < loop_depth
1394 						&& successor2_bb->loop_depth >= loop_depth) {
1395 					prob1 = 10;
1396 					prob2 = 90;
1397 				} else if (successor2_bb->flags & IR_BB_EMPTY) {
1398 					prob1 = 51;
1399 					prob2 = 49;
1400 				} else if (successor1_bb->flags & IR_BB_EMPTY) {
1401 					prob1 = 49;
1402 					prob2 = 51;
1403 				} else {
1404 					prob1 = prob2 = 50;
1405 				}
1406 				do {
1407 					freq = bb_freq[b] * (float)prob1 / (float)probN;
1408 					if (successor1 > b) {
1409 						IR_ASSERT(!ir_bitset_in(visited, successor1));
1410 						bb_freq[successor1] += freq;
1411 						if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1412 							/* move cold block without successors to the end */
1413 							IR_ASSERT(chains[successor1].head == successor1);
1414 							chains[successor1].head = 0;
1415 							*schedule_end = successor1;
1416 							schedule_end--;
1417 							break;
1418 						} else {
1419 							ir_bitqueue_add(&worklist, successor1);
1420 						}
1421 					}
1422 					/* try to join edges early to reduce number of edges and the cost of their sorting */
1423 					if (prob1 > prob2
1424 					 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1425 						uint32_t src = chains[b].next;
1426 						IR_ASSERT(chains[src].head == src);
1427 						IR_ASSERT(src == ir_chain_head(chains, b));
1428 						IR_ASSERT(chains[successor1].head == successor1);
1429 						ir_join_chains(chains, src, successor1);
1430 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1431 					}
1432 					successor1 = _ir_skip_empty_blocks(ctx, successor1);
1433 					IR_ASSERT(edges_count < max_edges_count);
1434 					edges[edges_count].from = b;
1435 					edges[edges_count].to = successor1;
1436 					edges[edges_count].freq = freq;
1437 					edges_count++;
1438 				} while (0);
1439 				do {
1440 					freq = bb_freq[b] * (float)prob2 / (float)probN;
1441 					if (successor2 > b) {
1442 						IR_ASSERT(!ir_bitset_in(visited, successor2));
1443 						bb_freq[successor2] += freq;
1444 						if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1445 							/* move cold block without successors to the end */
1446 							IR_ASSERT(chains[successor2].head == successor2);
1447 							chains[successor2].head = 0;
1448 							*schedule_end = successor2;
1449 							schedule_end--;
1450 							break;
1451 						} else {
1452 							ir_bitqueue_add(&worklist, successor2);
1453 						}
1454 					}
1455 					if (prob2 > prob1
1456 					 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1457 						uint32_t src = chains[b].next;
1458 						IR_ASSERT(chains[src].head == src);
1459 						IR_ASSERT(src == ir_chain_head(chains, b));
1460 						IR_ASSERT(chains[successor2].head == successor2);
1461 						ir_join_chains(chains, src, successor2);
1462 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1463 					}
1464 					successor2 = _ir_skip_empty_blocks(ctx, successor2);
1465 					IR_ASSERT(edges_count < max_edges_count);
1466 					edges[edges_count].from = b;
1467 					edges[edges_count].to = successor2;
1468 					edges[edges_count].freq = freq;
1469 					edges_count++;
1470 				} while (0);
1471 			} else {
1472 				int prob;
1473 
1474 				for (; n > 0; p++, n--) {
1475 					uint32_t successor = *p;
1476 					ir_block *successor_bb = &ctx->cfg_blocks[successor];
1477 					ir_insn *insn = &ctx->ir_base[successor_bb->start];
1478 
1479 					if (insn->op == IR_CASE_DEFAULT) {
1480 						prob = insn->op2;
1481 						if (!prob) {
1482 							prob = 100 / bb->successors_count;
1483 						}
1484 					} else if (insn->op == IR_CASE_VAL) {
1485 						prob = insn->op3;
1486 						if (!prob) {
1487 							prob = 100 / bb->successors_count;
1488 						}
1489 					} else if (insn->op == IR_ENTRY) {
1490 						if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1491 							prob = 99; /* prefer empty ENTRY block to go first */
1492 						} else {
1493 							prob = 1;
1494 						}
1495 					} else {
1496 						prob = 100 / bb->successors_count;
1497 					}
1498 					freq = bb_freq[b] * (float)prob / 100.0f;
1499 					if (successor > b) {
1500 						IR_ASSERT(!ir_bitset_in(visited, successor));
1501 						bb_freq[successor] += freq;
1502 						ir_bitqueue_add(&worklist, successor);
1503 					}
1504 					successor = _ir_skip_empty_blocks(ctx, successor);
1505 					IR_ASSERT(edges_count < max_edges_count);
1506 					edges[edges_count].from = b;
1507 					edges[edges_count].to = successor;
1508 					edges[edges_count].freq = freq;
1509 					edges_count++;
1510 				}
1511 			}
1512 		}
1513 	}
1514 	ir_bitqueue_free(&worklist);
1515 	ir_mem_free(visited);
1516 
1517 	/* 2. Sort EDGEs according to their frequencies */
1518 	qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1519 
1520 #ifdef IR_DEBUG
1521 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1522 		ir_dump_edges(ctx, edges_count, edges);
1523 	}
1524 #endif
1525 
1526 	/* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1527 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1528 		uint32_t dst = chains[e->to].head;
1529 		if (dst == e->to) {
1530 			uint32_t src = chains[e->from].next;
1531 			if (chains[src].head == src) {
1532 				IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1533 				if (src != dst) {
1534 					ir_join_chains(chains, src, dst);
1535 				} else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1536 					/* Try to rotate loop chian to finish it with a conditional branch */
1537 					uint32_t tail = e->from;
1538 					uint32_t prev = src;
1539 					uint32_t next = chains[prev].next;
1540 					uint32_t best = 0;
1541 
1542 					while (prev != tail) {
1543 						if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1544 							if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1545 							 && ctx->cfg_blocks[prev].loop_depth > 1) {
1546 								best = next;
1547 								break;
1548 							} else if (!best || bb_freq[next] < bb_freq[best]) {
1549 								/* Find the successor of IF with the least frequency */
1550 								best = next;
1551 							}
1552 						}
1553 						prev = next;
1554 						next = chains[next].next;
1555 					}
1556 					if (best) {
1557 						/* change the head of the chain */
1558 						chains[src].head = best;
1559 						chains[best].head = best;
1560 					}
1561 				}
1562 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1563 				e->from = 0; /* reset "from" to avoid check on step #5 */
1564 #endif
1565 			}
1566 		}
1567 	}
1568 
1569 #if IR_DEBUG_BB_SCHEDULE_GRAPH
1570 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1571 		ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1572 	}
1573 #endif
1574 
1575 	ir_mem_free(bb_freq);
1576 
1577 #ifdef IR_DEBUG
1578 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1579 		ir_dump_chains(ctx, chains);
1580 	}
1581 #endif
1582 
1583 	/* 4. Merge empty entry blocks */
1584 	if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1585 		for (i = 0; i < ctx->entries_count; i++) {
1586 			b = ctx->entries[i];
1587 			IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1588 			if (b && chains[b].head == b && chains[b].tail == b) {
1589 				bb = &ctx->cfg_blocks[b];
1590 				if (bb->flags & IR_BB_EMPTY) {
1591 					uint32_t successor;
1592 
1593 					do {
1594 						IR_ASSERT(bb->successors_count == 1);
1595 						successor = ctx->cfg_edges[bb->successors];
1596 						bb = &ctx->cfg_blocks[successor];
1597 					} while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1598 					IR_ASSERT(chains[successor].head);
1599 					ir_insert_chain_before(chains, b, successor);
1600 				}
1601 			}
1602 		}
1603 
1604 #ifdef IR_DEBUG
1605 		if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1606 			ir_dump_chains(ctx, chains);
1607 		}
1608 #endif
1609 	}
1610 
1611 	/* 5. Align loop headers */
1612 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1613 		if (chains[b].head == b) {
1614 			bb = &ctx->cfg_blocks[b];
1615 			if (bb->loop_depth) {
1616 				if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1617 					bb->flags |= IR_BB_ALIGN_LOOP;
1618 				}
1619 			}
1620 		}
1621 	}
1622 
1623 	/* 6. Group chains according to the most frequent edge between them */
1624 	// TODO: Try to find a better heuristic
1625 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1626 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1627 		if (!e->from) continue;
1628 #endif
1629 		uint32_t src = ir_chain_head(chains, e->from);
1630 		uint32_t dst = ir_chain_head(chains, e->to);
1631 		if (src != dst) {
1632 			if (dst == 1) {
1633 				ir_join_chains(chains, dst, src);
1634 			} else {
1635 				ir_join_chains(chains, src, dst);
1636 			}
1637 		}
1638 	}
1639 
1640 #ifdef IR_DEBUG
1641 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1642 		ir_dump_chains(ctx, chains);
1643 	}
1644 #endif
1645 
1646 	/* 7. Form a final BB order */
1647 	count = 0;
1648 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1649 		if (chains[b].head == b) {
1650 			uint32_t tail = chains[b].tail;
1651 			uint32_t i = b;
1652 			while (1) {
1653 				count++;
1654 				ctx->cfg_schedule[count] = i;
1655 				if (i == tail) break;
1656 				i = chains[i].next;
1657 			}
1658 		}
1659 	}
1660 
1661 	IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1662 	ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1663 
1664 	ir_mem_free(edges);
1665 	ir_mem_free(chains);
1666 
1667 	return 1;
1668 }
1669 
1670 /* A variation of "Top-down Positioning" algorithm described by
1671  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1672  */
ir_schedule_blocks_top_down(ir_ctx * ctx)1673 static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1674 {
1675 	ir_bitqueue blocks;
1676 	uint32_t b, best_successor, last_non_empty;
1677 	ir_block *bb, *best_successor_bb;
1678 	ir_insn *insn;
1679 	uint32_t *list, *schedule_end;
1680 	uint32_t count = 0;
1681 
1682 	ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1683 	blocks.pos = 0;
1684 	list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1685 	list[ctx->cfg_blocks_count + 1] = 0;
1686 	schedule_end = list + ctx->cfg_blocks_count;
1687 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1688 		ir_bitset_incl(blocks.set, b);
1689 	}
1690 
1691 	while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1692 		bb = &ctx->cfg_blocks[b];
1693 		/* Start trace */
1694 		last_non_empty = 0;
1695 		do {
1696 			if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1697 				/* Schedule the previous empty ENTRY block before this one */
1698 				uint32_t predecessor = b - 1;
1699 
1700 				ir_bitqueue_del(&blocks, predecessor);
1701 				count++;
1702 				list[count] = predecessor;
1703 			}
1704 			if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1705 				/* move empty blocks to the end */
1706 				*schedule_end = b;
1707 				schedule_end--;
1708 			} else {
1709 				count++;
1710 				list[count] = b;
1711 				last_non_empty = b;
1712 			}
1713 			best_successor_bb = NULL;
1714 			if (bb->successors_count == 1) {
1715 				best_successor = ctx->cfg_edges[bb->successors];
1716 				if (ir_bitqueue_in(&blocks, best_successor)) {
1717 					best_successor_bb = &ctx->cfg_blocks[best_successor];
1718 				}
1719 			} else if (bb->successors_count > 1) {
1720 				uint32_t prob, best_successor_prob;
1721 				uint32_t *p, successor;
1722 				ir_block *successor_bb;
1723 
1724 				for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1725 					successor = *p;
1726 					if (ir_bitqueue_in(&blocks, successor)) {
1727 						successor_bb = &ctx->cfg_blocks[successor];
1728 						insn = &ctx->ir_base[successor_bb->start];
1729 						if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1730 							prob = insn->op2;
1731 							if (!prob) {
1732 								prob = 100 / bb->successors_count;
1733 								if (!(successor_bb->flags & IR_BB_EMPTY)) {
1734 									prob++;
1735 								}
1736 							}
1737 						} else if (insn->op == IR_CASE_DEFAULT) {
1738 							prob = insn->op2;
1739 							if (!prob) {
1740 								prob = 100 / bb->successors_count;
1741 							}
1742 						} else if (insn->op == IR_CASE_VAL) {
1743 							prob = insn->op3;
1744 							if (!prob) {
1745 								prob = 100 / bb->successors_count;
1746 							}
1747 						} else if (insn->op == IR_ENTRY) {
1748 							if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1749 								prob = 99; /* prefer empty ENTRY block to go first */
1750 							} else {
1751 								prob = 1;
1752 							}
1753 						} else {
1754 							prob = 100 / bb->successors_count;
1755 						}
1756 						if (!best_successor_bb
1757 						 || successor_bb->loop_depth > best_successor_bb->loop_depth
1758 						 || prob > best_successor_prob) {
1759 							best_successor = successor;
1760 							best_successor_bb = successor_bb;
1761 							best_successor_prob = prob;
1762 						}
1763 					}
1764 				}
1765 			}
1766 			if (!best_successor_bb) {
1767 				/* Try to continue trace using the other successor of the last IF */
1768 				if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1769 					bb = &ctx->cfg_blocks[last_non_empty];
1770 					if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1771 						b = ctx->cfg_edges[bb->successors];
1772 
1773 						if (!ir_bitqueue_in(&blocks, b)) {
1774 							b = ctx->cfg_edges[bb->successors + 1];
1775 						}
1776 						if (ir_bitqueue_in(&blocks, b)) {
1777 							bb = &ctx->cfg_blocks[b];
1778 							ir_bitqueue_del(&blocks, b);
1779 							continue;
1780 						}
1781 					}
1782 				}
1783 				/* End trace */
1784 				break;
1785 			}
1786 			b = best_successor;
1787 			bb = best_successor_bb;
1788 			ir_bitqueue_del(&blocks, b);
1789 		} while (1);
1790 	}
1791 
1792 	IR_ASSERT(list + count == schedule_end);
1793 	ctx->cfg_schedule = list;
1794 	ir_bitqueue_free(&blocks);
1795 
1796 	return 1;
1797 }
1798 
ir_schedule_blocks(ir_ctx * ctx)1799 int ir_schedule_blocks(ir_ctx *ctx)
1800 {
1801 	ir_ref ref;
1802 
1803 	if (ctx->cfg_blocks_count <= 2) {
1804 		return 1;
1805 	}
1806 
1807 	/* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1808 	ref = ctx->ir_base[1].op1;
1809 	while (ref) {
1810 		ir_insn *insn = &ctx->ir_base[ref];
1811 
1812 		if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1813 			ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1814 			uint32_t n = bb->predecessors_count;
1815 
1816 			if (n == 1) {
1817 				ir_insn *start_insn = &ctx->ir_base[bb->start];
1818 				if (start_insn->op == IR_IF_TRUE
1819 				 || start_insn->op == IR_IF_FALSE
1820 				 || start_insn->op == IR_CASE_DEFAULT) {
1821 					if (!start_insn->op2) start_insn->op2 = 1;
1822 				} else if (start_insn->op == IR_CASE_VAL) {
1823 					if (!start_insn->op3) start_insn->op3 = 1;
1824 				}
1825 			} else if (n > 1) {
1826 				uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1827 
1828 				for (; n > 0; p++, n--) {
1829 					bb = &ctx->cfg_blocks[*p];
1830 					if (bb->predecessors_count == 1) {
1831 						ir_insn *start_insn = &ctx->ir_base[bb->start];
1832 						if (start_insn->op == IR_IF_TRUE
1833 						 || start_insn->op == IR_IF_FALSE
1834 						 || start_insn->op == IR_CASE_DEFAULT) {
1835 							if (!start_insn->op2) start_insn->op2 = 1;
1836 						} else if (start_insn->op == IR_CASE_VAL) {
1837 							if (!start_insn->op3) start_insn->op3 = 1;
1838 						}
1839 					}
1840 				}
1841 			}
1842 		}
1843 		ref = insn->op3;
1844 	}
1845 
1846 	/* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
1847 	 * use it only for relatively small functions.
1848 	 *
1849 	 * TODO: make the choice between top-down and bottom-up algorithm configurable
1850 	 */
1851 	if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
1852 		return ir_schedule_blocks_top_down(ctx);
1853 	} else {
1854 		return ir_schedule_blocks_bottom_up(ctx);
1855 	}
1856 }
1857 
1858 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1859 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1860 {
1861 	return _ir_skip_empty_blocks(ctx, b);
1862 }
1863 
ir_next_block(const ir_ctx * ctx,uint32_t b)1864 uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
1865 {
1866 	ir_block *bb;
1867 
1868 	if (ctx->cfg_schedule) {
1869 		uint32_t ret = ctx->cfg_schedule[++b];
1870 
1871 		/* Check for empty ENTRY block */
1872 		while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
1873 			ret = ctx->cfg_schedule[++b];
1874 		}
1875 		return ret;
1876 	}
1877 
1878 	b++;
1879 	while (1) {
1880 		if (b > ctx->cfg_blocks_count) {
1881 			return 0;
1882 		}
1883 
1884 		bb = &ctx->cfg_blocks[b];
1885 
1886 		if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1887 			b++;
1888 		} else {
1889 			break;
1890 		}
1891 	}
1892 	return b;
1893 }
1894 
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block)1895 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
1896 {
1897 	ir_block *bb;
1898 	uint32_t *p, use_block;
1899 
1900 	*true_block = 0;
1901 	*false_block = 0;
1902 	bb = &ctx->cfg_blocks[b];
1903 	IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1904 	IR_ASSERT(bb->successors_count == 2);
1905 	p = &ctx->cfg_edges[bb->successors];
1906 	use_block = *p;
1907 	if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1908 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1909 		use_block = *(p+1);
1910 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1911 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1912 	} else {
1913 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1914 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1915 		use_block = *(p+1);
1916 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1917 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1918 	}
1919 	IR_ASSERT(*true_block && *false_block);
1920 }
1921