xref: /php-src/ext/opcache/jit/ir/ir_cfg.c (revision bb21d195)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (CFG - Control Flow Graph)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 #include "ir_private.h"
10 
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)11 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
12 {
13 	ir_use_list *use_list = &ctx->use_lists[ref];
14 	ir_ref *p, use, n = use_list->count;
15 
16 	if (n < 2) {
17 		if (n == 1) {
18 			use = ctx->use_edges[use_list->refs];
19 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
20 			ir_worklist_push(worklist, use);
21 		}
22 	} else {
23 		p = &ctx->use_edges[use_list->refs];
24 		if (n == 2) {
25 			use = *p;
26 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
27 			ir_worklist_push(worklist, use);
28 			use = *(p + 1);
29 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
30 			ir_worklist_push(worklist, use);
31 		} else {
32 			for (; n > 0; p++, n--) {
33 				use = *p;
34 				IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
35 				ir_worklist_push(worklist, use);
36 			}
37 		}
38 	}
39 }
40 
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)41 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
42 {
43 	ir_ref n, ref;
44 	const ir_ref *p;
45 
46 	if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
47 		n = insn->inputs_count;
48 		for (p = insn->ops + 1; n > 0; p++, n--) {
49 			ref = *p;
50 			IR_ASSERT(ref);
51 			ir_worklist_push(worklist, ref);
52 		}
53 	} else if (insn->op != IR_START) {
54 		if (EXPECTED(insn->op1)) {
55 			ir_worklist_push(worklist, insn->op1);
56 		}
57 	}
58 }
59 
ir_build_cfg(ir_ctx * ctx)60 int ir_build_cfg(ir_ctx *ctx)
61 {
62 	ir_ref n, *p, ref, start, end, next;
63 	uint32_t b;
64 	ir_insn *insn;
65 	ir_worklist worklist;
66 	uint32_t bb_init_falgs;
67 	uint32_t count, bb_count = 0;
68 	uint32_t edges_count = 0;
69 	ir_block *blocks, *bb;
70 	uint32_t *_blocks, *edges;
71 	ir_use_list *use_list;
72 	uint32_t len = ir_bitset_len(ctx->insns_count);
73 	ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
74 	ir_bitset bb_leaks = bb_starts + len;
75 	_blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
76 	ir_worklist_init(&worklist, ctx->insns_count);
77 
78 	/* First try to perform backward DFS search starting from "stop" nodes */
79 
80 	/* Add all "stop" nodes */
81 	ref = ctx->ir_base[1].op1;
82 	while (ref) {
83 		ir_worklist_push(&worklist, ref);
84 		ref = ctx->ir_base[ref].op3;
85 	}
86 
87 	while (ir_worklist_len(&worklist)) {
88 		ref = ir_worklist_pop(&worklist);
89 		insn = &ctx->ir_base[ref];
90 
91 		if (insn->op == IR_NOP) {
92 			continue;
93 		}
94 		IR_ASSERT(IR_IS_BB_END(insn->op));
95 		/* Remember BB end */
96 		end = ref;
97 		/* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
98 		use_list = &ctx->use_lists[end];
99 		n = use_list->count;
100 		if (n > 1) {
101 			for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
102 				/* Remember possible inaccessible successors */
103 				ir_bitset_incl(bb_leaks, *p);
104 			}
105 		}
106 		/* Skip control nodes untill BB start */
107 		ref = insn->op1;
108 		while (1) {
109 			insn = &ctx->ir_base[ref];
110 			if (IR_IS_BB_START(insn->op)) {
111 				break;
112 			}
113 			ref = insn->op1; // follow connected control blocks untill BB start
114 		}
115 		/* Mark BB Start */
116 		bb_count++;
117 		_blocks[ref] = end;
118 		ir_bitset_incl(bb_starts, ref);
119 		/* Add predecessors */
120 		_ir_add_predecessors(insn, &worklist);
121 	}
122 
123 	/* Backward DFS way miss some branches ending by infinite loops.  */
124 	/* Try forward DFS. (in most cases all nodes are already proceed). */
125 
126 	/* START node may be inaccessible from "stop" nodes */
127 	ir_bitset_incl(bb_leaks, 1);
128 
129 	/* Add not processed START and successor of IF and SWITCH */
130 	IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
131 		ir_worklist_push(&worklist, start);
132 	} IR_BITSET_FOREACH_END();
133 
134 	if (ir_worklist_len(&worklist)) {
135 		ir_bitset_union(worklist.visited, bb_starts, len);
136 		do {
137 			ref = ir_worklist_pop(&worklist);
138 			insn = &ctx->ir_base[ref];
139 
140 			if (insn->op == IR_NOP) {
141 				continue;
142 			}
143 			IR_ASSERT(IR_IS_BB_START(insn->op));
144 			/* Remember BB start */
145 			start = ref;
146 			/* Skip control nodes untill BB end */
147 			while (1) {
148 				use_list = &ctx->use_lists[ref];
149 				n = use_list->count;
150 				next = IR_UNUSED;
151 				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
152 					next = *p;
153 					insn = &ctx->ir_base[next];
154 					if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
155 						break;
156 					}
157 				}
158 				IR_ASSERT(next != IR_UNUSED);
159 				ref = next;
160 				if (IR_IS_BB_END(insn->op)) {
161 					break;
162 				}
163 			}
164 			/* Mark BB Start */
165 			bb_count++;
166 			_blocks[start] = ref;
167 			ir_bitset_incl(bb_starts, start);
168 			/* Add successors */
169 			_ir_add_successors(ctx, ref, &worklist);
170 		} while (ir_worklist_len(&worklist));
171 	}
172 
173 	IR_ASSERT(bb_count > 0);
174 
175 	/* Create array of basic blocks and count successor/predecessors edges for each BB */
176 	blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
177 	b = 1;
178 	bb = blocks + 1;
179 	count = 0;
180 	/* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
181 	bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
182 	IR_BITSET_FOREACH(bb_starts, len, start) {
183 		insn = &ctx->ir_base[start];
184 		if (insn->op == IR_NOP) {
185 			_blocks[start] = 0;
186 			continue;
187 		}
188 		end = _blocks[start];
189 		_blocks[start] = b;
190 		_blocks[end] = b;
191 		IR_ASSERT(IR_IS_BB_START(insn->op));
192 		IR_ASSERT(end > start);
193 		bb->start = start;
194 		bb->end = end;
195 		bb->successors = count;
196 		count += ctx->use_lists[end].count;
197 		bb->successors_count = 0;
198 		bb->predecessors = count;
199 		bb->dom_parent = 0;
200 		bb->dom_depth = 0;
201 		bb->dom_child = 0;
202 		bb->dom_next_child = 0;
203 		bb->loop_header = 0;
204 		bb->loop_depth = 0;
205 		if (insn->op == IR_START) {
206 			bb->flags = IR_BB_START;
207 			bb->predecessors_count = 0;
208 		} else {
209 			bb->flags = bb_init_falgs;
210 			if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
211 				n = insn->inputs_count;
212 				bb->predecessors_count = n;
213 				edges_count += n;
214 				count += n;
215 			} else if (EXPECTED(insn->op1)) {
216 				if (insn->op == IR_ENTRY) {
217 					bb->flags |= IR_BB_ENTRY;
218 					ctx->entries_count++;
219 				}
220 				bb->predecessors_count = 1;
221 				edges_count++;
222 				count++;
223 			} else {
224 				IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
225 				bb->predecessors_count = 0;
226 			}
227 		}
228 		b++;
229 		bb++;
230 	} IR_BITSET_FOREACH_END();
231 	bb_count = b - 1;
232 	IR_ASSERT(count == edges_count * 2);
233 	ir_mem_free(bb_starts);
234 
235 	/* Create an array of successor/predecessors control edges */
236 	edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
237 	bb = blocks + 1;
238 	for (b = 1; b <= bb_count; b++, bb++) {
239 		insn = &ctx->ir_base[bb->start];
240 		if (bb->predecessors_count > 1) {
241 			uint32_t *q = edges + bb->predecessors;
242 			n = insn->inputs_count;
243 			for (p = insn->ops + 1; n > 0; p++, q++, n--) {
244 				ref = *p;
245 				IR_ASSERT(ref);
246 				ir_ref pred_b = _blocks[ref];
247 				ir_block *pred_bb = &blocks[pred_b];
248 				*q = pred_b;
249 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
250 			}
251 		} else if (bb->predecessors_count == 1) {
252 			ref = insn->op1;
253 			IR_ASSERT(ref);
254 			IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
255 			ir_ref pred_b = _blocks[ref];
256 			ir_block *pred_bb = &blocks[pred_b];
257 			edges[bb->predecessors] = pred_b;
258 			edges[pred_bb->successors + pred_bb->successors_count++] = b;
259 		}
260 	}
261 
262 	ctx->cfg_blocks_count = bb_count;
263 	ctx->cfg_edges_count = edges_count * 2;
264 	ctx->cfg_blocks = blocks;
265 	ctx->cfg_edges = edges;
266 	ctx->cfg_map = _blocks;
267 
268 	if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
269 		uint32_t reachable_count = 0;
270 
271 		/* Mark reachable blocks */
272 		ir_worklist_clear(&worklist);
273 		ir_worklist_push(&worklist, 1);
274 		while (ir_worklist_len(&worklist) != 0) {
275 			uint32_t *p;
276 
277 			reachable_count++;
278 			b = ir_worklist_pop(&worklist);
279 			bb = &blocks[b];
280 			bb->flags &= ~IR_BB_UNREACHABLE;
281 			n = bb->successors_count;
282 			if (n > 1) {
283 				for (p = edges + bb->successors; n > 0; p++, n--) {
284 					ir_worklist_push(&worklist, *p);
285 				}
286 			} else if (n == 1) {
287 				ir_worklist_push(&worklist, edges[bb->successors]);
288 			}
289 		}
290 		if (reachable_count != ctx->cfg_blocks_count) {
291 			ir_remove_unreachable_blocks(ctx);
292 		}
293 	}
294 
295 	ir_worklist_free(&worklist);
296 
297 	return 1;
298 }
299 
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)300 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
301 {
302 	uint32_t i, *p, *q, n = 0;
303 
304 	p = q = &ctx->cfg_edges[bb->predecessors];
305 	for (i = 0; i < bb->predecessors_count; i++, p++) {
306 		if (*p != from) {
307 			if (p != q) {
308 				*q = *p;
309 			}
310 			q++;
311 			n++;
312 		}
313 	}
314 	IR_ASSERT(n != bb->predecessors_count);
315 	bb->predecessors_count = n;
316 }
317 
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)318 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
319 {
320 	ir_ref i, j, n, k, *p, use;
321 	ir_insn *use_insn;
322 	ir_use_list *use_list;
323 	ir_bitset life_inputs;
324 	ir_insn *insn = &ctx->ir_base[merge];
325 
326 	IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
327 	n = insn->inputs_count;
328 	i = 1;
329 	life_inputs = ir_bitset_malloc(n + 1);
330 	for (j = 1; j <= n; j++) {
331 		ir_ref input = ir_insn_op(insn, j);
332 
333 		if (input != from) {
334 			if (i != j) {
335 				ir_insn_set_op(insn, i, input);
336 			}
337 			ir_bitset_incl(life_inputs, j);
338 			i++;
339 		}
340 	}
341 	i--;
342 	if (i == 1) {
343 		insn->op = IR_BEGIN;
344 		insn->inputs_count = 1;
345 		use_list = &ctx->use_lists[merge];
346 		if (use_list->count > 1) {
347 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
348 				use = *p;
349 				use_insn = &ctx->ir_base[use];
350 				if (use_insn->op == IR_PHI) {
351 					/* Convert PHI to COPY */
352 					i = 2;
353 					for (j = 2; j <= n; j++) {
354 						ir_ref input = ir_insn_op(use_insn, j);
355 
356 						if (ir_bitset_in(life_inputs, j - 1)) {
357 							use_insn->op1 = ir_insn_op(use_insn, j);
358 						} else if (input > 0) {
359 							ir_use_list_remove_all(ctx, input, use);
360 						}
361 					}
362 					use_insn->op = IR_COPY;
363 					use_insn->op2 = IR_UNUSED;
364 					use_insn->op3 = IR_UNUSED;
365 					ir_use_list_remove_all(ctx, merge, use);
366 				}
367 			}
368 		}
369 	} else {
370 		insn->inputs_count = i;
371 
372 		n++;
373 		use_list = &ctx->use_lists[merge];
374 		if (use_list->count > 1) {
375 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
376 				use = *p;
377 				use_insn = &ctx->ir_base[use];
378 				if (use_insn->op == IR_PHI) {
379 					i = 2;
380 					for (j = 2; j <= n; j++) {
381 						ir_ref input = ir_insn_op(use_insn, j);
382 
383 						if (ir_bitset_in(life_inputs, j - 1)) {
384 							IR_ASSERT(input);
385 							if (i != j) {
386 								ir_insn_set_op(use_insn, i, input);
387 							}
388 							i++;
389 						} else if (input > 0) {
390 							ir_use_list_remove_all(ctx, input, use);
391 						}
392 					}
393 				}
394 			}
395 		}
396 	}
397 	ir_mem_free(life_inputs);
398 	ir_use_list_remove_all(ctx, from, merge);
399 }
400 
401 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)402 int ir_remove_unreachable_blocks(ir_ctx *ctx)
403 {
404 	uint32_t b, *p, i;
405 	uint32_t unreachable_count = 0;
406 	uint32_t bb_count = ctx->cfg_blocks_count;
407 	ir_block *bb = ctx->cfg_blocks + 1;
408 
409 	for (b = 1; b <= bb_count; b++, bb++) {
410 		if (bb->flags & IR_BB_UNREACHABLE) {
411 #if 0
412 			do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
413 #endif
414 			if (bb->successors_count) {
415 				for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
416 					ir_block *succ_bb = &ctx->cfg_blocks[*p];
417 
418 					if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
419 						ir_remove_predecessor(ctx, succ_bb, b);
420 						ir_remove_merge_input(ctx, succ_bb->start, bb->end);
421 					}
422 				}
423 			} else {
424 				ir_ref prev, ref = bb->end;
425 				ir_insn *insn = &ctx->ir_base[ref];
426 
427 				IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
428 				/* remove from terminators list */
429 				prev = ctx->ir_base[1].op1;
430 				if (prev == ref) {
431 					ctx->ir_base[1].op1 = insn->op3;
432 				} else {
433 					while (prev) {
434 						if (ctx->ir_base[prev].op3 == ref) {
435 							ctx->ir_base[prev].op3 = insn->op3;
436 							break;
437 						}
438 						prev = ctx->ir_base[prev].op3;
439 					}
440 				}
441 			}
442 			ctx->cfg_map[bb->start] = 0;
443 			ctx->cfg_map[bb->end] = 0;
444 			unreachable_count++;
445 		}
446 	}
447 
448 	if (unreachable_count) {
449 		ir_block *dst_bb;
450 		uint32_t n = 1;
451 		uint32_t *edges;
452 
453 		dst_bb = bb = ctx->cfg_blocks + 1;
454 		for (b = 1; b <= bb_count; b++, bb++) {
455 			if (!(bb->flags & IR_BB_UNREACHABLE)) {
456 				if (dst_bb != bb) {
457 					memcpy(dst_bb, bb, sizeof(ir_block));
458 					ctx->cfg_map[dst_bb->start] = n;
459 					ctx->cfg_map[dst_bb->end] = n;
460 				}
461 				dst_bb->successors_count = 0;
462 				dst_bb++;
463 				n++;
464 			}
465 		}
466 		ctx->cfg_blocks_count = bb_count = n - 1;
467 
468 		/* Rebuild successor/predecessors control edges */
469 		edges = ctx->cfg_edges;
470 		bb = ctx->cfg_blocks + 1;
471 		for (b = 1; b <= bb_count; b++, bb++) {
472 			ir_insn *insn = &ctx->ir_base[bb->start];
473 			ir_ref *p, ref;
474 
475 			n = bb->predecessors_count;
476 			if (n > 1) {
477 				uint32_t *q = edges + bb->predecessors;
478 
479 				IR_ASSERT(n == insn->inputs_count);
480 				for (p = insn->ops + 1; n > 0; p++, q++, n--) {
481 					ref = *p;
482 					IR_ASSERT(ref);
483 					ir_ref pred_b = ctx->cfg_map[ref];
484 					ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
485 					*q = pred_b;
486 					edges[pred_bb->successors + pred_bb->successors_count++] = b;
487 				}
488 			} else if (n == 1) {
489 				ref = insn->op1;
490 				IR_ASSERT(ref);
491 				IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
492 				ir_ref pred_b = ctx->cfg_map[ref];
493 				ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
494 				edges[bb->predecessors] = pred_b;
495 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
496 			}
497 		}
498 	}
499 
500 	return 1;
501 }
502 
503 #if 0
504 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
505 {
506 	uint32_t i, *p;
507 	ir_block *bb = &ctx->cfg_blocks[b];
508 
509 	if (bb->postnum != 0) {
510 		return;
511 	}
512 
513 	if (bb->successors_count) {
514 		bb->postnum = -1; /* Marker for "currently visiting" */
515 		p = ctx->cfg_edges + bb->successors;
516 		i = bb->successors_count;
517 		do {
518 			compute_postnum(ctx, cur, *p);
519 			p++;
520 		} while (--i);
521 	}
522 	bb->postnum = (*cur)++;
523 }
524 
525 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
526  * Cooper, Harvey and Kennedy. */
527 int ir_build_dominators_tree(ir_ctx *ctx)
528 {
529 	uint32_t blocks_count, b, postnum;
530 	ir_block *blocks, *bb;
531 	uint32_t *edges;
532 	bool changed;
533 
534 	ctx->flags2 &= ~IR_NO_LOOPS;
535 
536 	postnum = 1;
537 	compute_postnum(ctx, &postnum, 1);
538 
539 	/* Find immediate dominators */
540 	blocks = ctx->cfg_blocks;
541 	edges  = ctx->cfg_edges;
542 	blocks_count = ctx->cfg_blocks_count;
543 	blocks[1].idom = 1;
544 	do {
545 		changed = 0;
546 		/* Iterating in Reverse Post Order */
547 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
548 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
549 			if (bb->predecessors_count == 1) {
550 				uint32_t pred_b = edges[bb->predecessors];
551 
552 				if (blocks[pred_b].idom <= 0) {
553 					//IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
554 				} else if (bb->idom != pred_b) {
555 					bb->idom = pred_b;
556 					changed = 1;
557 				}
558 			} else if (bb->predecessors_count) {
559 				uint32_t idom = 0;
560 				uint32_t k = bb->predecessors_count;
561 				uint32_t *p = edges + bb->predecessors;
562 
563 				do {
564 					uint32_t pred_b = *p;
565 					ir_block *pred_bb = &blocks[pred_b];
566 					ir_block *idom_bb;
567 
568 					if (pred_bb->idom > 0) {
569 						idom = pred_b;
570 						idom_bb = &blocks[idom];
571 
572 						while (--k > 0) {
573 							pred_b = *(++p);
574 							pred_bb = &blocks[pred_b];
575 							if (pred_bb->idom > 0) {
576 								while (idom != pred_b) {
577 									while (pred_bb->postnum < idom_bb->postnum) {
578 										pred_b = pred_bb->idom;
579 										pred_bb = &blocks[pred_b];
580 									}
581 									while (idom_bb->postnum < pred_bb->postnum) {
582 										idom = idom_bb->idom;
583 										idom_bb = &blocks[idom];
584 									}
585 								}
586 							}
587 						}
588 
589 						if (bb->idom != idom) {
590 							bb->idom = idom;
591 							changed = 1;
592 						}
593 						break;
594 					}
595 					p++;
596 				} while (--k > 0);
597 			}
598 		}
599 	} while (changed);
600 	blocks[1].idom = 0;
601 	blocks[1].dom_depth = 0;
602 
603 	/* Construct dominators tree */
604 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
605 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
606 		if (bb->idom > 0) {
607 			ir_block *idom_bb = &blocks[bb->idom];
608 
609 			bb->dom_depth = idom_bb->dom_depth + 1;
610 			/* Sort by block number to traverse children in pre-order */
611 			if (idom_bb->dom_child == 0) {
612 				idom_bb->dom_child = b;
613 			} else if (b < idom_bb->dom_child) {
614 				bb->dom_next_child = idom_bb->dom_child;
615 				idom_bb->dom_child = b;
616 			} else {
617 				int child = idom_bb->dom_child;
618 				ir_block *child_bb = &blocks[child];
619 
620 				while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
621 					child = child_bb->dom_next_child;
622 					child_bb = &blocks[child];
623 				}
624 				bb->dom_next_child = child_bb->dom_next_child;
625 				child_bb->dom_next_child = b;
626 			}
627 		}
628 	}
629 
630 	return 1;
631 }
632 #else
633 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
634  * Cooper, Harvey and Kennedy, that relays on IR block ordering.
635  * It may fallback to the general slow fixed-point algorithm.  */
636 static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
ir_build_dominators_tree(ir_ctx * ctx)637 int ir_build_dominators_tree(ir_ctx *ctx)
638 {
639 	uint32_t blocks_count, b;
640 	ir_block *blocks, *bb;
641 	uint32_t *edges;
642 	ir_list worklist;
643 
644 	ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
645 
646 	ctx->flags2 |= IR_NO_LOOPS;
647 
648 	/* Find immediate dominators */
649 	blocks = ctx->cfg_blocks;
650 	edges  = ctx->cfg_edges;
651 	blocks_count = ctx->cfg_blocks_count;
652 	blocks[1].idom = 1;
653 	blocks[1].dom_depth = 0;
654 
655 	/* Iterating in Reverse Post Order */
656 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
657 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
658 		IR_ASSERT(bb->predecessors_count > 0);
659 		uint32_t k = bb->predecessors_count;
660 		uint32_t *p = edges + bb->predecessors;
661 		uint32_t idom = *p;
662 		ir_block *idom_bb;
663 
664 		if (UNEXPECTED(idom > b)) {
665 			/* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
666 			ctx->flags2 &= ~IR_NO_LOOPS;
667 			IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
668 			ir_list_push(&worklist, idom);
669 			while (1) {
670 				k--;
671 				p++;
672 				idom = *p;
673 				if (idom < b) {
674 					break;
675 				}
676 				IR_ASSERT(k > 0);
677 				ir_list_push(&worklist, idom);
678 			}
679 		}
680 		IR_ASSERT(blocks[idom].idom > 0);
681 		IR_ASSERT(k != 0);
682 
683 		while (--k > 0) {
684 			uint32_t pred_b = *(++p);
685 
686 			if (pred_b < b) {
687 				IR_ASSERT(blocks[pred_b].idom > 0);
688 				while (idom != pred_b) {
689 					while (pred_b > idom) {
690 						pred_b = blocks[pred_b].idom;
691 					}
692 					while (idom > pred_b) {
693 						idom = blocks[idom].idom;
694 					}
695 				}
696 			} else {
697 				ctx->flags2 &= ~IR_NO_LOOPS;
698 				IR_ASSERT(bb->predecessors_count > 1);
699 				ir_list_push(&worklist, pred_b);
700 			}
701 		}
702 		bb->idom = idom;
703 		idom_bb = &blocks[idom];
704 
705 		bb->dom_depth = idom_bb->dom_depth + 1;
706 		/* Sort by block number to traverse children in pre-order */
707 		if (idom_bb->dom_child == 0) {
708 			idom_bb->dom_child = b;
709 		} else if (b < idom_bb->dom_child) {
710 			bb->dom_next_child = idom_bb->dom_child;
711 			idom_bb->dom_child = b;
712 		} else {
713 			int child = idom_bb->dom_child;
714 			ir_block *child_bb = &blocks[child];
715 
716 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
717 				child = child_bb->dom_next_child;
718 				child_bb = &blocks[child];
719 			}
720 			bb->dom_next_child = child_bb->dom_next_child;
721 			child_bb->dom_next_child = b;
722 		}
723 	}
724 
725 	blocks[1].idom = 0;
726 
727 	if (ir_list_len(&worklist) != 0) {
728 		uint32_t dom_depth;
729 		uint32_t succ_b;
730 		bool complete = 1;
731 
732 		/* Check if all the back-edges lead to the loop headers */
733 		do {
734 			b = ir_list_pop(&worklist);
735 			bb = &blocks[b];
736 			succ_b = ctx->cfg_edges[bb->successors];
737 			if (bb->successors_count != 1) {
738 				/* LOOP_END/END may be linked with the following ENTRY by a fake edge */
739 				IR_ASSERT(bb->successors_count == 2);
740 				if (blocks[succ_b].flags & IR_BB_ENTRY) {
741 					succ_b = ctx->cfg_edges[bb->successors + 1];
742 				} else {
743 					IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
744 				}
745 			}
746 			dom_depth = blocks[succ_b].dom_depth;;
747 			while (bb->dom_depth > dom_depth) {
748 				b = bb->dom_parent;
749 				bb = &blocks[b];
750 			}
751 			if (UNEXPECTED(b != succ_b)) {
752 				complete = 0;
753 				break;
754 			}
755 		} while (ir_list_len(&worklist) != 0);
756 
757 		if (UNEXPECTED(!complete)) {
758 			ir_list_free(&worklist);
759 			return ir_build_dominators_tree_iterative(ctx);
760 		}
761 	}
762 
763 	ir_list_free(&worklist);
764 
765 	return 1;
766 }
767 
ir_build_dominators_tree_iterative(ir_ctx * ctx)768 static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
769 {
770 	bool changed;
771 	uint32_t blocks_count, b;
772 	ir_block *blocks, *bb;
773 	uint32_t *edges;
774 
775 	/* Find immediate dominators */
776 	blocks = ctx->cfg_blocks;
777 	edges  = ctx->cfg_edges;
778 	blocks_count = ctx->cfg_blocks_count;
779 
780 	/* Clear the dominators tree, but keep already found dominators */
781 	for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
782 		bb->dom_depth = 0;
783 		bb->dom_child = 0;
784 		bb->dom_next_child = 0;
785 	}
786 
787 	/* Find immediate dominators by iterative fixed-point algorithm */
788 	blocks[1].idom = 1;
789 	do {
790 		changed = 0;
791 
792 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
793 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
794 			IR_ASSERT(bb->predecessors_count > 0);
795 			uint32_t k = bb->predecessors_count;
796 			uint32_t *p = edges + bb->predecessors;
797 			uint32_t idom = *p;
798 
799 			if (blocks[idom].idom == 0) {
800 				while (1) {
801 					k--;
802 					p++;
803 					idom = *p;
804 					if (blocks[idom].idom > 0) {
805 						break;
806 					}
807 					IR_ASSERT(k > 0);
808 				}
809 			}
810 			IR_ASSERT(k != 0);
811 			while (--k > 0) {
812 				uint32_t pred_b = *(++p);
813 
814 				if (blocks[pred_b].idom > 0) {
815 					IR_ASSERT(blocks[pred_b].idom > 0);
816 					while (idom != pred_b) {
817 						while (pred_b > idom) {
818 							pred_b = blocks[pred_b].idom;
819 						}
820 						while (idom > pred_b) {
821 							idom = blocks[idom].idom;
822 						}
823 					}
824 				}
825 			}
826 			if (bb->idom != idom) {
827 				bb->idom = idom;
828 				changed = 1;
829 			}
830 		}
831 	} while (changed);
832 
833 	/* Build dominators tree */
834 	blocks[1].idom = 0;
835 	blocks[1].dom_depth = 0;
836 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
837 		uint32_t idom = bb->idom;
838 		ir_block *idom_bb = &blocks[idom];
839 
840 		bb->dom_depth = idom_bb->dom_depth + 1;
841 		/* Sort by block number to traverse children in pre-order */
842 		if (idom_bb->dom_child == 0) {
843 			idom_bb->dom_child = b;
844 		} else if (b < idom_bb->dom_child) {
845 			bb->dom_next_child = idom_bb->dom_child;
846 			idom_bb->dom_child = b;
847 		} else {
848 			int child = idom_bb->dom_child;
849 			ir_block *child_bb = &blocks[child];
850 
851 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
852 				child = child_bb->dom_next_child;
853 				child_bb = &blocks[child];
854 			}
855 			bb->dom_next_child = child_bb->dom_next_child;
856 			child_bb->dom_next_child = b;
857 		}
858 	}
859 
860 	return 1;
861 }
862 #endif
863 
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)864 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
865 {
866 	uint32_t b1_depth = blocks[b1].dom_depth;
867 	const ir_block *bb2 = &blocks[b2];
868 
869 	while (bb2->dom_depth > b1_depth) {
870 		b2 = bb2->dom_parent;
871 		bb2 = &blocks[b2];
872 	}
873 	return b1 == b2;
874 }
875 
ir_find_loops(ir_ctx * ctx)876 int ir_find_loops(ir_ctx *ctx)
877 {
878 	uint32_t i, j, n, count;
879 	uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
880 	ir_block *blocks = ctx->cfg_blocks;
881 	uint32_t *edges = ctx->cfg_edges;
882 	ir_worklist work;
883 
884 	if (ctx->flags2 & IR_NO_LOOPS) {
885 		return 1;
886 	}
887 
888 	/* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
889 	 * queries. These are implemented by checking entry/exit times of the DFS search. */
890 	ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
891 	entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
892 	exit_times = entry_times + ctx->cfg_blocks_count + 1;
893 	sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
894 
895 	memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
896 
897 	ir_worklist_push(&work, 1);
898 	while (ir_worklist_len(&work)) {
899 		ir_block *bb;
900 		int child;
901 
902 next:
903 		i = ir_worklist_peek(&work);
904 		if (!entry_times[i]) {
905 			entry_times[i] = time++;
906 		}
907 
908 		/* Visit blocks immediately dominated by i. */
909 		bb = &blocks[i];
910 		for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
911 			if (ir_worklist_push(&work, child)) {
912 				goto next;
913 			}
914 		}
915 
916 		/* Visit join edges. */
917 		if (bb->successors_count) {
918 			uint32_t *p = edges + bb->successors;
919 			for (j = 0; j < bb->successors_count; j++,p++) {
920 				uint32_t succ = *p;
921 
922 				if (blocks[succ].idom == i) {
923 					continue;
924 				} else if (ir_worklist_push(&work, succ)) {
925 					goto next;
926 				}
927 			}
928 		}
929 		exit_times[i] = time++;
930 		ir_worklist_pop(&work);
931 	}
932 
933 	/* Sort blocks by level, which is the opposite order in which we want to process them */
934 	sorted_blocks[1] = 1;
935 	j = 1;
936 	n = 2;
937 	while (j != n) {
938 		i = j;
939 		j = n;
940 		for (; i < j; i++) {
941 			int child;
942 			for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
943 				sorted_blocks[n++] = child;
944 			}
945 		}
946 	}
947 	count = n;
948 
949 	/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
950 	while (n > 1) {
951 		i = sorted_blocks[--n];
952 		ir_block *bb = &blocks[i];
953 
954 		if (bb->predecessors_count > 1) {
955 			bool irreducible = 0;
956 			uint32_t *p = &edges[bb->predecessors];
957 
958 			j = bb->predecessors_count;
959 			do {
960 				uint32_t pred = *p;
961 
962 				/* A join edge is one for which the predecessor does not
963 				   immediately dominate the successor.  */
964 				if (bb->idom != pred) {
965 					/* In a loop back-edge (back-join edge), the successor dominates
966 					   the predecessor.  */
967 					if (ir_dominates(blocks, i, pred)) {
968 						if (!ir_worklist_len(&work)) {
969 							ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
970 						}
971 						blocks[pred].loop_header = 0; /* support for merged loops */
972 						ir_worklist_push(&work, pred);
973 					} else {
974 						/* Otherwise it's a cross-join edge.  See if it's a branch
975 						   to an ancestor on the DJ spanning tree.  */
976 						if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
977 							irreducible = 1;
978 						}
979 					}
980 				}
981 				p++;
982 			} while (--j);
983 
984 			if (UNEXPECTED(irreducible)) {
985 				// TODO: Support for irreducible loops ???
986 				bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
987 				ctx->flags2 |= IR_IRREDUCIBLE_CFG;
988 				while (ir_worklist_len(&work)) {
989 					ir_worklist_pop(&work);
990 				}
991 			} else if (ir_worklist_len(&work)) {
992 				bb->flags |= IR_BB_LOOP_HEADER;
993 				ctx->flags2 |= IR_CFG_HAS_LOOPS;
994 				bb->loop_depth = 1;
995 				while (ir_worklist_len(&work)) {
996 					j = ir_worklist_pop(&work);
997 					while (blocks[j].loop_header > 0) {
998 						j = blocks[j].loop_header;
999 					}
1000 					if (j != i) {
1001 						ir_block *bb = &blocks[j];
1002 						if (bb->idom == 0 && j != 1) {
1003 							/* Ignore blocks that are unreachable or only abnormally reachable. */
1004 							continue;
1005 						}
1006 						bb->loop_header = i;
1007 						if (bb->predecessors_count) {
1008 							uint32_t *p = &edges[bb->predecessors];
1009 							j = bb->predecessors_count;
1010 							do {
1011 								ir_worklist_push(&work, *p);
1012 								p++;
1013 							} while (--j);
1014 						}
1015 					}
1016 				}
1017 			}
1018 		}
1019 	}
1020 
1021 	if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1022 		for (n = 1; n < count; n++) {
1023 			i = sorted_blocks[n];
1024 			ir_block *bb = &blocks[i];
1025 			if (bb->loop_header > 0) {
1026 				ir_block *loop = &blocks[bb->loop_header];
1027 				uint32_t loop_depth = loop->loop_depth;
1028 
1029 				if (bb->flags & IR_BB_LOOP_HEADER) {
1030 					loop_depth++;
1031 				}
1032 				bb->loop_depth = loop_depth;
1033 				if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1034 					loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1035 				}
1036 			}
1037 		}
1038 	}
1039 
1040 	ir_mem_free(entry_times);
1041 	ir_worklist_free(&work);
1042 
1043 	return 1;
1044 }
1045 
_ir_skip_empty_blocks(const ir_ctx * ctx,uint32_t b)1046 static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1047 {
1048 	while (1) {
1049 		ir_block *bb = &ctx->cfg_blocks[b];
1050 
1051 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1052 			IR_ASSERT(bb->successors_count == 1);
1053 			b = ctx->cfg_edges[bb->successors];
1054 		} else {
1055 			return b;
1056 		}
1057 	}
1058 }
1059 
1060 /* A variation of "Bottom-up Positioning" algorithm described by
1061  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1062  */
1063 typedef struct _ir_edge_info {
1064 	uint32_t from;
1065 	uint32_t to;
1066 	float    freq;
1067 } ir_edge_info;
1068 
1069 typedef struct _ir_chain {
1070 	uint32_t head;
1071 	uint32_t next;
1072 	union {
1073 		uint32_t prev;
1074 		uint32_t tail;
1075 	};
1076 } ir_chain;
1077 
ir_edge_info_cmp(const void * b1,const void * b2)1078 static int ir_edge_info_cmp(const void *b1, const void *b2)
1079 {
1080 	ir_edge_info *e1 = (ir_edge_info*)b1;
1081 	ir_edge_info *e2 = (ir_edge_info*)b2;
1082 
1083 	if (e1->freq != e2->freq) {
1084 		return e1->freq < e2->freq ? 1 : -1;
1085 	}
1086 	/* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1087 	 * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1088 	 * (in merge points).
1089 	 *
1090 	 * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1091 	 * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1092 	 */
1093 	if (e1->from != e2->from) {
1094 		return e2->from - e1->from;
1095 	} else {
1096 		return e1->to - e2->to;
1097 	}
1098 }
1099 
ir_chain_head_path_compress(ir_chain * chains,uint32_t src,uint32_t head)1100 static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1101 {
1102 	IR_ASSERT(head != 0);
1103 	do {
1104 		head = chains[head].head;
1105 	} while (chains[head].head != head);
1106 	do {
1107 		uint32_t next = chains[src].head;
1108 		chains[src].head = head;
1109 		src = next;
1110 	} while (chains[src].head != head);
1111 	return head;
1112 }
1113 
ir_chain_head(ir_chain * chains,uint32_t src)1114 IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1115 {
1116 	uint32_t head = chains[src].head;
1117 	if (chains[head].head == head) {
1118 		return head;
1119 	} else {
1120 		return ir_chain_head_path_compress(chains, src, head);
1121 	}
1122 }
1123 
ir_join_chains(ir_chain * chains,uint32_t src,uint32_t dst)1124 static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1125 {
1126 	uint32_t dst_tail = chains[dst].tail;
1127 	uint32_t src_tail = chains[src].tail;
1128 
1129 	chains[dst_tail].next = src;
1130 	chains[dst].prev = src_tail;
1131 	chains[src_tail].next = dst;
1132 	chains[src].tail = dst_tail;
1133 	chains[dst].head = src;
1134 }
1135 
ir_insert_chain_before(ir_chain * chains,uint32_t c,uint32_t before)1136 static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1137 {
1138 	ir_chain *this = &chains[c];
1139 	ir_chain *next = &chains[before];
1140 
1141 	IR_ASSERT(chains[c].head == c);
1142 	if (chains[before].head != before) {
1143 		this->head = next->head;
1144 	} else {
1145 		next->head = c;
1146 	}
1147 	this->next = before;
1148 	this->prev = next->prev;
1149 	next->prev = c;
1150 	chains[this->prev].next = c;
1151 }
1152 
1153 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1154 # ifdef IR_DEBUG
1155 #  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1156 # else
1157 #  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1158 # endif
1159 #endif
1160 
1161 #if IR_DEBUG_BB_SCHEDULE_GRAPH
ir_dump_cfg_freq_graph(ir_ctx * ctx,float * bb_freq,uint32_t edges_count,ir_edge_info * edges,ir_chain * chains)1162 static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1163 {
1164 	uint32_t b, i;
1165 	ir_block *bb;
1166 	uint8_t c, *colors;
1167 	bool is_head, is_empty;
1168 	uint32_t max_colors = 12;
1169 
1170 	colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1171 	memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1172 	i = 0;
1173 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1174 		if (chains[b].head == b) {
1175 			colors[b] = (i % max_colors) + 1;
1176 			i++;
1177 		}
1178 	}
1179 
1180 	fprintf(stderr, "digraph {\n");
1181 	fprintf(stderr, "\trankdir=TB;\n");
1182 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1183 		bb = &ctx->cfg_blocks[b];
1184 		c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1185 		is_head = chains[b].head == b;
1186 		is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1187 		if (c) {
1188 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1189 				bb->loop_depth, bb_freq[b],
1190 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1191 				c,
1192 				is_head ? ",penwidth=3" : "",
1193 				is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1194 		} else {
1195 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1196 				bb->loop_depth, bb_freq[b],
1197 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1198 				is_head ? ",penwidth=3" : "",
1199 				is_empty ? ",style=\"dotted\"" : "");
1200 		}
1201 	}
1202 	fprintf(stderr, "\n");
1203 
1204 	for (i = 0; i < edges_count; i++) {
1205 		fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1206 	}
1207 	fprintf(stderr, "}\n");
1208 }
1209 #endif
1210 
1211 #ifdef IR_DEBUG
ir_dump_edges(ir_ctx * ctx,uint32_t edges_count,ir_edge_info * edges)1212 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1213 {
1214 	uint32_t i;
1215 
1216 	fprintf(stderr, "Edges:\n");
1217 	for (i = 0; i < edges_count; i++) {
1218 		fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1219 	}
1220 }
1221 
ir_dump_chains(ir_ctx * ctx,ir_chain * chains)1222 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1223 {
1224 	uint32_t b, tail, i;
1225 
1226 	fprintf(stderr, "Chains:\n");
1227 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1228 		if (chains[b].head == b) {
1229 			tail = chains[b].tail;
1230 			i = b;
1231 			fprintf(stderr, "(BB%d", i);
1232 			while (i != tail) {
1233 				i = chains[i].next;
1234 				fprintf(stderr, ", BB%d", i);
1235 			}
1236 			fprintf(stderr, ")\n");
1237 		}
1238 	}
1239 }
1240 #endif
1241 
ir_schedule_blocks_bottom_up(ir_ctx * ctx)1242 static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1243 {
1244 	uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1245 	uint32_t edges_count = 0;
1246 	uint32_t b, i, loop_depth;
1247 	float *bb_freq, freq;
1248 	ir_block *bb;
1249 	ir_edge_info *edges, *e;
1250 	ir_chain *chains;
1251 	ir_bitqueue worklist;
1252 	ir_bitset visited;
1253 	uint32_t *schedule_end, count;
1254 
1255 	ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1256 	schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1257 
1258 	/* 1. Create initial chains for each BB */
1259 	chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1260 	chains[0].head = 0;
1261 	chains[0].next = 0;
1262 	chains[0].prev = 0;
1263 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1264 		chains[b].head = b;
1265 		chains[b].next = b;
1266 		chains[b].prev = b;
1267 	}
1268 
1269 	/* 2. Collect information about BBs and EDGEs frequencies */
1270 	edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1271 	bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1272 	bb_freq[1] = 1.0f;
1273 	visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1274 	ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1275 	ir_bitqueue_add(&worklist, 1);
1276 	while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1277 restart:
1278 		bb = &ctx->cfg_blocks[b];
1279 		if (bb->predecessors_count) {
1280 			uint32_t n = bb->predecessors_count;
1281 			uint32_t *p = ctx->cfg_edges + bb->predecessors;
1282 			for (; n > 0; p++, n--) {
1283 				uint32_t predecessor = *p;
1284 				/* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1285 				 * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1286 				 * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1287 				 */
1288 				if (predecessor < b) {
1289 					if (!ir_bitset_in(visited, predecessor)) {
1290 						b = predecessor;
1291 						ir_bitqueue_del(&worklist, b);
1292 						goto restart;
1293 					}
1294 				} else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1295 					ir_dump_cfg(ctx, stderr);
1296 					IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1297 				}
1298 			}
1299 		}
1300 
1301 		ir_bitset_incl(visited, b);
1302 
1303 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1304 			uint32_t successor = ctx->cfg_edges[bb->successors];
1305 
1306 			/* move empty blocks to the end */
1307 			IR_ASSERT(chains[b].head == b);
1308 			chains[b].head = 0;
1309 			*schedule_end = b;
1310 			schedule_end--;
1311 
1312 			if (successor > b) {
1313 				bb_freq[successor] += bb_freq[b];
1314 				b = successor;
1315 				ir_bitqueue_del(&worklist, b);
1316 				goto restart;
1317 			} else {
1318 				continue;
1319 			}
1320 		}
1321 
1322 		loop_depth = bb->loop_depth;
1323 		if (bb->flags & IR_BB_LOOP_HEADER) {
1324 			// TODO: Estimate the loop iterations count
1325 			bb_freq[b] *= 10;
1326 		}
1327 
1328 		if (bb->successors_count) {
1329 			uint32_t n = bb->successors_count;
1330 			uint32_t *p = ctx->cfg_edges + bb->successors;
1331 
1332 			if (n == 1) {
1333 				uint32_t successor = *p;
1334 
1335 				IR_ASSERT(edges_count < max_edges_count);
1336 				freq = bb_freq[b];
1337 				if (successor > b) {
1338 					IR_ASSERT(!ir_bitset_in(visited, successor));
1339 					bb_freq[successor] += freq;
1340 					ir_bitqueue_add(&worklist, successor);
1341 				}
1342 				successor = _ir_skip_empty_blocks(ctx, successor);
1343 				edges[edges_count].from = b;
1344 				edges[edges_count].to = successor;
1345 				edges[edges_count].freq = freq;
1346 				edges_count++;
1347 			} else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1348 				uint32_t successor1 = *p;
1349 				ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1350 				ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1351 				uint32_t successor2 = *(p + 1);
1352 				ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1353 				ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1354 				int prob1, prob2, probN = 100;
1355 
1356 				if (insn1->op2) {
1357 					prob1 = insn1->op2;
1358 					if (insn2->op2) {
1359 						prob2 = insn2->op2;
1360 						probN = prob1 + prob2;
1361 					} else {
1362 						if (prob1 > 100) {
1363 							prob1 = 100;
1364 						}
1365 						prob2 = 100 - prob1;
1366 					}
1367 
1368 				} else if (insn2->op2) {
1369 					prob2 = insn2->op2;
1370 					if (prob2 > 100) {
1371 						prob2 = 100;
1372 					}
1373 					prob1 = 100 - prob2;
1374 				} else if (successor1_bb->loop_depth >= loop_depth
1375 						&& successor2_bb->loop_depth < loop_depth) {
1376 					prob1 = 90;
1377 					prob2 = 10;
1378 				} else if (successor1_bb->loop_depth < loop_depth
1379 						&& successor2_bb->loop_depth >= loop_depth) {
1380 					prob1 = 10;
1381 					prob2 = 90;
1382 				} else if (successor2_bb->flags & IR_BB_EMPTY) {
1383 					prob1 = 51;
1384 					prob2 = 49;
1385 				} else if (successor1_bb->flags & IR_BB_EMPTY) {
1386 					prob1 = 49;
1387 					prob2 = 51;
1388 				} else {
1389 					prob1 = prob2 = 50;
1390 				}
1391 				do {
1392 					freq = bb_freq[b] * (float)prob1 / (float)probN;
1393 					if (successor1 > b) {
1394 						IR_ASSERT(!ir_bitset_in(visited, successor1));
1395 						bb_freq[successor1] += freq;
1396 						if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1397 							/* move cold block without successors to the end */
1398 							IR_ASSERT(chains[successor1].head == successor1);
1399 							chains[successor1].head = 0;
1400 							*schedule_end = successor1;
1401 							schedule_end--;
1402 							break;
1403 						} else {
1404 							ir_bitqueue_add(&worklist, successor1);
1405 						}
1406 					}
1407 					/* try to join edges early to reduce number of edges and the cost of their sorting */
1408 					if (prob1 > prob2
1409 					 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1410 						uint32_t src = chains[b].next;
1411 						IR_ASSERT(chains[src].head == src);
1412 						IR_ASSERT(src == ir_chain_head(chains, b));
1413 						IR_ASSERT(chains[successor1].head == successor1);
1414 						ir_join_chains(chains, src, successor1);
1415 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1416 					}
1417 					successor1 = _ir_skip_empty_blocks(ctx, successor1);
1418 					IR_ASSERT(edges_count < max_edges_count);
1419 					edges[edges_count].from = b;
1420 					edges[edges_count].to = successor1;
1421 					edges[edges_count].freq = freq;
1422 					edges_count++;
1423 				} while (0);
1424 				do {
1425 					freq = bb_freq[b] * (float)prob2 / (float)probN;
1426 					if (successor2 > b) {
1427 						IR_ASSERT(!ir_bitset_in(visited, successor2));
1428 						bb_freq[successor2] += freq;
1429 						if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1430 							/* move cold block without successors to the end */
1431 							IR_ASSERT(chains[successor2].head == successor2);
1432 							chains[successor2].head = 0;
1433 							*schedule_end = successor2;
1434 							schedule_end--;
1435 							break;
1436 						} else {
1437 							ir_bitqueue_add(&worklist, successor2);
1438 						}
1439 					}
1440 					if (prob2 > prob1
1441 					 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1442 						uint32_t src = chains[b].next;
1443 						IR_ASSERT(chains[src].head == src);
1444 						IR_ASSERT(src == ir_chain_head(chains, b));
1445 						IR_ASSERT(chains[successor2].head == successor2);
1446 						ir_join_chains(chains, src, successor2);
1447 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1448 					}
1449 					successor2 = _ir_skip_empty_blocks(ctx, successor2);
1450 					IR_ASSERT(edges_count < max_edges_count);
1451 					edges[edges_count].from = b;
1452 					edges[edges_count].to = successor2;
1453 					edges[edges_count].freq = freq;
1454 					edges_count++;
1455 				} while (0);
1456 			} else {
1457 				int prob;
1458 
1459 				for (; n > 0; p++, n--) {
1460 					uint32_t successor = *p;
1461 					ir_block *successor_bb = &ctx->cfg_blocks[successor];
1462 					ir_insn *insn = &ctx->ir_base[successor_bb->start];
1463 
1464 					if (insn->op == IR_CASE_DEFAULT) {
1465 						prob = insn->op2;
1466 						if (!prob) {
1467 							prob = 100 / bb->successors_count;
1468 						}
1469 					} else if (insn->op == IR_CASE_VAL) {
1470 						prob = insn->op3;
1471 						if (!prob) {
1472 							prob = 100 / bb->successors_count;
1473 						}
1474 					} else if (insn->op == IR_ENTRY) {
1475 						if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1476 							prob = 99; /* prefer empty ENTRY block to go first */
1477 						} else {
1478 							prob = 1;
1479 						}
1480 					} else {
1481 						prob = 100 / bb->successors_count;
1482 					}
1483 					freq = bb_freq[b] * (float)prob / 100.0f;
1484 					if (successor > b) {
1485 						IR_ASSERT(!ir_bitset_in(visited, successor));
1486 						bb_freq[successor] += freq;
1487 						ir_bitqueue_add(&worklist, successor);
1488 					}
1489 					successor = _ir_skip_empty_blocks(ctx, successor);
1490 					IR_ASSERT(edges_count < max_edges_count);
1491 					edges[edges_count].from = b;
1492 					edges[edges_count].to = successor;
1493 					edges[edges_count].freq = freq;
1494 					edges_count++;
1495 				}
1496 			}
1497 		}
1498 	}
1499 	ir_bitqueue_free(&worklist);
1500 	ir_mem_free(visited);
1501 
1502 	/* 2. Sort EDGEs according to their frequencies */
1503 	qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1504 
1505 #ifdef IR_DEBUG
1506 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1507 		ir_dump_edges(ctx, edges_count, edges);
1508 	}
1509 #endif
1510 
1511 	/* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1512 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1513 		uint32_t dst = chains[e->to].head;
1514 		if (dst == e->to) {
1515 			uint32_t src = chains[e->from].next;
1516 			if (chains[src].head == src) {
1517 				IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1518 				if (src != dst) {
1519 					ir_join_chains(chains, src, dst);
1520 				} else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1521 					/* Try to rotate loop chian to finish it with a conditional branch */
1522 					uint32_t tail = e->from;
1523 					uint32_t prev = src;
1524 					uint32_t next = chains[prev].next;
1525 					uint32_t best = 0;
1526 
1527 					while (prev != tail) {
1528 						if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1529 							if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1530 							 && ctx->cfg_blocks[prev].loop_depth > 1) {
1531 								best = next;
1532 								break;
1533 							} else if (!best || bb_freq[next] < bb_freq[best]) {
1534 								/* Find the successor of IF with the least frequency */
1535 								best = next;
1536 							}
1537 						}
1538 						prev = next;
1539 						next = chains[next].next;
1540 					}
1541 					if (best) {
1542 						/* change the head of the chain */
1543 						chains[src].head = best;
1544 						chains[best].head = best;
1545 					}
1546 				}
1547 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1548 				e->from = 0; /* reset "from" to avoid check on step #5 */
1549 #endif
1550 			}
1551 		}
1552 	}
1553 
1554 #if IR_DEBUG_BB_SCHEDULE_GRAPH
1555 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1556 		ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1557 	}
1558 #endif
1559 
1560 	ir_mem_free(bb_freq);
1561 
1562 #ifdef IR_DEBUG
1563 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1564 		ir_dump_chains(ctx, chains);
1565 	}
1566 #endif
1567 
1568 	/* 4. Merge empty entry blocks */
1569 	if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1570 		for (i = 0; i < ctx->entries_count; i++) {
1571 			b = ctx->entries[i];
1572 			IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1573 			if (b && chains[b].head == b && chains[b].tail == b) {
1574 				bb = &ctx->cfg_blocks[b];
1575 				if (bb->flags & IR_BB_EMPTY) {
1576 					uint32_t successor;
1577 
1578 					do {
1579 						IR_ASSERT(bb->successors_count == 1);
1580 						successor = ctx->cfg_edges[bb->successors];
1581 						bb = &ctx->cfg_blocks[successor];
1582 					} while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1583 					IR_ASSERT(chains[successor].head);
1584 					ir_insert_chain_before(chains, b, successor);
1585 				}
1586 			}
1587 		}
1588 
1589 #ifdef IR_DEBUG
1590 		if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1591 			ir_dump_chains(ctx, chains);
1592 		}
1593 #endif
1594 	}
1595 
1596 	/* 5. Align loop headers */
1597 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1598 		if (chains[b].head == b) {
1599 			bb = &ctx->cfg_blocks[b];
1600 			if (bb->loop_depth) {
1601 				if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1602 					bb->flags |= IR_BB_ALIGN_LOOP;
1603 				}
1604 			}
1605 		}
1606 	}
1607 
1608 	/* 6. Group chains according to the most frequent edge between them */
1609 	// TODO: Try to find a better heuristic
1610 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1611 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1612 		if (!e->from) continue;
1613 #endif
1614 		uint32_t src = ir_chain_head(chains, e->from);
1615 		uint32_t dst = ir_chain_head(chains, e->to);
1616 		if (src != dst) {
1617 			if (dst == 1) {
1618 				ir_join_chains(chains, dst, src);
1619 			} else {
1620 				ir_join_chains(chains, src, dst);
1621 			}
1622 		}
1623 	}
1624 
1625 #ifdef IR_DEBUG
1626 	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1627 		ir_dump_chains(ctx, chains);
1628 	}
1629 #endif
1630 
1631 	/* 7. Form a final BB order */
1632 	count = 0;
1633 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1634 		if (chains[b].head == b) {
1635 			uint32_t tail = chains[b].tail;
1636 			uint32_t i = b;
1637 			while (1) {
1638 				count++;
1639 				ctx->cfg_schedule[count] = i;
1640 				if (i == tail) break;
1641 				i = chains[i].next;
1642 			}
1643 		}
1644 	}
1645 
1646 	IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1647 	ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1648 
1649 	ir_mem_free(edges);
1650 	ir_mem_free(chains);
1651 
1652 	return 1;
1653 }
1654 
1655 /* A variation of "Top-down Positioning" algorithm described by
1656  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1657  */
ir_schedule_blocks_top_down(ir_ctx * ctx)1658 static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1659 {
1660 	ir_bitqueue blocks;
1661 	uint32_t b, best_successor, last_non_empty;
1662 	ir_block *bb, *best_successor_bb;
1663 	ir_insn *insn;
1664 	uint32_t *list, *schedule_end;
1665 	uint32_t count = 0;
1666 
1667 	ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1668 	blocks.pos = 0;
1669 	list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1670 	list[ctx->cfg_blocks_count + 1] = 0;
1671 	schedule_end = list + ctx->cfg_blocks_count;
1672 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1673 		ir_bitset_incl(blocks.set, b);
1674 	}
1675 
1676 	while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1677 		bb = &ctx->cfg_blocks[b];
1678 		/* Start trace */
1679 		last_non_empty = 0;
1680 		do {
1681 			if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1682 				/* Schedule the previous empty ENTRY block before this one */
1683 				uint32_t predecessor = b - 1;
1684 
1685 				ir_bitqueue_del(&blocks, predecessor);
1686 				count++;
1687 				list[count] = predecessor;
1688 			}
1689 			if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1690 				/* move empty blocks to the end */
1691 				*schedule_end = b;
1692 				schedule_end--;
1693 			} else {
1694 				count++;
1695 				list[count] = b;
1696 				last_non_empty = b;
1697 			}
1698 			best_successor_bb = NULL;
1699 			if (bb->successors_count == 1) {
1700 				best_successor = ctx->cfg_edges[bb->successors];
1701 				if (ir_bitqueue_in(&blocks, best_successor)) {
1702 					best_successor_bb = &ctx->cfg_blocks[best_successor];
1703 				}
1704 			} else if (bb->successors_count > 1) {
1705 				uint32_t prob, best_successor_prob;
1706 				uint32_t *p, successor;
1707 				ir_block *successor_bb;
1708 
1709 				for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1710 					successor = *p;
1711 					if (ir_bitqueue_in(&blocks, successor)) {
1712 						successor_bb = &ctx->cfg_blocks[successor];
1713 						insn = &ctx->ir_base[successor_bb->start];
1714 						if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1715 							prob = insn->op2;
1716 							if (!prob) {
1717 								prob = 100 / bb->successors_count;
1718 								if (!(successor_bb->flags & IR_BB_EMPTY)) {
1719 									prob++;
1720 								}
1721 							}
1722 						} else if (insn->op == IR_CASE_DEFAULT) {
1723 							prob = insn->op2;
1724 							if (!prob) {
1725 								prob = 100 / bb->successors_count;
1726 							}
1727 						} else if (insn->op == IR_CASE_VAL) {
1728 							prob = insn->op3;
1729 							if (!prob) {
1730 								prob = 100 / bb->successors_count;
1731 							}
1732 						} else if (insn->op == IR_ENTRY) {
1733 							if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1734 								prob = 99; /* prefer empty ENTRY block to go first */
1735 							} else {
1736 								prob = 1;
1737 							}
1738 						} else {
1739 							prob = 100 / bb->successors_count;
1740 						}
1741 						if (!best_successor_bb
1742 						 || successor_bb->loop_depth > best_successor_bb->loop_depth
1743 						 || prob > best_successor_prob) {
1744 							best_successor = successor;
1745 							best_successor_bb = successor_bb;
1746 							best_successor_prob = prob;
1747 						}
1748 					}
1749 				}
1750 			}
1751 			if (!best_successor_bb) {
1752 				/* Try to continue trace using the other successor of the last IF */
1753 				if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1754 					bb = &ctx->cfg_blocks[last_non_empty];
1755 					if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1756 						b = ctx->cfg_edges[bb->successors];
1757 
1758 						if (!ir_bitqueue_in(&blocks, b)) {
1759 							b = ctx->cfg_edges[bb->successors + 1];
1760 						}
1761 						if (ir_bitqueue_in(&blocks, b)) {
1762 							bb = &ctx->cfg_blocks[b];
1763 							ir_bitqueue_del(&blocks, b);
1764 							continue;
1765 						}
1766 					}
1767 				}
1768 				/* End trace */
1769 				break;
1770 			}
1771 			b = best_successor;
1772 			bb = best_successor_bb;
1773 			ir_bitqueue_del(&blocks, b);
1774 		} while (1);
1775 	}
1776 
1777 	IR_ASSERT(list + count == schedule_end);
1778 	ctx->cfg_schedule = list;
1779 	ir_bitqueue_free(&blocks);
1780 
1781 	return 1;
1782 }
1783 
ir_schedule_blocks(ir_ctx * ctx)1784 int ir_schedule_blocks(ir_ctx *ctx)
1785 {
1786 	ir_ref ref;
1787 
1788 	if (ctx->cfg_blocks_count <= 2) {
1789 		return 1;
1790 	}
1791 
1792 	/* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1793 	ref = ctx->ir_base[1].op1;
1794 	while (ref) {
1795 		ir_insn *insn = &ctx->ir_base[ref];
1796 
1797 		if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1798 			ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1799 			uint32_t n = bb->predecessors_count;
1800 
1801 			if (n == 1) {
1802 				ir_insn *start_insn = &ctx->ir_base[bb->start];
1803 				if (start_insn->op == IR_IF_TRUE
1804 				 || start_insn->op == IR_IF_FALSE
1805 				 || start_insn->op == IR_CASE_DEFAULT) {
1806 					if (!start_insn->op2) start_insn->op2 = 1;
1807 				} else if (start_insn->op == IR_CASE_VAL) {
1808 					if (!start_insn->op3) start_insn->op3 = 1;
1809 				}
1810 			} else if (n > 1) {
1811 				uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1812 
1813 				for (; n > 0; p++, n--) {
1814 					bb = &ctx->cfg_blocks[*p];
1815 					if (bb->predecessors_count == 1) {
1816 						ir_insn *start_insn = &ctx->ir_base[bb->start];
1817 						if (start_insn->op == IR_IF_TRUE
1818 						 || start_insn->op == IR_IF_FALSE
1819 						 || start_insn->op == IR_CASE_DEFAULT) {
1820 							if (!start_insn->op2) start_insn->op2 = 1;
1821 						} else if (start_insn->op == IR_CASE_VAL) {
1822 							if (!start_insn->op3) start_insn->op3 = 1;
1823 						}
1824 					}
1825 				}
1826 			}
1827 		}
1828 		ref = insn->op3;
1829 	}
1830 
1831 	/* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
1832 	 * use it only for relatively small functions.
1833 	 *
1834 	 * TODO: make the choice between top-down and bottom-up algorithm configurable
1835 	 */
1836 	if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
1837 		return ir_schedule_blocks_top_down(ctx);
1838 	} else {
1839 		return ir_schedule_blocks_bottom_up(ctx);
1840 	}
1841 }
1842 
1843 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1844 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1845 {
1846 	return _ir_skip_empty_blocks(ctx, b);
1847 }
1848 
ir_next_block(const ir_ctx * ctx,uint32_t b)1849 uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
1850 {
1851 	ir_block *bb;
1852 
1853 	if (ctx->cfg_schedule) {
1854 		uint32_t ret = ctx->cfg_schedule[++b];
1855 
1856 		/* Check for empty ENTRY block */
1857 		while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
1858 			ret = ctx->cfg_schedule[++b];
1859 		}
1860 		return ret;
1861 	}
1862 
1863 	b++;
1864 	while (1) {
1865 		if (b > ctx->cfg_blocks_count) {
1866 			return 0;
1867 		}
1868 
1869 		bb = &ctx->cfg_blocks[b];
1870 
1871 		if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1872 			b++;
1873 		} else {
1874 			break;
1875 		}
1876 	}
1877 	return b;
1878 }
1879 
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block)1880 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
1881 {
1882 	ir_block *bb;
1883 	uint32_t *p, use_block;
1884 
1885 	*true_block = 0;
1886 	*false_block = 0;
1887 	bb = &ctx->cfg_blocks[b];
1888 	IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1889 	IR_ASSERT(bb->successors_count == 2);
1890 	p = &ctx->cfg_edges[bb->successors];
1891 	use_block = *p;
1892 	if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1893 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1894 		use_block = *(p+1);
1895 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1896 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1897 	} else {
1898 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1899 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1900 		use_block = *(p+1);
1901 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1902 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1903 	}
1904 	IR_ASSERT(*true_block && *false_block);
1905 }
1906