xref: /php-src/ext/opcache/jit/ir/ir_cfg.c (revision 8e4363de)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (CFG - Control Flow Graph)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 #include "ir_private.h"
10 
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)11 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
12 {
13 	ir_use_list *use_list = &ctx->use_lists[ref];
14 	ir_ref *p, use, n = use_list->count;
15 
16 	if (n < 2) {
17 		if (n == 1) {
18 			use = ctx->use_edges[use_list->refs];
19 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
20 			ir_worklist_push(worklist, use);
21 		}
22 	} else {
23 		p = &ctx->use_edges[use_list->refs];
24 		if (n == 2) {
25 			use = *p;
26 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
27 			ir_worklist_push(worklist, use);
28 			use = *(p + 1);
29 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
30 			ir_worklist_push(worklist, use);
31 		} else {
32 			for (; n > 0; p++, n--) {
33 				use = *p;
34 				IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
35 				ir_worklist_push(worklist, use);
36 			}
37 		}
38 	}
39 }
40 
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)41 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
42 {
43 	ir_ref n, ref;
44 	const ir_ref *p;
45 
46 	if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
47 		n = insn->inputs_count;
48 		for (p = insn->ops + 1; n > 0; p++, n--) {
49 			ref = *p;
50 			IR_ASSERT(ref);
51 			ir_worklist_push(worklist, ref);
52 		}
53 	} else if (insn->op != IR_START) {
54 		if (EXPECTED(insn->op1)) {
55 			ir_worklist_push(worklist, insn->op1);
56 		}
57 	}
58 }
59 
ir_build_cfg(ir_ctx * ctx)60 int ir_build_cfg(ir_ctx *ctx)
61 {
62 	ir_ref n, *p, ref, start, end, next;
63 	uint32_t b;
64 	ir_insn *insn;
65 	ir_worklist worklist;
66 	uint32_t bb_init_falgs;
67 	uint32_t count, bb_count = 0;
68 	uint32_t edges_count = 0;
69 	ir_block *blocks, *bb;
70 	uint32_t *_blocks, *edges;
71 	ir_use_list *use_list;
72 	uint32_t len = ir_bitset_len(ctx->insns_count);
73 	ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
74 	ir_bitset bb_leaks = bb_starts + len;
75 	_blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
76 	ir_worklist_init(&worklist, ctx->insns_count);
77 
78 	/* First try to perform backward DFS search starting from "stop" nodes */
79 
80 	/* Add all "stop" nodes */
81 	ref = ctx->ir_base[1].op1;
82 	while (ref) {
83 		ir_worklist_push(&worklist, ref);
84 		ref = ctx->ir_base[ref].op3;
85 	}
86 
87 	while (ir_worklist_len(&worklist)) {
88 		ref = ir_worklist_pop(&worklist);
89 		insn = &ctx->ir_base[ref];
90 
91 		if (insn->op == IR_NOP) {
92 			continue;
93 		}
94 		IR_ASSERT(IR_IS_BB_END(insn->op));
95 		/* Remember BB end */
96 		end = ref;
97 		/* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
98 		use_list = &ctx->use_lists[end];
99 		n = use_list->count;
100 		if (n > 1) {
101 			for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
102 				/* Remember possible inaccessible successors */
103 				ir_bitset_incl(bb_leaks, *p);
104 			}
105 		}
106 		/* Skip control nodes untill BB start */
107 		ref = insn->op1;
108 		while (1) {
109 			insn = &ctx->ir_base[ref];
110 			if (IR_IS_BB_START(insn->op)) {
111 				break;
112 			}
113 			ref = insn->op1; // follow connected control blocks untill BB start
114 		}
115 		/* Mark BB Start */
116 		bb_count++;
117 		_blocks[ref] = end;
118 		ir_bitset_incl(bb_starts, ref);
119 		/* Add predecessors */
120 		_ir_add_predecessors(insn, &worklist);
121 	}
122 
123 	/* Backward DFS way miss some branches ending by infinite loops.  */
124 	/* Try forward DFS. (in most cases all nodes are already proceed). */
125 
126 	/* START node may be inaccessible from "stop" nodes */
127 	ir_bitset_incl(bb_leaks, 1);
128 
129 	/* Add not processed START and successor of IF and SWITCH */
130 	IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
131 		ir_worklist_push(&worklist, start);
132 	} IR_BITSET_FOREACH_END();
133 
134 	if (ir_worklist_len(&worklist)) {
135 		ir_bitset_union(worklist.visited, bb_starts, len);
136 		do {
137 			ref = ir_worklist_pop(&worklist);
138 			insn = &ctx->ir_base[ref];
139 
140 			if (insn->op == IR_NOP) {
141 				continue;
142 			}
143 			IR_ASSERT(IR_IS_BB_START(insn->op));
144 			/* Remember BB start */
145 			start = ref;
146 			/* Skip control nodes untill BB end */
147 			while (1) {
148 				use_list = &ctx->use_lists[ref];
149 				n = use_list->count;
150 				next = IR_UNUSED;
151 				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
152 					next = *p;
153 					insn = &ctx->ir_base[next];
154 					if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
155 						break;
156 					}
157 				}
158 				IR_ASSERT(next != IR_UNUSED);
159 				ref = next;
160 				if (IR_IS_BB_END(insn->op)) {
161 					break;
162 				}
163 			}
164 			/* Mark BB Start */
165 			bb_count++;
166 			_blocks[start] = ref;
167 			ir_bitset_incl(bb_starts, start);
168 			/* Add successors */
169 			_ir_add_successors(ctx, ref, &worklist);
170 		} while (ir_worklist_len(&worklist));
171 	}
172 
173 	IR_ASSERT(bb_count > 0);
174 
175 	/* Create array of basic blocks and count successor/predecessors edges for each BB */
176 	blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
177 	b = 1;
178 	bb = blocks + 1;
179 	count = 0;
180 	/* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
181 	bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
182 	IR_BITSET_FOREACH(bb_starts, len, start) {
183 		insn = &ctx->ir_base[start];
184 		if (insn->op == IR_NOP) {
185 			_blocks[start] = 0;
186 			continue;
187 		}
188 		end = _blocks[start];
189 		_blocks[start] = b;
190 		_blocks[end] = b;
191 		IR_ASSERT(IR_IS_BB_START(insn->op));
192 		IR_ASSERT(end > start);
193 		bb->start = start;
194 		bb->end = end;
195 		bb->successors = count;
196 		count += ctx->use_lists[end].count;
197 		bb->successors_count = 0;
198 		bb->predecessors = count;
199 		bb->dom_parent = 0;
200 		bb->dom_depth = 0;
201 		bb->dom_child = 0;
202 		bb->dom_next_child = 0;
203 		bb->loop_header = 0;
204 		bb->loop_depth = 0;
205 		if (insn->op == IR_START) {
206 			bb->flags = IR_BB_START;
207 			bb->predecessors_count = 0;
208 		} else {
209 			bb->flags = bb_init_falgs;
210 			if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
211 				n = insn->inputs_count;
212 				bb->predecessors_count = n;
213 				edges_count += n;
214 				count += n;
215 			} else if (EXPECTED(insn->op1)) {
216 				if (insn->op == IR_ENTRY) {
217 					bb->flags |= IR_BB_ENTRY;
218 					ctx->entries_count++;
219 				}
220 				bb->predecessors_count = 1;
221 				edges_count++;
222 				count++;
223 			} else {
224 				IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
225 				bb->predecessors_count = 0;
226 			}
227 		}
228 		b++;
229 		bb++;
230 	} IR_BITSET_FOREACH_END();
231 	bb_count = b - 1;
232 	IR_ASSERT(count == edges_count * 2);
233 	ir_mem_free(bb_starts);
234 
235 	/* Create an array of successor/predecessors control edges */
236 	edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
237 	bb = blocks + 1;
238 	for (b = 1; b <= bb_count; b++, bb++) {
239 		insn = &ctx->ir_base[bb->start];
240 		if (bb->predecessors_count > 1) {
241 			uint32_t *q = edges + bb->predecessors;
242 			n = insn->inputs_count;
243 			for (p = insn->ops + 1; n > 0; p++, q++, n--) {
244 				ref = *p;
245 				IR_ASSERT(ref);
246 				ir_ref pred_b = _blocks[ref];
247 				ir_block *pred_bb = &blocks[pred_b];
248 				*q = pred_b;
249 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
250 			}
251 		} else if (bb->predecessors_count == 1) {
252 			ref = insn->op1;
253 			IR_ASSERT(ref);
254 			IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
255 			ir_ref pred_b = _blocks[ref];
256 			ir_block *pred_bb = &blocks[pred_b];
257 			edges[bb->predecessors] = pred_b;
258 			edges[pred_bb->successors + pred_bb->successors_count++] = b;
259 		}
260 	}
261 
262 	ctx->cfg_blocks_count = bb_count;
263 	ctx->cfg_edges_count = edges_count * 2;
264 	ctx->cfg_blocks = blocks;
265 	ctx->cfg_edges = edges;
266 	ctx->cfg_map = _blocks;
267 
268 	if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
269 		uint32_t reachable_count = 0;
270 
271 		/* Mark reachable blocks */
272 		ir_worklist_clear(&worklist);
273 		ir_worklist_push(&worklist, 1);
274 		while (ir_worklist_len(&worklist) != 0) {
275 			uint32_t *p;
276 
277 			reachable_count++;
278 			b = ir_worklist_pop(&worklist);
279 			bb = &blocks[b];
280 			bb->flags &= ~IR_BB_UNREACHABLE;
281 			n = bb->successors_count;
282 			if (n > 1) {
283 				for (p = edges + bb->successors; n > 0; p++, n--) {
284 					ir_worklist_push(&worklist, *p);
285 				}
286 			} else if (n == 1) {
287 				ir_worklist_push(&worklist, edges[bb->successors]);
288 			}
289 		}
290 		if (reachable_count != ctx->cfg_blocks_count) {
291 			ir_remove_unreachable_blocks(ctx);
292 		}
293 	}
294 
295 	ir_worklist_free(&worklist);
296 
297 	return 1;
298 }
299 
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)300 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
301 {
302 	uint32_t i, *p, *q, n = 0;
303 
304 	p = q = &ctx->cfg_edges[bb->predecessors];
305 	for (i = 0; i < bb->predecessors_count; i++, p++) {
306 		if (*p != from) {
307 			if (p != q) {
308 				*q = *p;
309 			}
310 			q++;
311 			n++;
312 		}
313 	}
314 	IR_ASSERT(n != bb->predecessors_count);
315 	bb->predecessors_count = n;
316 }
317 
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)318 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
319 {
320 	ir_ref i, j, n, k, *p, use;
321 	ir_insn *use_insn;
322 	ir_use_list *use_list;
323 	ir_bitset life_inputs;
324 	ir_insn *insn = &ctx->ir_base[merge];
325 
326 	IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
327 	n = insn->inputs_count;
328 	i = 1;
329 	life_inputs = ir_bitset_malloc(n + 1);
330 	for (j = 1; j <= n; j++) {
331 		ir_ref input = ir_insn_op(insn, j);
332 
333 		if (input != from) {
334 			if (i != j) {
335 				ir_insn_set_op(insn, i, input);
336 			}
337 			ir_bitset_incl(life_inputs, j);
338 			i++;
339 		}
340 	}
341 	i--;
342 	if (i == 1) {
343 		insn->op = IR_BEGIN;
344 		insn->inputs_count = 1;
345 		use_list = &ctx->use_lists[merge];
346 		if (use_list->count > 1) {
347 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
348 				use = *p;
349 				use_insn = &ctx->ir_base[use];
350 				if (use_insn->op == IR_PHI) {
351 					/* Convert PHI to COPY */
352 					i = 2;
353 					for (j = 2; j <= n; j++) {
354 						ir_ref input = ir_insn_op(use_insn, j);
355 
356 						if (ir_bitset_in(life_inputs, j - 1)) {
357 							use_insn->op1 = ir_insn_op(use_insn, j);
358 						} else if (input > 0) {
359 							ir_use_list_remove_all(ctx, input, use);
360 						}
361 					}
362 					use_insn->op = IR_COPY;
363 					use_insn->op2 = IR_UNUSED;
364 					use_insn->op3 = IR_UNUSED;
365 					ir_use_list_remove_all(ctx, merge, use);
366 				}
367 			}
368 		}
369 	} else {
370 		insn->inputs_count = i;
371 
372 		n++;
373 		use_list = &ctx->use_lists[merge];
374 		if (use_list->count > 1) {
375 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
376 				use = *p;
377 				use_insn = &ctx->ir_base[use];
378 				if (use_insn->op == IR_PHI) {
379 					i = 2;
380 					for (j = 2; j <= n; j++) {
381 						ir_ref input = ir_insn_op(use_insn, j);
382 
383 						if (ir_bitset_in(life_inputs, j - 1)) {
384 							IR_ASSERT(input);
385 							if (i != j) {
386 								ir_insn_set_op(use_insn, i, input);
387 							}
388 							i++;
389 						} else if (input > 0) {
390 							ir_use_list_remove_all(ctx, input, use);
391 						}
392 					}
393 				}
394 			}
395 		}
396 	}
397 	ir_mem_free(life_inputs);
398 	ir_use_list_remove_all(ctx, from, merge);
399 }
400 
401 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)402 int ir_remove_unreachable_blocks(ir_ctx *ctx)
403 {
404 	uint32_t b, *p, i;
405 	uint32_t unreachable_count = 0;
406 	uint32_t bb_count = ctx->cfg_blocks_count;
407 	ir_block *bb = ctx->cfg_blocks + 1;
408 
409 	for (b = 1; b <= bb_count; b++, bb++) {
410 		if (bb->flags & IR_BB_UNREACHABLE) {
411 #if 0
412 			do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
413 #endif
414 			if (bb->successors_count) {
415 				for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
416 					ir_block *succ_bb = &ctx->cfg_blocks[*p];
417 
418 					if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
419 						ir_remove_predecessor(ctx, succ_bb, b);
420 						ir_remove_merge_input(ctx, succ_bb->start, bb->end);
421 					}
422 				}
423 			} else {
424 				ir_ref prev, ref = bb->end;
425 				ir_insn *insn = &ctx->ir_base[ref];
426 
427 				IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
428 				/* remove from terminators list */
429 				prev = ctx->ir_base[1].op1;
430 				if (prev == ref) {
431 					ctx->ir_base[1].op1 = insn->op3;
432 				} else {
433 					while (prev) {
434 						if (ctx->ir_base[prev].op3 == ref) {
435 							ctx->ir_base[prev].op3 = insn->op3;
436 							break;
437 						}
438 						prev = ctx->ir_base[prev].op3;
439 					}
440 				}
441 			}
442 			ctx->cfg_map[bb->start] = 0;
443 			ctx->cfg_map[bb->end] = 0;
444 			unreachable_count++;
445 		}
446 	}
447 
448 	if (unreachable_count) {
449 		ir_block *dst_bb;
450 		uint32_t n = 1;
451 		uint32_t *edges;
452 
453 		dst_bb = bb = ctx->cfg_blocks + 1;
454 		for (b = 1; b <= bb_count; b++, bb++) {
455 			if (!(bb->flags & IR_BB_UNREACHABLE)) {
456 				if (dst_bb != bb) {
457 					memcpy(dst_bb, bb, sizeof(ir_block));
458 					ctx->cfg_map[dst_bb->start] = n;
459 					ctx->cfg_map[dst_bb->end] = n;
460 				}
461 				dst_bb->successors_count = 0;
462 				dst_bb++;
463 				n++;
464 			}
465 		}
466 		ctx->cfg_blocks_count = bb_count = n - 1;
467 
468 		/* Rebuild successor/predecessors control edges */
469 		edges = ctx->cfg_edges;
470 		bb = ctx->cfg_blocks + 1;
471 		for (b = 1; b <= bb_count; b++, bb++) {
472 			ir_insn *insn = &ctx->ir_base[bb->start];
473 			ir_ref *p, ref;
474 
475 			n = bb->predecessors_count;
476 			if (n > 1) {
477 				uint32_t *q = edges + bb->predecessors;
478 
479 				IR_ASSERT(n == insn->inputs_count);
480 				for (p = insn->ops + 1; n > 0; p++, q++, n--) {
481 					ref = *p;
482 					IR_ASSERT(ref);
483 					ir_ref pred_b = ctx->cfg_map[ref];
484 					ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
485 					*q = pred_b;
486 					edges[pred_bb->successors + pred_bb->successors_count++] = b;
487 				}
488 			} else if (n == 1) {
489 				ref = insn->op1;
490 				IR_ASSERT(ref);
491 				IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
492 				ir_ref pred_b = ctx->cfg_map[ref];
493 				ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
494 				edges[bb->predecessors] = pred_b;
495 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
496 			}
497 		}
498 	}
499 
500 	return 1;
501 }
502 
503 #if 0
504 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
505 {
506 	uint32_t i, *p;
507 	ir_block *bb = &ctx->cfg_blocks[b];
508 
509 	if (bb->postnum != 0) {
510 		return;
511 	}
512 
513 	if (bb->successors_count) {
514 		bb->postnum = -1; /* Marker for "currently visiting" */
515 		p = ctx->cfg_edges + bb->successors;
516 		i = bb->successors_count;
517 		do {
518 			compute_postnum(ctx, cur, *p);
519 			p++;
520 		} while (--i);
521 	}
522 	bb->postnum = (*cur)++;
523 }
524 
525 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
526  * Cooper, Harvey and Kennedy. */
527 int ir_build_dominators_tree(ir_ctx *ctx)
528 {
529 	uint32_t blocks_count, b, postnum;
530 	ir_block *blocks, *bb;
531 	uint32_t *edges;
532 	bool changed;
533 
534 	ctx->flags2 &= ~IR_NO_LOOPS;
535 
536 	postnum = 1;
537 	compute_postnum(ctx, &postnum, 1);
538 
539 	/* Find immediate dominators */
540 	blocks = ctx->cfg_blocks;
541 	edges  = ctx->cfg_edges;
542 	blocks_count = ctx->cfg_blocks_count;
543 	blocks[1].idom = 1;
544 	do {
545 		changed = 0;
546 		/* Iterating in Reverse Post Order */
547 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
548 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
549 			if (bb->predecessors_count == 1) {
550 				uint32_t pred_b = edges[bb->predecessors];
551 
552 				if (blocks[pred_b].idom <= 0) {
553 					//IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
554 				} else if (bb->idom != pred_b) {
555 					bb->idom = pred_b;
556 					changed = 1;
557 				}
558 			} else if (bb->predecessors_count) {
559 				uint32_t idom = 0;
560 				uint32_t k = bb->predecessors_count;
561 				uint32_t *p = edges + bb->predecessors;
562 
563 				do {
564 					uint32_t pred_b = *p;
565 					ir_block *pred_bb = &blocks[pred_b];
566 					ir_block *idom_bb;
567 
568 					if (pred_bb->idom > 0) {
569 						idom = pred_b;
570 						idom_bb = &blocks[idom];
571 
572 						while (--k > 0) {
573 							pred_b = *(++p);
574 							pred_bb = &blocks[pred_b];
575 							if (pred_bb->idom > 0) {
576 								while (idom != pred_b) {
577 									while (pred_bb->postnum < idom_bb->postnum) {
578 										pred_b = pred_bb->idom;
579 										pred_bb = &blocks[pred_b];
580 									}
581 									while (idom_bb->postnum < pred_bb->postnum) {
582 										idom = idom_bb->idom;
583 										idom_bb = &blocks[idom];
584 									}
585 								}
586 							}
587 						}
588 
589 						if (bb->idom != idom) {
590 							bb->idom = idom;
591 							changed = 1;
592 						}
593 						break;
594 					}
595 					p++;
596 				} while (--k > 0);
597 			}
598 		}
599 	} while (changed);
600 	blocks[1].idom = 0;
601 	blocks[1].dom_depth = 0;
602 
603 	/* Construct dominators tree */
604 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
605 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
606 		if (bb->idom > 0) {
607 			ir_block *idom_bb = &blocks[bb->idom];
608 
609 			bb->dom_depth = idom_bb->dom_depth + 1;
610 			/* Sort by block number to traverse children in pre-order */
611 			if (idom_bb->dom_child == 0) {
612 				idom_bb->dom_child = b;
613 			} else if (b < idom_bb->dom_child) {
614 				bb->dom_next_child = idom_bb->dom_child;
615 				idom_bb->dom_child = b;
616 			} else {
617 				int child = idom_bb->dom_child;
618 				ir_block *child_bb = &blocks[child];
619 
620 				while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
621 					child = child_bb->dom_next_child;
622 					child_bb = &blocks[child];
623 				}
624 				bb->dom_next_child = child_bb->dom_next_child;
625 				child_bb->dom_next_child = b;
626 			}
627 		}
628 	}
629 
630 	return 1;
631 }
632 #else
633 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
634  * Cooper, Harvey and Kennedy, that relays on IR block ordering.
635  * It may fallback to the general slow fixed-point algorithm.  */
636 static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
ir_build_dominators_tree(ir_ctx * ctx)637 int ir_build_dominators_tree(ir_ctx *ctx)
638 {
639 	uint32_t blocks_count, b;
640 	ir_block *blocks, *bb;
641 	uint32_t *edges;
642 	ir_list worklist;
643 
644 	ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
645 
646 	ctx->flags2 |= IR_NO_LOOPS;
647 
648 	/* Find immediate dominators */
649 	blocks = ctx->cfg_blocks;
650 	edges  = ctx->cfg_edges;
651 	blocks_count = ctx->cfg_blocks_count;
652 	blocks[1].idom = 1;
653 	blocks[1].dom_depth = 0;
654 
655 	/* Iterating in Reverse Post Order */
656 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
657 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
658 		IR_ASSERT(bb->predecessors_count > 0);
659 		uint32_t k = bb->predecessors_count;
660 		uint32_t *p = edges + bb->predecessors;
661 		uint32_t idom = *p;
662 		ir_block *idom_bb;
663 
664 		if (UNEXPECTED(idom > b)) {
665 			/* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
666 			ctx->flags2 &= ~IR_NO_LOOPS;
667 			IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
668 			ir_list_push(&worklist, idom);
669 			while (1) {
670 				k--;
671 				p++;
672 				idom = *p;
673 				if (idom < b) {
674 					break;
675 				}
676 				IR_ASSERT(k > 0);
677 				ir_list_push(&worklist, idom);
678 			}
679 		}
680 		IR_ASSERT(blocks[idom].idom > 0);
681 		IR_ASSERT(k != 0);
682 
683 		while (--k > 0) {
684 			uint32_t pred_b = *(++p);
685 
686 			if (pred_b < b) {
687 				IR_ASSERT(blocks[pred_b].idom > 0);
688 				while (idom != pred_b) {
689 					while (pred_b > idom) {
690 						pred_b = blocks[pred_b].idom;
691 					}
692 					while (idom > pred_b) {
693 						idom = blocks[idom].idom;
694 					}
695 				}
696 			} else {
697 				ctx->flags2 &= ~IR_NO_LOOPS;
698 				IR_ASSERT(bb->predecessors_count > 1);
699 				ir_list_push(&worklist, pred_b);
700 			}
701 		}
702 		bb->idom = idom;
703 		idom_bb = &blocks[idom];
704 
705 		bb->dom_depth = idom_bb->dom_depth + 1;
706 		/* Sort by block number to traverse children in pre-order */
707 		if (idom_bb->dom_child == 0) {
708 			idom_bb->dom_child = b;
709 		} else if (b < idom_bb->dom_child) {
710 			bb->dom_next_child = idom_bb->dom_child;
711 			idom_bb->dom_child = b;
712 		} else {
713 			int child = idom_bb->dom_child;
714 			ir_block *child_bb = &blocks[child];
715 
716 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
717 				child = child_bb->dom_next_child;
718 				child_bb = &blocks[child];
719 			}
720 			bb->dom_next_child = child_bb->dom_next_child;
721 			child_bb->dom_next_child = b;
722 		}
723 	}
724 
725 	blocks[1].idom = 0;
726 
727 	if (ir_list_len(&worklist) != 0) {
728 		uint32_t dom_depth;
729 		uint32_t succ_b;
730 		bool complete = 1;
731 
732 		/* Check if all the back-edges lead to the loop headers */
733 		do {
734 			b = ir_list_pop(&worklist);
735 			bb = &blocks[b];
736 			succ_b = ctx->cfg_edges[bb->successors];
737 			if (bb->successors_count != 1) {
738 				/* LOOP_END/END may be linked with the following ENTRY by a fake edge */
739 				IR_ASSERT(bb->successors_count == 2);
740 				if (blocks[succ_b].flags & IR_BB_ENTRY) {
741 					succ_b = ctx->cfg_edges[bb->successors + 1];
742 				} else {
743 					IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
744 				}
745 			}
746 			dom_depth = blocks[succ_b].dom_depth;;
747 			while (bb->dom_depth > dom_depth) {
748 				b = bb->dom_parent;
749 				bb = &blocks[b];
750 			}
751 			if (UNEXPECTED(b != succ_b)) {
752 				complete = 0;
753 				break;
754 			}
755 		} while (ir_list_len(&worklist) != 0);
756 
757 		if (UNEXPECTED(!complete)) {
758 			ir_list_free(&worklist);
759 			return ir_build_dominators_tree_iterative(ctx);
760 		}
761 	}
762 
763 	ir_list_free(&worklist);
764 
765 	return 1;
766 }
767 
ir_build_dominators_tree_iterative(ir_ctx * ctx)768 static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
769 {
770 	bool changed;
771 	uint32_t blocks_count, b;
772 	ir_block *blocks, *bb;
773 	uint32_t *edges;
774 
775 	/* Find immediate dominators */
776 	blocks = ctx->cfg_blocks;
777 	edges  = ctx->cfg_edges;
778 	blocks_count = ctx->cfg_blocks_count;
779 
780 	/* Clear the dominators tree, but keep already found dominators */
781 	for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
782 		bb->dom_depth = 0;
783 		bb->dom_child = 0;
784 		bb->dom_next_child = 0;
785 	}
786 
787 	/* Find immediate dominators by iterative fixed-point algorithm */
788 	blocks[1].idom = 1;
789 	do {
790 		changed = 0;
791 
792 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
793 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
794 			IR_ASSERT(bb->predecessors_count > 0);
795 			uint32_t k = bb->predecessors_count;
796 			uint32_t *p = edges + bb->predecessors;
797 			uint32_t idom = *p;
798 
799 			if (blocks[idom].idom == 0) {
800 				while (1) {
801 					k--;
802 					p++;
803 					idom = *p;
804 					if (blocks[idom].idom > 0) {
805 						break;
806 					}
807 					IR_ASSERT(k > 0);
808 				}
809 			}
810 			IR_ASSERT(k != 0);
811 			while (--k > 0) {
812 				uint32_t pred_b = *(++p);
813 
814 				if (blocks[pred_b].idom > 0) {
815 					IR_ASSERT(blocks[pred_b].idom > 0);
816 					while (idom != pred_b) {
817 						while (pred_b > idom) {
818 							pred_b = blocks[pred_b].idom;
819 						}
820 						while (idom > pred_b) {
821 							idom = blocks[idom].idom;
822 						}
823 					}
824 				}
825 			}
826 			if (bb->idom != idom) {
827 				bb->idom = idom;
828 				changed = 1;
829 			}
830 		}
831 	} while (changed);
832 
833 	/* Build dominators tree */
834 	blocks[1].idom = 0;
835 	blocks[1].dom_depth = 0;
836 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
837 		uint32_t idom = bb->idom;
838 		ir_block *idom_bb = &blocks[idom];
839 
840 		bb->dom_depth = idom_bb->dom_depth + 1;
841 		/* Sort by block number to traverse children in pre-order */
842 		if (idom_bb->dom_child == 0) {
843 			idom_bb->dom_child = b;
844 		} else if (b < idom_bb->dom_child) {
845 			bb->dom_next_child = idom_bb->dom_child;
846 			idom_bb->dom_child = b;
847 		} else {
848 			int child = idom_bb->dom_child;
849 			ir_block *child_bb = &blocks[child];
850 
851 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
852 				child = child_bb->dom_next_child;
853 				child_bb = &blocks[child];
854 			}
855 			bb->dom_next_child = child_bb->dom_next_child;
856 			child_bb->dom_next_child = b;
857 		}
858 	}
859 
860 	return 1;
861 }
862 #endif
863 
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)864 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
865 {
866 	uint32_t b1_depth = blocks[b1].dom_depth;
867 	const ir_block *bb2 = &blocks[b2];
868 
869 	while (bb2->dom_depth > b1_depth) {
870 		b2 = bb2->dom_parent;
871 		bb2 = &blocks[b2];
872 	}
873 	return b1 == b2;
874 }
875 
ir_find_loops(ir_ctx * ctx)876 int ir_find_loops(ir_ctx *ctx)
877 {
878 	uint32_t i, j, n, count;
879 	uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
880 	ir_block *blocks = ctx->cfg_blocks;
881 	uint32_t *edges = ctx->cfg_edges;
882 	ir_worklist work;
883 
884 	if (ctx->flags2 & IR_NO_LOOPS) {
885 		return 1;
886 	}
887 
888 	/* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
889 	 * queries. These are implemented by checking entry/exit times of the DFS search. */
890 	ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
891 	entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
892 	exit_times = entry_times + ctx->cfg_blocks_count + 1;
893 	sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
894 
895 	memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
896 
897 	ir_worklist_push(&work, 1);
898 	while (ir_worklist_len(&work)) {
899 		ir_block *bb;
900 		int child;
901 
902 next:
903 		i = ir_worklist_peek(&work);
904 		if (!entry_times[i]) {
905 			entry_times[i] = time++;
906 		}
907 
908 		/* Visit blocks immediately dominated by i. */
909 		bb = &blocks[i];
910 		for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
911 			if (ir_worklist_push(&work, child)) {
912 				goto next;
913 			}
914 		}
915 
916 		/* Visit join edges. */
917 		if (bb->successors_count) {
918 			uint32_t *p = edges + bb->successors;
919 			for (j = 0; j < bb->successors_count; j++,p++) {
920 				uint32_t succ = *p;
921 
922 				if (blocks[succ].idom == i) {
923 					continue;
924 				} else if (ir_worklist_push(&work, succ)) {
925 					goto next;
926 				}
927 			}
928 		}
929 		exit_times[i] = time++;
930 		ir_worklist_pop(&work);
931 	}
932 
933 	/* Sort blocks by level, which is the opposite order in which we want to process them */
934 	sorted_blocks[1] = 1;
935 	j = 1;
936 	n = 2;
937 	while (j != n) {
938 		i = j;
939 		j = n;
940 		for (; i < j; i++) {
941 			int child;
942 			for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
943 				sorted_blocks[n++] = child;
944 			}
945 		}
946 	}
947 	count = n;
948 
949 	/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
950 	while (n > 1) {
951 		i = sorted_blocks[--n];
952 		ir_block *bb = &blocks[i];
953 
954 		if (bb->predecessors_count > 1) {
955 			bool irreducible = 0;
956 			uint32_t *p = &edges[bb->predecessors];
957 
958 			j = bb->predecessors_count;
959 			do {
960 				uint32_t pred = *p;
961 
962 				/* A join edge is one for which the predecessor does not
963 				   immediately dominate the successor.  */
964 				if (bb->idom != pred) {
965 					/* In a loop back-edge (back-join edge), the successor dominates
966 					   the predecessor.  */
967 					if (ir_dominates(blocks, i, pred)) {
968 						if (!ir_worklist_len(&work)) {
969 							ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
970 						}
971 						blocks[pred].loop_header = 0; /* support for merged loops */
972 						ir_worklist_push(&work, pred);
973 					} else {
974 						/* Otherwise it's a cross-join edge.  See if it's a branch
975 						   to an ancestor on the DJ spanning tree.  */
976 						if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
977 							irreducible = 1;
978 						}
979 					}
980 				}
981 				p++;
982 			} while (--j);
983 
984 			if (UNEXPECTED(irreducible)) {
985 				// TODO: Support for irreducible loops ???
986 				bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
987 				ctx->flags2 |= IR_IRREDUCIBLE_CFG;
988 				while (ir_worklist_len(&work)) {
989 					ir_worklist_pop(&work);
990 				}
991 			} else if (ir_worklist_len(&work)) {
992 				bb->flags |= IR_BB_LOOP_HEADER;
993 				ctx->flags2 |= IR_CFG_HAS_LOOPS;
994 				bb->loop_depth = 1;
995 				while (ir_worklist_len(&work)) {
996 					j = ir_worklist_pop(&work);
997 					while (blocks[j].loop_header > 0) {
998 						j = blocks[j].loop_header;
999 					}
1000 					if (j != i) {
1001 						ir_block *bb = &blocks[j];
1002 						if (bb->idom == 0 && j != 1) {
1003 							/* Ignore blocks that are unreachable or only abnormally reachable. */
1004 							continue;
1005 						}
1006 						bb->loop_header = i;
1007 						if (bb->predecessors_count) {
1008 							uint32_t *p = &edges[bb->predecessors];
1009 							j = bb->predecessors_count;
1010 							do {
1011 								ir_worklist_push(&work, *p);
1012 								p++;
1013 							} while (--j);
1014 						}
1015 					}
1016 				}
1017 			}
1018 		}
1019 	}
1020 
1021 	if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1022 		for (n = 1; n < count; n++) {
1023 			i = sorted_blocks[n];
1024 			ir_block *bb = &blocks[i];
1025 			if (bb->loop_header > 0) {
1026 				ir_block *loop = &blocks[bb->loop_header];
1027 				uint32_t loop_depth = loop->loop_depth;
1028 
1029 				if (bb->flags & IR_BB_LOOP_HEADER) {
1030 					loop_depth++;
1031 				}
1032 				bb->loop_depth = loop_depth;
1033 				if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1034 					loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1035 				}
1036 			}
1037 		}
1038 	}
1039 
1040 	ir_mem_free(entry_times);
1041 	ir_worklist_free(&work);
1042 
1043 	return 1;
1044 }
1045 
_ir_skip_empty_blocks(const ir_ctx * ctx,uint32_t b)1046 static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1047 {
1048 	while (1) {
1049 		ir_block *bb = &ctx->cfg_blocks[b];
1050 
1051 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1052 			IR_ASSERT(bb->successors_count == 1);
1053 			b = ctx->cfg_edges[bb->successors];
1054 		} else {
1055 			return b;
1056 		}
1057 	}
1058 }
1059 
1060 /* A variation of "Bottom-up Positioning" algorithm described by
1061  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1062  */
1063 typedef struct _ir_edge_info {
1064 	uint32_t from;
1065 	uint32_t to;
1066 	float    freq;
1067 } ir_edge_info;
1068 
1069 typedef struct _ir_chain {
1070 	uint32_t head;
1071 	uint32_t next;
1072 	union {
1073 		uint32_t prev;
1074 		uint32_t tail;
1075 	};
1076 } ir_chain;
1077 
ir_edge_info_cmp(const void * b1,const void * b2)1078 static int ir_edge_info_cmp(const void *b1, const void *b2)
1079 {
1080 	ir_edge_info *e1 = (ir_edge_info*)b1;
1081 	ir_edge_info *e2 = (ir_edge_info*)b2;
1082 
1083 	if (e1->freq != e2->freq) {
1084 		return e1->freq < e2->freq ? 1 : -1;
1085 	}
1086 	/* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1087 	 * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1088 	 * (in merge points).
1089 	 *
1090 	 * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1091 	 * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1092 	 */
1093 	if (e1->from != e2->from) {
1094 		return e2->from - e1->from;
1095 	} else {
1096 		return e1->to - e2->to;
1097 	}
1098 }
1099 
ir_chain_head_path_compress(ir_chain * chains,uint32_t src,uint32_t head)1100 static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1101 {
1102 	IR_ASSERT(head != 0);
1103 	do {
1104 		head = chains[head].head;
1105 	} while (chains[head].head != head);
1106 	do {
1107 		uint32_t next = chains[src].head;
1108 		chains[src].head = head;
1109 		src = next;
1110 	} while (chains[src].head != head);
1111 	return head;
1112 }
1113 
ir_chain_head(ir_chain * chains,uint32_t src)1114 IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1115 {
1116 	uint32_t head = chains[src].head;
1117 	if (chains[head].head == head) {
1118 		return head;
1119 	} else {
1120 		return ir_chain_head_path_compress(chains, src, head);
1121 	}
1122 }
1123 
ir_join_chains(ir_chain * chains,uint32_t src,uint32_t dst)1124 static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1125 {
1126 	uint32_t dst_tail = chains[dst].tail;
1127 	uint32_t src_tail = chains[src].tail;
1128 
1129 	chains[dst_tail].next = src;
1130 	chains[dst].prev = src_tail;
1131 	chains[src_tail].next = dst;
1132 	chains[src].tail = dst_tail;
1133 	chains[dst].head = src;
1134 }
1135 
ir_insert_chain_before(ir_chain * chains,uint32_t c,uint32_t before)1136 static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1137 {
1138 	ir_chain *this = &chains[c];
1139 	ir_chain *next = &chains[before];
1140 
1141 	IR_ASSERT(chains[c].head == c);
1142 	if (chains[before].head != before) {
1143 		this->head = next->head;
1144 	} else {
1145 		next->head = c;
1146 	}
1147 	this->next = before;
1148 	this->prev = next->prev;
1149 	next->prev = c;
1150 	chains[this->prev].next = c;
1151 }
1152 
1153 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1154 # define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1155 #endif
1156 #ifndef IR_DEBUG_BB_SCHEDULE_EDGES
1157 # define IR_DEBUG_BB_SCHEDULE_EDGES 0
1158 #endif
1159 #ifndef IR_DEBUG_BB_SCHEDULE_CHAINS
1160 # define IR_DEBUG_BB_SCHEDULE_CHAINS 0
1161 #endif
1162 
1163 #if IR_DEBUG_BB_SCHEDULE_GRAPH
ir_dump_cfg_freq_graph(ir_ctx * ctx,float * bb_freq,uint32_t edges_count,ir_edge_info * edges,ir_chain * chains)1164 static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1165 {
1166 	uint32_t b, i;
1167 	ir_block *bb;
1168 	uint8_t c, *colors;
1169 	bool is_head, is_empty;
1170 	uint32_t max_colors = 12;
1171 
1172 	colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1173 	memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1174 	i = 0;
1175 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1176 		if (chains[b].head == b) {
1177 			colors[b] = (i % max_colors) + 1;
1178 			i++;
1179 		}
1180 	}
1181 
1182 	fprintf(stderr, "digraph {\n");
1183 	fprintf(stderr, "\trankdir=TB;\n");
1184 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1185 		bb = &ctx->cfg_blocks[b];
1186 		c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1187 		is_head = chains[b].head == b;
1188 		is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1189 		if (c) {
1190 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1191 				bb->loop_depth, bb_freq[b],
1192 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1193 				c,
1194 				is_head ? ",penwidth=3" : "",
1195 				is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1196 		} else {
1197 			fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1198 				bb->loop_depth, bb_freq[b],
1199 				ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1200 				is_head ? ",penwidth=3" : "",
1201 				is_empty ? ",style=\"dotted\"" : "");
1202 		}
1203 	}
1204 	fprintf(stderr, "\n");
1205 
1206 	for (i = 0; i < edges_count; i++) {
1207 		fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1208 	}
1209 	fprintf(stderr, "}\n");
1210 }
1211 #endif
1212 
1213 #if IR_DEBUG_BB_SCHEDULE_EDGES
ir_dump_edges(ir_ctx * ctx,uint32_t edges_count,ir_edge_info * edges)1214 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1215 {
1216 	uint32_t i;
1217 
1218 	fprintf(stderr, "Edges:\n");
1219 	for (i = 0; i < edges_count; i++) {
1220 		fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1221 	}
1222 	fprintf(stderr, "}\n");
1223 }
1224 #endif
1225 
1226 #if IR_DEBUG_BB_SCHEDULE_CHAINS
ir_dump_chains(ir_ctx * ctx,ir_chain * chains)1227 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1228 {
1229 	uint32_t b, tail, i;
1230 
1231 	fprintf(stderr, "Chains:\n");
1232 	for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1233 		if (chains[b].head == b) {
1234 			tail = chains[b].tail;
1235 			i = b;
1236 			fprintf(stderr, "(BB%d", i);
1237 			while (i != tail) {
1238 				i = chains[i].next;
1239 				fprintf(stderr, ", BB%d", i);
1240 			}
1241 			fprintf(stderr, ")\n");
1242 		}
1243 	}
1244 }
1245 #endif
1246 
ir_schedule_blocks_bottom_up(ir_ctx * ctx)1247 static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1248 {
1249 	uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1250 	uint32_t edges_count = 0;
1251 	uint32_t b, i, loop_depth;
1252 	float *bb_freq, freq;
1253 	ir_block *bb;
1254 	ir_edge_info *edges, *e;
1255 	ir_chain *chains;
1256 	ir_bitqueue worklist;
1257 	ir_bitset visited;
1258 	uint32_t *schedule_end, count;
1259 
1260 	ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1261 	schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1262 
1263 	/* 1. Create initial chains for each BB */
1264 	chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1265 	chains[0].head = 0;
1266 	chains[0].next = 0;
1267 	chains[0].prev = 0;
1268 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1269 		chains[b].head = b;
1270 		chains[b].next = b;
1271 		chains[b].prev = b;
1272 	}
1273 
1274 	/* 2. Collect information about BBs and EDGEs frequencies */
1275 	edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1276 	bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1277 	bb_freq[1] = 1.0f;
1278 	visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1279 	ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1280 	ir_bitqueue_add(&worklist, 1);
1281 	while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1282 restart:
1283 		bb = &ctx->cfg_blocks[b];
1284 		if (bb->predecessors_count) {
1285 			uint32_t n = bb->predecessors_count;
1286 			uint32_t *p = ctx->cfg_edges + bb->predecessors;
1287 			for (; n > 0; p++, n--) {
1288 				uint32_t predecessor = *p;
1289 				/* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1290 				 * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1291 				 * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1292 				 */
1293 				if (predecessor < b) {
1294 					if (!ir_bitset_in(visited, predecessor)) {
1295 						b = predecessor;
1296 						ir_bitqueue_del(&worklist, b);
1297 						goto restart;
1298 					}
1299 				} else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1300 					ir_dump_cfg(ctx, stderr);
1301 					IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1302 				}
1303 			}
1304 		}
1305 
1306 		ir_bitset_incl(visited, b);
1307 
1308 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1309 			uint32_t successor = ctx->cfg_edges[bb->successors];
1310 
1311 			/* move empty blocks to the end */
1312 			IR_ASSERT(chains[b].head == b);
1313 			chains[b].head = 0;
1314 			*schedule_end = b;
1315 			schedule_end--;
1316 
1317 			if (successor > b) {
1318 				bb_freq[successor] += bb_freq[b];
1319 				b = successor;
1320 				ir_bitqueue_del(&worklist, b);
1321 				goto restart;
1322 			} else {
1323 				continue;
1324 			}
1325 		}
1326 
1327 		loop_depth = bb->loop_depth;
1328 		if (bb->flags & IR_BB_LOOP_HEADER) {
1329 			// TODO: Estimate the loop iterations count
1330 			bb_freq[b] *= 10;
1331 		}
1332 
1333 		if (bb->successors_count) {
1334 			uint32_t n = bb->successors_count;
1335 			uint32_t *p = ctx->cfg_edges + bb->successors;
1336 
1337 			if (n == 1) {
1338 				uint32_t successor = *p;
1339 
1340 				IR_ASSERT(edges_count < max_edges_count);
1341 				freq = bb_freq[b];
1342 				if (successor > b) {
1343 					IR_ASSERT(!ir_bitset_in(visited, successor));
1344 					bb_freq[successor] += freq;
1345 					ir_bitqueue_add(&worklist, successor);
1346 				}
1347 				successor = _ir_skip_empty_blocks(ctx, successor);
1348 				edges[edges_count].from = b;
1349 				edges[edges_count].to = successor;
1350 				edges[edges_count].freq = freq;
1351 				edges_count++;
1352 			} else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1353 				uint32_t successor1 = *p;
1354 				ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1355 				ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1356 				uint32_t successor2 = *(p + 1);
1357 				ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1358 				ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1359 				int prob1, prob2, probN = 100;
1360 
1361 				if (insn1->op2) {
1362 					prob1 = insn1->op2;
1363 					if (insn2->op2) {
1364 						prob2 = insn2->op2;
1365 						probN = prob1 + prob2;
1366 					} else {
1367 						if (prob1 > 100) {
1368 							prob1 = 100;
1369 						}
1370 						prob2 = 100 - prob1;
1371 					}
1372 
1373 				} else if (insn2->op2) {
1374 					prob2 = insn2->op2;
1375 					if (prob2 > 100) {
1376 						prob2 = 100;
1377 					}
1378 					prob1 = 100 - prob2;
1379 				} else if (successor1_bb->loop_depth >= loop_depth
1380 						&& successor2_bb->loop_depth < loop_depth) {
1381 					prob1 = 90;
1382 					prob2 = 10;
1383 				} else if (successor1_bb->loop_depth < loop_depth
1384 						&& successor2_bb->loop_depth >= loop_depth) {
1385 					prob1 = 10;
1386 					prob2 = 90;
1387 				} else if (successor2_bb->flags & IR_BB_EMPTY) {
1388 					prob1 = 51;
1389 					prob2 = 49;
1390 				} else if (successor1_bb->flags & IR_BB_EMPTY) {
1391 					prob1 = 49;
1392 					prob2 = 51;
1393 				} else {
1394 					prob1 = prob2 = 50;
1395 				}
1396 				do {
1397 					freq = bb_freq[b] * (float)prob1 / (float)probN;
1398 					if (successor1 > b) {
1399 						IR_ASSERT(!ir_bitset_in(visited, successor1));
1400 						bb_freq[successor1] += freq;
1401 						if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1402 							/* move cold block without successors to the end */
1403 							IR_ASSERT(chains[successor1].head == successor1);
1404 							chains[successor1].head = 0;
1405 							*schedule_end = successor1;
1406 							schedule_end--;
1407 							break;
1408 						} else {
1409 							ir_bitqueue_add(&worklist, successor1);
1410 						}
1411 					}
1412 					/* try to join edges early to reduce number of edges and the cost of their sorting */
1413 					if (prob1 > prob2
1414 					 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1415 						uint32_t src = chains[b].next;
1416 						IR_ASSERT(chains[src].head == src);
1417 						IR_ASSERT(src == ir_chain_head(chains, b));
1418 						IR_ASSERT(chains[successor1].head == successor1);
1419 						ir_join_chains(chains, src, successor1);
1420 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1421 					}
1422 					successor1 = _ir_skip_empty_blocks(ctx, successor1);
1423 					IR_ASSERT(edges_count < max_edges_count);
1424 					edges[edges_count].from = b;
1425 					edges[edges_count].to = successor1;
1426 					edges[edges_count].freq = freq;
1427 					edges_count++;
1428 				} while (0);
1429 				do {
1430 					freq = bb_freq[b] * (float)prob2 / (float)probN;
1431 					if (successor2 > b) {
1432 						IR_ASSERT(!ir_bitset_in(visited, successor2));
1433 						bb_freq[successor2] += freq;
1434 						if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1435 							/* move cold block without successors to the end */
1436 							IR_ASSERT(chains[successor2].head == successor2);
1437 							chains[successor2].head = 0;
1438 							*schedule_end = successor2;
1439 							schedule_end--;
1440 							break;
1441 						} else {
1442 							ir_bitqueue_add(&worklist, successor2);
1443 						}
1444 					}
1445 					if (prob2 > prob1
1446 					 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1447 						uint32_t src = chains[b].next;
1448 						IR_ASSERT(chains[src].head == src);
1449 						IR_ASSERT(src == ir_chain_head(chains, b));
1450 						IR_ASSERT(chains[successor2].head == successor2);
1451 						ir_join_chains(chains, src, successor2);
1452 						if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1453 					}
1454 					successor2 = _ir_skip_empty_blocks(ctx, successor2);
1455 					IR_ASSERT(edges_count < max_edges_count);
1456 					edges[edges_count].from = b;
1457 					edges[edges_count].to = successor2;
1458 					edges[edges_count].freq = freq;
1459 					edges_count++;
1460 				} while (0);
1461 			} else {
1462 				int prob;
1463 
1464 				for (; n > 0; p++, n--) {
1465 					uint32_t successor = *p;
1466 					ir_block *successor_bb = &ctx->cfg_blocks[successor];
1467 					ir_insn *insn = &ctx->ir_base[successor_bb->start];
1468 
1469 					if (insn->op == IR_CASE_DEFAULT) {
1470 						prob = insn->op2;
1471 						if (!prob) {
1472 							prob = 100 / bb->successors_count;
1473 						}
1474 					} else if (insn->op == IR_CASE_VAL) {
1475 						prob = insn->op3;
1476 						if (!prob) {
1477 							prob = 100 / bb->successors_count;
1478 						}
1479 					} else if (insn->op == IR_ENTRY) {
1480 						if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1481 							prob = 99; /* prefer empty ENTRY block to go first */
1482 						} else {
1483 							prob = 1;
1484 						}
1485 					} else {
1486 						prob = 100 / bb->successors_count;
1487 					}
1488 					freq = bb_freq[b] * (float)prob / 100.0f;
1489 					if (successor > b) {
1490 						IR_ASSERT(!ir_bitset_in(visited, successor));
1491 						bb_freq[successor] += freq;
1492 						ir_bitqueue_add(&worklist, successor);
1493 					}
1494 					successor = _ir_skip_empty_blocks(ctx, successor);
1495 					IR_ASSERT(edges_count < max_edges_count);
1496 					edges[edges_count].from = b;
1497 					edges[edges_count].to = successor;
1498 					edges[edges_count].freq = freq;
1499 					edges_count++;
1500 				}
1501 			}
1502 		}
1503 	}
1504 	ir_bitqueue_free(&worklist);
1505 	ir_mem_free(visited);
1506 
1507 	/* 2. Sort EDGEs according to their frequencies */
1508 	qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1509 
1510 #if IR_DEBUG_BB_SCHEDULE_EDGES
1511 	ir_dump_edges(ctx, edges_count, edges);
1512 #endif
1513 
1514 	/* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1515 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1516 		uint32_t dst = chains[e->to].head;
1517 		if (dst == e->to) {
1518 			uint32_t src = chains[e->from].next;
1519 			if (chains[src].head == src) {
1520 				IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1521 				if (src != dst) {
1522 					ir_join_chains(chains, src, dst);
1523 				} else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1524 					/* Try to rotate loop chian to finish it with a conditional branch */
1525 					uint32_t tail = e->from;
1526 					uint32_t prev = src;
1527 					uint32_t next = chains[prev].next;
1528 					uint32_t best = 0;
1529 
1530 					while (prev != tail) {
1531 						if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1532 							if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1533 							 && ctx->cfg_blocks[prev].loop_depth > 1) {
1534 								best = next;
1535 								break;
1536 							} else if (!best || bb_freq[next] < bb_freq[best]) {
1537 								/* Find the successor of IF with the least frequency */
1538 								best = next;
1539 							}
1540 						}
1541 						prev = next;
1542 						next = chains[next].next;
1543 					}
1544 					if (best) {
1545 						/* change the head of the chain */
1546 						chains[src].head = best;
1547 						chains[best].head = best;
1548 					}
1549 				}
1550 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1551 				e->from = 0; /* reset "from" to avoid check on step #5 */
1552 #endif
1553 			}
1554 		}
1555 	}
1556 
1557 #if IR_DEBUG_BB_SCHEDULE_GRAPH
1558 	ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1559 #endif
1560 
1561 	ir_mem_free(bb_freq);
1562 
1563 #if IR_DEBUG_BB_SCHEDULE_CHAINS
1564 	ir_dump_chains(ctx, chains);
1565 #endif
1566 
1567 	/* 4. Merge empty entry blocks */
1568 	if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1569 		for (i = 0; i < ctx->entries_count; i++) {
1570 			b = ctx->entries[i];
1571 			IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1572 			if (b && chains[b].head == b && chains[b].tail == b) {
1573 				bb = &ctx->cfg_blocks[b];
1574 				if (bb->flags & IR_BB_EMPTY) {
1575 					uint32_t successor;
1576 
1577 					do {
1578 						IR_ASSERT(bb->successors_count == 1);
1579 						successor = ctx->cfg_edges[bb->successors];
1580 						bb = &ctx->cfg_blocks[successor];
1581 					} while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1582 					IR_ASSERT(chains[successor].head);
1583 					ir_insert_chain_before(chains, b, successor);
1584 				}
1585 			}
1586 		}
1587 
1588 #if IR_DEBUG_BB_SCHEDULE_CHAINS
1589 		ir_dump_chains(ctx, chains);
1590 #endif
1591 	}
1592 
1593 	/* 5. Align loop headers */
1594 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1595 		if (chains[b].head == b) {
1596 			bb = &ctx->cfg_blocks[b];
1597 			if (bb->loop_depth) {
1598 				if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1599 					bb->flags |= IR_BB_ALIGN_LOOP;
1600 				}
1601 			}
1602 		}
1603 	}
1604 
1605 	/* 6. Group chains according to the most frequent edge between them */
1606 	// TODO: Try to find a better heuristic
1607 	for (e = edges, i = edges_count; i > 0; e++, i--) {
1608 #if !IR_DEBUG_BB_SCHEDULE_GRAPH
1609 		if (!e->from) continue;
1610 #endif
1611 		uint32_t src = ir_chain_head(chains, e->from);
1612 		uint32_t dst = ir_chain_head(chains, e->to);
1613 		if (src != dst) {
1614 			if (dst == 1) {
1615 				ir_join_chains(chains, dst, src);
1616 			} else {
1617 				ir_join_chains(chains, src, dst);
1618 			}
1619 		}
1620 	}
1621 
1622 #if IR_DEBUG_BB_SCHEDULE_CHAINS
1623 	ir_dump_chains(ctx, chains);
1624 #endif
1625 
1626 	/* 7. Form a final BB order */
1627 	count = 0;
1628 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1629 		if (chains[b].head == b) {
1630 			uint32_t tail = chains[b].tail;
1631 			uint32_t i = b;
1632 			while (1) {
1633 				count++;
1634 				ctx->cfg_schedule[count] = i;
1635 				if (i == tail) break;
1636 				i = chains[i].next;
1637 			}
1638 		}
1639 	}
1640 
1641 	IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1642 	ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1643 
1644 	ir_mem_free(edges);
1645 	ir_mem_free(chains);
1646 
1647 	return 1;
1648 }
1649 
1650 /* A variation of "Top-down Positioning" algorithm described by
1651  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1652  */
ir_schedule_blocks_top_down(ir_ctx * ctx)1653 static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1654 {
1655 	ir_bitqueue blocks;
1656 	uint32_t b, best_successor, last_non_empty;
1657 	ir_block *bb, *best_successor_bb;
1658 	ir_insn *insn;
1659 	uint32_t *list, *schedule_end;
1660 	uint32_t count = 0;
1661 
1662 	ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1663 	blocks.pos = 0;
1664 	list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1665 	list[ctx->cfg_blocks_count + 1] = 0;
1666 	schedule_end = list + ctx->cfg_blocks_count;
1667 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1668 		ir_bitset_incl(blocks.set, b);
1669 	}
1670 
1671 	while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1672 		bb = &ctx->cfg_blocks[b];
1673 		/* Start trace */
1674 		last_non_empty = 0;
1675 		do {
1676 			if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1677 				/* Schedule the previous empty ENTRY block before this one */
1678 				uint32_t predecessor = b - 1;
1679 
1680 				ir_bitqueue_del(&blocks, predecessor);
1681 				count++;
1682 				list[count] = predecessor;
1683 			}
1684 			if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1685 				/* move empty blocks to the end */
1686 				*schedule_end = b;
1687 				schedule_end--;
1688 			} else {
1689 				count++;
1690 				list[count] = b;
1691 				last_non_empty = b;
1692 			}
1693 			best_successor_bb = NULL;
1694 			if (bb->successors_count == 1) {
1695 				best_successor = ctx->cfg_edges[bb->successors];
1696 				if (ir_bitqueue_in(&blocks, best_successor)) {
1697 					best_successor_bb = &ctx->cfg_blocks[best_successor];
1698 				}
1699 			} else if (bb->successors_count > 1) {
1700 				uint32_t prob, best_successor_prob;
1701 				uint32_t *p, successor;
1702 				ir_block *successor_bb;
1703 
1704 				for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1705 					successor = *p;
1706 					if (ir_bitqueue_in(&blocks, successor)) {
1707 						successor_bb = &ctx->cfg_blocks[successor];
1708 						insn = &ctx->ir_base[successor_bb->start];
1709 						if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1710 							prob = insn->op2;
1711 							if (!prob) {
1712 								prob = 100 / bb->successors_count;
1713 								if (!(successor_bb->flags & IR_BB_EMPTY)) {
1714 									prob++;
1715 								}
1716 							}
1717 						} else if (insn->op == IR_CASE_DEFAULT) {
1718 							prob = insn->op2;
1719 							if (!prob) {
1720 								prob = 100 / bb->successors_count;
1721 							}
1722 						} else if (insn->op == IR_CASE_VAL) {
1723 							prob = insn->op3;
1724 							if (!prob) {
1725 								prob = 100 / bb->successors_count;
1726 							}
1727 						} else if (insn->op == IR_ENTRY) {
1728 							if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1729 								prob = 99; /* prefer empty ENTRY block to go first */
1730 							} else {
1731 								prob = 1;
1732 							}
1733 						} else {
1734 							prob = 100 / bb->successors_count;
1735 						}
1736 						if (!best_successor_bb
1737 						 || successor_bb->loop_depth > best_successor_bb->loop_depth
1738 						 || prob > best_successor_prob) {
1739 							best_successor = successor;
1740 							best_successor_bb = successor_bb;
1741 							best_successor_prob = prob;
1742 						}
1743 					}
1744 				}
1745 			}
1746 			if (!best_successor_bb) {
1747 				/* Try to continue trace using the other successor of the last IF */
1748 				if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1749 					bb = &ctx->cfg_blocks[last_non_empty];
1750 					if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1751 						b = ctx->cfg_edges[bb->successors];
1752 
1753 						if (!ir_bitqueue_in(&blocks, b)) {
1754 							b = ctx->cfg_edges[bb->successors + 1];
1755 						}
1756 						if (ir_bitqueue_in(&blocks, b)) {
1757 							bb = &ctx->cfg_blocks[b];
1758 							ir_bitqueue_del(&blocks, b);
1759 							continue;
1760 						}
1761 					}
1762 				}
1763 				/* End trace */
1764 				break;
1765 			}
1766 			b = best_successor;
1767 			bb = best_successor_bb;
1768 			ir_bitqueue_del(&blocks, b);
1769 		} while (1);
1770 	}
1771 
1772 	IR_ASSERT(list + count == schedule_end);
1773 	ctx->cfg_schedule = list;
1774 	ir_bitqueue_free(&blocks);
1775 
1776 	return 1;
1777 }
1778 
ir_schedule_blocks(ir_ctx * ctx)1779 int ir_schedule_blocks(ir_ctx *ctx)
1780 {
1781 	ir_ref ref;
1782 
1783 	if (ctx->cfg_blocks_count <= 2) {
1784 		return 1;
1785 	}
1786 
1787 	/* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1788 	ref = ctx->ir_base[1].op1;
1789 	while (ref) {
1790 		ir_insn *insn = &ctx->ir_base[ref];
1791 
1792 		if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1793 			ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1794 			uint32_t n = bb->predecessors_count;
1795 
1796 			if (n == 1) {
1797 				ir_insn *start_insn = &ctx->ir_base[bb->start];
1798 				if (start_insn->op == IR_IF_TRUE
1799 				 || start_insn->op == IR_IF_FALSE
1800 				 || start_insn->op == IR_CASE_DEFAULT) {
1801 					if (!start_insn->op2) start_insn->op2 = 1;
1802 				} else if (start_insn->op == IR_CASE_VAL) {
1803 					if (!start_insn->op3) start_insn->op3 = 1;
1804 				}
1805 			} else if (n > 1) {
1806 				uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1807 
1808 				for (; n > 0; p++, n--) {
1809 					bb = &ctx->cfg_blocks[*p];
1810 					if (bb->predecessors_count == 1) {
1811 						ir_insn *start_insn = &ctx->ir_base[bb->start];
1812 						if (start_insn->op == IR_IF_TRUE
1813 						 || start_insn->op == IR_IF_FALSE
1814 						 || start_insn->op == IR_CASE_DEFAULT) {
1815 							if (!start_insn->op2) start_insn->op2 = 1;
1816 						} else if (start_insn->op == IR_CASE_VAL) {
1817 							if (!start_insn->op3) start_insn->op3 = 1;
1818 						}
1819 					}
1820 				}
1821 			}
1822 		}
1823 		ref = insn->op3;
1824 	}
1825 
1826 	/* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
1827 	 * use it only for relatively small functions.
1828 	 *
1829 	 * TODO: make the choice between top-down and bottom-up algorithm configurable
1830 	 */
1831 	if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
1832 		return ir_schedule_blocks_top_down(ctx);
1833 	} else {
1834 		return ir_schedule_blocks_bottom_up(ctx);
1835 	}
1836 }
1837 
1838 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1839 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1840 {
1841 	return _ir_skip_empty_blocks(ctx, b);
1842 }
1843 
ir_next_block(const ir_ctx * ctx,uint32_t b)1844 uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
1845 {
1846 	ir_block *bb;
1847 
1848 	if (ctx->cfg_schedule) {
1849 		uint32_t ret = ctx->cfg_schedule[++b];
1850 
1851 		/* Check for empty ENTRY block */
1852 		while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
1853 			ret = ctx->cfg_schedule[++b];
1854 		}
1855 		return ret;
1856 	}
1857 
1858 	b++;
1859 	while (1) {
1860 		if (b > ctx->cfg_blocks_count) {
1861 			return 0;
1862 		}
1863 
1864 		bb = &ctx->cfg_blocks[b];
1865 
1866 		if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1867 			b++;
1868 		} else {
1869 			break;
1870 		}
1871 	}
1872 	return b;
1873 }
1874 
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block)1875 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
1876 {
1877 	ir_block *bb;
1878 	uint32_t *p, use_block;
1879 
1880 	*true_block = 0;
1881 	*false_block = 0;
1882 	bb = &ctx->cfg_blocks[b];
1883 	IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1884 	IR_ASSERT(bb->successors_count == 2);
1885 	p = &ctx->cfg_edges[bb->successors];
1886 	use_block = *p;
1887 	if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1888 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1889 		use_block = *(p+1);
1890 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1891 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1892 	} else {
1893 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1894 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1895 		use_block = *(p+1);
1896 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1897 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1898 	}
1899 	IR_ASSERT(*true_block && *false_block);
1900 }
1901