xref: /PHP-8.2/ext/opcache/jit/ir/ir_cfg.c (revision acda3af0)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (CFG - Control Flow Graph)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 #include "ir_private.h"
10 
_ir_merge_blocks(ir_ctx * ctx,ir_ref end,ir_ref begin)11 static ir_ref _ir_merge_blocks(ir_ctx *ctx, ir_ref end, ir_ref begin)
12 {
13 	ir_ref prev, next;
14 	ir_use_list *use_list;
15 	ir_ref n, *p;
16 
17 	IR_ASSERT(ctx->ir_base[begin].op == IR_BEGIN);
18 	IR_ASSERT(ctx->ir_base[end].op == IR_END);
19 	IR_ASSERT(ctx->ir_base[begin].op1 == end);
20 	IR_ASSERT(ctx->use_lists[end].count == 1);
21 
22 	prev = ctx->ir_base[end].op1;
23 
24 	use_list = &ctx->use_lists[begin];
25 	IR_ASSERT(use_list->count == 1);
26 	next = ctx->use_edges[use_list->refs];
27 
28 	/* remove BEGIN and END */
29 	ctx->ir_base[begin].op = IR_NOP;
30 	ctx->ir_base[begin].op1 = IR_UNUSED;
31 	ctx->use_lists[begin].count = 0;
32 	ctx->ir_base[end].op = IR_NOP;
33 	ctx->ir_base[end].op1 = IR_UNUSED;
34 	ctx->use_lists[end].count = 0;
35 
36 	/* connect their predecessor and successor */
37 	ctx->ir_base[next].op1 = prev;
38 	use_list = &ctx->use_lists[prev];
39 	n = use_list->count;
40 	for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
41 		if (*p == end) {
42 			*p = next;
43 		}
44 	}
45 
46 	return next;
47 }
48 
_ir_add_successors(const ir_ctx * ctx,ir_ref ref,ir_worklist * worklist)49 IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
50 {
51 	ir_use_list *use_list = &ctx->use_lists[ref];
52 	ir_ref *p, use, n = use_list->count;
53 
54 	if (n < 2) {
55 		if (n == 1) {
56 			use = ctx->use_edges[use_list->refs];
57 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
58 			ir_worklist_push(worklist, use);
59 		}
60 	} else {
61 		p = &ctx->use_edges[use_list->refs];
62 		if (n == 2) {
63 			use = *p;
64 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
65 			ir_worklist_push(worklist, use);
66 			use = *(p + 1);
67 			IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
68 			ir_worklist_push(worklist, use);
69 		} else {
70 			for (; n > 0; p++, n--) {
71 				use = *p;
72 				IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
73 				ir_worklist_push(worklist, use);
74 			}
75 		}
76 	}
77 }
78 
_ir_add_predecessors(const ir_insn * insn,ir_worklist * worklist)79 IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
80 {
81 	ir_ref n, ref;
82 	const ir_ref *p;
83 
84 	if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
85 		n = insn->inputs_count;
86 		for (p = insn->ops + 1; n > 0; p++, n--) {
87 			ref = *p;
88 			IR_ASSERT(ref);
89 			ir_worklist_push(worklist, ref);
90 		}
91 	} else if (insn->op != IR_START) {
92 		if (EXPECTED(insn->op1)) {
93 			ir_worklist_push(worklist, insn->op1);
94 		}
95 	}
96 }
97 
ir_build_cfg(ir_ctx * ctx)98 int ir_build_cfg(ir_ctx *ctx)
99 {
100 	ir_ref n, *p, ref, start, end, next;
101 	uint32_t b;
102 	ir_insn *insn;
103 	ir_worklist worklist;
104 	uint32_t bb_init_falgs;
105 	uint32_t count, bb_count = 0;
106 	uint32_t edges_count = 0;
107 	ir_block *blocks, *bb;
108 	uint32_t *_blocks, *edges;
109 	ir_use_list *use_list;
110 	uint32_t len = ir_bitset_len(ctx->insns_count);
111 	ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
112 	ir_bitset bb_leaks = bb_starts + len;
113 	_blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
114 	ir_worklist_init(&worklist, ctx->insns_count);
115 
116 	/* First try to perform backward DFS search starting from "stop" nodes */
117 
118 	/* Add all "stop" nodes */
119 	ref = ctx->ir_base[1].op1;
120 	while (ref) {
121 		ir_worklist_push(&worklist, ref);
122 		ref = ctx->ir_base[ref].op3;
123 	}
124 
125 	while (ir_worklist_len(&worklist)) {
126 		ref = ir_worklist_pop(&worklist);
127 		insn = &ctx->ir_base[ref];
128 
129 		IR_ASSERT(IR_IS_BB_END(insn->op));
130 		/* Remember BB end */
131 		end = ref;
132 		/* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
133 		use_list = &ctx->use_lists[end];
134 		n = use_list->count;
135 		if (n > 1) {
136 			for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
137 				/* Remember possible inaccessible successors */
138 				ir_bitset_incl(bb_leaks, *p);
139 			}
140 		}
141 		/* Skip control nodes untill BB start */
142 		ref = insn->op1;
143 		while (1) {
144 			insn = &ctx->ir_base[ref];
145 			if (IR_IS_BB_START(insn->op)) {
146 				if (insn->op == IR_BEGIN
147 				 && (ctx->flags & IR_OPT_CFG)
148 				 && ctx->ir_base[insn->op1].op == IR_END
149 				 && ctx->use_lists[ref].count == 1) {
150 					ref = _ir_merge_blocks(ctx, insn->op1, ref);
151 					ref = ctx->ir_base[ref].op1;
152 					continue;
153 				}
154 				break;
155 			}
156 			ref = insn->op1; // follow connected control blocks untill BB start
157 		}
158 		/* Mark BB Start */
159 		bb_count++;
160 		_blocks[ref] = end;
161 		ir_bitset_incl(bb_starts, ref);
162 		/* Add predecessors */
163 		_ir_add_predecessors(insn, &worklist);
164 	}
165 
166 	/* Backward DFS way miss some branches ending by infinite loops.  */
167 	/* Try forward DFS. (in most cases all nodes are already proceed). */
168 
169 	/* START node may be inaccessible from "stop" nodes */
170 	ir_bitset_incl(bb_leaks, 1);
171 
172 	/* Add not processed START and successor of IF and SWITCH */
173 	IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
174 		ir_worklist_push(&worklist, start);
175 	} IR_BITSET_FOREACH_END();
176 
177 	if (ir_worklist_len(&worklist)) {
178 		ir_bitset_union(worklist.visited, bb_starts, len);
179 		do {
180 			ref = ir_worklist_pop(&worklist);
181 			insn = &ctx->ir_base[ref];
182 
183 			IR_ASSERT(IR_IS_BB_START(insn->op));
184 			/* Remember BB start */
185 			start = ref;
186 			/* Skip control nodes untill BB end */
187 			while (1) {
188 				use_list = &ctx->use_lists[ref];
189 				n = use_list->count;
190 				next = IR_UNUSED;
191 				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
192 					next = *p;
193 					insn = &ctx->ir_base[next];
194 					if ((ir_op_flags[insn->op] & IR_OP_FLAG_CONTROL) && insn->op1 == ref) {
195 						break;
196 					}
197 				}
198 				IR_ASSERT(next != IR_UNUSED);
199 				ref = next;
200 next_successor:
201 				if (IR_IS_BB_END(insn->op)) {
202 					if (insn->op == IR_END && (ctx->flags & IR_OPT_CFG)) {
203 						use_list = &ctx->use_lists[ref];
204 						IR_ASSERT(use_list->count == 1);
205 						next = ctx->use_edges[use_list->refs];
206 
207 						if (ctx->ir_base[next].op == IR_BEGIN
208 						 && ctx->use_lists[next].count == 1) {
209 							ref = _ir_merge_blocks(ctx, ref, next);
210 							insn = &ctx->ir_base[ref];
211 							goto next_successor;
212 						}
213 					}
214 					break;
215 				}
216 			}
217 			/* Mark BB Start */
218 			bb_count++;
219 			_blocks[start] = ref;
220 			ir_bitset_incl(bb_starts, start);
221 			/* Add successors */
222 			_ir_add_successors(ctx, ref, &worklist);
223 		} while (ir_worklist_len(&worklist));
224 	}
225 
226 	IR_ASSERT(bb_count > 0);
227 
228 	/* Create array of basic blocks and count successor/predecessors edges for each BB */
229 	blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
230 	b = 1;
231 	bb = blocks + 1;
232 	count = 0;
233 	/* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
234 	bb_init_falgs = (ctx->flags2 & IR_SCCP_DONE) ? 0 : IR_BB_UNREACHABLE;
235 	IR_BITSET_FOREACH(bb_starts, len, start) {
236 		end = _blocks[start];
237 		_blocks[start] = b;
238 		_blocks[end] = b;
239 		insn = &ctx->ir_base[start];
240 		IR_ASSERT(IR_IS_BB_START(insn->op));
241 		IR_ASSERT(end > start);
242 		bb->start = start;
243 		bb->end = end;
244 		bb->successors = count;
245 		count += ctx->use_lists[end].count;
246 		bb->successors_count = 0;
247 		bb->predecessors = count;
248 		bb->dom_parent = 0;
249 		bb->dom_depth = 0;
250 		bb->dom_child = 0;
251 		bb->dom_next_child = 0;
252 		bb->loop_header = 0;
253 		bb->loop_depth = 0;
254 		if (insn->op == IR_START) {
255 			bb->flags = IR_BB_START;
256 			bb->predecessors_count = 0;
257 		} else {
258 			bb->flags = bb_init_falgs;
259 			if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
260 				n = insn->inputs_count;
261 				bb->predecessors_count = n;
262 				edges_count += n;
263 				count += n;
264 			} else if (EXPECTED(insn->op1)) {
265 				if (insn->op == IR_ENTRY) {
266 					bb->flags |= IR_BB_ENTRY;
267 					ctx->entries_count++;
268 				}
269 				bb->predecessors_count = 1;
270 				edges_count++;
271 				count++;
272 			} else {
273 				IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
274 				bb->predecessors_count = 0;
275 			}
276 		}
277 		b++;
278 		bb++;
279 	} IR_BITSET_FOREACH_END();
280 	IR_ASSERT(count == edges_count * 2);
281 	ir_mem_free(bb_starts);
282 
283 	/* Create an array of successor/predecessors control edges */
284 	edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
285 	bb = blocks + 1;
286 	for (b = 1; b <= bb_count; b++, bb++) {
287 		insn = &ctx->ir_base[bb->start];
288 		if (bb->predecessors_count > 1) {
289 			uint32_t *q = edges + bb->predecessors;
290 			n = insn->inputs_count;
291 			for (p = insn->ops + 1; n > 0; p++, q++, n--) {
292 				ref = *p;
293 				IR_ASSERT(ref);
294 				ir_ref pred_b = _blocks[ref];
295 				ir_block *pred_bb = &blocks[pred_b];
296 				*q = pred_b;
297 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
298 			}
299 		} else if (bb->predecessors_count == 1) {
300 			ref = insn->op1;
301 			IR_ASSERT(ref);
302 			IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
303 			ir_ref pred_b = _blocks[ref];
304 			ir_block *pred_bb = &blocks[pred_b];
305 			edges[bb->predecessors] = pred_b;
306 			edges[pred_bb->successors + pred_bb->successors_count++] = b;
307 		}
308 	}
309 
310 	ctx->cfg_blocks_count = bb_count;
311 	ctx->cfg_edges_count = edges_count * 2;
312 	ctx->cfg_blocks = blocks;
313 	ctx->cfg_edges = edges;
314 	ctx->cfg_map = _blocks;
315 
316 	if (!(ctx->flags2 & IR_SCCP_DONE)) {
317 		uint32_t reachable_count = 0;
318 
319 		/* Mark reachable blocks */
320 		ir_worklist_clear(&worklist);
321 		ir_worklist_push(&worklist, 1);
322 		while (ir_worklist_len(&worklist) != 0) {
323 			uint32_t *p;
324 
325 			reachable_count++;
326 			b = ir_worklist_pop(&worklist);
327 			bb = &blocks[b];
328 			bb->flags &= ~IR_BB_UNREACHABLE;
329 			n = bb->successors_count;
330 			if (n > 1) {
331 				for (p = edges + bb->successors; n > 0; p++, n--) {
332 					ir_worklist_push(&worklist, *p);
333 				}
334 			} else if (n == 1) {
335 				ir_worklist_push(&worklist, edges[bb->successors]);
336 			}
337 		}
338 		if (reachable_count != ctx->cfg_blocks_count) {
339 			ir_remove_unreachable_blocks(ctx);
340 		}
341 	}
342 
343 	ir_worklist_free(&worklist);
344 
345 	return 1;
346 }
347 
ir_remove_predecessor(ir_ctx * ctx,ir_block * bb,uint32_t from)348 static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
349 {
350 	uint32_t i, *p, *q, n = 0;
351 
352 	p = q = &ctx->cfg_edges[bb->predecessors];
353 	for (i = 0; i < bb->predecessors_count; i++, p++) {
354 		if (*p != from) {
355 			if (p != q) {
356 				*q = *p;
357 			}
358 			q++;
359 			n++;
360 		}
361 	}
362 	IR_ASSERT(n != bb->predecessors_count);
363 	bb->predecessors_count = n;
364 }
365 
ir_remove_from_use_list(ir_ctx * ctx,ir_ref from,ir_ref ref)366 static void ir_remove_from_use_list(ir_ctx *ctx, ir_ref from, ir_ref ref)
367 {
368 	ir_ref j, n, *p, *q, use;
369 	ir_use_list *use_list = &ctx->use_lists[from];
370 	ir_ref skip = 0;
371 
372 	n = use_list->count;
373 	for (j = 0, p = q = &ctx->use_edges[use_list->refs]; j < n; j++, p++) {
374 		use = *p;
375 		if (use == ref) {
376 			skip++;
377 		} else {
378 			if (p != q) {
379 				*q = use;
380 			}
381 			q++;
382 		}
383 	}
384 	use_list->count -= skip;
385 }
386 
ir_remove_merge_input(ir_ctx * ctx,ir_ref merge,ir_ref from)387 static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
388 {
389 	ir_ref i, j, n, k, *p, use;
390 	ir_insn *use_insn;
391 	ir_use_list *use_list;
392 	ir_bitset life_inputs;
393 	ir_insn *insn = &ctx->ir_base[merge];
394 
395 	IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
396 	n = insn->inputs_count;
397 	i = 1;
398 	life_inputs = ir_bitset_malloc(n + 1);
399 	for (j = 1; j <= n; j++) {
400 		ir_ref input = ir_insn_op(insn, j);
401 
402 		if (input != from) {
403 			if (i != j) {
404 				ir_insn_set_op(insn, i, input);
405 			}
406 			ir_bitset_incl(life_inputs, j);
407 			i++;
408 		}
409 	}
410 	i--;
411 	if (i == 1) {
412 	    insn->op = IR_BEGIN;
413 		insn->inputs_count = 0;
414 		use_list = &ctx->use_lists[merge];
415 		if (use_list->count > 1) {
416 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
417 				use = *p;
418 				use_insn = &ctx->ir_base[use];
419 				if (use_insn->op == IR_PHI) {
420 					/* Convert PHI to COPY */
421 					i = 2;
422 					for (j = 2; j <= n; j++) {
423 						ir_ref input = ir_insn_op(use_insn, j);
424 
425 						if (ir_bitset_in(life_inputs, j - 1)) {
426 							use_insn->op1 = ir_insn_op(use_insn, j);
427 						} else if (input > 0) {
428 							ir_remove_from_use_list(ctx, input, use);
429 						}
430 					}
431 					use_insn->op = IR_COPY;
432 					use_insn->op2 = IR_UNUSED;
433 					use_insn->op3 = IR_UNUSED;
434 					ir_remove_from_use_list(ctx, merge, use);
435 				}
436 			}
437 		}
438 	} else {
439 		insn->inputs_count = i;
440 
441 		n++;
442 		use_list = &ctx->use_lists[merge];
443 		if (use_list->count > 1) {
444 			for (k = 0, p = &ctx->use_edges[use_list->refs]; k < use_list->count; k++, p++) {
445 				use = *p;
446 				use_insn = &ctx->ir_base[use];
447 				if (use_insn->op == IR_PHI) {
448 					i = 2;
449 					for (j = 2; j <= n; j++) {
450 						ir_ref input = ir_insn_op(use_insn, j);
451 
452 						if (ir_bitset_in(life_inputs, j - 1)) {
453 							IR_ASSERT(input);
454 							if (i != j) {
455 								ir_insn_set_op(use_insn, i, input);
456 							}
457 							i++;
458 						} else if (input > 0) {
459 							ir_remove_from_use_list(ctx, input, use);
460 						}
461 					}
462 				}
463 			}
464 		}
465 	}
466 	ir_mem_free(life_inputs);
467 	ir_remove_from_use_list(ctx, from, merge);
468 }
469 
470 /* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
ir_remove_unreachable_blocks(ir_ctx * ctx)471 int ir_remove_unreachable_blocks(ir_ctx *ctx)
472 {
473 	uint32_t b, *p, i;
474 	uint32_t unreachable_count = 0;
475 	uint32_t bb_count = ctx->cfg_blocks_count;
476 	ir_block *bb = ctx->cfg_blocks + 1;
477 
478 	for (b = 1; b <= bb_count; b++, bb++) {
479 		if (bb->flags & IR_BB_UNREACHABLE) {
480 #if 0
481 			do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
482 #endif
483 			if (bb->successors_count) {
484 				for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
485 					ir_block *succ_bb = &ctx->cfg_blocks[*p];
486 
487 					if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
488 						ir_remove_predecessor(ctx, succ_bb, b);
489 						ir_remove_merge_input(ctx, succ_bb->start, bb->end);
490 					}
491 				}
492 			} else {
493 				ir_ref prev, ref = bb->end;
494 				ir_insn *insn = &ctx->ir_base[ref];
495 
496 				IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
497 				/* remove from terminators list */
498 				prev = ctx->ir_base[1].op1;
499 				if (prev == ref) {
500 					ctx->ir_base[1].op1 = insn->op3;
501 				} else {
502 					while (prev) {
503 						if (ctx->ir_base[prev].op3 == ref) {
504 							ctx->ir_base[prev].op3 = insn->op3;
505 							break;
506 						}
507 						prev = ctx->ir_base[prev].op3;
508 					}
509 				}
510 			}
511 			ctx->cfg_map[bb->start] = 0;
512 			ctx->cfg_map[bb->end] = 0;
513 			unreachable_count++;
514 		}
515 	}
516 
517 	if (unreachable_count) {
518 		ir_block *dst_bb;
519 		uint32_t n = 1;
520 		uint32_t *edges;
521 
522 		dst_bb = bb = ctx->cfg_blocks + 1;
523 		for (b = 1; b <= bb_count; b++, bb++) {
524 			if (!(bb->flags & IR_BB_UNREACHABLE)) {
525 				if (dst_bb != bb) {
526 					memcpy(dst_bb, bb, sizeof(ir_block));
527 					ctx->cfg_map[dst_bb->start] = n;
528 					ctx->cfg_map[dst_bb->end] = n;
529 				}
530 				dst_bb->successors_count = 0;
531 				dst_bb++;
532 				n++;
533 			}
534 		}
535 		ctx->cfg_blocks_count = bb_count = n - 1;
536 
537 		/* Rebuild successor/predecessors control edges */
538 		edges = ctx->cfg_edges;
539 		bb = ctx->cfg_blocks + 1;
540 		for (b = 1; b <= bb_count; b++, bb++) {
541 			ir_insn *insn = &ctx->ir_base[bb->start];
542 			ir_ref *p, ref;
543 
544 			n = bb->predecessors_count;
545 			if (n > 1) {
546 				uint32_t *q = edges + bb->predecessors;
547 
548 				IR_ASSERT(n == insn->inputs_count);
549 				for (p = insn->ops + 1; n > 0; p++, q++, n--) {
550 					ref = *p;
551 					IR_ASSERT(ref);
552 					ir_ref pred_b = ctx->cfg_map[ref];
553 					ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
554 					*q = pred_b;
555 					edges[pred_bb->successors + pred_bb->successors_count++] = b;
556 				}
557 			} else if (n == 1) {
558 				ref = insn->op1;
559 				IR_ASSERT(ref);
560 				IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
561 				ir_ref pred_b = ctx->cfg_map[ref];
562 				ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
563 				edges[bb->predecessors] = pred_b;
564 				edges[pred_bb->successors + pred_bb->successors_count++] = b;
565 			}
566 		}
567 	}
568 
569 	return 1;
570 }
571 
572 #if 0
573 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
574 {
575 	uint32_t i, *p;
576 	ir_block *bb = &ctx->cfg_blocks[b];
577 
578 	if (bb->postnum != 0) {
579 		return;
580 	}
581 
582 	if (bb->successors_count) {
583 		bb->postnum = -1; /* Marker for "currently visiting" */
584 		p = ctx->cfg_edges + bb->successors;
585 		i = bb->successors_count;
586 		do {
587 			compute_postnum(ctx, cur, *p);
588 			p++;
589 		} while (--i);
590 	}
591 	bb->postnum = (*cur)++;
592 }
593 
594 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
595  * Cooper, Harvey and Kennedy. */
596 int ir_build_dominators_tree(ir_ctx *ctx)
597 {
598 	uint32_t blocks_count, b, postnum;
599 	ir_block *blocks, *bb;
600 	uint32_t *edges;
601 	bool changed;
602 
603 	ctx->flags2 &= ~IR_NO_LOOPS;
604 
605 	postnum = 1;
606 	compute_postnum(ctx, &postnum, 1);
607 
608 	/* Find immediate dominators */
609 	blocks = ctx->cfg_blocks;
610 	edges  = ctx->cfg_edges;
611 	blocks_count = ctx->cfg_blocks_count;
612 	blocks[1].idom = 1;
613 	do {
614 		changed = 0;
615 		/* Iterating in Reverse Post Order */
616 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
617 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
618 			if (bb->predecessors_count == 1) {
619 				uint32_t pred_b = edges[bb->predecessors];
620 
621 				IR_ASSERT(blocks[pred_b].idom > 0);
622 				if (bb->idom != pred_b) {
623 					bb->idom = pred_b;
624 					changed = 1;
625 				}
626 			} else if (bb->predecessors_count) {
627 				uint32_t idom = 0;
628 				uint32_t k = bb->predecessors_count;
629 				uint32_t *p = edges + bb->predecessors;
630 
631 				do {
632 					uint32_t pred_b = *p;
633 					ir_block *pred_bb = &blocks[pred_b];
634 					ir_block *idom_bb;
635 
636 					if (pred_bb->idom > 0) {
637 						idom = pred_b;
638 						idom_bb = &blocks[idom];
639 
640 						while (--k > 0) {
641 							pred_b = *(++p);
642 							pred_bb = &blocks[pred_b];
643 							if (pred_bb->idom > 0) {
644 								while (idom != pred_b) {
645 									while (pred_bb->postnum < idom_bb->postnum) {
646 										pred_b = pred_bb->idom;
647 										pred_bb = &blocks[pred_b];
648 									}
649 									while (idom_bb->postnum < pred_bb->postnum) {
650 										idom = idom_bb->idom;
651 										idom_bb = &blocks[idom];
652 									}
653 								}
654 							}
655 						}
656 
657 						if (bb->idom != idom) {
658 							bb->idom = idom;
659 							changed = 1;
660 						}
661 						break;
662 					}
663 					p++;
664 				} while (--k > 0);
665 			}
666 		}
667 	} while (changed);
668 	blocks[1].idom = 0;
669 	blocks[1].dom_depth = 0;
670 
671 	/* Construct dominators tree */
672 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
673 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
674 		if (bb->idom > 0) {
675 			ir_block *idom_bb = &blocks[bb->idom];
676 
677 			bb->dom_depth = idom_bb->dom_depth + 1;
678 			/* Sort by block number to traverse children in pre-order */
679 			if (idom_bb->dom_child == 0) {
680 				idom_bb->dom_child = b;
681 			} else if (b < idom_bb->dom_child) {
682 				bb->dom_next_child = idom_bb->dom_child;
683 				idom_bb->dom_child = b;
684 			} else {
685 				int child = idom_bb->dom_child;
686 				ir_block *child_bb = &blocks[child];
687 
688 				while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
689 					child = child_bb->dom_next_child;
690 					child_bb = &blocks[child];
691 				}
692 				bb->dom_next_child = child_bb->dom_next_child;
693 				child_bb->dom_next_child = b;
694 			}
695 		}
696 	}
697 
698 	return 1;
699 }
700 #else
701 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
702  * Cooper, Harvey and Kennedy, that relays on IR block ordering */
ir_build_dominators_tree(ir_ctx * ctx)703 int ir_build_dominators_tree(ir_ctx *ctx)
704 {
705 	uint32_t blocks_count, b;
706 	ir_block *blocks, *bb;
707 	uint32_t *edges;
708 
709 	ctx->flags2 |= IR_NO_LOOPS;
710 
711 	/* Find immediate dominators */
712 	blocks = ctx->cfg_blocks;
713 	edges  = ctx->cfg_edges;
714 	blocks_count = ctx->cfg_blocks_count;
715 	blocks[1].idom = 1;
716 	blocks[1].dom_depth = 0;
717 
718 	/* Iterating in Reverse Post Order */
719 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
720 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
721 		IR_ASSERT(bb->predecessors_count > 0);
722 		uint32_t k = bb->predecessors_count;
723 		uint32_t *p = edges + bb->predecessors;
724 		uint32_t idom = *p;
725 		ir_block *idom_bb;
726 
727 		if (UNEXPECTED(idom > b)) {
728 			/* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
729 			ctx->flags2 &= ~IR_NO_LOOPS;
730 			while (1) {
731 				k--;
732 				p++;
733 				idom = *p;
734 				if (idom < b) {
735 					break;
736 				}
737 				IR_ASSERT(k > 0);
738 			}
739 		}
740 		IR_ASSERT(blocks[idom].idom > 0);
741 
742 		while (--k > 0) {
743 			uint32_t pred_b = *(++p);
744 
745 			if (pred_b < b) {
746 				IR_ASSERT(blocks[pred_b].idom > 0);
747 				while (idom != pred_b) {
748 					while (pred_b > idom) {
749 						pred_b = blocks[pred_b].idom;
750 					}
751 					while (idom > pred_b) {
752 						idom = blocks[idom].idom;
753 					}
754 				}
755 			} else {
756 				ctx->flags2 &= ~IR_NO_LOOPS;
757 			}
758 		}
759 		bb->idom = idom;
760 		idom_bb = &blocks[idom];
761 
762 		bb->dom_depth = idom_bb->dom_depth + 1;
763 		/* Sort by block number to traverse children in pre-order */
764 		if (idom_bb->dom_child == 0) {
765 			idom_bb->dom_child = b;
766 		} else if (b < idom_bb->dom_child) {
767 			bb->dom_next_child = idom_bb->dom_child;
768 			idom_bb->dom_child = b;
769 		} else {
770 			int child = idom_bb->dom_child;
771 			ir_block *child_bb = &blocks[child];
772 
773 			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
774 				child = child_bb->dom_next_child;
775 				child_bb = &blocks[child];
776 			}
777 			bb->dom_next_child = child_bb->dom_next_child;
778 			child_bb->dom_next_child = b;
779 		}
780 	}
781 
782 	blocks[1].idom = 0;
783 
784 	return 1;
785 }
786 #endif
787 
ir_dominates(const ir_block * blocks,uint32_t b1,uint32_t b2)788 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
789 {
790 	uint32_t b1_depth = blocks[b1].dom_depth;
791 	const ir_block *bb2 = &blocks[b2];
792 
793 	while (bb2->dom_depth > b1_depth) {
794 		b2 = bb2->dom_parent;
795 		bb2 = &blocks[b2];
796 	}
797 	return b1 == b2;
798 }
799 
ir_find_loops(ir_ctx * ctx)800 int ir_find_loops(ir_ctx *ctx)
801 {
802 	uint32_t i, j, n, count;
803 	uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
804 	ir_block *blocks = ctx->cfg_blocks;
805 	uint32_t *edges = ctx->cfg_edges;
806 	ir_worklist work;
807 
808 	if (ctx->flags2 & IR_NO_LOOPS) {
809 		return 1;
810 	}
811 
812 	/* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
813 	 * queries. These are implemented by checking entry/exit times of the DFS search. */
814 	ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
815 	entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
816 	exit_times = entry_times + ctx->cfg_blocks_count + 1;
817 	sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
818 
819 	memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
820 
821 	ir_worklist_push(&work, 1);
822 	while (ir_worklist_len(&work)) {
823 		ir_block *bb;
824 		int child;
825 
826 next:
827 		i = ir_worklist_peek(&work);
828 		if (!entry_times[i]) {
829 			entry_times[i] = time++;
830 		}
831 
832 		/* Visit blocks immediately dominated by i. */
833 		bb = &blocks[i];
834 		for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
835 			if (ir_worklist_push(&work, child)) {
836 				goto next;
837 			}
838 		}
839 
840 		/* Visit join edges. */
841 		if (bb->successors_count) {
842 			uint32_t *p = edges + bb->successors;
843 			for (j = 0; j < bb->successors_count; j++,p++) {
844 				uint32_t succ = *p;
845 
846 				if (blocks[succ].idom == i) {
847 					continue;
848 				} else if (ir_worklist_push(&work, succ)) {
849 					goto next;
850 				}
851 			}
852 		}
853 		exit_times[i] = time++;
854 		ir_worklist_pop(&work);
855 	}
856 
857 	/* Sort blocks by level, which is the opposite order in which we want to process them */
858 	sorted_blocks[1] = 1;
859 	j = 1;
860 	n = 2;
861 	while (j != n) {
862 		i = j;
863 		j = n;
864 		for (; i < j; i++) {
865 			int child;
866 			for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
867 				sorted_blocks[n++] = child;
868 			}
869 		}
870 	}
871 	count = n;
872 
873 	/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
874 	while (n > 1) {
875 		i = sorted_blocks[--n];
876 		ir_block *bb = &blocks[i];
877 
878 		if (bb->predecessors_count > 1) {
879 			bool irreducible = 0;
880 			uint32_t *p = &edges[bb->predecessors];
881 
882 			j = bb->predecessors_count;
883 			do {
884 				uint32_t pred = *p;
885 
886 				/* A join edge is one for which the predecessor does not
887 				   immediately dominate the successor.  */
888 				if (bb->idom != pred) {
889 					/* In a loop back-edge (back-join edge), the successor dominates
890 					   the predecessor.  */
891 					if (ir_dominates(blocks, i, pred)) {
892 						if (!ir_worklist_len(&work)) {
893 							ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
894 						}
895 						blocks[pred].loop_header = 0; /* support for merged loops */
896 						ir_worklist_push(&work, pred);
897 					} else {
898 						/* Otherwise it's a cross-join edge.  See if it's a branch
899 						   to an ancestor on the DJ spanning tree.  */
900 						if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
901 							irreducible = 1;
902 						}
903 					}
904 				}
905 				p++;
906 			} while (--j);
907 
908 			if (UNEXPECTED(irreducible)) {
909 				// TODO: Support for irreducible loops ???
910 				bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
911 				ctx->flags2 |= IR_IRREDUCIBLE_CFG;
912 				while (ir_worklist_len(&work)) {
913 					ir_worklist_pop(&work);
914 				}
915 			} else if (ir_worklist_len(&work)) {
916 				bb->flags |= IR_BB_LOOP_HEADER;
917 				ctx->flags2 |= IR_CFG_HAS_LOOPS;
918 				bb->loop_depth = 1;
919 				while (ir_worklist_len(&work)) {
920 					j = ir_worklist_pop(&work);
921 					while (blocks[j].loop_header > 0) {
922 						j = blocks[j].loop_header;
923 					}
924 					if (j != i) {
925 						ir_block *bb = &blocks[j];
926 						if (bb->idom == 0 && j != 1) {
927 							/* Ignore blocks that are unreachable or only abnormally reachable. */
928 							continue;
929 						}
930 						bb->loop_header = i;
931 						if (bb->predecessors_count) {
932 							uint32_t *p = &edges[bb->predecessors];
933 							j = bb->predecessors_count;
934 							do {
935 								ir_worklist_push(&work, *p);
936 								p++;
937 							} while (--j);
938 						}
939 					}
940 				}
941 			}
942 		}
943 	}
944 
945 	if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
946 		for (n = 1; n < count; n++) {
947 			i = sorted_blocks[n];
948 			ir_block *bb = &blocks[i];
949 			if (bb->loop_header > 0) {
950 				ir_block *loop = &blocks[bb->loop_header];
951 				uint32_t loop_depth = loop->loop_depth;
952 
953 				if (bb->flags & IR_BB_LOOP_HEADER) {
954 					loop_depth++;
955 				}
956 				bb->loop_depth = loop_depth;
957 				if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
958 					loop->flags |= IR_BB_LOOP_WITH_ENTRY;
959 				}
960 			}
961 		}
962 	}
963 
964 	ir_mem_free(entry_times);
965 	ir_worklist_free(&work);
966 
967 	return 1;
968 }
969 
970 /* A variation of "Top-down Positioning" algorithm described by
971  * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
972  *
973  * TODO: Switch to "Bottom-up Positioning" algorithm
974  */
ir_schedule_blocks(ir_ctx * ctx)975 int ir_schedule_blocks(ir_ctx *ctx)
976 {
977 	ir_bitqueue blocks;
978 	uint32_t b, best_successor, j, last_non_empty;
979 	ir_block *bb, *best_successor_bb;
980 	ir_insn *insn;
981 	uint32_t *list, *map;
982 	uint32_t count = 0;
983 	bool reorder = 0;
984 
985 	ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
986 	blocks.pos = 0;
987 	list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 1) * 2);
988 	map = list + (ctx->cfg_blocks_count + 1);
989 	for (b = 1; b <= ctx->cfg_blocks_count; b++) {
990 		ir_bitset_incl(blocks.set, b);
991 	}
992 
993 	while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
994 		bb = &ctx->cfg_blocks[b];
995 		/* Start trace */
996 		last_non_empty = 0;
997 		do {
998 			if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
999 				/* Schedule the previous empty ENTRY block before this one */
1000 				uint32_t predecessor = b - 1;
1001 
1002 				ir_bitqueue_del(&blocks, predecessor);
1003 				count++;
1004 				list[count] = predecessor;
1005 				map[predecessor] = count;
1006 				if (predecessor != count) {
1007 					reorder = 1;
1008 				}
1009 			}
1010 			count++;
1011 			list[count] = b;
1012 			map[b] = count;
1013 			if (b != count) {
1014 				reorder = 1;
1015 			}
1016 			if (!(bb->flags & IR_BB_EMPTY)) {
1017 				last_non_empty = b;
1018 			}
1019 			best_successor_bb = NULL;
1020 			if (bb->successors_count == 1) {
1021 				best_successor = ctx->cfg_edges[bb->successors];
1022 				if (ir_bitqueue_in(&blocks, best_successor)) {
1023 					best_successor_bb = &ctx->cfg_blocks[best_successor];
1024 				}
1025 			} else if (bb->successors_count > 1) {
1026 				uint32_t prob, best_successor_prob;
1027 				uint32_t *p, successor;
1028 				ir_block *successor_bb;
1029 
1030 				for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1031 					successor = *p;
1032 					if (ir_bitqueue_in(&blocks, successor)) {
1033 						successor_bb = &ctx->cfg_blocks[successor];
1034 						insn = &ctx->ir_base[successor_bb->start];
1035 						if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1036 							prob = insn->op2;
1037 							if (!prob) {
1038 								prob = 100 / bb->successors_count;
1039 								if (!(successor_bb->flags & IR_BB_EMPTY)) {
1040 									prob++;
1041 								}
1042 							}
1043 						} else if (insn->op == IR_CASE_DEFAULT) {
1044 							prob = insn->op2;
1045 							if (!prob) {
1046 								prob = 100 / bb->successors_count;
1047 							}
1048 						} else if (insn->op == IR_CASE_VAL) {
1049 							prob = insn->op3;
1050 							if (!prob) {
1051 								prob = 100 / bb->successors_count;
1052 							}
1053 						} else if (insn->op == IR_ENTRY) {
1054 							if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1055 								prob = 99; /* prefer empty ENTRY block to go first */
1056 							} else {
1057 								prob = 1;
1058 							}
1059 						} else {
1060 							prob = 100 / bb->successors_count;
1061 						}
1062 						if (!best_successor_bb
1063 						 || successor_bb->loop_depth > best_successor_bb->loop_depth
1064 						 || prob > best_successor_prob) {
1065 							best_successor = successor;
1066 							best_successor_bb = successor_bb;
1067 							best_successor_prob = prob;
1068 						}
1069 					}
1070 				}
1071 			}
1072 			if (!best_successor_bb) {
1073 				/* Try to continue trace using the other successor of the last IF */
1074 				if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1075 					bb = &ctx->cfg_blocks[last_non_empty];
1076 					if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1077 						b = ctx->cfg_edges[bb->successors];
1078 
1079 						if (!ir_bitqueue_in(&blocks, b)) {
1080 							b = ctx->cfg_edges[bb->successors + 1];
1081 						}
1082 						if (ir_bitqueue_in(&blocks, b)) {
1083 							bb = &ctx->cfg_blocks[b];
1084 							ir_bitqueue_del(&blocks, b);
1085 							continue;
1086 						}
1087 					}
1088 				}
1089 				/* End trace */
1090 				break;
1091 			}
1092 			b = best_successor;
1093 			bb = best_successor_bb;
1094 			ir_bitqueue_del(&blocks, b);
1095 		} while (1);
1096 	}
1097 
1098 	if (reorder) {
1099 		ir_block *cfg_blocks = ir_mem_malloc(sizeof(ir_block) * (ctx->cfg_blocks_count + 1));
1100 
1101 		memset(ctx->cfg_blocks, 0, sizeof(ir_block));
1102 		for (b = 1, bb = cfg_blocks + 1; b <= count; b++, bb++) {
1103 			*bb = ctx->cfg_blocks[list[b]];
1104 			if (bb->dom_parent > 0) {
1105 				bb->dom_parent = map[bb->dom_parent];
1106 			}
1107 			if (bb->dom_child > 0) {
1108 				bb->dom_child = map[bb->dom_child];
1109 			}
1110 			if (bb->dom_next_child > 0) {
1111 				bb->dom_next_child = map[bb->dom_next_child];
1112 			}
1113 			if (bb->loop_header > 0) {
1114 				bb->loop_header = map[bb->loop_header];
1115 			}
1116 		}
1117 		for (j = 0; j < ctx->cfg_edges_count; j++) {
1118 			if (ctx->cfg_edges[j] > 0) {
1119 				ctx->cfg_edges[j] = map[ctx->cfg_edges[j]];
1120 			}
1121 		}
1122 		ir_mem_free(ctx->cfg_blocks);
1123 		ctx->cfg_blocks = cfg_blocks;
1124 
1125 		if (ctx->osr_entry_loads) {
1126 			ir_list *list = (ir_list*)ctx->osr_entry_loads;
1127 			uint32_t pos = 0, count;
1128 
1129 			while (1) {
1130 				b = ir_list_at(list, pos);
1131 				if (b == 0) {
1132 					break;
1133 				}
1134 				ir_list_set(list, pos, map[b]);
1135 				pos++;
1136 				count = ir_list_at(list, pos);
1137 				pos += count + 1;
1138 			}
1139 		}
1140 
1141 		if (ctx->cfg_map) {
1142 			ir_ref i;
1143 
1144 			for (i = IR_UNUSED + 1; i < ctx->insns_count; i++) {
1145 				ctx->cfg_map[i] = map[ctx->cfg_map[i]];
1146 			}
1147 		}
1148 	}
1149 
1150 	ir_mem_free(list);
1151 	ir_bitqueue_free(&blocks);
1152 
1153 	return 1;
1154 }
1155 
1156 /* JMP target optimisation */
ir_skip_empty_target_blocks(const ir_ctx * ctx,uint32_t b)1157 uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
1158 {
1159 	ir_block *bb;
1160 
1161 	while (1) {
1162 		bb = &ctx->cfg_blocks[b];
1163 
1164 		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1165 			b = ctx->cfg_edges[bb->successors];
1166 		} else {
1167 			break;
1168 		}
1169 	}
1170 	return b;
1171 }
1172 
ir_skip_empty_next_blocks(const ir_ctx * ctx,uint32_t b)1173 uint32_t ir_skip_empty_next_blocks(const ir_ctx *ctx, uint32_t b)
1174 {
1175 	ir_block *bb;
1176 
1177 	while (1) {
1178 		if (b > ctx->cfg_blocks_count) {
1179 			return 0;
1180 		}
1181 
1182 		bb = &ctx->cfg_blocks[b];
1183 
1184 		if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1185 			b++;
1186 		} else {
1187 			break;
1188 		}
1189 	}
1190 	return b;
1191 }
1192 
ir_get_true_false_blocks(const ir_ctx * ctx,uint32_t b,uint32_t * true_block,uint32_t * false_block,uint32_t * next_block)1193 void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block, uint32_t *next_block)
1194 {
1195 	ir_block *bb;
1196 	uint32_t *p, use_block;
1197 
1198 	*true_block = 0;
1199 	*false_block = 0;
1200 	bb = &ctx->cfg_blocks[b];
1201 	IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
1202 	IR_ASSERT(bb->successors_count == 2);
1203 	p = &ctx->cfg_edges[bb->successors];
1204 	use_block = *p;
1205 	if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
1206 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1207 		use_block = *(p+1);
1208 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1209 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1210 	} else {
1211 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
1212 		*false_block = ir_skip_empty_target_blocks(ctx, use_block);
1213 		use_block = *(p+1);
1214 		IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
1215 		*true_block = ir_skip_empty_target_blocks(ctx, use_block);
1216 	}
1217 	IR_ASSERT(*true_block && *false_block);
1218 	*next_block = b == ctx->cfg_blocks_count ? 0 : ir_skip_empty_next_blocks(ctx, b + 1);
1219 }
1220