xref: /PHP-8.2/ext/opcache/jit/ir/ir_emit.c (revision cea64f7a)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (Native code generator based on DynAsm)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 
10 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
11 # include "ir_x86.h"
12 #elif defined(IR_TARGET_AARCH64)
13 # include "ir_aarch64.h"
14 #else
15 # error "Unknown IR target"
16 #endif
17 
18 #include "ir_private.h"
19 #ifndef _WIN32
20 # include <dlfcn.h>
21 #else
22 # define WIN32_LEAN_AND_MEAN
23 # include <windows.h>
24 # include <psapi.h>
25 #endif
26 
27 #if defined(__linux__) || defined(__sun)
28 # include <alloca.h>
29 #endif
30 
31 #define DASM_M_GROW(ctx, t, p, sz, need) \
32   do { \
33     size_t _sz = (sz), _need = (need); \
34     if (_sz < _need) { \
35       if (_sz < 16) _sz = 16; \
36       while (_sz < _need) _sz += _sz; \
37       (p) = (t *)ir_mem_realloc((p), _sz); \
38       (sz) = _sz; \
39     } \
40   } while(0)
41 
42 #define DASM_M_FREE(ctx, p, sz) ir_mem_free(p)
43 
44 #if IR_DEBUG
45 # define DASM_CHECKS
46 #endif
47 
48 typedef struct _ir_copy {
49 	ir_type type;
50 	ir_reg  from;
51 	ir_reg  to;
52 } ir_copy;
53 
54 typedef struct _ir_dessa_copy {
55 	ir_type type;
56 	int32_t from; /* negative - constant ref, [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg */
57 	int32_t to;   /* [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg  */
58 } ir_dessa_copy;
59 
60 #if IR_REG_INT_ARGS
61 static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS];
62 #else
63 static const int8_t *_ir_int_reg_params;
64 #endif
65 #if IR_REG_FP_ARGS
66 static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS];
67 #else
68 static const int8_t *_ir_fp_reg_params;
69 #endif
70 
ir_call_proto(const ir_ctx * ctx,ir_insn * insn)71 static const ir_proto_t *ir_call_proto(const ir_ctx *ctx, ir_insn *insn)
72 {
73 	if (IR_IS_CONST_REF(insn->op2)) {
74 		const ir_insn *func = &ctx->ir_base[insn->op2];
75 
76 		if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
77 			if (func->proto) {
78 				return (const ir_proto_t *)ir_get_str(ctx, func->proto);
79 			}
80 		}
81 	} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
82 		return (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
83 	}
84 	return NULL;
85 }
86 
87 #ifdef IR_HAVE_FASTCALL
88 static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS];
89 static const int8_t *_ir_fp_fc_reg_params;
90 
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)91 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
92 {
93 	if (sizeof(void*) == 4) {
94 		if (IR_IS_CONST_REF(insn->op2)) {
95 			const ir_insn *func = &ctx->ir_base[insn->op2];
96 
97 			if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
98 				if (func->proto) {
99 					const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);
100 
101 					return (proto->flags & IR_FASTCALL_FUNC) != 0;
102 				}
103 			}
104 		} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
105 			const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
106 
107 			return (proto->flags & IR_FASTCALL_FUNC) != 0;
108 		}
109 		return 0;
110 	}
111 	return 0;
112 }
113 #else
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)114 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
115 {
116 	return 0;
117 }
118 #endif
119 
ir_is_vararg(const ir_ctx * ctx,ir_insn * insn)120 bool ir_is_vararg(const ir_ctx *ctx, ir_insn *insn)
121 {
122 	const ir_proto_t *proto = ir_call_proto(ctx, insn);
123 
124 	if (proto) {
125 		return (proto->flags & IR_VARARG_FUNC) != 0;
126 	}
127 	return 0;
128 }
129 
ir_rule(const ir_ctx * ctx,ir_ref ref)130 IR_ALWAYS_INLINE uint32_t ir_rule(const ir_ctx *ctx, ir_ref ref)
131 {
132 	IR_ASSERT(!IR_IS_CONST_REF(ref));
133 	return ctx->rules[ref];
134 }
135 
ir_in_same_block(ir_ctx * ctx,ir_ref ref)136 IR_ALWAYS_INLINE bool ir_in_same_block(ir_ctx *ctx, ir_ref ref)
137 {
138 	return ref > ctx->bb_start;
139 }
140 
141 
ir_get_param_reg(const ir_ctx * ctx,ir_ref ref)142 static ir_reg ir_get_param_reg(const ir_ctx *ctx, ir_ref ref)
143 {
144 	ir_use_list *use_list = &ctx->use_lists[1];
145 	int i;
146 	ir_ref use, *p;
147 	ir_insn *insn;
148 	int int_param = 0;
149 	int fp_param = 0;
150 	int int_reg_params_count = IR_REG_INT_ARGS;
151 	int fp_reg_params_count = IR_REG_FP_ARGS;
152 	const int8_t *int_reg_params = _ir_int_reg_params;
153 	const int8_t *fp_reg_params = _ir_fp_reg_params;
154 
155 #ifdef IR_HAVE_FASTCALL
156 	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
157 		int_reg_params_count = IR_REG_INT_FCARGS;
158 		fp_reg_params_count = IR_REG_FP_FCARGS;
159 		int_reg_params = _ir_int_fc_reg_params;
160 		fp_reg_params = _ir_fp_fc_reg_params;
161 	}
162 #endif
163 
164 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
165 		use = *p;
166 		insn = &ctx->ir_base[use];
167 		if (insn->op == IR_PARAM) {
168 			if (IR_IS_TYPE_INT(insn->type)) {
169 				if (use == ref) {
170 					if (int_param < int_reg_params_count) {
171 						return int_reg_params[int_param];
172 					} else {
173 						return IR_REG_NONE;
174 					}
175 				}
176 				int_param++;
177 #ifdef _WIN64
178 				/* WIN64 calling convention use common couter for int and fp registers */
179 				fp_param++;
180 #endif
181 			} else {
182 				IR_ASSERT(IR_IS_TYPE_FP(insn->type));
183 				if (use == ref) {
184 					if (fp_param < fp_reg_params_count) {
185 						return fp_reg_params[fp_param];
186 					} else {
187 						return IR_REG_NONE;
188 					}
189 				}
190 				fp_param++;
191 #ifdef _WIN64
192 				/* WIN64 calling convention use common couter for int and fp registers */
193 				int_param++;
194 #endif
195 			}
196 		}
197 	}
198 	return IR_REG_NONE;
199 }
200 
ir_get_args_regs(const ir_ctx * ctx,const ir_insn * insn,int8_t * regs)201 static int ir_get_args_regs(const ir_ctx *ctx, const ir_insn *insn, int8_t *regs)
202 {
203 	int j, n;
204 	ir_type type;
205 	int int_param = 0;
206 	int fp_param = 0;
207 	int count = 0;
208 	int int_reg_params_count = IR_REG_INT_ARGS;
209 	int fp_reg_params_count = IR_REG_FP_ARGS;
210 	const int8_t *int_reg_params = _ir_int_reg_params;
211 	const int8_t *fp_reg_params = _ir_fp_reg_params;
212 
213 #ifdef IR_HAVE_FASTCALL
214 	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
215 		int_reg_params_count = IR_REG_INT_FCARGS;
216 		fp_reg_params_count = IR_REG_FP_FCARGS;
217 		int_reg_params = _ir_int_fc_reg_params;
218 		fp_reg_params = _ir_fp_fc_reg_params;
219 	}
220 #endif
221 
222 	n = insn->inputs_count;
223 	n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
224 	for (j = 3; j <= n; j++) {
225 		type = ctx->ir_base[ir_insn_op(insn, j)].type;
226 		if (IR_IS_TYPE_INT(type)) {
227 			if (int_param < int_reg_params_count) {
228 				regs[j] = int_reg_params[int_param];
229 				count = j + 1;
230 			} else {
231 				regs[j] = IR_REG_NONE;
232 			}
233 			int_param++;
234 #ifdef _WIN64
235 			/* WIN64 calling convention use common couter for int and fp registers */
236 			fp_param++;
237 #endif
238 		} else {
239 			IR_ASSERT(IR_IS_TYPE_FP(type));
240 			if (fp_param < fp_reg_params_count) {
241 				regs[j] = fp_reg_params[fp_param];
242 				count = j + 1;
243 			} else {
244 				regs[j] = IR_REG_NONE;
245 			}
246 			fp_param++;
247 #ifdef _WIN64
248 			/* WIN64 calling convention use common couter for int and fp registers */
249 			int_param++;
250 #endif
251 		}
252 	}
253 	return count;
254 }
255 
ir_is_same_mem_var(const ir_ctx * ctx,ir_ref r1,int32_t offset)256 static bool ir_is_same_mem_var(const ir_ctx *ctx, ir_ref r1, int32_t offset)
257 {
258 	ir_live_interval *ival1;
259 	int32_t o1;
260 
261 	if (IR_IS_CONST_REF(r1)) {
262 		return 0;
263 	}
264 
265 	IR_ASSERT(ctx->vregs[r1]);
266 	ival1 = ctx->live_intervals[ctx->vregs[r1]];
267 	IR_ASSERT(ival1);
268 	o1 = ival1->stack_spill_pos;
269 	IR_ASSERT(o1 != -1);
270 	return o1 == offset;
271 }
272 
ir_resolve_sym_name(const char * name)273 void *ir_resolve_sym_name(const char *name)
274 {
275 	void *handle = NULL;
276 	void *addr;
277 
278 #ifndef _WIN32
279 # ifdef RTLD_DEFAULT
280 	handle = RTLD_DEFAULT;
281 # endif
282 	addr = dlsym(handle, name);
283 #else
284 	HMODULE mods[256];
285 	DWORD cbNeeded;
286 	uint32_t i = 0;
287 
288 	addr = NULL;
289 
290 	EnumProcessModules(GetCurrentProcess(), mods, sizeof(mods), &cbNeeded);
291 
292 	while(i < (cbNeeded / sizeof(HMODULE))) {
293 		addr = GetProcAddress(mods[i], name);
294 		if (addr) {
295 			return addr;
296 		}
297 		i++;
298 	}
299 #endif
300 	return addr;
301 }
302 
303 #ifdef IR_SNAPSHOT_HANDLER_DCL
304 	IR_SNAPSHOT_HANDLER_DCL();
305 #endif
306 
307 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
ir_sym_addr(ir_ctx * ctx,const ir_insn * addr_insn)308 static void* ir_sym_addr(ir_ctx *ctx, const ir_insn *addr_insn)
309 {
310 	const char *name = ir_get_str(ctx, addr_insn->val.name);
311 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
312 		ctx->loader->resolve_sym_name(ctx->loader, name, 0) :
313 		ir_resolve_sym_name(name);
314 
315 	return addr;
316 }
317 #endif
318 
ir_sym_val(ir_ctx * ctx,const ir_insn * addr_insn)319 static void* ir_sym_val(ir_ctx *ctx, const ir_insn *addr_insn)
320 {
321 	const char *name = ir_get_str(ctx, addr_insn->val.name);
322 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
323 		ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC) :
324 		ir_resolve_sym_name(name);
325 
326 	IR_ASSERT(addr);
327 	return addr;
328 }
329 
ir_call_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)330 static void *ir_call_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
331 {
332 	void *addr;
333 
334 	IR_ASSERT(addr_insn->type == IR_ADDR);
335 	if (addr_insn->op == IR_FUNC) {
336 		addr = ir_sym_val(ctx, addr_insn);
337 	} else {
338 		IR_ASSERT(addr_insn->op == IR_ADDR || addr_insn->op == IR_FUNC_ADDR);
339 		addr = (void*)addr_insn->val.addr;
340 	}
341 	return addr;
342 }
343 
ir_jmp_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)344 static void *ir_jmp_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
345 {
346 	void *addr = ir_call_addr(ctx, insn, addr_insn);
347 
348 #ifdef IR_SNAPSHOT_HANDLER
349 	if (ctx->ir_base[insn->op1].op == IR_SNAPSHOT) {
350 		addr = IR_SNAPSHOT_HANDLER(ctx, insn->op1, &ctx->ir_base[insn->op1], addr);
351 	}
352 #endif
353 	return addr;
354 }
355 
ir_get_fused_reg(ir_ctx * ctx,ir_ref root,ir_ref ref_and_op)356 static int8_t ir_get_fused_reg(ir_ctx *ctx, ir_ref root, ir_ref ref_and_op)
357 {
358 	if (ctx->fused_regs) {
359 		char key[10];
360 		ir_ref val;
361 
362 		memcpy(key, &root, sizeof(ir_ref));
363 		memcpy(key + 4, &ref_and_op, sizeof(ir_ref));
364 
365 		val = ir_strtab_find(ctx->fused_regs, key, 8);
366 		if (val) {
367 			return val;
368 		}
369 	}
370 	return ((int8_t*)ctx->regs)[ref_and_op];
371 }
372 
373 #if defined(__GNUC__)
374 # pragma GCC diagnostic push
375 # pragma GCC diagnostic ignored "-Warray-bounds"
376 # pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
377 #endif
378 
379 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
380 # include "dynasm/dasm_proto.h"
381 # include "dynasm/dasm_x86.h"
382 #elif defined(IR_TARGET_AARCH64)
383 # include "dynasm/dasm_proto.h"
384 static int ir_add_veneer(dasm_State *Dst, void *buffer, uint32_t ins, int *b, uint32_t *cp, ptrdiff_t offset);
385 # define DASM_ADD_VENEER ir_add_veneer
386 # include "dynasm/dasm_arm64.h"
387 #else
388 # error "Unknown IR target"
389 #endif
390 
391 #if defined(__GNUC__)
392 # pragma GCC diagnostic pop
393 #endif
394 
395 /* Forward Declarations */
396 static void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb);
397 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg);
398 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb);
399 
400 typedef struct _ir_common_backend_data {
401     ir_reg_alloc_data  ra_data;
402 	uint32_t           dessa_from_block;
403 	dasm_State        *dasm_state;
404 	ir_bitset          emit_constants;
405 } ir_common_backend_data;
406 
ir_const_label(ir_ctx * ctx,ir_ref ref)407 static int ir_const_label(ir_ctx *ctx, ir_ref ref)
408 {
409 	ir_common_backend_data *data = ctx->data;
410 	int label = ctx->cfg_blocks_count - ref;
411 
412 	IR_ASSERT(IR_IS_CONST_REF(ref));
413 	ir_bitset_incl(data->emit_constants, -ref);
414 	return label;
415 }
416 
417 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
418 # include "ir_emit_x86.h"
419 #elif defined(IR_TARGET_AARCH64)
420 # include "ir_emit_aarch64.h"
421 #else
422 # error "Unknown IR target"
423 #endif
424 
ir_emit_osr_entry_loads(ir_ctx * ctx,int b,ir_block * bb)425 static IR_NEVER_INLINE void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb)
426 {
427 	ir_list *list = (ir_list*)ctx->osr_entry_loads;
428 	int pos = 0, count, i;
429 	ir_ref ref;
430 
431 	IR_ASSERT(ctx->binding);
432 	IR_ASSERT(list);
433 	while (1) {
434 		i = ir_list_at(list, pos);
435 		if (b == i) {
436 			break;
437 		}
438 		IR_ASSERT(i != 0); /* end marker */
439 		pos++;
440 		count = ir_list_at(list, pos);
441 		pos += count + 1;
442 	}
443 	pos++;
444 	count = ir_list_at(list, pos);
445 	pos++;
446 
447 	for (i = 0; i < count; i++, pos++) {
448 		ref = ir_list_at(list, pos);
449 		IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
450 		if (!(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILLED)) {
451 			/* not spilled */
452 			ir_reg reg = ctx->live_intervals[ctx->vregs[ref]]->reg;
453 			ir_type type = ctx->ir_base[ref].type;
454 			int32_t offset = -ir_binding_find(ctx, ref);
455 
456 			IR_ASSERT(offset > 0);
457 			ir_emit_load_mem(ctx, type, reg, IR_MEM_BO(ctx->spill_base, offset));
458 		} else {
459 			IR_ASSERT(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL);
460 		}
461 	}
462 }
463 
464 /*
465  * Parallel copy sequentialization algorithm
466  *
467  * The implementation is based on algorithm 1 desriebed in
468  * "Revisiting Out-of-SSA Translation for Correctness, Code Quality and Efficiency",
469  * Benoit Boissinot, Alain Darte, Fabrice Rastello, Benoit Dupont de Dinechin, Christophe Guillon.
470  * 2009 International Symposium on Code Generation and Optimization, Seattle, WA, USA, 2009,
471  * pp. 114-125, doi: 10.1109/CGO.2009.19.
472  */
ir_parallel_copy(ir_ctx * ctx,ir_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)473 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
474 {
475 	int i;
476 	int8_t *pred, *loc, *types;
477 	ir_reg to, from;
478 	ir_type type;
479 	ir_regset todo, ready, srcs;
480 
481 	if (count == 1) {
482 		to = copies[0].to;
483 		from = copies[0].from;
484 		IR_ASSERT(from != to);
485 		type = copies[0].type;
486 		if (IR_IS_TYPE_INT(type)) {
487 			ir_emit_mov(ctx, type, to, from);
488 		} else {
489 			ir_emit_fp_mov(ctx, type, to, from);
490 		}
491 		return 1;
492 	}
493 
494 	loc = alloca(IR_REG_NUM * 3 * sizeof(int8_t));
495 	pred = loc + IR_REG_NUM;
496 	types = pred + IR_REG_NUM;
497 	todo = IR_REGSET_EMPTY;
498 	srcs = IR_REGSET_EMPTY;
499 
500 	for (i = 0; i < count; i++) {
501 		from = copies[i].from;
502 		to = copies[i].to;
503 		IR_ASSERT(from != to);
504 		IR_REGSET_INCL(srcs, from);
505 		loc[from] = from;
506 		pred[to] = from;
507 		types[from] = copies[i].type;
508 		IR_ASSERT(!IR_REGSET_IN(todo, to));
509 		IR_REGSET_INCL(todo, to);
510 	}
511 
512 	ready = IR_REGSET_DIFFERENCE(todo, srcs);
513 
514 	if (ready == todo) {
515 		for (i = 0; i < count; i++) {
516 			from = copies[i].from;
517 			to = copies[i].to;
518 			IR_ASSERT(from != to);
519 			type = copies[i].type;
520 			if (IR_IS_TYPE_INT(type)) {
521 				ir_emit_mov(ctx, type, to, from);
522 			} else {
523 				ir_emit_fp_mov(ctx, type, to, from);
524 			}
525 		}
526 		return 1;
527 	}
528 
529 	/* temporary registers can't be the same as some of the destinations */
530 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_reg));
531 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_fp_reg));
532 
533 	/* first we resolve all "windmill blades" - trees (this doesn't requre temporary registers) */
534 	while (ready != IR_REGSET_EMPTY) {
535 		ir_reg r;
536 
537 		to = ir_regset_pop_first(&ready);
538 		from = pred[to];
539 		r = loc[from];
540 		type = types[from];
541 		if (IR_IS_TYPE_INT(type)) {
542 			ir_emit_mov_ext(ctx, type, to, r);
543 		} else {
544 			ir_emit_fp_mov(ctx, type, to, r);
545 		}
546 		IR_REGSET_EXCL(todo, to);
547 		loc[from] = to;
548 		if (from == r && IR_REGSET_IN(todo, from)) {
549 			IR_REGSET_INCL(ready, from);
550 		}
551 	}
552 	if (todo == IR_REGSET_EMPTY) {
553 		return 1;
554 	}
555 
556 	/* at this point the sources that are the same as temoraries are already moved */
557 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_reg) || pred[loc[tmp_reg]] == tmp_reg);
558 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_fp_reg) || pred[loc[tmp_fp_reg]] == tmp_fp_reg);
559 
560 	/* now we resolve all "windmill axles" - cycles (this reuires temporary registers) */
561 	while (todo != IR_REGSET_EMPTY) {
562 		to = ir_regset_pop_first(&todo);
563 		from = pred[to];
564 		IR_ASSERT(to != loc[from]);
565 		type = types[from];
566 		if (IR_IS_TYPE_INT(type)) {
567 #ifdef IR_HAVE_SWAP_INT
568 			if (pred[from] == to) {
569 				ir_emit_swap(ctx, type, to, from);
570 				IR_REGSET_EXCL(todo, from);
571 				loc[to] = from;
572 				loc[from] = to;
573 				continue;
574 			}
575 #endif
576 			IR_ASSERT(tmp_reg != IR_REG_NONE);
577 			IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
578 			ir_emit_mov(ctx, type, tmp_reg, to);
579 			loc[to] = tmp_reg;
580 		} else {
581 #ifdef IR_HAVE_SWAP_FP
582 			if (pred[from] == to) {
583 				ir_emit_swap_fp(ctx, type, to, from);
584 				IR_REGSET_EXCL(todo, from);
585 				loc[to] = from;
586 				loc[from] = to;
587 				continue;
588 			}
589 #endif
590 			IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
591 			IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
592 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
593 			loc[to] = tmp_fp_reg;
594 		}
595 		while (1) {
596 			ir_reg r;
597 
598 			from = pred[to];
599 			r = loc[from];
600 			type = types[from];
601 			if (IR_IS_TYPE_INT(type)) {
602 				ir_emit_mov_ext(ctx, type, to, r);
603 			} else {
604 				ir_emit_fp_mov(ctx, type, to, r);
605 			}
606 			IR_REGSET_EXCL(todo, to);
607 			loc[from] = to;
608 			if (from == r && IR_REGSET_IN(todo, from)) {
609 				to = from;
610 			} else {
611 				break;
612 			}
613 		}
614 	}
615 
616 	return 1;
617 }
618 
ir_emit_dessa_move(ir_ctx * ctx,ir_type type,ir_ref to,ir_ref from,ir_reg tmp_reg,ir_reg tmp_fp_reg)619 static void ir_emit_dessa_move(ir_ctx *ctx, ir_type type, ir_ref to, ir_ref from, ir_reg tmp_reg, ir_reg tmp_fp_reg)
620 {
621 	ir_mem mem_from, mem_to;
622 
623 	IR_ASSERT(from != to);
624 	if (to < IR_REG_NUM) {
625 		if (IR_IS_CONST_REF(from)) {
626 			ir_emit_load(ctx, type, to, from);
627 		} else if (from < IR_REG_NUM) {
628 			if (IR_IS_TYPE_INT(type)) {
629 				ir_emit_mov(ctx, type, to, from);
630 			} else {
631 				ir_emit_fp_mov(ctx, type, to, from);
632 			}
633 		} else {
634 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
635 			ir_emit_load_mem(ctx, type, to, mem_from);
636 		}
637 	} else {
638 		mem_to = ir_vreg_spill_slot(ctx, to - IR_REG_NUM);
639 		if (IR_IS_CONST_REF(from)) {
640 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
641 			if (IR_IS_TYPE_INT(type)
642 			 && !IR_IS_SYM_CONST(ctx->ir_base[from].op)
643 			 && (ir_type_size[type] != 8 || IR_IS_SIGNED_32BIT(ctx->ir_base[from].val.i64))) {
644 				ir_emit_store_mem_imm(ctx, type, mem_to, ctx->ir_base[from].val.i32);
645 				return;
646 			}
647 #endif
648 			ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
649 			IR_ASSERT(tmp != IR_REG_NONE);
650 			ir_emit_load(ctx, type, tmp, from);
651 			ir_emit_store_mem(ctx, type, mem_to, tmp);
652 		} else if (from < IR_REG_NUM) {
653 			ir_emit_store_mem(ctx, type, mem_to, from);
654 		} else {
655 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
656 			IR_ASSERT(IR_MEM_VAL(mem_to) != IR_MEM_VAL(mem_from));
657 			ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
658 			IR_ASSERT(tmp != IR_REG_NONE);
659 			ir_emit_load_mem(ctx, type, tmp, mem_from);
660 			ir_emit_store_mem(ctx, type, mem_to, tmp);
661 		}
662 	}
663 }
664 
ir_dessa_resolve_cycle(ir_ctx * ctx,int32_t * pred,int32_t * loc,ir_bitset todo,ir_type type,int32_t to,ir_reg tmp_reg,ir_reg tmp_fp_reg)665 IR_ALWAYS_INLINE void ir_dessa_resolve_cycle(ir_ctx *ctx, int32_t *pred, int32_t *loc, ir_bitset todo, ir_type type, int32_t to, ir_reg tmp_reg, ir_reg tmp_fp_reg)
666 {
667 	ir_reg from;
668 	ir_mem tmp_spill_slot;
669 
670 	IR_MEM_VAL(tmp_spill_slot) = 0;
671 	IR_ASSERT(!IR_IS_CONST_REF(to));
672 	from = pred[to];
673 	IR_ASSERT(!IR_IS_CONST_REF(from));
674 	IR_ASSERT(from != to);
675 	IR_ASSERT(loc[from] == from);
676 
677 	if (IR_IS_TYPE_INT(type)) {
678 #ifdef IR_HAVE_SWAP_INT
679 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) {
680 			/* a simple cycle from 2 elements */
681 			ir_emit_swap(ctx, type, to, from);
682 			ir_bitset_excl(todo, from);
683 			ir_bitset_excl(todo, to);
684 			loc[to] = from;
685 			loc[from] = to;
686 			return;
687 		}
688 #endif
689 		IR_ASSERT(tmp_reg != IR_REG_NONE);
690 		IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
691 		loc[to] = tmp_reg;
692 		if (to < IR_REG_NUM) {
693 			ir_emit_mov(ctx, type, tmp_reg, to);
694 		} else {
695 			ir_emit_load_mem_int(ctx, type, tmp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
696 		}
697 	} else {
698 #ifdef IR_HAVE_SWAP_FP
699 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) {
700 			/* a simple cycle from 2 elements */
701 			ir_emit_swap_fp(ctx, type, to, from);
702 			IR_REGSET_EXCL(todo, from);
703 			IR_REGSET_EXCL(todo, to);
704 			loc[to] = from;
705 			loc[from] = to;
706 			return;
707 		}
708 #endif
709 		IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
710 		IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
711 		loc[to] = tmp_fp_reg;
712 		if (to < IR_REG_NUM) {
713 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
714 		} else {
715 			ir_emit_load_mem_fp(ctx, type, tmp_fp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
716 		}
717 	}
718 
719 	while (1) {
720 		int32_t r;
721 
722 		from = pred[to];
723 		r = loc[from];
724 
725 		if (from == r && ir_bitset_in(todo, from)) {
726 			/* Memory to memory move inside an isolated or "blocked" cycle requres an additional temporary register */
727 			if (to >= IR_REG_NUM && r >= IR_REG_NUM) {
728 				ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
729 
730 				if (!IR_MEM_VAL(tmp_spill_slot)) {
731 					/* Free a register, saving it in a temporary spill slot */
732 					tmp_spill_slot = IR_MEM_BO(IR_REG_STACK_POINTER, -16);
733 					ir_emit_store_mem(ctx, type, tmp_spill_slot, tmp);
734 				}
735 				ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
736 			} else {
737 				ir_emit_dessa_move(ctx, type, to, r, IR_REG_NONE, IR_REG_NONE);
738 			}
739 			ir_bitset_excl(todo, to);
740 			loc[from] = to;
741 			to = from;
742 		} else {
743 			break;
744 		}
745 	}
746 	if (IR_MEM_VAL(tmp_spill_slot)) {
747 		ir_emit_load_mem(ctx, type, IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg, tmp_spill_slot);
748 	}
749 	ir_emit_dessa_move(ctx, type, to, loc[from], IR_REG_NONE, IR_REG_NONE);
750 	ir_bitset_excl(todo, to);
751 	loc[from] = to;
752 }
753 
ir_dessa_parallel_copy(ir_ctx * ctx,ir_dessa_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)754 static int ir_dessa_parallel_copy(ir_ctx *ctx, ir_dessa_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
755 {
756 	int i;
757 	int32_t *pred, *loc, to, from;
758 	int8_t *types;
759 	ir_type type;
760 	uint32_t len;
761 	ir_bitset todo, ready, srcs, visited;
762 
763 	if (count == 1) {
764 		to = copies[0].to;
765 		from = copies[0].from;
766 		IR_ASSERT(from != to);
767 		type = copies[0].type;
768 		ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
769 		return 1;
770 	}
771 
772 	len = IR_REG_NUM + ctx->vregs_count + 1;
773 	todo = ir_bitset_malloc(len);
774 	srcs = ir_bitset_malloc(len);
775 	loc = ir_mem_malloc(len * 2 * sizeof(int32_t) + len * sizeof(int8_t));
776 	pred = loc + len;
777 	types = (int8_t*)(pred + len);
778 
779 	for (i = 0; i < count; i++) {
780 		from = copies[i].from;
781 		to = copies[i].to;
782 		IR_ASSERT(from != to);
783 		if (!IR_IS_CONST_REF(from)) {
784 			ir_bitset_incl(srcs, from);
785 			loc[from] = from;
786 		}
787 		pred[to] = from;
788 		types[to] = copies[i].type;
789 		IR_ASSERT(!ir_bitset_in(todo, to));
790 		ir_bitset_incl(todo, to);
791 	}
792 
793 	/* temporary registers can't be the same as some of the sources */
794 	IR_ASSERT(tmp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_reg));
795 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_fp_reg));
796 
797 	/* first we resolve all "windmill blades" - trees, that don't set temporary registers */
798 	ready = ir_bitset_malloc(len);
799 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
800 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
801 	if (tmp_reg != IR_REG_NONE) {
802 		ir_bitset_excl(ready, tmp_reg);
803 	}
804 	if (tmp_fp_reg != IR_REG_NONE) {
805 		ir_bitset_excl(ready, tmp_fp_reg);
806 	}
807 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
808 		ir_bitset_excl(todo, to);
809 		type = types[to];
810 		from = pred[to];
811 		if (IR_IS_CONST_REF(from)) {
812 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
813 		} else {
814 			int32_t r = loc[from];
815 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
816 			loc[from] = to;
817 			if (from == r && ir_bitset_in(todo, from) && from != tmp_reg && from != tmp_fp_reg) {
818 				ir_bitset_incl(ready, from);
819 			}
820 		}
821 	}
822 
823 	/* then we resolve all "windmill axles" - cycles (this requres temporary registers) */
824 	visited = ir_bitset_malloc(len);
825 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
826 	ir_bitset_intersection(ready, srcs, ir_bitset_len(len));
827 	while ((to = ir_bitset_first(ready, ir_bitset_len(len))) >= 0) {
828 		ir_bitset_clear(visited, ir_bitset_len(len));
829 		ir_bitset_incl(visited, to);
830 		to = pred[to];
831 		while (!IR_IS_CONST_REF(to) && ir_bitset_in(ready, to)) {
832 			to = pred[to];
833 			if (!IR_IS_CONST_REF(to) && ir_bitset_in(visited, to)) {
834 				/* We found a cycle. Resolve it. */
835 				ir_bitset_incl(visited, to);
836 				type = types[to];
837 				ir_dessa_resolve_cycle(ctx, pred, loc, todo, type, to, tmp_reg, tmp_fp_reg);
838 				break;
839 			}
840 			ir_bitset_incl(visited, to);
841 		}
842 		ir_bitset_difference(ready, visited, ir_bitset_len(len));
843 	}
844 
845 	/* finally we resolve remaining "windmill blades" - trees that set temporary registers */
846 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
847 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
848 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
849 		ir_bitset_excl(todo, to);
850 		type = types[to];
851 		from = pred[to];
852 		if (IR_IS_CONST_REF(from)) {
853 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
854 		} else {
855 			int32_t r = loc[from];
856 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
857 			loc[from] = to;
858 			if (from == r && ir_bitset_in(todo, from)) {
859 				ir_bitset_incl(ready, from);
860 			}
861 		}
862 	}
863 
864 	IR_ASSERT(ir_bitset_empty(todo, ir_bitset_len(len)));
865 
866 	ir_mem_free(visited);
867 	ir_mem_free(ready);
868 	ir_mem_free(loc);
869 	ir_mem_free(srcs);
870 	ir_mem_free(todo);
871 	return 1;
872 }
873 
ir_emit_dessa_moves(ir_ctx * ctx,int b,ir_block * bb)874 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb)
875 {
876 	uint32_t succ, k, n = 0;
877 	ir_block *succ_bb;
878 	ir_use_list *use_list;
879 	ir_ref i, *p;
880 	ir_dessa_copy *copies;
881 	ir_reg tmp_reg = ctx->regs[bb->end][0];
882 	ir_reg tmp_fp_reg = ctx->regs[bb->end][1];
883 
884 	IR_ASSERT(bb->successors_count == 1);
885 	succ = ctx->cfg_edges[bb->successors];
886 	succ_bb = &ctx->cfg_blocks[succ];
887 	IR_ASSERT(succ_bb->predecessors_count > 1);
888 	use_list = &ctx->use_lists[succ_bb->start];
889 	k = ir_phi_input_number(ctx, succ_bb, b);
890 
891 	copies = alloca(use_list->count * sizeof(ir_dessa_copy));
892 
893 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
894 		ir_ref ref = *p;
895 		ir_insn *insn = &ctx->ir_base[ref];
896 
897 		if (insn->op == IR_PHI) {
898 			ir_ref input = ir_insn_op(insn, k);
899 			ir_reg src = ir_get_alocated_reg(ctx, ref, k);
900 			ir_reg dst = ctx->regs[ref][0];
901 			ir_ref from, to;
902 
903 			IR_ASSERT(dst == IR_REG_NONE || !IR_REG_SPILLED(dst));
904 			if (IR_IS_CONST_REF(input)) {
905 				from = input;
906 			} else {
907 				from = (src != IR_REG_NONE && !IR_REG_SPILLED(src)) ?
908 					(ir_ref)src : (ir_ref)(IR_REG_NUM + ctx->vregs[input]);
909 			}
910 			to = (dst != IR_REG_NONE) ?
911 				(ir_ref)dst : (ir_ref)(IR_REG_NUM + ctx->vregs[ref]);
912 			if (to != from) {
913 				if (to >= IR_REG_NUM
914 				 && from >= IR_REG_NUM
915 				 && IR_MEM_VAL(ir_vreg_spill_slot(ctx, from - IR_REG_NUM)) ==
916 						IR_MEM_VAL(ir_vreg_spill_slot(ctx, to - IR_REG_NUM))) {
917 					/* It's possible that different virtual registers share the same special spill slot */
918 					// TODO: See ext/opcache/tests/jit/gh11917.phpt failure on Linux 32-bit
919 					continue;
920 				}
921 				copies[n].type = insn->type;
922 				copies[n].from = from;
923 				copies[n].to = to;
924 				n++;
925 			}
926 		}
927 	}
928 
929 	if (n > 0) {
930 		ir_dessa_parallel_copy(ctx, copies, n, tmp_reg, tmp_fp_reg);
931 	}
932 }
933 
ir_match(ir_ctx * ctx)934 int ir_match(ir_ctx *ctx)
935 {
936 	uint32_t b;
937 	ir_ref start, ref, *prev_ref;
938 	ir_block *bb;
939 	ir_insn *insn;
940 	uint32_t entries_count = 0;
941 
942 	ctx->rules = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
943 
944 	prev_ref = ctx->prev_ref;
945 	if (!prev_ref) {
946 		ir_build_prev_refs(ctx);
947 		prev_ref = ctx->prev_ref;
948 	}
949 
950 	if (ctx->entries_count) {
951 		ctx->entries = ir_mem_malloc(ctx->entries_count * sizeof(ir_ref));
952 	}
953 
954 	for (b = ctx->cfg_blocks_count, bb = ctx->cfg_blocks + b; b > 0; b--, bb--) {
955 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
956 		start = bb->start;
957 		if (UNEXPECTED(bb->flags & IR_BB_ENTRY)) {
958 			IR_ASSERT(entries_count < ctx->entries_count);
959 			insn = &ctx->ir_base[start];
960 			IR_ASSERT(insn->op == IR_ENTRY);
961 			insn->op3 = entries_count;
962 			ctx->entries[entries_count] = b;
963 			entries_count++;
964 		}
965 		ctx->rules[start] = IR_SKIPPED | IR_NOP;
966 		ref = bb->end;
967 		if (bb->successors_count == 1) {
968 			insn = &ctx->ir_base[ref];
969 			if (insn->op == IR_END || insn->op == IR_LOOP_END) {
970 				ctx->rules[ref] = insn->op;
971 				ref = prev_ref[ref];
972 				if (ref == start && ctx->cfg_edges[bb->successors] != b) {
973 					if (EXPECTED(!(bb->flags & IR_BB_ENTRY))) {
974 						bb->flags |= IR_BB_EMPTY;
975 					} else if (ctx->flags & IR_MERGE_EMPTY_ENTRIES) {
976 						bb->flags |= IR_BB_EMPTY;
977 						if (ctx->cfg_edges[bb->successors] == b + 1) {
978 							(bb + 1)->flags |= IR_BB_PREV_EMPTY_ENTRY;
979 						}
980 					}
981 					continue;
982 				}
983 			}
984 		}
985 
986 		ctx->bb_start = start; /* bb_start is used by matcher to avoid fusion of insns from different blocks */
987 
988 		while (ref != start) {
989 			uint32_t rule = ctx->rules[ref];
990 
991 			if (!rule) {
992 				ctx->rules[ref] = rule = ir_match_insn(ctx, ref);
993 			}
994 			ir_match_insn2(ctx, ref, rule);
995 			ref = prev_ref[ref];
996 		}
997 	}
998 
999 	if (ctx->entries_count) {
1000 		ctx->entries_count = entries_count;
1001 		if (!entries_count) {
1002 			ir_mem_free(ctx->entries);
1003 			ctx->entries = NULL;
1004 		}
1005 	}
1006 
1007 	return 1;
1008 }
1009 
ir_get_spill_slot_offset(ir_ctx * ctx,ir_ref ref)1010 int32_t ir_get_spill_slot_offset(ir_ctx *ctx, ir_ref ref)
1011 {
1012 	int32_t offset;
1013 
1014 	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
1015 	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
1016 	IR_ASSERT(offset != -1);
1017 	return IR_SPILL_POS_TO_OFFSET(offset);
1018 }
1019