xref: /php-src/ext/opcache/jit/ir/ir_emit.c (revision 7e2831f0)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (Native code generator based on DynAsm)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 
10 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
11 # include "ir_x86.h"
12 #elif defined(IR_TARGET_AARCH64)
13 # include "ir_aarch64.h"
14 #else
15 # error "Unknown IR target"
16 #endif
17 
18 #include "ir_private.h"
19 #ifndef _WIN32
20 # include <dlfcn.h>
21 #else
22 # define WIN32_LEAN_AND_MEAN
23 # include <windows.h>
24 # include <psapi.h>
25 #endif
26 
27 #if defined(__linux__) || defined(__sun)
28 # include <alloca.h>
29 #endif
30 
31 #define DASM_M_GROW(ctx, t, p, sz, need) \
32   do { \
33     size_t _sz = (sz), _need = (need); \
34     if (_sz < _need) { \
35       if (_sz < 16) _sz = 16; \
36       while (_sz < _need) _sz += _sz; \
37       (p) = (t *)ir_mem_realloc((p), _sz); \
38       (sz) = _sz; \
39     } \
40   } while(0)
41 
42 #define DASM_M_FREE(ctx, p, sz) ir_mem_free(p)
43 
44 #if IR_DEBUG
45 # define DASM_CHECKS
46 #endif
47 
48 typedef struct _ir_copy {
49 	ir_type type;
50 	ir_reg  from;
51 	ir_reg  to;
52 } ir_copy;
53 
54 typedef struct _ir_dessa_copy {
55 	ir_type type;
56 	int32_t from; /* negative - constant ref, [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg */
57 	int32_t to;   /* [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg  */
58 } ir_dessa_copy;
59 
60 #if IR_REG_INT_ARGS
61 static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS];
62 #else
63 static const int8_t *_ir_int_reg_params;
64 #endif
65 #if IR_REG_FP_ARGS
66 static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS];
67 #else
68 static const int8_t *_ir_fp_reg_params;
69 #endif
70 
ir_call_proto(const ir_ctx * ctx,ir_insn * insn)71 static const ir_proto_t *ir_call_proto(const ir_ctx *ctx, ir_insn *insn)
72 {
73 	if (IR_IS_CONST_REF(insn->op2)) {
74 		const ir_insn *func = &ctx->ir_base[insn->op2];
75 
76 		if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
77 			if (func->proto) {
78 				return (const ir_proto_t *)ir_get_str(ctx, func->proto);
79 			}
80 		}
81 	} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
82 		return (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
83 	}
84 	return NULL;
85 }
86 
87 #ifdef IR_HAVE_FASTCALL
88 static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS];
89 static const int8_t *_ir_fp_fc_reg_params;
90 
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)91 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
92 {
93 	if (sizeof(void*) == 4) {
94 		if (IR_IS_CONST_REF(insn->op2)) {
95 			const ir_insn *func = &ctx->ir_base[insn->op2];
96 
97 			if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
98 				if (func->proto) {
99 					const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);
100 
101 					return (proto->flags & IR_FASTCALL_FUNC) != 0;
102 				}
103 			}
104 		} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
105 			const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
106 
107 			return (proto->flags & IR_FASTCALL_FUNC) != 0;
108 		}
109 		return 0;
110 	}
111 	return 0;
112 }
113 #else
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)114 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
115 {
116 	return 0;
117 }
118 #endif
119 
ir_is_vararg(const ir_ctx * ctx,ir_insn * insn)120 bool ir_is_vararg(const ir_ctx *ctx, ir_insn *insn)
121 {
122 	const ir_proto_t *proto = ir_call_proto(ctx, insn);
123 
124 	if (proto) {
125 		return (proto->flags & IR_VARARG_FUNC) != 0;
126 	}
127 	return 0;
128 }
129 
ir_rule(const ir_ctx * ctx,ir_ref ref)130 IR_ALWAYS_INLINE uint32_t ir_rule(const ir_ctx *ctx, ir_ref ref)
131 {
132 	IR_ASSERT(!IR_IS_CONST_REF(ref));
133 	return ctx->rules[ref];
134 }
135 
ir_in_same_block(ir_ctx * ctx,ir_ref ref)136 IR_ALWAYS_INLINE bool ir_in_same_block(ir_ctx *ctx, ir_ref ref)
137 {
138 	return ref > ctx->bb_start;
139 }
140 
141 
ir_get_param_reg(const ir_ctx * ctx,ir_ref ref)142 static ir_reg ir_get_param_reg(const ir_ctx *ctx, ir_ref ref)
143 {
144 	ir_use_list *use_list = &ctx->use_lists[1];
145 	int i;
146 	ir_ref use, *p;
147 	ir_insn *insn;
148 	int int_param = 0;
149 	int fp_param = 0;
150 	int int_reg_params_count = IR_REG_INT_ARGS;
151 	int fp_reg_params_count = IR_REG_FP_ARGS;
152 	const int8_t *int_reg_params = _ir_int_reg_params;
153 	const int8_t *fp_reg_params = _ir_fp_reg_params;
154 
155 #ifdef IR_HAVE_FASTCALL
156 	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
157 		int_reg_params_count = IR_REG_INT_FCARGS;
158 		fp_reg_params_count = IR_REG_FP_FCARGS;
159 		int_reg_params = _ir_int_fc_reg_params;
160 		fp_reg_params = _ir_fp_fc_reg_params;
161 	}
162 #endif
163 
164 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
165 		use = *p;
166 		insn = &ctx->ir_base[use];
167 		if (insn->op == IR_PARAM) {
168 			if (IR_IS_TYPE_INT(insn->type)) {
169 				if (use == ref) {
170 					if (int_param < int_reg_params_count) {
171 						return int_reg_params[int_param];
172 					} else {
173 						return IR_REG_NONE;
174 					}
175 				}
176 				int_param++;
177 #ifdef _WIN64
178 				/* WIN64 calling convention use common couter for int and fp registers */
179 				fp_param++;
180 #endif
181 			} else {
182 				IR_ASSERT(IR_IS_TYPE_FP(insn->type));
183 				if (use == ref) {
184 					if (fp_param < fp_reg_params_count) {
185 						return fp_reg_params[fp_param];
186 					} else {
187 						return IR_REG_NONE;
188 					}
189 				}
190 				fp_param++;
191 #ifdef _WIN64
192 				/* WIN64 calling convention use common couter for int and fp registers */
193 				int_param++;
194 #endif
195 			}
196 		}
197 	}
198 	return IR_REG_NONE;
199 }
200 
ir_get_args_regs(const ir_ctx * ctx,const ir_insn * insn,int8_t * regs)201 static int ir_get_args_regs(const ir_ctx *ctx, const ir_insn *insn, int8_t *regs)
202 {
203 	int j, n;
204 	ir_type type;
205 	int int_param = 0;
206 	int fp_param = 0;
207 	int count = 0;
208 	int int_reg_params_count = IR_REG_INT_ARGS;
209 	int fp_reg_params_count = IR_REG_FP_ARGS;
210 	const int8_t *int_reg_params = _ir_int_reg_params;
211 	const int8_t *fp_reg_params = _ir_fp_reg_params;
212 
213 #ifdef IR_HAVE_FASTCALL
214 	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
215 		int_reg_params_count = IR_REG_INT_FCARGS;
216 		fp_reg_params_count = IR_REG_FP_FCARGS;
217 		int_reg_params = _ir_int_fc_reg_params;
218 		fp_reg_params = _ir_fp_fc_reg_params;
219 	}
220 #endif
221 
222 	n = insn->inputs_count;
223 	n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
224 	for (j = 3; j <= n; j++) {
225 		type = ctx->ir_base[ir_insn_op(insn, j)].type;
226 		if (IR_IS_TYPE_INT(type)) {
227 			if (int_param < int_reg_params_count) {
228 				regs[j] = int_reg_params[int_param];
229 				count = j + 1;
230 			} else {
231 				regs[j] = IR_REG_NONE;
232 			}
233 			int_param++;
234 #ifdef _WIN64
235 			/* WIN64 calling convention use common couter for int and fp registers */
236 			fp_param++;
237 #endif
238 		} else {
239 			IR_ASSERT(IR_IS_TYPE_FP(type));
240 			if (fp_param < fp_reg_params_count) {
241 				regs[j] = fp_reg_params[fp_param];
242 				count = j + 1;
243 			} else {
244 				regs[j] = IR_REG_NONE;
245 			}
246 			fp_param++;
247 #ifdef _WIN64
248 			/* WIN64 calling convention use common couter for int and fp registers */
249 			int_param++;
250 #endif
251 		}
252 	}
253 	return count;
254 }
255 
ir_is_same_mem_var(const ir_ctx * ctx,ir_ref r1,int32_t offset)256 static bool ir_is_same_mem_var(const ir_ctx *ctx, ir_ref r1, int32_t offset)
257 {
258 	ir_live_interval *ival1;
259 	int32_t o1;
260 
261 	if (IR_IS_CONST_REF(r1)) {
262 		return 0;
263 	}
264 
265 	IR_ASSERT(ctx->vregs[r1]);
266 	ival1 = ctx->live_intervals[ctx->vregs[r1]];
267 	IR_ASSERT(ival1);
268 	o1 = ival1->stack_spill_pos;
269 	IR_ASSERT(o1 != -1);
270 	return o1 == offset;
271 }
272 
ir_resolve_sym_name(const char * name)273 void *ir_resolve_sym_name(const char *name)
274 {
275 	void *handle = NULL;
276 	void *addr;
277 
278 #ifndef _WIN32
279 # ifdef RTLD_DEFAULT
280 	handle = RTLD_DEFAULT;
281 # endif
282 	addr = dlsym(handle, name);
283 #else
284 	HMODULE mods[256];
285 	DWORD cbNeeded;
286 	uint32_t i = 0;
287 
288 	addr = NULL;
289 
290 	EnumProcessModules(GetCurrentProcess(), mods, sizeof(mods), &cbNeeded);
291 
292 	while(i < (cbNeeded / sizeof(HMODULE))) {
293 		addr = GetProcAddress(mods[i], name);
294 		if (addr) {
295 			return addr;
296 		}
297 		i++;
298 	}
299 #endif
300 	return addr;
301 }
302 
303 #ifdef IR_SNAPSHOT_HANDLER_DCL
304 	IR_SNAPSHOT_HANDLER_DCL();
305 #endif
306 
307 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
ir_sym_addr(ir_ctx * ctx,const ir_insn * addr_insn)308 static void* ir_sym_addr(ir_ctx *ctx, const ir_insn *addr_insn)
309 {
310 	const char *name = ir_get_str(ctx, addr_insn->val.name);
311 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
312 		ctx->loader->resolve_sym_name(ctx->loader, name, 0) :
313 		ir_resolve_sym_name(name);
314 
315 	return addr;
316 }
317 #endif
318 
ir_sym_val(ir_ctx * ctx,const ir_insn * addr_insn)319 static void* ir_sym_val(ir_ctx *ctx, const ir_insn *addr_insn)
320 {
321 	const char *name = ir_get_str(ctx, addr_insn->val.name);
322 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
323 		ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC) :
324 		ir_resolve_sym_name(name);
325 
326 	IR_ASSERT(addr);
327 	return addr;
328 }
329 
ir_call_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)330 static void *ir_call_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
331 {
332 	void *addr;
333 
334 	IR_ASSERT(addr_insn->type == IR_ADDR);
335 	if (addr_insn->op == IR_FUNC) {
336 		addr = ir_sym_val(ctx, addr_insn);
337 	} else {
338 		IR_ASSERT(addr_insn->op == IR_ADDR || addr_insn->op == IR_FUNC_ADDR);
339 		addr = (void*)addr_insn->val.addr;
340 	}
341 	return addr;
342 }
343 
ir_jmp_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)344 static void *ir_jmp_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
345 {
346 	void *addr = ir_call_addr(ctx, insn, addr_insn);
347 
348 #ifdef IR_SNAPSHOT_HANDLER
349 	if (ctx->ir_base[insn->op1].op == IR_SNAPSHOT) {
350 		addr = IR_SNAPSHOT_HANDLER(ctx, insn->op1, &ctx->ir_base[insn->op1], addr);
351 	}
352 #endif
353 	return addr;
354 }
355 
ir_get_fused_reg(ir_ctx * ctx,ir_ref root,ir_ref ref_and_op)356 static int8_t ir_get_fused_reg(ir_ctx *ctx, ir_ref root, ir_ref ref_and_op)
357 {
358 	if (ctx->fused_regs) {
359 		char key[10];
360 		ir_ref val;
361 
362 		memcpy(key, &root, sizeof(ir_ref));
363 		memcpy(key + 4, &ref_and_op, sizeof(ir_ref));
364 
365 		val = ir_strtab_find(ctx->fused_regs, key, 8);
366 		if (val) {
367 			return val;
368 		}
369 	}
370 	return ((int8_t*)ctx->regs)[ref_and_op];
371 }
372 
373 #if defined(__GNUC__)
374 # pragma GCC diagnostic push
375 # pragma GCC diagnostic ignored "-Warray-bounds"
376 # pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
377 #endif
378 
379 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
380 # include "dynasm/dasm_proto.h"
381 # include "dynasm/dasm_x86.h"
382 #elif defined(IR_TARGET_AARCH64)
383 # include "dynasm/dasm_proto.h"
384 static int ir_add_veneer(dasm_State *Dst, void *buffer, uint32_t ins, int *b, uint32_t *cp, ptrdiff_t offset);
385 # define DASM_ADD_VENEER ir_add_veneer
386 # include "dynasm/dasm_arm64.h"
387 #else
388 # error "Unknown IR target"
389 #endif
390 
391 #if defined(__GNUC__)
392 # pragma GCC diagnostic pop
393 #endif
394 
395 /* Forward Declarations */
396 static void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb);
397 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg);
398 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb);
399 
400 typedef struct _ir_common_backend_data {
401     ir_reg_alloc_data  ra_data;
402 	uint32_t           dessa_from_block;
403 	dasm_State        *dasm_state;
404 	ir_bitset          emit_constants;
405 } ir_common_backend_data;
406 
ir_const_label(ir_ctx * ctx,ir_ref ref)407 static int ir_const_label(ir_ctx *ctx, ir_ref ref)
408 {
409 	ir_common_backend_data *data = ctx->data;
410 	int label = ctx->cfg_blocks_count - ref;
411 
412 	IR_ASSERT(IR_IS_CONST_REF(ref));
413 	ir_bitset_incl(data->emit_constants, -ref);
414 	return label;
415 }
416 
417 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
418 # include "ir_emit_x86.h"
419 #elif defined(IR_TARGET_AARCH64)
420 # include "ir_emit_aarch64.h"
421 #else
422 # error "Unknown IR target"
423 #endif
424 
ir_emit_osr_entry_loads(ir_ctx * ctx,int b,ir_block * bb)425 static IR_NEVER_INLINE void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb)
426 {
427 	ir_list *list = (ir_list*)ctx->osr_entry_loads;
428 	int pos = 0, count, i;
429 	ir_ref ref;
430 
431 	IR_ASSERT(ctx->binding);
432 	IR_ASSERT(list);
433 	while (1) {
434 		i = ir_list_at(list, pos);
435 		if (b == i) {
436 			break;
437 		}
438 		IR_ASSERT(i != 0); /* end marker */
439 		pos++;
440 		count = ir_list_at(list, pos);
441 		pos += count + 1;
442 	}
443 	pos++;
444 	count = ir_list_at(list, pos);
445 	pos++;
446 
447 	for (i = 0; i < count; i++, pos++) {
448 		ref = ir_list_at(list, pos);
449 		IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
450 		if (!(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILLED)) {
451 			/* not spilled */
452 			ir_reg reg = ctx->live_intervals[ctx->vregs[ref]]->reg;
453 			ir_type type = ctx->ir_base[ref].type;
454 			int32_t offset = -ir_binding_find(ctx, ref);
455 
456 			IR_ASSERT(offset > 0);
457 			ir_emit_load_mem(ctx, type, reg, IR_MEM_BO(ctx->spill_base, offset));
458 		} else {
459 			IR_ASSERT(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL);
460 		}
461 	}
462 }
463 
464 /*
465  * Parallel copy sequentialization algorithm
466  *
467  * The implementation is based on algorithm 1 desriebed in
468  * "Revisiting Out-of-SSA Translation for Correctness, Code Quality and Efficiency",
469  * Benoit Boissinot, Alain Darte, Fabrice Rastello, Benoit Dupont de Dinechin, Christophe Guillon.
470  * 2009 International Symposium on Code Generation and Optimization, Seattle, WA, USA, 2009,
471  * pp. 114-125, doi: 10.1109/CGO.2009.19.
472  */
ir_parallel_copy(ir_ctx * ctx,ir_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)473 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
474 {
475 	int i;
476 	int8_t *pred, *loc, *types;
477 	ir_reg to, from;
478 	ir_type type;
479 	ir_regset todo, ready, srcs;
480 
481 	if (count == 1) {
482 		to = copies[0].to;
483 		from = copies[0].from;
484 		IR_ASSERT(from != to);
485 		type = copies[0].type;
486 		if (IR_IS_TYPE_INT(type)) {
487 			ir_emit_mov(ctx, type, to, from);
488 		} else {
489 			ir_emit_fp_mov(ctx, type, to, from);
490 		}
491 		return 1;
492 	}
493 
494 	loc = alloca(IR_REG_NUM * 3 * sizeof(int8_t));
495 	pred = loc + IR_REG_NUM;
496 	types = pred + IR_REG_NUM;
497 	todo = IR_REGSET_EMPTY;
498 	srcs = IR_REGSET_EMPTY;
499 
500 	for (i = 0; i < count; i++) {
501 		from = copies[i].from;
502 		to = copies[i].to;
503 		IR_ASSERT(from != to);
504 		IR_REGSET_INCL(srcs, from);
505 		loc[from] = from;
506 		pred[to] = from;
507 		types[from] = copies[i].type;
508 		IR_ASSERT(!IR_REGSET_IN(todo, to));
509 		IR_REGSET_INCL(todo, to);
510 	}
511 
512 	ready = IR_REGSET_DIFFERENCE(todo, srcs);
513 
514 	if (ready == todo) {
515 		for (i = 0; i < count; i++) {
516 			from = copies[i].from;
517 			to = copies[i].to;
518 			IR_ASSERT(from != to);
519 			type = copies[i].type;
520 			if (IR_IS_TYPE_INT(type)) {
521 				ir_emit_mov(ctx, type, to, from);
522 			} else {
523 				ir_emit_fp_mov(ctx, type, to, from);
524 			}
525 		}
526 		return 1;
527 	}
528 
529 	/* temporary registers can't be the same as some of the destinations */
530 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_reg));
531 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_fp_reg));
532 
533 	/* first we resolve all "windmill blades" - trees (this doesn't requre temporary registers) */
534 	while (ready != IR_REGSET_EMPTY) {
535 		ir_reg r;
536 
537 		to = ir_regset_pop_first(&ready);
538 		from = pred[to];
539 		r = loc[from];
540 		type = types[from];
541 		if (IR_IS_TYPE_INT(type)) {
542 			ir_emit_mov_ext(ctx, type, to, r);
543 		} else {
544 			ir_emit_fp_mov(ctx, type, to, r);
545 		}
546 		IR_REGSET_EXCL(todo, to);
547 		loc[from] = to;
548 		if (from == r && IR_REGSET_IN(todo, from)) {
549 			IR_REGSET_INCL(ready, from);
550 		}
551 	}
552 	if (todo == IR_REGSET_EMPTY) {
553 		return 1;
554 	}
555 
556 	/* at this point the sources that are the same as temoraries are already moved */
557 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_reg) || pred[loc[tmp_reg]] == tmp_reg);
558 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_fp_reg) || pred[loc[tmp_fp_reg]] == tmp_fp_reg);
559 
560 	/* now we resolve all "windmill axles" - cycles (this reuires temporary registers) */
561 	while (todo != IR_REGSET_EMPTY) {
562 		to = ir_regset_pop_first(&todo);
563 		from = pred[to];
564 		IR_ASSERT(to != loc[from]);
565 		type = types[from];
566 		if (IR_IS_TYPE_INT(type)) {
567 #ifdef IR_HAVE_SWAP_INT
568 			if (pred[from] == to) {
569 				ir_emit_swap(ctx, type, to, from);
570 				IR_REGSET_EXCL(todo, from);
571 				loc[to] = from;
572 				loc[from] = to;
573 				continue;
574 			}
575 #endif
576 			IR_ASSERT(tmp_reg != IR_REG_NONE);
577 			IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
578 			ir_emit_mov(ctx, type, tmp_reg, to);
579 			loc[to] = tmp_reg;
580 		} else {
581 #ifdef IR_HAVE_SWAP_FP
582 			if (pred[from] == to) {
583 				ir_emit_swap_fp(ctx, type, to, from);
584 				IR_REGSET_EXCL(todo, from);
585 				loc[to] = from;
586 				loc[from] = to;
587 				continue;
588 			}
589 #endif
590 			IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
591 			IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
592 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
593 			loc[to] = tmp_fp_reg;
594 		}
595 		while (1) {
596 			ir_reg r;
597 
598 			from = pred[to];
599 			r = loc[from];
600 			type = types[from];
601 			if (IR_IS_TYPE_INT(type)) {
602 				ir_emit_mov_ext(ctx, type, to, r);
603 			} else {
604 				ir_emit_fp_mov(ctx, type, to, r);
605 			}
606 			IR_REGSET_EXCL(todo, to);
607 			loc[from] = to;
608 			if (from == r && IR_REGSET_IN(todo, from)) {
609 				to = from;
610 			} else {
611 				break;
612 			}
613 		}
614 	}
615 
616 	return 1;
617 }
618 
ir_emit_dessa_move(ir_ctx * ctx,ir_type type,ir_ref to,ir_ref from,ir_reg tmp_reg,ir_reg tmp_fp_reg)619 static void ir_emit_dessa_move(ir_ctx *ctx, ir_type type, ir_ref to, ir_ref from, ir_reg tmp_reg, ir_reg tmp_fp_reg)
620 {
621 	ir_mem mem_from, mem_to;
622 
623 	IR_ASSERT(from != to);
624 	if (to < IR_REG_NUM) {
625 		if (IR_IS_CONST_REF(from)) {
626 			if (-from < ctx->consts_count) {
627 				/* constant reference */
628 				ir_emit_load(ctx, type, to, from);
629 			} else {
630 				/* local variable address */
631 				ir_load_local_addr(ctx, to, -from - ctx->consts_count);
632 			}
633 		} else if (from < IR_REG_NUM) {
634 			if (IR_IS_TYPE_INT(type)) {
635 				ir_emit_mov(ctx, type, to, from);
636 			} else {
637 				ir_emit_fp_mov(ctx, type, to, from);
638 			}
639 		} else {
640 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
641 			ir_emit_load_mem(ctx, type, to, mem_from);
642 		}
643 	} else {
644 		mem_to = ir_vreg_spill_slot(ctx, to - IR_REG_NUM);
645 		if (IR_IS_CONST_REF(from)) {
646 			if (-from < ctx->consts_count) {
647 				/* constant reference */
648 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
649 				if (IR_IS_TYPE_INT(type)
650 				 && !IR_IS_SYM_CONST(ctx->ir_base[from].op)
651 				 && (ir_type_size[type] != 8 || IR_IS_SIGNED_32BIT(ctx->ir_base[from].val.i64))) {
652 					ir_emit_store_mem_imm(ctx, type, mem_to, ctx->ir_base[from].val.i32);
653 					return;
654 				}
655 #endif
656 				ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
657 				IR_ASSERT(tmp != IR_REG_NONE);
658 				ir_emit_load(ctx, type, tmp, from);
659 				ir_emit_store_mem(ctx, type, mem_to, tmp);
660 			} else {
661 				/* local variable address */
662 				IR_ASSERT(IR_IS_TYPE_INT(type));
663 				IR_ASSERT(tmp_reg != IR_REG_NONE);
664 				ir_load_local_addr(ctx, tmp_reg, -from - ctx->consts_count);
665 				ir_emit_store_mem(ctx, type, mem_to, tmp_reg);
666 			}
667 		} else if (from < IR_REG_NUM) {
668 			ir_emit_store_mem(ctx, type, mem_to, from);
669 		} else {
670 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
671 			IR_ASSERT(IR_MEM_VAL(mem_to) != IR_MEM_VAL(mem_from));
672 			ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
673 			IR_ASSERT(tmp != IR_REG_NONE);
674 			ir_emit_load_mem(ctx, type, tmp, mem_from);
675 			ir_emit_store_mem(ctx, type, mem_to, tmp);
676 		}
677 	}
678 }
679 
ir_dessa_resolve_cycle(ir_ctx * ctx,int32_t * pred,int32_t * loc,int8_t * types,ir_bitset todo,int32_t to,ir_reg tmp_reg,ir_reg tmp_fp_reg)680 IR_ALWAYS_INLINE void ir_dessa_resolve_cycle(ir_ctx *ctx, int32_t *pred, int32_t *loc, int8_t *types, ir_bitset todo, int32_t to, ir_reg tmp_reg, ir_reg tmp_fp_reg)
681 {
682 	ir_ref from;
683 	ir_mem tmp_spill_slot;
684 	ir_type type;
685 
686 	IR_MEM_VAL(tmp_spill_slot) = 0;
687 	IR_ASSERT(!IR_IS_CONST_REF(to));
688 	from = pred[to];
689 	type = types[from];
690 	IR_ASSERT(!IR_IS_CONST_REF(from));
691 	IR_ASSERT(from != to);
692 	IR_ASSERT(loc[from] == from);
693 
694 	if (IR_IS_TYPE_INT(type)) {
695 #ifdef IR_HAVE_SWAP_INT
696 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) {
697 			/* a simple cycle from 2 elements */
698 			if (ir_type_size[types[to]] > ir_type_size[type]) {
699 				type = types[to];
700 			}
701 			ir_emit_swap(ctx, type, to, from);
702 			ir_bitset_excl(todo, from);
703 			ir_bitset_excl(todo, to);
704 			loc[to] = from;
705 			loc[from] = to;
706 			return;
707 		}
708 #endif
709 		IR_ASSERT(tmp_reg != IR_REG_NONE);
710 		IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
711 		loc[to] = tmp_reg;
712 		if (to < IR_REG_NUM) {
713 			ir_emit_mov(ctx, type, tmp_reg, to);
714 		} else {
715 			ir_emit_load_mem_int(ctx, type, tmp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
716 		}
717 	} else {
718 #ifdef IR_HAVE_SWAP_FP
719 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM && types[to] == type) {
720 			/* a simple cycle from 2 elements */
721 			ir_emit_swap_fp(ctx, type, to, from);
722 			IR_REGSET_EXCL(todo, from);
723 			IR_REGSET_EXCL(todo, to);
724 			loc[to] = from;
725 			loc[from] = to;
726 			return;
727 		}
728 #endif
729 		IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
730 		IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
731 		loc[to] = tmp_fp_reg;
732 		if (to < IR_REG_NUM) {
733 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
734 		} else {
735 			ir_emit_load_mem_fp(ctx, type, tmp_fp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
736 		}
737 	}
738 
739 	while (1) {
740 		int32_t r;
741 
742 		from = pred[to];
743 		r = loc[from];
744 		type = types[to];
745 
746 		if (from == r && ir_bitset_in(todo, from)) {
747 			/* Memory to memory move inside an isolated or "blocked" cycle requres an additional temporary register */
748 			if (to >= IR_REG_NUM && r >= IR_REG_NUM) {
749 				ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
750 
751 				if (!IR_MEM_VAL(tmp_spill_slot)) {
752 					/* Free a register, saving it in a temporary spill slot */
753 					tmp_spill_slot = IR_MEM_BO(IR_REG_STACK_POINTER, -16);
754 					ir_emit_store_mem(ctx, type, tmp_spill_slot, tmp);
755 				}
756 				ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
757 			} else {
758 				ir_emit_dessa_move(ctx, type, to, r, IR_REG_NONE, IR_REG_NONE);
759 			}
760 			ir_bitset_excl(todo, to);
761 			loc[from] = to;
762 			to = from;
763 		} else {
764 			break;
765 		}
766 	}
767 
768 	type = types[to];
769 	if (IR_MEM_VAL(tmp_spill_slot)) {
770 		ir_emit_load_mem(ctx, type, IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg, tmp_spill_slot);
771 	}
772 	ir_emit_dessa_move(ctx, type, to, loc[from], IR_REG_NONE, IR_REG_NONE);
773 	ir_bitset_excl(todo, to);
774 	loc[from] = to;
775 }
776 
ir_dessa_parallel_copy(ir_ctx * ctx,ir_dessa_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)777 static int ir_dessa_parallel_copy(ir_ctx *ctx, ir_dessa_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
778 {
779 	int i;
780 	int32_t *pred, *loc, to, from;
781 	int8_t *types;
782 	ir_type type;
783 	uint32_t len;
784 	ir_bitset todo, ready, srcs, visited;
785 
786 	if (count == 1) {
787 		to = copies[0].to;
788 		from = copies[0].from;
789 		IR_ASSERT(from != to);
790 		type = copies[0].type;
791 		ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
792 		return 1;
793 	}
794 
795 	len = IR_REG_NUM + ctx->vregs_count + 1;
796 	todo = ir_bitset_malloc(len);
797 	srcs = ir_bitset_malloc(len);
798 	loc = ir_mem_malloc(len * 2 * sizeof(int32_t) + len * sizeof(int8_t));
799 	pred = loc + len;
800 	types = (int8_t*)(pred + len);
801 
802 	for (i = 0; i < count; i++) {
803 		from = copies[i].from;
804 		to = copies[i].to;
805 		IR_ASSERT(from != to);
806 		if (!IR_IS_CONST_REF(from)) {
807 			ir_bitset_incl(srcs, from);
808 			loc[from] = from;
809 		}
810 		pred[to] = from;
811 		types[to] = copies[i].type;
812 		IR_ASSERT(!ir_bitset_in(todo, to));
813 		ir_bitset_incl(todo, to);
814 	}
815 
816 	/* temporary registers can't be the same as some of the sources */
817 	IR_ASSERT(tmp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_reg));
818 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_fp_reg));
819 
820 	/* first we resolve all "windmill blades" - trees, that don't set temporary registers */
821 	ready = ir_bitset_malloc(len);
822 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
823 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
824 	if (tmp_reg != IR_REG_NONE) {
825 		ir_bitset_excl(ready, tmp_reg);
826 	}
827 	if (tmp_fp_reg != IR_REG_NONE) {
828 		ir_bitset_excl(ready, tmp_fp_reg);
829 	}
830 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
831 		ir_bitset_excl(todo, to);
832 		type = types[to];
833 		from = pred[to];
834 		if (IR_IS_CONST_REF(from)) {
835 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
836 		} else {
837 			int32_t r = loc[from];
838 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
839 			loc[from] = to;
840 			if (from == r && ir_bitset_in(todo, from) && from != tmp_reg && from != tmp_fp_reg) {
841 				ir_bitset_incl(ready, from);
842 			}
843 		}
844 	}
845 
846 	/* then we resolve all "windmill axles" - cycles (this requres temporary registers) */
847 	visited = ir_bitset_malloc(len);
848 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
849 	ir_bitset_intersection(ready, srcs, ir_bitset_len(len));
850 	while ((to = ir_bitset_first(ready, ir_bitset_len(len))) >= 0) {
851 		ir_bitset_clear(visited, ir_bitset_len(len));
852 		ir_bitset_incl(visited, to);
853 		to = pred[to];
854 		while (!IR_IS_CONST_REF(to) && ir_bitset_in(ready, to)) {
855 			to = pred[to];
856 			if (IR_IS_CONST_REF(to)) {
857 				break;
858 			} else if (ir_bitset_in(visited, to)) {
859 				/* We found a cycle. Resolve it. */
860 				ir_bitset_incl(visited, to);
861 				ir_dessa_resolve_cycle(ctx, pred, loc, types, todo, to, tmp_reg, tmp_fp_reg);
862 				break;
863 			}
864 			ir_bitset_incl(visited, to);
865 		}
866 		ir_bitset_difference(ready, visited, ir_bitset_len(len));
867 	}
868 
869 	/* finally we resolve remaining "windmill blades" - trees that set temporary registers */
870 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
871 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
872 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
873 		ir_bitset_excl(todo, to);
874 		type = types[to];
875 		from = pred[to];
876 		if (IR_IS_CONST_REF(from)) {
877 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
878 		} else {
879 			int32_t r = loc[from];
880 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
881 			loc[from] = to;
882 			if (from == r && ir_bitset_in(todo, from)) {
883 				ir_bitset_incl(ready, from);
884 			}
885 		}
886 	}
887 
888 	IR_ASSERT(ir_bitset_empty(todo, ir_bitset_len(len)));
889 
890 	ir_mem_free(visited);
891 	ir_mem_free(ready);
892 	ir_mem_free(loc);
893 	ir_mem_free(srcs);
894 	ir_mem_free(todo);
895 	return 1;
896 }
897 
ir_emit_dessa_moves(ir_ctx * ctx,int b,ir_block * bb)898 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb)
899 {
900 	uint32_t succ, k, n = 0;
901 	ir_block *succ_bb;
902 	ir_use_list *use_list;
903 	ir_ref i, *p;
904 	ir_dessa_copy *copies;
905 	ir_reg tmp_reg = ctx->regs[bb->end][0];
906 	ir_reg tmp_fp_reg = ctx->regs[bb->end][1];
907 
908 	IR_ASSERT(bb->successors_count == 1);
909 	succ = ctx->cfg_edges[bb->successors];
910 	succ_bb = &ctx->cfg_blocks[succ];
911 	IR_ASSERT(succ_bb->predecessors_count > 1);
912 	use_list = &ctx->use_lists[succ_bb->start];
913 	k = ir_phi_input_number(ctx, succ_bb, b);
914 
915 	copies = alloca(use_list->count * sizeof(ir_dessa_copy));
916 
917 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
918 		ir_ref ref = *p;
919 		ir_insn *insn = &ctx->ir_base[ref];
920 
921 		if (insn->op == IR_PHI) {
922 			ir_ref input = ir_insn_op(insn, k);
923 			ir_reg src = ir_get_alocated_reg(ctx, ref, k);
924 			ir_reg dst = ctx->regs[ref][0];
925 			ir_ref from, to;
926 
927 			IR_ASSERT(dst == IR_REG_NONE || !IR_REG_SPILLED(dst));
928 			if (IR_IS_CONST_REF(input)) {
929 				from = input;
930 			} else if (ir_rule(ctx, input) == IR_STATIC_ALLOCA) {
931 				/* encode local variable address */
932 				from = -(ctx->consts_count + input);
933 			} else {
934 				from = (src != IR_REG_NONE && !IR_REG_SPILLED(src)) ?
935 					(ir_ref)src : (ir_ref)(IR_REG_NUM + ctx->vregs[input]);
936 			}
937 			to = (dst != IR_REG_NONE) ?
938 				(ir_ref)dst : (ir_ref)(IR_REG_NUM + ctx->vregs[ref]);
939 			if (to != from) {
940 				if (to >= IR_REG_NUM
941 				 && from >= IR_REG_NUM
942 				 && IR_MEM_VAL(ir_vreg_spill_slot(ctx, from - IR_REG_NUM)) ==
943 						IR_MEM_VAL(ir_vreg_spill_slot(ctx, to - IR_REG_NUM))) {
944 					/* It's possible that different virtual registers share the same special spill slot */
945 					// TODO: See ext/opcache/tests/jit/gh11917.phpt failure on Linux 32-bit
946 					continue;
947 				}
948 				copies[n].type = insn->type;
949 				copies[n].from = from;
950 				copies[n].to = to;
951 				n++;
952 			}
953 		}
954 	}
955 
956 	if (n > 0) {
957 		ir_dessa_parallel_copy(ctx, copies, n, tmp_reg, tmp_fp_reg);
958 	}
959 }
960 
ir_match(ir_ctx * ctx)961 int ir_match(ir_ctx *ctx)
962 {
963 	uint32_t b;
964 	ir_ref start, ref, *prev_ref;
965 	ir_block *bb;
966 	ir_insn *insn;
967 	uint32_t entries_count = 0;
968 
969 	ctx->rules = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
970 
971 	prev_ref = ctx->prev_ref;
972 	if (!prev_ref) {
973 		ir_build_prev_refs(ctx);
974 		prev_ref = ctx->prev_ref;
975 	}
976 
977 	if (ctx->entries_count) {
978 		ctx->entries = ir_mem_malloc(ctx->entries_count * sizeof(ir_ref));
979 	}
980 
981 	for (b = ctx->cfg_blocks_count, bb = ctx->cfg_blocks + b; b > 0; b--, bb--) {
982 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
983 		start = bb->start;
984 		if (UNEXPECTED(bb->flags & IR_BB_ENTRY)) {
985 			IR_ASSERT(entries_count < ctx->entries_count);
986 			insn = &ctx->ir_base[start];
987 			IR_ASSERT(insn->op == IR_ENTRY);
988 			insn->op3 = entries_count;
989 			ctx->entries[entries_count] = b;
990 			entries_count++;
991 		}
992 		ctx->rules[start] = IR_SKIPPED | IR_NOP;
993 		ref = bb->end;
994 		if (bb->successors_count == 1) {
995 			insn = &ctx->ir_base[ref];
996 			if (insn->op == IR_END || insn->op == IR_LOOP_END) {
997 				ctx->rules[ref] = insn->op;
998 				ref = prev_ref[ref];
999 				if (ref == start && ctx->cfg_edges[bb->successors] != b) {
1000 					if (EXPECTED(!(bb->flags & IR_BB_ENTRY))) {
1001 						bb->flags |= IR_BB_EMPTY;
1002 					} else if (ctx->flags & IR_MERGE_EMPTY_ENTRIES) {
1003 						bb->flags |= IR_BB_EMPTY;
1004 						if (ctx->cfg_edges[bb->successors] == b + 1) {
1005 							(bb + 1)->flags |= IR_BB_PREV_EMPTY_ENTRY;
1006 						}
1007 					}
1008 					continue;
1009 				}
1010 			}
1011 		}
1012 
1013 		ctx->bb_start = start; /* bb_start is used by matcher to avoid fusion of insns from different blocks */
1014 
1015 		while (ref != start) {
1016 			uint32_t rule = ctx->rules[ref];
1017 
1018 			if (!rule) {
1019 				ctx->rules[ref] = rule = ir_match_insn(ctx, ref);
1020 			}
1021 			ir_match_insn2(ctx, ref, rule);
1022 			ref = prev_ref[ref];
1023 		}
1024 	}
1025 
1026 	if (ctx->entries_count) {
1027 		ctx->entries_count = entries_count;
1028 		if (!entries_count) {
1029 			ir_mem_free(ctx->entries);
1030 			ctx->entries = NULL;
1031 		}
1032 	}
1033 
1034 	return 1;
1035 }
1036 
ir_get_spill_slot_offset(ir_ctx * ctx,ir_ref ref)1037 int32_t ir_get_spill_slot_offset(ir_ctx *ctx, ir_ref ref)
1038 {
1039 	int32_t offset;
1040 
1041 	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
1042 	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
1043 	IR_ASSERT(offset != -1);
1044 	return IR_SPILL_POS_TO_OFFSET(offset);
1045 }
1046