xref: /php-src/ext/opcache/jit/ir/ir_emit.c (revision 5708df71)
1 /*
2  * IR - Lightweight JIT Compilation Framework
3  * (Native code generator based on DynAsm)
4  * Copyright (C) 2022 Zend by Perforce.
5  * Authors: Dmitry Stogov <dmitry@php.net>
6  */
7 
8 #include "ir.h"
9 
10 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
11 # include "ir_x86.h"
12 #elif defined(IR_TARGET_AARCH64)
13 # include "ir_aarch64.h"
14 #else
15 # error "Unknown IR target"
16 #endif
17 
18 #include "ir_private.h"
19 #ifndef _WIN32
20 # include <dlfcn.h>
21 #else
22 # define WIN32_LEAN_AND_MEAN
23 # include <windows.h>
24 # include <psapi.h>
25 #endif
26 
27 #if defined(__linux__) || defined(__sun)
28 # include <alloca.h>
29 #endif
30 
31 #define DASM_M_GROW(ctx, t, p, sz, need) \
32   do { \
33     size_t _sz = (sz), _need = (need); \
34     if (_sz < _need) { \
35       if (_sz < 16) _sz = 16; \
36       while (_sz < _need) _sz += _sz; \
37       (p) = (t *)ir_mem_realloc((p), _sz); \
38       (sz) = _sz; \
39     } \
40   } while(0)
41 
42 #define DASM_M_FREE(ctx, p, sz) ir_mem_free(p)
43 
44 #if IR_DEBUG
45 # define DASM_CHECKS
46 #endif
47 
48 typedef struct _ir_copy {
49 	ir_type type;
50 	ir_reg  from;
51 	ir_reg  to;
52 } ir_copy;
53 
54 typedef struct _ir_dessa_copy {
55 	ir_type type;
56 	int32_t from; /* negative - constant ref, [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg */
57 	int32_t to;   /* [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg  */
58 } ir_dessa_copy;
59 
60 #if IR_REG_INT_ARGS
61 static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS];
62 #else
63 static const int8_t *_ir_int_reg_params;
64 #endif
65 #if IR_REG_FP_ARGS
66 static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS];
67 #else
68 static const int8_t *_ir_fp_reg_params;
69 #endif
70 
ir_call_proto(const ir_ctx * ctx,ir_insn * insn)71 static const ir_proto_t *ir_call_proto(const ir_ctx *ctx, ir_insn *insn)
72 {
73 	if (IR_IS_CONST_REF(insn->op2)) {
74 		const ir_insn *func = &ctx->ir_base[insn->op2];
75 
76 		if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
77 			if (func->proto) {
78 				return (const ir_proto_t *)ir_get_str(ctx, func->proto);
79 			}
80 		}
81 	} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
82 		return (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
83 	}
84 	return NULL;
85 }
86 
87 #ifdef IR_HAVE_FASTCALL
88 static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS];
89 static const int8_t *_ir_fp_fc_reg_params;
90 
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)91 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
92 {
93 	if (sizeof(void*) == 4) {
94 		if (IR_IS_CONST_REF(insn->op2)) {
95 			const ir_insn *func = &ctx->ir_base[insn->op2];
96 
97 			if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
98 				if (func->proto) {
99 					const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);
100 
101 					return (proto->flags & IR_FASTCALL_FUNC) != 0;
102 				}
103 			}
104 		} else if (ctx->ir_base[insn->op2].op == IR_PROTO) {
105 			const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, ctx->ir_base[insn->op2].op2);
106 
107 			return (proto->flags & IR_FASTCALL_FUNC) != 0;
108 		}
109 		return 0;
110 	}
111 	return 0;
112 }
113 #else
ir_is_fastcall(const ir_ctx * ctx,const ir_insn * insn)114 bool ir_is_fastcall(const ir_ctx *ctx, const ir_insn *insn)
115 {
116 	return 0;
117 }
118 #endif
119 
ir_is_vararg(const ir_ctx * ctx,ir_insn * insn)120 bool ir_is_vararg(const ir_ctx *ctx, ir_insn *insn)
121 {
122 	const ir_proto_t *proto = ir_call_proto(ctx, insn);
123 
124 	if (proto) {
125 		return (proto->flags & IR_VARARG_FUNC) != 0;
126 	}
127 	return 0;
128 }
129 
ir_rule(const ir_ctx * ctx,ir_ref ref)130 IR_ALWAYS_INLINE uint32_t ir_rule(const ir_ctx *ctx, ir_ref ref)
131 {
132 	IR_ASSERT(!IR_IS_CONST_REF(ref));
133 	return ctx->rules[ref];
134 }
135 
ir_in_same_block(ir_ctx * ctx,ir_ref ref)136 IR_ALWAYS_INLINE bool ir_in_same_block(ir_ctx *ctx, ir_ref ref)
137 {
138 	return ref > ctx->bb_start;
139 }
140 
141 
ir_get_param_reg(const ir_ctx * ctx,ir_ref ref)142 static ir_reg ir_get_param_reg(const ir_ctx *ctx, ir_ref ref)
143 {
144 	ir_use_list *use_list = &ctx->use_lists[1];
145 	int i;
146 	ir_ref use, *p;
147 	ir_insn *insn;
148 	int int_param = 0;
149 	int fp_param = 0;
150 	int int_reg_params_count = IR_REG_INT_ARGS;
151 	int fp_reg_params_count = IR_REG_FP_ARGS;
152 	const int8_t *int_reg_params = _ir_int_reg_params;
153 	const int8_t *fp_reg_params = _ir_fp_reg_params;
154 
155 #ifdef IR_HAVE_FASTCALL
156 	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
157 		int_reg_params_count = IR_REG_INT_FCARGS;
158 		fp_reg_params_count = IR_REG_FP_FCARGS;
159 		int_reg_params = _ir_int_fc_reg_params;
160 		fp_reg_params = _ir_fp_fc_reg_params;
161 	}
162 #endif
163 
164 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
165 		use = *p;
166 		insn = &ctx->ir_base[use];
167 		if (insn->op == IR_PARAM) {
168 			if (IR_IS_TYPE_INT(insn->type)) {
169 				if (use == ref) {
170 					if (int_param < int_reg_params_count) {
171 						return int_reg_params[int_param];
172 					} else {
173 						return IR_REG_NONE;
174 					}
175 				}
176 				int_param++;
177 #ifdef _WIN64
178 				/* WIN64 calling convention use common couter for int and fp registers */
179 				fp_param++;
180 #endif
181 			} else {
182 				IR_ASSERT(IR_IS_TYPE_FP(insn->type));
183 				if (use == ref) {
184 					if (fp_param < fp_reg_params_count) {
185 						return fp_reg_params[fp_param];
186 					} else {
187 						return IR_REG_NONE;
188 					}
189 				}
190 				fp_param++;
191 #ifdef _WIN64
192 				/* WIN64 calling convention use common couter for int and fp registers */
193 				int_param++;
194 #endif
195 			}
196 		}
197 	}
198 	return IR_REG_NONE;
199 }
200 
ir_get_args_regs(const ir_ctx * ctx,const ir_insn * insn,int8_t * regs)201 static int ir_get_args_regs(const ir_ctx *ctx, const ir_insn *insn, int8_t *regs)
202 {
203 	int j, n;
204 	ir_type type;
205 	int int_param = 0;
206 	int fp_param = 0;
207 	int count = 0;
208 	int int_reg_params_count = IR_REG_INT_ARGS;
209 	int fp_reg_params_count = IR_REG_FP_ARGS;
210 	const int8_t *int_reg_params = _ir_int_reg_params;
211 	const int8_t *fp_reg_params = _ir_fp_reg_params;
212 
213 #ifdef IR_HAVE_FASTCALL
214 	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
215 		int_reg_params_count = IR_REG_INT_FCARGS;
216 		fp_reg_params_count = IR_REG_FP_FCARGS;
217 		int_reg_params = _ir_int_fc_reg_params;
218 		fp_reg_params = _ir_fp_fc_reg_params;
219 	}
220 #endif
221 
222 	n = insn->inputs_count;
223 	n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
224 	for (j = 3; j <= n; j++) {
225 		type = ctx->ir_base[ir_insn_op(insn, j)].type;
226 		if (IR_IS_TYPE_INT(type)) {
227 			if (int_param < int_reg_params_count) {
228 				regs[j] = int_reg_params[int_param];
229 				count = j + 1;
230 			} else {
231 				regs[j] = IR_REG_NONE;
232 			}
233 			int_param++;
234 #ifdef _WIN64
235 			/* WIN64 calling convention use common couter for int and fp registers */
236 			fp_param++;
237 #endif
238 		} else {
239 			IR_ASSERT(IR_IS_TYPE_FP(type));
240 			if (fp_param < fp_reg_params_count) {
241 				regs[j] = fp_reg_params[fp_param];
242 				count = j + 1;
243 			} else {
244 				regs[j] = IR_REG_NONE;
245 			}
246 			fp_param++;
247 #ifdef _WIN64
248 			/* WIN64 calling convention use common couter for int and fp registers */
249 			int_param++;
250 #endif
251 		}
252 	}
253 	return count;
254 }
255 
ir_is_same_mem_var(const ir_ctx * ctx,ir_ref r1,int32_t offset)256 static bool ir_is_same_mem_var(const ir_ctx *ctx, ir_ref r1, int32_t offset)
257 {
258 	ir_live_interval *ival1;
259 	int32_t o1;
260 
261 	if (IR_IS_CONST_REF(r1)) {
262 		return 0;
263 	}
264 
265 	IR_ASSERT(ctx->vregs[r1]);
266 	ival1 = ctx->live_intervals[ctx->vregs[r1]];
267 	IR_ASSERT(ival1);
268 	o1 = ival1->stack_spill_pos;
269 	IR_ASSERT(o1 != -1);
270 	return o1 == offset;
271 }
272 
ir_resolve_sym_name(const char * name)273 void *ir_resolve_sym_name(const char *name)
274 {
275 	void *handle = NULL;
276 	void *addr;
277 
278 #ifndef _WIN32
279 # ifdef RTLD_DEFAULT
280 	handle = RTLD_DEFAULT;
281 # endif
282 	addr = dlsym(handle, name);
283 #else
284 	HMODULE mods[256];
285 	DWORD cbNeeded;
286 	uint32_t i = 0;
287 
288 	addr = NULL;
289 
290 	EnumProcessModules(GetCurrentProcess(), mods, sizeof(mods), &cbNeeded);
291 
292 	while(i < (cbNeeded / sizeof(HMODULE))) {
293 		addr = GetProcAddress(mods[i], name);
294 		if (addr) {
295 			return addr;
296 		}
297 		i++;
298 	}
299 #endif
300 	return addr;
301 }
302 
303 #ifdef IR_SNAPSHOT_HANDLER_DCL
304 	IR_SNAPSHOT_HANDLER_DCL();
305 #endif
306 
307 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
ir_sym_addr(ir_ctx * ctx,const ir_insn * addr_insn)308 static void* ir_sym_addr(ir_ctx *ctx, const ir_insn *addr_insn)
309 {
310 	const char *name = ir_get_str(ctx, addr_insn->val.name);
311 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
312 		ctx->loader->resolve_sym_name(ctx->loader, name, 0) :
313 		ir_resolve_sym_name(name);
314 
315 	return addr;
316 }
317 #endif
318 
ir_sym_val(ir_ctx * ctx,const ir_insn * addr_insn)319 static void* ir_sym_val(ir_ctx *ctx, const ir_insn *addr_insn)
320 {
321 	const char *name = ir_get_str(ctx, addr_insn->val.name);
322 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
323 		ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC) :
324 		ir_resolve_sym_name(name);
325 
326 	IR_ASSERT(addr);
327 	return addr;
328 }
329 
ir_call_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)330 static void *ir_call_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
331 {
332 	void *addr;
333 
334 	IR_ASSERT(addr_insn->type == IR_ADDR);
335 	if (addr_insn->op == IR_FUNC) {
336 		addr = ir_sym_val(ctx, addr_insn);
337 	} else {
338 		IR_ASSERT(addr_insn->op == IR_ADDR || addr_insn->op == IR_FUNC_ADDR);
339 		addr = (void*)addr_insn->val.addr;
340 	}
341 	return addr;
342 }
343 
ir_jmp_addr(ir_ctx * ctx,ir_insn * insn,ir_insn * addr_insn)344 static void *ir_jmp_addr(ir_ctx *ctx, ir_insn *insn, ir_insn *addr_insn)
345 {
346 	void *addr = ir_call_addr(ctx, insn, addr_insn);
347 
348 #ifdef IR_SNAPSHOT_HANDLER
349 	if (ctx->ir_base[insn->op1].op == IR_SNAPSHOT) {
350 		addr = IR_SNAPSHOT_HANDLER(ctx, insn->op1, &ctx->ir_base[insn->op1], addr);
351 	}
352 #endif
353 	return addr;
354 }
355 
ir_get_fused_reg(ir_ctx * ctx,ir_ref root,ir_ref ref_and_op)356 static int8_t ir_get_fused_reg(ir_ctx *ctx, ir_ref root, ir_ref ref_and_op)
357 {
358 	if (ctx->fused_regs) {
359 		char key[10];
360 		ir_ref val;
361 
362 		memcpy(key, &root, sizeof(ir_ref));
363 		memcpy(key + 4, &ref_and_op, sizeof(ir_ref));
364 
365 		val = ir_strtab_find(ctx->fused_regs, key, 8);
366 		if (val) {
367 			return val;
368 		}
369 	}
370 	return ((int8_t*)ctx->regs)[ref_and_op];
371 }
372 
373 #if defined(__GNUC__)
374 # pragma GCC diagnostic push
375 # pragma GCC diagnostic ignored "-Warray-bounds"
376 # pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
377 #endif
378 
379 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
380 # include "dynasm/dasm_proto.h"
381 # include "dynasm/dasm_x86.h"
382 #elif defined(IR_TARGET_AARCH64)
383 # include "dynasm/dasm_proto.h"
384 static int ir_add_veneer(dasm_State *Dst, void *buffer, uint32_t ins, int *b, uint32_t *cp, ptrdiff_t offset);
385 # define DASM_ADD_VENEER ir_add_veneer
386 # include "dynasm/dasm_arm64.h"
387 #else
388 # error "Unknown IR target"
389 #endif
390 
391 #if defined(__GNUC__)
392 # pragma GCC diagnostic pop
393 #endif
394 
395 /* Forward Declarations */
396 static void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb);
397 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg);
398 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb);
399 
400 typedef struct _ir_common_backend_data {
401     ir_reg_alloc_data  ra_data;
402 	uint32_t           dessa_from_block;
403 	dasm_State        *dasm_state;
404 	ir_bitset          emit_constants;
405 } ir_common_backend_data;
406 
ir_const_label(ir_ctx * ctx,ir_ref ref)407 static int ir_const_label(ir_ctx *ctx, ir_ref ref)
408 {
409 	ir_common_backend_data *data = ctx->data;
410 	int label = ctx->cfg_blocks_count - ref;
411 
412 	IR_ASSERT(IR_IS_CONST_REF(ref));
413 	ir_bitset_incl(data->emit_constants, -ref);
414 	return label;
415 }
416 
417 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
418 # include "ir_emit_x86.h"
419 #elif defined(IR_TARGET_AARCH64)
420 # include "ir_emit_aarch64.h"
421 #else
422 # error "Unknown IR target"
423 #endif
424 
ir_emit_osr_entry_loads(ir_ctx * ctx,int b,ir_block * bb)425 static IR_NEVER_INLINE void ir_emit_osr_entry_loads(ir_ctx *ctx, int b, ir_block *bb)
426 {
427 	ir_list *list = (ir_list*)ctx->osr_entry_loads;
428 	int pos = 0, count, i;
429 	ir_ref ref;
430 
431 	IR_ASSERT(ctx->binding);
432 	IR_ASSERT(list);
433 	while (1) {
434 		i = ir_list_at(list, pos);
435 		if (b == i) {
436 			break;
437 		}
438 		IR_ASSERT(i != 0); /* end marker */
439 		pos++;
440 		count = ir_list_at(list, pos);
441 		pos += count + 1;
442 	}
443 	pos++;
444 	count = ir_list_at(list, pos);
445 	pos++;
446 
447 	for (i = 0; i < count; i++, pos++) {
448 		ref = ir_list_at(list, pos);
449 		IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
450 		if (!(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILLED)) {
451 			/* not spilled */
452 			ir_reg reg = ctx->live_intervals[ctx->vregs[ref]]->reg;
453 			ir_type type = ctx->ir_base[ref].type;
454 			int32_t offset = -ir_binding_find(ctx, ref);
455 
456 			IR_ASSERT(offset > 0);
457 			ir_emit_load_mem(ctx, type, reg, IR_MEM_BO(ctx->spill_base, offset));
458 		} else {
459 			IR_ASSERT(ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL);
460 		}
461 	}
462 }
463 
464 /*
465  * Parallel copy sequentialization algorithm
466  *
467  * The implementation is based on algorithm 1 desriebed in
468  * "Revisiting Out-of-SSA Translation for Correctness, Code Quality and Efficiency",
469  * Benoit Boissinot, Alain Darte, Fabrice Rastello, Benoit Dupont de Dinechin, Christophe Guillon.
470  * 2009 International Symposium on Code Generation and Optimization, Seattle, WA, USA, 2009,
471  * pp. 114-125, doi: 10.1109/CGO.2009.19.
472  */
ir_parallel_copy(ir_ctx * ctx,ir_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)473 static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
474 {
475 	int i;
476 	int8_t *pred, *loc, *types;
477 	ir_reg to, from;
478 	ir_type type;
479 	ir_regset todo, ready, srcs;
480 
481 	if (count == 1) {
482 		to = copies[0].to;
483 		from = copies[0].from;
484 		IR_ASSERT(from != to);
485 		type = copies[0].type;
486 		if (IR_IS_TYPE_INT(type)) {
487 			ir_emit_mov(ctx, type, to, from);
488 		} else {
489 			ir_emit_fp_mov(ctx, type, to, from);
490 		}
491 		return 1;
492 	}
493 
494 	loc = alloca(IR_REG_NUM * 3 * sizeof(int8_t));
495 	pred = loc + IR_REG_NUM;
496 	types = pred + IR_REG_NUM;
497 	todo = IR_REGSET_EMPTY;
498 	srcs = IR_REGSET_EMPTY;
499 
500 	for (i = 0; i < count; i++) {
501 		from = copies[i].from;
502 		to = copies[i].to;
503 		IR_ASSERT(from != to);
504 		IR_REGSET_INCL(srcs, from);
505 		loc[from] = from;
506 		pred[to] = from;
507 		types[from] = copies[i].type;
508 		IR_ASSERT(!IR_REGSET_IN(todo, to));
509 		IR_REGSET_INCL(todo, to);
510 	}
511 
512 	ready = IR_REGSET_DIFFERENCE(todo, srcs);
513 
514 	if (ready == todo) {
515 		for (i = 0; i < count; i++) {
516 			from = copies[i].from;
517 			to = copies[i].to;
518 			IR_ASSERT(from != to);
519 			type = copies[i].type;
520 			if (IR_IS_TYPE_INT(type)) {
521 				ir_emit_mov(ctx, type, to, from);
522 			} else {
523 				ir_emit_fp_mov(ctx, type, to, from);
524 			}
525 		}
526 		return 1;
527 	}
528 
529 	/* temporary registers can't be the same as some of the destinations */
530 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_reg));
531 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_fp_reg));
532 
533 	/* first we resolve all "windmill blades" - trees (this doesn't requre temporary registers) */
534 	while (ready != IR_REGSET_EMPTY) {
535 		ir_reg r;
536 
537 		to = ir_regset_pop_first(&ready);
538 		from = pred[to];
539 		r = loc[from];
540 		type = types[from];
541 		if (IR_IS_TYPE_INT(type)) {
542 			ir_emit_mov_ext(ctx, type, to, r);
543 		} else {
544 			ir_emit_fp_mov(ctx, type, to, r);
545 		}
546 		IR_REGSET_EXCL(todo, to);
547 		loc[from] = to;
548 		if (from == r && IR_REGSET_IN(todo, from)) {
549 			IR_REGSET_INCL(ready, from);
550 		}
551 	}
552 	if (todo == IR_REGSET_EMPTY) {
553 		return 1;
554 	}
555 
556 	/* at this point the sources that are the same as temoraries are already moved */
557 	IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_reg) || pred[loc[tmp_reg]] == tmp_reg);
558 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_fp_reg) || pred[loc[tmp_fp_reg]] == tmp_fp_reg);
559 
560 	/* now we resolve all "windmill axles" - cycles (this reuires temporary registers) */
561 	while (todo != IR_REGSET_EMPTY) {
562 		to = ir_regset_pop_first(&todo);
563 		from = pred[to];
564 		IR_ASSERT(to != loc[from]);
565 		type = types[from];
566 		if (IR_IS_TYPE_INT(type)) {
567 #ifdef IR_HAVE_SWAP_INT
568 			if (pred[from] == to) {
569 				ir_emit_swap(ctx, type, to, from);
570 				IR_REGSET_EXCL(todo, from);
571 				loc[to] = from;
572 				loc[from] = to;
573 				continue;
574 			}
575 #endif
576 			IR_ASSERT(tmp_reg != IR_REG_NONE);
577 			IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
578 			ir_emit_mov(ctx, type, tmp_reg, to);
579 			loc[to] = tmp_reg;
580 		} else {
581 #ifdef IR_HAVE_SWAP_FP
582 			if (pred[from] == to) {
583 				ir_emit_swap_fp(ctx, type, to, from);
584 				IR_REGSET_EXCL(todo, from);
585 				loc[to] = from;
586 				loc[from] = to;
587 				continue;
588 			}
589 #endif
590 			IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
591 			IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
592 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
593 			loc[to] = tmp_fp_reg;
594 		}
595 		while (1) {
596 			ir_reg r;
597 
598 			from = pred[to];
599 			r = loc[from];
600 			type = types[from];
601 			if (IR_IS_TYPE_INT(type)) {
602 				ir_emit_mov_ext(ctx, type, to, r);
603 			} else {
604 				ir_emit_fp_mov(ctx, type, to, r);
605 			}
606 			IR_REGSET_EXCL(todo, to);
607 			loc[from] = to;
608 			if (from == r && IR_REGSET_IN(todo, from)) {
609 				to = from;
610 			} else {
611 				break;
612 			}
613 		}
614 	}
615 
616 	return 1;
617 }
618 
ir_emit_dessa_move(ir_ctx * ctx,ir_type type,ir_ref to,ir_ref from,ir_reg tmp_reg,ir_reg tmp_fp_reg)619 static void ir_emit_dessa_move(ir_ctx *ctx, ir_type type, ir_ref to, ir_ref from, ir_reg tmp_reg, ir_reg tmp_fp_reg)
620 {
621 	ir_mem mem_from, mem_to;
622 
623 	IR_ASSERT(from != to);
624 	if (to < IR_REG_NUM) {
625 		if (IR_IS_CONST_REF(from)) {
626 			if (-from < ctx->consts_count) {
627 				/* constant reference */
628 				ir_emit_load(ctx, type, to, from);
629 			} else {
630 				/* local variable address */
631 				ir_load_local_addr(ctx, to, -from - ctx->consts_count);
632 			}
633 		} else if (from < IR_REG_NUM) {
634 			if (IR_IS_TYPE_INT(type)) {
635 				ir_emit_mov(ctx, type, to, from);
636 			} else {
637 				ir_emit_fp_mov(ctx, type, to, from);
638 			}
639 		} else {
640 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
641 			ir_emit_load_mem(ctx, type, to, mem_from);
642 		}
643 	} else {
644 		mem_to = ir_vreg_spill_slot(ctx, to - IR_REG_NUM);
645 		if (IR_IS_CONST_REF(from)) {
646 			if (-from < ctx->consts_count) {
647 				/* constant reference */
648 #if defined(IR_TARGET_X86) || defined(IR_TARGET_X64)
649 				if (IR_IS_TYPE_INT(type)
650 				 && !IR_IS_SYM_CONST(ctx->ir_base[from].op)
651 				 && (ir_type_size[type] != 8 || IR_IS_SIGNED_32BIT(ctx->ir_base[from].val.i64))) {
652 					ir_emit_store_mem_imm(ctx, type, mem_to, ctx->ir_base[from].val.i32);
653 					return;
654 				}
655 #endif
656 				ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
657 				IR_ASSERT(tmp != IR_REG_NONE);
658 				ir_emit_load(ctx, type, tmp, from);
659 				ir_emit_store_mem(ctx, type, mem_to, tmp);
660 			} else {
661 				/* local variable address */
662 				IR_ASSERT(IR_IS_TYPE_INT(type));
663 				IR_ASSERT(tmp_reg != IR_REG_NONE);
664 				ir_load_local_addr(ctx, tmp_reg, -from - ctx->consts_count);
665 				ir_emit_store_mem(ctx, type, mem_to, tmp_reg);
666 			}
667 		} else if (from < IR_REG_NUM) {
668 			ir_emit_store_mem(ctx, type, mem_to, from);
669 		} else {
670 			mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM);
671 			IR_ASSERT(IR_MEM_VAL(mem_to) != IR_MEM_VAL(mem_from));
672 			ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
673 			IR_ASSERT(tmp != IR_REG_NONE);
674 			ir_emit_load_mem(ctx, type, tmp, mem_from);
675 			ir_emit_store_mem(ctx, type, mem_to, tmp);
676 		}
677 	}
678 }
679 
ir_dessa_resolve_cycle(ir_ctx * ctx,int32_t * pred,int32_t * loc,int8_t * types,ir_bitset todo,int32_t to,ir_reg tmp_reg,ir_reg tmp_fp_reg)680 IR_ALWAYS_INLINE void ir_dessa_resolve_cycle(ir_ctx *ctx, int32_t *pred, int32_t *loc, int8_t *types, ir_bitset todo, int32_t to, ir_reg tmp_reg, ir_reg tmp_fp_reg)
681 {
682 	ir_ref from;
683 	ir_mem tmp_spill_slot;
684 	ir_type type;
685 
686 	IR_MEM_VAL(tmp_spill_slot) = 0;
687 	IR_ASSERT(!IR_IS_CONST_REF(to));
688 	from = pred[to];
689 	type = types[from];
690 	IR_ASSERT(!IR_IS_CONST_REF(from));
691 	IR_ASSERT(from != to);
692 	IR_ASSERT(loc[from] == from);
693 
694 	if (IR_IS_TYPE_INT(type)) {
695 #ifdef IR_HAVE_SWAP_INT
696 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) {
697 			/* a simple cycle from 2 elements */
698 			ir_emit_swap(ctx, type, to, from);
699 			ir_bitset_excl(todo, from);
700 			ir_bitset_excl(todo, to);
701 			loc[to] = from;
702 			loc[from] = to;
703 			return;
704 		}
705 #endif
706 		IR_ASSERT(tmp_reg != IR_REG_NONE);
707 		IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST);
708 		loc[to] = tmp_reg;
709 		if (to < IR_REG_NUM) {
710 			ir_emit_mov(ctx, type, tmp_reg, to);
711 		} else {
712 			ir_emit_load_mem_int(ctx, type, tmp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
713 		}
714 	} else {
715 #ifdef IR_HAVE_SWAP_FP
716 		if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) {
717 			/* a simple cycle from 2 elements */
718 			ir_emit_swap_fp(ctx, type, to, from);
719 			IR_REGSET_EXCL(todo, from);
720 			IR_REGSET_EXCL(todo, to);
721 			loc[to] = from;
722 			loc[from] = to;
723 			return;
724 		}
725 #endif
726 		IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
727 		IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST);
728 		loc[to] = tmp_fp_reg;
729 		if (to < IR_REG_NUM) {
730 			ir_emit_fp_mov(ctx, type, tmp_fp_reg, to);
731 		} else {
732 			ir_emit_load_mem_fp(ctx, type, tmp_fp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM));
733 		}
734 	}
735 
736 	while (1) {
737 		int32_t r;
738 
739 		from = pred[to];
740 		r = loc[from];
741 		type = types[to];
742 
743 		if (from == r && ir_bitset_in(todo, from)) {
744 			/* Memory to memory move inside an isolated or "blocked" cycle requres an additional temporary register */
745 			if (to >= IR_REG_NUM && r >= IR_REG_NUM) {
746 				ir_reg tmp = IR_IS_TYPE_INT(type) ?  tmp_reg : tmp_fp_reg;
747 
748 				if (!IR_MEM_VAL(tmp_spill_slot)) {
749 					/* Free a register, saving it in a temporary spill slot */
750 					tmp_spill_slot = IR_MEM_BO(IR_REG_STACK_POINTER, -16);
751 					ir_emit_store_mem(ctx, type, tmp_spill_slot, tmp);
752 				}
753 				ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
754 			} else {
755 				ir_emit_dessa_move(ctx, type, to, r, IR_REG_NONE, IR_REG_NONE);
756 			}
757 			ir_bitset_excl(todo, to);
758 			loc[from] = to;
759 			to = from;
760 		} else {
761 			break;
762 		}
763 	}
764 
765 	type = types[to];
766 	if (IR_MEM_VAL(tmp_spill_slot)) {
767 		ir_emit_load_mem(ctx, type, IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg, tmp_spill_slot);
768 	}
769 	ir_emit_dessa_move(ctx, type, to, loc[from], IR_REG_NONE, IR_REG_NONE);
770 	ir_bitset_excl(todo, to);
771 	loc[from] = to;
772 }
773 
ir_dessa_parallel_copy(ir_ctx * ctx,ir_dessa_copy * copies,int count,ir_reg tmp_reg,ir_reg tmp_fp_reg)774 static int ir_dessa_parallel_copy(ir_ctx *ctx, ir_dessa_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg)
775 {
776 	int i;
777 	int32_t *pred, *loc, to, from;
778 	int8_t *types;
779 	ir_type type;
780 	uint32_t len;
781 	ir_bitset todo, ready, srcs, visited;
782 
783 	if (count == 1) {
784 		to = copies[0].to;
785 		from = copies[0].from;
786 		IR_ASSERT(from != to);
787 		type = copies[0].type;
788 		ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
789 		return 1;
790 	}
791 
792 	len = IR_REG_NUM + ctx->vregs_count + 1;
793 	todo = ir_bitset_malloc(len);
794 	srcs = ir_bitset_malloc(len);
795 	loc = ir_mem_malloc(len * 2 * sizeof(int32_t) + len * sizeof(int8_t));
796 	pred = loc + len;
797 	types = (int8_t*)(pred + len);
798 
799 	for (i = 0; i < count; i++) {
800 		from = copies[i].from;
801 		to = copies[i].to;
802 		IR_ASSERT(from != to);
803 		if (!IR_IS_CONST_REF(from)) {
804 			ir_bitset_incl(srcs, from);
805 			loc[from] = from;
806 		}
807 		pred[to] = from;
808 		types[to] = copies[i].type;
809 		IR_ASSERT(!ir_bitset_in(todo, to));
810 		ir_bitset_incl(todo, to);
811 	}
812 
813 	/* temporary registers can't be the same as some of the sources */
814 	IR_ASSERT(tmp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_reg));
815 	IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_fp_reg));
816 
817 	/* first we resolve all "windmill blades" - trees, that don't set temporary registers */
818 	ready = ir_bitset_malloc(len);
819 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
820 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
821 	if (tmp_reg != IR_REG_NONE) {
822 		ir_bitset_excl(ready, tmp_reg);
823 	}
824 	if (tmp_fp_reg != IR_REG_NONE) {
825 		ir_bitset_excl(ready, tmp_fp_reg);
826 	}
827 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
828 		ir_bitset_excl(todo, to);
829 		type = types[to];
830 		from = pred[to];
831 		if (IR_IS_CONST_REF(from)) {
832 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
833 		} else {
834 			int32_t r = loc[from];
835 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
836 			loc[from] = to;
837 			if (from == r && ir_bitset_in(todo, from) && from != tmp_reg && from != tmp_fp_reg) {
838 				ir_bitset_incl(ready, from);
839 			}
840 		}
841 	}
842 
843 	/* then we resolve all "windmill axles" - cycles (this requres temporary registers) */
844 	visited = ir_bitset_malloc(len);
845 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
846 	ir_bitset_intersection(ready, srcs, ir_bitset_len(len));
847 	while ((to = ir_bitset_first(ready, ir_bitset_len(len))) >= 0) {
848 		ir_bitset_clear(visited, ir_bitset_len(len));
849 		ir_bitset_incl(visited, to);
850 		to = pred[to];
851 		while (!IR_IS_CONST_REF(to) && ir_bitset_in(ready, to)) {
852 			to = pred[to];
853 			if (IR_IS_CONST_REF(to)) {
854 				break;
855 			} else if (ir_bitset_in(visited, to)) {
856 				/* We found a cycle. Resolve it. */
857 				ir_bitset_incl(visited, to);
858 				ir_dessa_resolve_cycle(ctx, pred, loc, types, todo, to, tmp_reg, tmp_fp_reg);
859 				break;
860 			}
861 			ir_bitset_incl(visited, to);
862 		}
863 		ir_bitset_difference(ready, visited, ir_bitset_len(len));
864 	}
865 
866 	/* finally we resolve remaining "windmill blades" - trees that set temporary registers */
867 	ir_bitset_copy(ready, todo, ir_bitset_len(len));
868 	ir_bitset_difference(ready, srcs, ir_bitset_len(len));
869 	while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) {
870 		ir_bitset_excl(todo, to);
871 		type = types[to];
872 		from = pred[to];
873 		if (IR_IS_CONST_REF(from)) {
874 			ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg);
875 		} else {
876 			int32_t r = loc[from];
877 			ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg);
878 			loc[from] = to;
879 			if (from == r && ir_bitset_in(todo, from)) {
880 				ir_bitset_incl(ready, from);
881 			}
882 		}
883 	}
884 
885 	IR_ASSERT(ir_bitset_empty(todo, ir_bitset_len(len)));
886 
887 	ir_mem_free(visited);
888 	ir_mem_free(ready);
889 	ir_mem_free(loc);
890 	ir_mem_free(srcs);
891 	ir_mem_free(todo);
892 	return 1;
893 }
894 
ir_emit_dessa_moves(ir_ctx * ctx,int b,ir_block * bb)895 static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb)
896 {
897 	uint32_t succ, k, n = 0;
898 	ir_block *succ_bb;
899 	ir_use_list *use_list;
900 	ir_ref i, *p;
901 	ir_dessa_copy *copies;
902 	ir_reg tmp_reg = ctx->regs[bb->end][0];
903 	ir_reg tmp_fp_reg = ctx->regs[bb->end][1];
904 
905 	IR_ASSERT(bb->successors_count == 1);
906 	succ = ctx->cfg_edges[bb->successors];
907 	succ_bb = &ctx->cfg_blocks[succ];
908 	IR_ASSERT(succ_bb->predecessors_count > 1);
909 	use_list = &ctx->use_lists[succ_bb->start];
910 	k = ir_phi_input_number(ctx, succ_bb, b);
911 
912 	copies = alloca(use_list->count * sizeof(ir_dessa_copy));
913 
914 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
915 		ir_ref ref = *p;
916 		ir_insn *insn = &ctx->ir_base[ref];
917 
918 		if (insn->op == IR_PHI) {
919 			ir_ref input = ir_insn_op(insn, k);
920 			ir_reg src = ir_get_alocated_reg(ctx, ref, k);
921 			ir_reg dst = ctx->regs[ref][0];
922 			ir_ref from, to;
923 
924 			IR_ASSERT(dst == IR_REG_NONE || !IR_REG_SPILLED(dst));
925 			if (IR_IS_CONST_REF(input)) {
926 				from = input;
927 			} else if (ir_rule(ctx, input) == IR_STATIC_ALLOCA) {
928 				/* encode local variable address */
929 				from = -(ctx->consts_count + input);
930 			} else {
931 				from = (src != IR_REG_NONE && !IR_REG_SPILLED(src)) ?
932 					(ir_ref)src : (ir_ref)(IR_REG_NUM + ctx->vregs[input]);
933 			}
934 			to = (dst != IR_REG_NONE) ?
935 				(ir_ref)dst : (ir_ref)(IR_REG_NUM + ctx->vregs[ref]);
936 			if (to != from) {
937 				if (to >= IR_REG_NUM
938 				 && from >= IR_REG_NUM
939 				 && IR_MEM_VAL(ir_vreg_spill_slot(ctx, from - IR_REG_NUM)) ==
940 						IR_MEM_VAL(ir_vreg_spill_slot(ctx, to - IR_REG_NUM))) {
941 					/* It's possible that different virtual registers share the same special spill slot */
942 					// TODO: See ext/opcache/tests/jit/gh11917.phpt failure on Linux 32-bit
943 					continue;
944 				}
945 				copies[n].type = insn->type;
946 				copies[n].from = from;
947 				copies[n].to = to;
948 				n++;
949 			}
950 		}
951 	}
952 
953 	if (n > 0) {
954 		ir_dessa_parallel_copy(ctx, copies, n, tmp_reg, tmp_fp_reg);
955 	}
956 }
957 
ir_match(ir_ctx * ctx)958 int ir_match(ir_ctx *ctx)
959 {
960 	uint32_t b;
961 	ir_ref start, ref, *prev_ref;
962 	ir_block *bb;
963 	ir_insn *insn;
964 	uint32_t entries_count = 0;
965 
966 	ctx->rules = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
967 
968 	prev_ref = ctx->prev_ref;
969 	if (!prev_ref) {
970 		ir_build_prev_refs(ctx);
971 		prev_ref = ctx->prev_ref;
972 	}
973 
974 	if (ctx->entries_count) {
975 		ctx->entries = ir_mem_malloc(ctx->entries_count * sizeof(ir_ref));
976 	}
977 
978 	for (b = ctx->cfg_blocks_count, bb = ctx->cfg_blocks + b; b > 0; b--, bb--) {
979 		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
980 		start = bb->start;
981 		if (UNEXPECTED(bb->flags & IR_BB_ENTRY)) {
982 			IR_ASSERT(entries_count < ctx->entries_count);
983 			insn = &ctx->ir_base[start];
984 			IR_ASSERT(insn->op == IR_ENTRY);
985 			insn->op3 = entries_count;
986 			ctx->entries[entries_count] = b;
987 			entries_count++;
988 		}
989 		ctx->rules[start] = IR_SKIPPED | IR_NOP;
990 		ref = bb->end;
991 		if (bb->successors_count == 1) {
992 			insn = &ctx->ir_base[ref];
993 			if (insn->op == IR_END || insn->op == IR_LOOP_END) {
994 				ctx->rules[ref] = insn->op;
995 				ref = prev_ref[ref];
996 				if (ref == start && ctx->cfg_edges[bb->successors] != b) {
997 					if (EXPECTED(!(bb->flags & IR_BB_ENTRY))) {
998 						bb->flags |= IR_BB_EMPTY;
999 					} else if (ctx->flags & IR_MERGE_EMPTY_ENTRIES) {
1000 						bb->flags |= IR_BB_EMPTY;
1001 						if (ctx->cfg_edges[bb->successors] == b + 1) {
1002 							(bb + 1)->flags |= IR_BB_PREV_EMPTY_ENTRY;
1003 						}
1004 					}
1005 					continue;
1006 				}
1007 			}
1008 		}
1009 
1010 		ctx->bb_start = start; /* bb_start is used by matcher to avoid fusion of insns from different blocks */
1011 
1012 		while (ref != start) {
1013 			uint32_t rule = ctx->rules[ref];
1014 
1015 			if (!rule) {
1016 				ctx->rules[ref] = rule = ir_match_insn(ctx, ref);
1017 			}
1018 			ir_match_insn2(ctx, ref, rule);
1019 			ref = prev_ref[ref];
1020 		}
1021 	}
1022 
1023 	if (ctx->entries_count) {
1024 		ctx->entries_count = entries_count;
1025 		if (!entries_count) {
1026 			ir_mem_free(ctx->entries);
1027 			ctx->entries = NULL;
1028 		}
1029 	}
1030 
1031 	return 1;
1032 }
1033 
ir_get_spill_slot_offset(ir_ctx * ctx,ir_ref ref)1034 int32_t ir_get_spill_slot_offset(ir_ctx *ctx, ir_ref ref)
1035 {
1036 	int32_t offset;
1037 
1038 	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
1039 	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
1040 	IR_ASSERT(offset != -1);
1041 	return IR_SPILL_POS_TO_OFFSET(offset);
1042 }
1043