xref: /PHP-8.2/ext/opcache/jit/ir/ir_x86.dasc (revision 2289af88)
1/*
2 * IR - Lightweight JIT Compilation Framework
3 * (x86/x86_64 native code generator based on DynAsm)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8|.if X64
9|.arch x64
10|.else
11|.arch x86
12|.endif
13
14|.actionlist dasm_actions
15|.globals ir_lb
16|.section code, cold_code, rodata, jmp_table
17
18#ifdef IR_DEBUG
19typedef struct _ir_mem {uint64_t v;} ir_mem;
20
21# define IR_MEM_VAL(loc)            ((loc).v)
22#else
23typedef uint64_t ir_mem;
24
25# define IR_MEM_VAL(loc)            (loc)
26#endif
27
28#define IR_MEM_OFFSET(loc)          ((int32_t)(IR_MEM_VAL(loc) & 0xffffffff))
29#define IR_MEM_BASE(loc)            ((ir_reg)((IR_MEM_VAL(loc) >> 32) & 0xff))
30#define IR_MEM_INDEX(loc)           ((ir_reg)((IR_MEM_VAL(loc) >> 40) & 0xff))
31#define IR_MEM_SCALE(loc)           ((int32_t)((IR_MEM_VAL(loc) >> 48) & 0xff))
32
33#define IR_MEM_O(addr)            IR_MEM(IR_REG_NONE, addr, IR_REG_NONE, 1)
34#define IR_MEM_B(base)            IR_MEM(base, 0, IR_REG_NONE, 1)
35#define IR_MEM_BO(base, offset)   IR_MEM(base, offset, IR_REG_NONE, 1)
36
37IR_ALWAYS_INLINE ir_mem IR_MEM(ir_reg base, int32_t offset, ir_reg index, int32_t scale)
38{
39	ir_mem mem;
40	IR_ASSERT(base == IR_REG_NONE || (base >= IR_REG_GP_FIRST && base <= IR_REG_GP_LAST));
41	IR_ASSERT(index == IR_REG_NONE || (index >= IR_REG_GP_FIRST && index <= IR_REG_GP_LAST));
42	IR_ASSERT(scale == 1 || scale == 2 || scale == 4 || scale == 8);
43#ifdef IR_DEBUG
44	mem.v =
45#else
46	mem =
47#endif
48		((uint64_t)(uint32_t)offset |
49		((uint64_t)(uint8_t)base << 32) |
50		((uint64_t)(uint8_t)index << 40) |
51		((uint64_t)(uint8_t)scale << 48));
52	return mem;
53}
54
55#define IR_IS_SIGNED_32BIT(val)     ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= (-2147483647 - 1)))
56#define IR_IS_SIGNED_NEG_32BIT(val) ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= -2147483647))
57#define IR_IS_UNSIGNED_32BIT(val)   (((uintptr_t)(val)) <= 0xffffffff)
58#define IR_IS_32BIT(type, val)      (IR_IS_TYPE_SIGNED(type) ? IR_IS_SIGNED_32BIT((val).i64) : IR_IS_UNSIGNED_32BIT((val).u64))
59#define IR_IS_FP_ZERO(insn)         ((insn.type == IR_DOUBLE) ? (insn.val.u64 == 0) : (insn.val.u32 == 0))
60#define IR_MAY_USE_32BIT_ADDR(code_buffer, addr) \
61	((code_buffer) && \
62	IR_IS_SIGNED_32BIT((char*)(addr) - (char*)(code_buffer)->start) && \
63	IR_IS_SIGNED_32BIT((char*)(addr) - ((char*)(code_buffer)->end)))
64
65#define IR_SPILL_POS_TO_OFFSET(offset) \
66	((ctx->flags & IR_USE_FRAME_POINTER) ? \
67		((offset) - (ctx->stack_frame_size - ctx->stack_frame_alignment)) : \
68		((offset) + ctx->call_stack_size))
69
70|.macro ASM_EXPAND_OP_MEM, MACRO, op, type, op1
71||	do {
72||		int32_t offset = IR_MEM_OFFSET(op1);
73||		int32_t base = IR_MEM_BASE(op1);
74||		int32_t index = IR_MEM_INDEX(op1);
75||		int32_t scale = IR_MEM_SCALE(op1);
76||  	if (index == IR_REG_NONE) {
77||			if (base == IR_REG_NONE) {
78|				MACRO op, type, [offset]
79||			} else {
80|				MACRO op, type, [Ra(base)+offset]
81||			}
82||		} else if (scale == 8) {
83||			if (base == IR_REG_NONE) {
84|				MACRO op, type, [Ra(index)*8+offset]
85||			} else {
86|				MACRO op, type, [Ra(base)+Ra(index)*8+offset]
87||			}
88||		} else if (scale == 4) {
89||			if (base == IR_REG_NONE) {
90|				MACRO op, type, [Ra(index)*4+offset]
91||			} else {
92|				MACRO op, type, [Ra(base)+Ra(index)*4+offset]
93||			}
94||		} else if (scale == 2) {
95||			if (base == IR_REG_NONE) {
96|				MACRO op, type, [Ra(index)*2+offset]
97||			} else {
98|				MACRO op, type, [Ra(base)+Ra(index)*2+offset]
99||			}
100||		} else {
101||			IR_ASSERT(scale == 1);
102||			if (base == IR_REG_NONE) {
103|				MACRO op, type, [Ra(index)+offset]
104||			} else {
105|				MACRO op, type, [Ra(base)+Ra(index)+offset]
106||			}
107||		}
108||	} while (0);
109|.endmacro
110
111|.macro ASM_EXPAND_OP1_MEM, MACRO, op, type, op1, op2
112||	do {
113||		int32_t offset = IR_MEM_OFFSET(op1);
114||		int32_t base = IR_MEM_BASE(op1);
115||		int32_t index = IR_MEM_INDEX(op1);
116||		int32_t scale = IR_MEM_SCALE(op1);
117||  	if (index == IR_REG_NONE) {
118||			if (base == IR_REG_NONE) {
119|				MACRO op, type, [offset], op2
120||			} else {
121|				MACRO op, type, [Ra(base)+offset], op2
122||			}
123||		} else if (scale == 8) {
124||			if (base == IR_REG_NONE) {
125|				MACRO op, type, [Ra(index)*8+offset], op2
126||			} else {
127|				MACRO op, type, [Ra(base)+Ra(index)*8+offset], op2
128||			}
129||		} else if (scale == 4) {
130||			if (base == IR_REG_NONE) {
131|				MACRO op, type, [Ra(index)*4+offset], op2
132||			} else {
133|				MACRO op, type, [Ra(base)+Ra(index)*4+offset], op2
134||			}
135||		} else if (scale == 2) {
136||			if (base == IR_REG_NONE) {
137|				MACRO op, type, [Ra(index)*2+offset], op2
138||			} else {
139|				MACRO op, type, [Ra(base)+Ra(index)*2+offset], op2
140||			}
141||		} else {
142||			IR_ASSERT(scale == 1);
143||			if (base == IR_REG_NONE) {
144|				MACRO op, type, [Ra(index)+offset], op2
145||			} else {
146|				MACRO op, type, [Ra(base)+Ra(index)+offset], op2
147||			}
148||		}
149||	} while (0);
150|.endmacro
151
152|.macro ASM_EXPAND_OP2_MEM, MACRO, op, type, op1, op2
153||	do {
154||		int32_t offset = IR_MEM_OFFSET(op2);
155||		int32_t base = IR_MEM_BASE(op2);
156||		int32_t index = IR_MEM_INDEX(op2);
157||		int32_t scale = IR_MEM_SCALE(op2);
158||  	if (index == IR_REG_NONE) {
159||			if (base == IR_REG_NONE) {
160|				MACRO op, type, op1, [offset]
161||			} else {
162|				MACRO op, type, op1, [Ra(base)+offset]
163||			}
164||		} else if (scale == 8) {
165||			if (base == IR_REG_NONE) {
166|				MACRO op, type, op1, [Ra(index)*8+offset]
167||			} else {
168|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset]
169||			}
170||		} else if (scale == 4) {
171||			if (base == IR_REG_NONE) {
172|				MACRO op, type, op1, [Ra(index)*4+offset]
173||			} else {
174|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset]
175||			}
176||		} else if (scale == 2) {
177||			if (base == IR_REG_NONE) {
178|				MACRO op, type, op1, [Ra(index)*2+offset]
179||			} else {
180|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset]
181||			}
182||		} else {
183||			IR_ASSERT(scale == 1);
184||			if (base == IR_REG_NONE) {
185|				MACRO op, type, op1, [Ra(index)+offset]
186||			} else {
187|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset]
188||			}
189||		}
190||	} while (0);
191|.endmacro
192
193|.macro ASM_EXPAND_OP2_MEM_3, MACRO, op, type, op1, op2, op3
194||	do {
195||		int32_t offset = IR_MEM_OFFSET(op2);
196||		int32_t base = IR_MEM_BASE(op2);
197||		int32_t index = IR_MEM_INDEX(op2);
198||		int32_t scale = IR_MEM_SCALE(op2);
199||  	if (index == IR_REG_NONE) {
200||			if (base == IR_REG_NONE) {
201|				MACRO op, type, op1, [offset], op3
202||			} else {
203|				MACRO op, type, op1, [Ra(base)+offset], op3
204||			}
205||		} else if (scale == 8) {
206||			if (base == IR_REG_NONE) {
207|				MACRO op, type, op1, [Ra(index)*8+offset], op3
208||			} else {
209|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset], op3
210||			}
211||		} else if (scale == 4) {
212||			if (base == IR_REG_NONE) {
213|				MACRO op, type, op1, [Ra(index)*4+offset], op3
214||			} else {
215|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset], op3
216||			}
217||		} else if (scale == 2) {
218||			if (base == IR_REG_NONE) {
219|				MACRO op, type, op1, [Ra(index)*2+offset], op3
220||			} else {
221|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset], op3
222||			}
223||		} else {
224||			IR_ASSERT(scale == 1);
225||			if (base == IR_REG_NONE) {
226|				MACRO op, type, op1, [Ra(index)+offset], op3
227||			} else {
228|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset], op3
229||			}
230||		}
231||	} while (0);
232|.endmacro
233
234|.macro ASM_EXPAND_OP3_MEM, MACRO, op, type, op1, op2, op3
235||	do {
236||		int32_t offset = IR_MEM_OFFSET(op3);
237||		int32_t base = IR_MEM_BASE(op3);
238||		int32_t index = IR_MEM_INDEX(op3);
239||		int32_t scale = IR_MEM_SCALE(op3);
240||  	if (index == IR_REG_NONE) {
241||			if (base == IR_REG_NONE) {
242|				MACRO op, type, op1, op2, [offset]
243||			} else {
244|				MACRO op, type, op1, op2, [Ra(base)+offset]
245||			}
246||		} else if (scale == 8) {
247||			if (base == IR_REG_NONE) {
248|				MACRO op, type, op1, op2, [Ra(index)*8+offset]
249||			} else {
250|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*8+offset]
251||			}
252||		} else if (scale == 4) {
253||			if (base == IR_REG_NONE) {
254|				MACRO op, type, op1, op2, [Ra(index)*4+offset]
255||			} else {
256|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*4+offset]
257||			}
258||		} else if (scale == 2) {
259||			if (base == IR_REG_NONE) {
260|				MACRO op, type, op1, op2, [Ra(index)*2+offset]
261||			} else {
262|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*2+offset]
263||			}
264||		} else {
265||			IR_ASSERT(scale == 1);
266||			if (base == IR_REG_NONE) {
267|				MACRO op, type, op1, op2, [Ra(index)+offset]
268||			} else {
269|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)+offset]
270||			}
271||		}
272||	} while (0);
273|.endmacro
274
275|.macro ASM_EXPAND_TYPE_MEM, op, type, op1
276||	switch (ir_type_size[type]) {
277|| 		default:
278||			IR_ASSERT(0);
279||		case 1:
280|			op byte op1
281||			break;
282||		case 2:
283|			op word op1
284|| 			break;
285||		case 4:
286|			op dword op1
287|| 			break;
288|.if X64
289||		case 8:
290|			op qword op1
291|| 			break;
292|.endif
293||	}
294|.endmacro
295
296|.macro ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
297||	switch (ir_type_size[type]) {
298|| 		default:
299||			IR_ASSERT(0);
300||		case 1:
301|			op byte op1, Rb(op2)
302||			break;
303||		case 2:
304|			op word op1, Rw(op2)
305|| 			break;
306||		case 4:
307|			op dword op1, Rd(op2)
308|| 			break;
309|.if X64
310||		case 8:
311|			op qword op1, Rq(op2)
312|| 			break;
313|.endif
314||	}
315|.endmacro
316
317|.macro ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
318||	switch (ir_type_size[type]) {
319|| 		default:
320||			IR_ASSERT(0);
321||		case 1:
322|			op byte op1, op2
323||			break;
324||		case 2:
325|			op word op1, op2
326|| 			break;
327||		case 4:
328|			op dword op1, op2
329|| 			break;
330|.if X64
331||		case 8:
332|			op qword op1, op2
333|| 			break;
334|.endif
335||	}
336|.endmacro
337
338|.macro ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
339||	switch (ir_type_size[type]) {
340|| 		default:
341||			IR_ASSERT(0);
342||		case 1:
343|			op byte op1, (op2 & 0xff)
344||			break;
345||		case 2:
346|			op word op1, (op2 & 0xffff)
347|| 			break;
348||		case 4:
349|			op dword op1, op2
350|| 			break;
351|.if X64
352||		case 8:
353|			op qword op1, op2
354|| 			break;
355|.endif
356||	}
357|.endmacro
358
359|.macro ASM_EXPAND_TYPE_REG_MEM, op, type, op1, op2
360||	switch (ir_type_size[type]) {
361|| 		default:
362||			IR_ASSERT(0);
363||		case 1:
364|			op Rb(op1), byte op2
365||			break;
366||		case 2:
367|			op Rw(op1), word op2
368|| 			break;
369||		case 4:
370|			op Rd(op1), dword op2
371|| 			break;
372|.if X64
373||		case 8:
374|			op Rq(op1), qword op2
375|| 			break;
376|.endif
377||	}
378|.endmacro
379
380|.macro ASM_TMEM_OP, op, type, op1
381||	do {
382||		int32_t offset = IR_MEM_OFFSET(op1);
383||		int32_t base = IR_MEM_BASE(op1);
384||		int32_t index = IR_MEM_INDEX(op1);
385||		int32_t scale = IR_MEM_SCALE(op1);
386||  	if (index == IR_REG_NONE) {
387||			if (base == IR_REG_NONE) {
388|				op type [offset]
389||			} else {
390|				op type [Ra(base)+offset]
391||			}
392||		} else if (scale == 8) {
393||			if (base == IR_REG_NONE) {
394|				op type [Ra(index)*8+offset]
395||			} else {
396|				op type [Ra(base)+Ra(index)*8+offset]
397||			}
398||		} else if (scale == 4) {
399||			if (base == IR_REG_NONE) {
400|				op type [Ra(index)*4+offset]
401||			} else {
402|				op type [Ra(base)+Ra(index)*4+offset]
403||			}
404||		} else if (scale == 2) {
405||			if (base == IR_REG_NONE) {
406|				op type [Ra(index)*2+offset]
407||			} else {
408|				op type [Ra(base)+Ra(index)*2+offset]
409||			}
410||		} else {
411||			IR_ASSERT(scale == 1);
412||			if (base == IR_REG_NONE) {
413|				op type [Ra(index)+offset]
414||			} else {
415|				op type [Ra(base)+Ra(index)+offset]
416||			}
417||		}
418||	} while (0);
419|.endmacro
420
421|.macro ASM_TXT_TMEM_OP, op, op1, type, op2
422||	do {
423||		int32_t offset = IR_MEM_OFFSET(op2);
424||		int32_t base = IR_MEM_BASE(op2);
425||		int32_t index = IR_MEM_INDEX(op2);
426||		int32_t scale = IR_MEM_SCALE(op2);
427||  	if (index == IR_REG_NONE) {
428||			if (base == IR_REG_NONE) {
429|				op op1, type [offset]
430||			} else {
431|				op op1, type [Ra(base)+offset]
432||			}
433||		} else if (scale == 8) {
434||			if (base == IR_REG_NONE) {
435|				op op1, type [Ra(index)*8+offset]
436||			} else {
437|				op op1, type [Ra(base)+Ra(index)*8+offset]
438||			}
439||		} else if (scale == 4) {
440||			if (base == IR_REG_NONE) {
441|				op op1, type [Ra(index)*4+offset]
442||			} else {
443|				op op1, type [Ra(base)+Ra(index)*4+offset]
444||			}
445||		} else if (scale == 2) {
446||			if (base == IR_REG_NONE) {
447|				op op1, type [Ra(index)*2+offset]
448||			} else {
449|				op op1, type [Ra(base)+Ra(index)*2+offset]
450||			}
451||		} else {
452||			IR_ASSERT(scale == 1);
453||			if (base == IR_REG_NONE) {
454|				op op1, type [Ra(index)+offset]
455||			} else {
456|				op op1, type [Ra(base)+Ra(index)+offset]
457||			}
458||		}
459||	} while (0);
460|.endmacro
461
462|.macro ASM_TMEM_TXT_OP, op, type, op1, op2
463||	do {
464||		int32_t offset = IR_MEM_OFFSET(op1);
465||		int32_t base = IR_MEM_BASE(op1);
466||		int32_t index = IR_MEM_INDEX(op1);
467||		int32_t scale = IR_MEM_SCALE(op1);
468||  	if (index == IR_REG_NONE) {
469||			if (base == IR_REG_NONE) {
470|				op type [offset], op2
471||			} else {
472|				op type [Ra(base)+offset], op2
473||			}
474||		} else if (scale == 8) {
475||			if (base == IR_REG_NONE) {
476|				op type [Ra(index)*8+offset], op2
477||			} else {
478|				op type [Ra(base)+Ra(index)*8+offset], op2
479||			}
480||		} else if (scale == 4) {
481||			if (base == IR_REG_NONE) {
482|				op type [Ra(index)*4+offset], op2
483||			} else {
484|				op type [Ra(base)+Ra(index)*4+offset], op2
485||			}
486||		} else if (scale == 2) {
487||			if (base == IR_REG_NONE) {
488|				op type [Ra(index)*2+offset], op2
489||			} else {
490|				op type [Ra(base)+Ra(index)*2+offset], op2
491||			}
492||		} else {
493||			IR_ASSERT(scale == 1);
494||			if (base == IR_REG_NONE) {
495|				op type [Ra(index)+offset], op2
496||			} else {
497|				op type [Ra(base)+Ra(index)+offset], op2
498||			}
499||		}
500||	} while (0);
501|.endmacro
502
503|.macro ASM_TXT_TXT_TMEM_OP, op, op1, op2, type, op3
504||	do {
505||		int32_t offset = IR_MEM_OFFSET(op3);
506||		int32_t base = IR_MEM_BASE(op3);
507||		int32_t index = IR_MEM_INDEX(op3);
508||		int32_t scale = IR_MEM_SCALE(op3);
509||  	if (index == IR_REG_NONE) {
510||			if (base == IR_REG_NONE) {
511|				op op1, op2, type [offset]
512||			} else {
513|				op op1, op2, type [Ra(base)+offset]
514||			}
515||		} else if (scale == 8) {
516||			if (base == IR_REG_NONE) {
517|				op op1, op2, type [Ra(index)*8+offset]
518||			} else {
519|				op op1, op2, type [Ra(base)+Ra(index)*8+offset]
520||			}
521||		} else if (scale == 4) {
522||			if (base == IR_REG_NONE) {
523|				op op1, op2, type [Ra(index)*4+offset]
524||			} else {
525|				op op1, op2, type [Ra(base)+Ra(index)*4+offset]
526||			}
527||		} else if (scale == 2) {
528||			if (base == IR_REG_NONE) {
529|				op op1, op2, type [Ra(index)*2+offset]
530||			} else {
531|				op op1, op2, type [Ra(base)+Ra(index)*2+offset]
532||			}
533||		} else {
534||			IR_ASSERT(scale == 1);
535||			if (base == IR_REG_NONE) {
536|				op op1, op2, type [Ra(index)+offset]
537||			} else {
538|				op op1, op2, type [Ra(base)+Ra(index)+offset]
539||			}
540||		}
541||	} while (0);
542|.endmacro
543
544|.macro ASM_REG_OP, op, type, op1
545||	switch (ir_type_size[type]) {
546|| 		default:
547||			IR_ASSERT(0);
548||		case 1:
549|			op Rb(op1)
550||			break;
551||		case 2:
552|			op Rw(op1)
553|| 			break;
554||		case 4:
555|			op Rd(op1)
556|| 			break;
557|.if X64
558||		case 8:
559|			op Rq(op1)
560|| 			break;
561|.endif
562||	}
563|.endmacro
564
565|.macro ASM_MEM_OP, op, type, op1
566|	ASM_EXPAND_OP_MEM ASM_EXPAND_TYPE_MEM, op, type, op1
567|.endmacro
568
569|.macro ASM_REG_REG_OP, op, type, op1, op2
570||	switch (ir_type_size[type]) {
571|| 		default:
572||			IR_ASSERT(0);
573||		case 1:
574|			op Rb(op1), Rb(op2)
575||			break;
576||		case 2:
577|			op Rw(op1), Rw(op2)
578|| 			break;
579||		case 4:
580|			op Rd(op1), Rd(op2)
581|| 			break;
582|.if X64
583||		case 8:
584|			op Rq(op1), Rq(op2)
585|| 			break;
586|.endif
587||	}
588|.endmacro
589
590|.macro ASM_REG_REG_OP2, op, type, op1, op2
591||	switch (ir_type_size[type]) {
592|| 		default:
593||			IR_ASSERT(0);
594||		case 1:
595||		case 2:
596|			op Rw(op1), Rw(op2)
597|| 			break;
598||		case 4:
599|			op Rd(op1), Rd(op2)
600|| 			break;
601|.if X64
602||		case 8:
603|			op Rq(op1), Rq(op2)
604|| 			break;
605|.endif
606||	}
607|.endmacro
608
609|.macro ASM_REG_TXT_OP, op, type, op1, op2
610||	switch (ir_type_size[type]) {
611|| 		default:
612||			IR_ASSERT(0);
613||		case 1:
614|			op Rb(op1), op2
615||			break;
616||		case 2:
617|			op Rw(op1), op2
618|| 			break;
619||		case 4:
620|			op Rd(op1), op2
621|| 			break;
622|.if X64
623||		case 8:
624|			op Rq(op1), op2
625|| 			break;
626|.endif
627||	}
628|.endmacro
629
630|.macro ASM_REG_IMM_OP, op, type, op1, op2
631||	switch (ir_type_size[type]) {
632|| 		default:
633||			IR_ASSERT(0);
634||		case 1:
635|			op Rb(op1), (op2 & 0xff)
636||			break;
637||		case 2:
638|			op Rw(op1), (op2 & 0xffff)
639|| 			break;
640||		case 4:
641|			op Rd(op1), op2
642|| 			break;
643|.if X64
644||		case 8:
645|			op Rq(op1), op2
646|| 			break;
647|.endif
648||	}
649|.endmacro
650
651|.macro ASM_MEM_REG_OP, op, type, op1, op2
652|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
653|.endmacro
654
655|.macro ASM_MEM_TXT_OP, op, type, op1, op2
656|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
657|.endmacro
658
659|.macro ASM_MEM_IMM_OP, op, type, op1, op2
660|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
661|.endmacro
662
663|.macro ASM_REG_MEM_OP, op, type, op1, op2
664|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_OP, op, type, op1, op2
665|.endmacro
666
667|.macro ASM_REG_REG_MUL, op, type, op1, op2
668||	switch (ir_type_size[type]) {
669|| 		default:
670||			IR_ASSERT(0);
671||		case 2:
672|			op Rw(op1), Rw(op2)
673|| 			break;
674||		case 4:
675|			op Rd(op1), Rd(op2)
676|| 			break;
677|.if X64
678||		case 8:
679|			op Rq(op1), Rq(op2)
680|| 			break;
681|.endif
682||	}
683|.endmacro
684
685|.macro ASM_REG_IMM_MUL, op, type, op1, op2
686||	switch (ir_type_size[type]) {
687|| 		default:
688||			IR_ASSERT(0);
689||		case 2:
690|			op Rw(op1), op2
691|| 			break;
692||		case 4:
693|			op Rd(op1), op2
694|| 			break;
695|.if X64
696||		case 8:
697|			op Rq(op1), op2
698|| 			break;
699|.endif
700||	}
701|.endmacro
702
703|.macro ASM_REG_TXT_MUL, op, type, op1, op2
704||	switch (ir_type_size[type]) {
705|| 		default:
706||			IR_ASSERT(0);
707||		case 2:
708|			op Rw(op1), op2
709|| 			break;
710||		case 4:
711|			op Rd(op1), op2
712|| 			break;
713|.if X64
714||		case 8:
715|			op Rq(op1), op2
716|| 			break;
717|.endif
718||	}
719|.endmacro
720
721|.macro ASM_REG_MEM_MUL, op, type, op1, op2
722|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_MUL, op, type, op1, op2
723|.endmacro
724
725|.macro ASM_REG_TXT_TXT_MUL, op, type, op1, op2, op3
726||	switch (ir_type_size[type]) {
727|| 		default:
728||			IR_ASSERT(0);
729||		case 2:
730|			op Rw(op1), op2, op3
731|| 			break;
732||		case 4:
733|			op Rd(op1), op2, op3
734|| 			break;
735|.if X64
736||		case 8:
737|			op Rq(op1), op2, op3
738|| 			break;
739|.endif
740||	}
741|.endmacro
742
743|.macro ASM_REG_MEM_TXT_MUL, op, type, op1, op2, op3
744|	ASM_EXPAND_OP2_MEM_3 ASM_REG_TXT_TXT_MUL, imul, type, op1, op2, op3
745|.endmacro
746
747|.macro ASM_SSE2_REG_REG_OP, op, type, op1, op2
748||	if (type == IR_DOUBLE) {
749|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
750||	} else {
751||		IR_ASSERT(type == IR_FLOAT);
752|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
753||	}
754|.endmacro
755
756|.macro ASM_SSE2_REG_TXT_OP, op, type, op1, op2
757||	if (type == IR_DOUBLE) {
758|		op..d xmm(op1-IR_REG_FP_FIRST), qword op2
759||	} else {
760||		IR_ASSERT(type == IR_FLOAT);
761|		op..s xmm(op1-IR_REG_FP_FIRST), dword op2
762||	}
763|.endmacro
764
765|.macro ASM_SSE2_REG_MEM_OP, op, type, op1, op2
766|	ASM_EXPAND_OP2_MEM ASM_SSE2_REG_TXT_OP, op, type, op1, op2
767|.endmacro
768
769|.macro ASM_AVX_REG_REG_REG_OP, op, type, op1, op2, op3
770||	if (type == IR_DOUBLE) {
771|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
772||	} else {
773||		IR_ASSERT(type == IR_FLOAT);
774|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
775||	}
776|.endmacro
777
778|.macro ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
779||	if (type == IR_DOUBLE) {
780|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), qword op3
781||	} else {
782||		IR_ASSERT(type == IR_FLOAT);
783|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), dword op3
784||	}
785|.endmacro
786
787|.macro ASM_AVX_REG_REG_MEM_OP, op, type, op1, op2, op3
788|	ASM_EXPAND_OP3_MEM ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
789|.endmacro
790
791|.macro ASM_FP_REG_REG_OP, op, type, op1, op2
792||	if (ctx->mflags & IR_X86_AVX) {
793|		ASM_SSE2_REG_REG_OP v..op, type, op1, op2
794||	} else {
795|		ASM_SSE2_REG_REG_OP op, type, op1, op2
796||	}
797|.endmacro
798
799|.macro ASM_FP_TXT_REG_OP, op, type, dst, src
800||	if (type == IR_DOUBLE) {
801||		if (ctx->mflags & IR_X86_AVX) {
802|			v..op..d qword dst, xmm(src-IR_REG_FP_FIRST)
803||		} else {
804|			op..d qword dst, xmm(src-IR_REG_FP_FIRST)
805||		}
806||	} else {
807||		IR_ASSERT(type == IR_FLOAT);
808||		if (ctx->mflags & IR_X86_AVX) {
809|			v..op..s dword dst, xmm(src-IR_REG_FP_FIRST)
810||		} else {
811|			op..s dword dst, xmm(src-IR_REG_FP_FIRST)
812||		}
813||	}
814|.endmacro
815
816|.macro ASM_FP_MEM_REG_OP, op, type, op1, op2
817|	ASM_EXPAND_OP1_MEM ASM_FP_TXT_REG_OP, op, type, op1, op2
818|.endmacro
819
820|.macro ASM_FP_REG_TXT_OP, op, type, op1, op2
821||	if (ctx->mflags & IR_X86_AVX) {
822|		ASM_SSE2_REG_TXT_OP v..op, type, op1, op2
823||	} else {
824|		ASM_SSE2_REG_TXT_OP op, type, op1, op2
825||	}
826|.endmacro
827
828|.macro ASM_FP_REG_MEM_OP, op, type, op1, op2
829||	if (ctx->mflags & IR_X86_AVX) {
830|		ASM_SSE2_REG_MEM_OP v..op, type, op1, op2
831||	} else {
832|		ASM_SSE2_REG_MEM_OP op, type, op1, op2
833||	}
834|.endmacro
835
836typedef struct _ir_backend_data {
837    ir_reg_alloc_data  ra_data;
838	uint32_t           dessa_from_block;
839	dasm_State        *dasm_state;
840	ir_bitset          emit_constants;
841	int                rodata_label, jmp_table_label;
842	bool               double_neg_const;
843	bool               float_neg_const;
844	bool               double_abs_const;
845	bool               float_abs_const;
846	bool               double_zero_const;
847} ir_backend_data;
848
849#define IR_GP_REG_NAME(code, name64, name32, name16, name8, name8h) \
850	#name64,
851#define IR_GP_REG_NAME32(code, name64, name32, name16, name8, name8h) \
852	#name32,
853#define IR_GP_REG_NAME16(code, name64, name32, name16, name8, name8h) \
854	#name16,
855#define IR_GP_REG_NAME8(code, name64, name32, name16, name8, name8h) \
856	#name8,
857#define IR_FP_REG_NAME(code, name) \
858	#name,
859
860static const char *_ir_reg_name[IR_REG_NUM] = {
861	IR_GP_REGS(IR_GP_REG_NAME)
862	IR_FP_REGS(IR_FP_REG_NAME)
863};
864
865static const char *_ir_reg_name32[IR_REG_NUM] = {
866	IR_GP_REGS(IR_GP_REG_NAME32)
867};
868
869static const char *_ir_reg_name16[IR_REG_NUM] = {
870	IR_GP_REGS(IR_GP_REG_NAME16)
871};
872
873static const char *_ir_reg_name8[IR_REG_NUM] = {
874	IR_GP_REGS(IR_GP_REG_NAME8)
875};
876
877/* Calling Convention */
878#ifdef _WIN64
879
880static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
881	IR_REG_INT_ARG1,
882	IR_REG_INT_ARG2,
883	IR_REG_INT_ARG3,
884	IR_REG_INT_ARG4,
885};
886
887static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
888	IR_REG_FP_ARG1,
889	IR_REG_FP_ARG2,
890	IR_REG_FP_ARG3,
891	IR_REG_FP_ARG4,
892};
893
894#elif defined(IR_TARGET_X64)
895
896static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
897	IR_REG_INT_ARG1,
898	IR_REG_INT_ARG2,
899	IR_REG_INT_ARG3,
900	IR_REG_INT_ARG4,
901	IR_REG_INT_ARG5,
902	IR_REG_INT_ARG6,
903};
904
905static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
906	IR_REG_FP_ARG1,
907	IR_REG_FP_ARG2,
908	IR_REG_FP_ARG3,
909	IR_REG_FP_ARG4,
910	IR_REG_FP_ARG5,
911	IR_REG_FP_ARG6,
912	IR_REG_FP_ARG7,
913	IR_REG_FP_ARG8,
914};
915
916#else
917
918static const int8_t *_ir_int_reg_params = NULL;
919static const int8_t *_ir_fp_reg_params = NULL;
920static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS] = {
921	IR_REG_INT_FCARG1,
922	IR_REG_INT_FCARG2,
923};
924static const int8_t *_ir_fp_fc_reg_params = NULL;
925
926#endif
927
928const char *ir_reg_name(int8_t reg, ir_type type)
929{
930	if (reg >= IR_REG_NUM) {
931		if (reg == IR_REG_SCRATCH) {
932			return "SCRATCH";
933		} else {
934			IR_ASSERT(reg == IR_REG_ALL);
935			return "ALL";
936		}
937	}
938	IR_ASSERT(reg >= 0 && reg < IR_REG_NUM);
939	if (type == IR_VOID) {
940		type = (reg < IR_REG_FP_FIRST) ? IR_ADDR : IR_DOUBLE;
941	}
942	if (IR_IS_TYPE_FP(type) || ir_type_size[type] == 8) {
943		return _ir_reg_name[reg];
944	} else if (ir_type_size[type] == 4) {
945		return _ir_reg_name32[reg];
946	} else if (ir_type_size[type] == 2) {
947		return _ir_reg_name16[reg];
948	} else {
949		IR_ASSERT(ir_type_size[type] == 1);
950		return _ir_reg_name8[reg];
951	}
952}
953
954#define IR_RULES(_)        \
955	_(CMP_INT)             \
956	_(CMP_FP)              \
957	_(MUL_INT)             \
958	_(DIV_INT)             \
959	_(MOD_INT)             \
960	_(TEST_INT)            \
961	_(SETCC_INT)           \
962	_(TESTCC_INT)          \
963	_(LEA_OB)              \
964	_(LEA_SI)              \
965	_(LEA_SIB)             \
966	_(LEA_IB)              \
967	_(LEA_SI_O)            \
968	_(LEA_SIB_O)           \
969	_(LEA_IB_O)            \
970	_(LEA_I_OB)            \
971	_(LEA_OB_I)            \
972	_(LEA_OB_SI)           \
973	_(LEA_SI_OB)           \
974	_(LEA_B_SI)            \
975	_(LEA_SI_B)            \
976	_(INC)                 \
977	_(DEC)                 \
978	_(MUL_PWR2)            \
979	_(DIV_PWR2)            \
980	_(MOD_PWR2)            \
981	_(SDIV_PWR2)           \
982	_(SMOD_PWR2)           \
983	_(BOOL_NOT_INT)        \
984	_(ABS_INT)             \
985	_(OP_INT)              \
986	_(OP_FP)               \
987	_(IMUL3)               \
988	_(BINOP_INT)           \
989	_(BINOP_SSE2)          \
990	_(BINOP_AVX)           \
991	_(SHIFT)               \
992	_(SHIFT_CONST)         \
993	_(COPY_INT)            \
994	_(COPY_FP)             \
995	_(CMP_AND_BRANCH_INT)  \
996	_(CMP_AND_BRANCH_FP)   \
997	_(TEST_AND_BRANCH_INT) \
998	_(JCC_INT)             \
999	_(GUARD_CMP_INT)       \
1000	_(GUARD_CMP_FP)        \
1001	_(GUARD_TEST_INT)      \
1002	_(GUARD_JCC_INT)       \
1003	_(GUARD_OVERFLOW)      \
1004	_(OVERFLOW_AND_BRANCH) \
1005	_(MIN_MAX_INT)         \
1006	_(MEM_OP_INT)          \
1007	_(MEM_INC)             \
1008	_(MEM_DEC)             \
1009	_(MEM_MUL_PWR2)        \
1010	_(MEM_DIV_PWR2)        \
1011	_(MEM_MOD_PWR2)        \
1012	_(MEM_BINOP_INT)       \
1013	_(MEM_SHIFT)           \
1014	_(MEM_SHIFT_CONST)     \
1015	_(REG_BINOP_INT)       \
1016	_(VSTORE_INT)          \
1017	_(VSTORE_FP)           \
1018	_(LOAD_INT)            \
1019	_(LOAD_FP)             \
1020	_(STORE_INT)           \
1021	_(STORE_FP)            \
1022	_(IF_INT)              \
1023	_(RETURN_VOID)         \
1024	_(RETURN_INT)          \
1025	_(RETURN_FP)           \
1026	_(BIT_COUNT)           \
1027
1028#define IR_RULE_ENUM(name) IR_ ## name,
1029
1030enum _ir_rule {
1031	IR_FIRST_RULE = IR_LAST_OP,
1032	IR_RULES(IR_RULE_ENUM)
1033	IR_LAST_RULE
1034};
1035
1036#define IR_RULE_NAME(name)  #name,
1037const char *ir_rule_name[IR_LAST_OP] = {
1038	NULL,
1039	IR_RULES(IR_RULE_NAME)
1040	NULL
1041};
1042
1043static bool ir_may_fuse_addr(ir_ctx *ctx, const ir_insn *addr_insn)
1044{
1045	if (sizeof(void*) == 4) {
1046		return 1;
1047	} else if (IR_IS_SYM_CONST(addr_insn->op)) {
1048		void *addr = ir_sym_addr(ctx, addr_insn);
1049
1050		if (!addr) {
1051			return 0;
1052		}
1053		return IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
1054	} else {
1055		return IR_IS_SIGNED_32BIT(addr_insn->val.i64);
1056	}
1057}
1058
1059static bool ir_may_fuse_imm(ir_ctx *ctx, const ir_insn *val_insn)
1060{
1061	if (val_insn->type == IR_ADDR) {
1062		if (sizeof(void*) == 4) {
1063			return 1;
1064		} else if (IR_IS_SYM_CONST(val_insn->op)) {
1065			void *addr = ir_sym_addr(ctx, val_insn);
1066
1067			if (!addr) {
1068				return 0;
1069			}
1070			return IR_IS_SIGNED_32BIT((intptr_t)addr);
1071		} else {
1072			return IR_IS_SIGNED_32BIT(val_insn->val.i64);
1073		}
1074	} else {
1075		return (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
1076	}
1077}
1078
1079/* register allocation */
1080static int ir_add_const_tmp_reg(ir_ctx *ctx, ir_ref ref, uint32_t num, int n, ir_target_constraints *constraints)
1081{
1082	IR_ASSERT(IR_IS_CONST_REF(ref));
1083	const ir_insn *val_insn = &ctx->ir_base[ref];
1084
1085	if (!ir_may_fuse_imm(ctx, val_insn)) {
1086		constraints->tmp_regs[n] = IR_TMP_REG(num, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1087		n++;
1088	}
1089	return n;
1090}
1091
1092int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *constraints)
1093{
1094	uint32_t rule = ir_rule(ctx, ref);
1095	const ir_insn *insn;
1096	int n = 0;
1097	int flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1098
1099	constraints->def_reg = IR_REG_NONE;
1100	constraints->hints_count = 0;
1101	switch (rule & IR_RULE_MASK) {
1102		case IR_BINOP_INT:
1103			insn = &ctx->ir_base[ref];
1104			if (rule & IR_FUSED) {
1105				if (ctx->ir_base[insn->op1].op == IR_RLOAD) {
1106					flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1107				} else {
1108					flags = IR_OP2_MUST_BE_IN_REG;
1109				}
1110			} else {
1111				flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1112			}
1113			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1114				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1115			}
1116			break;
1117		case IR_IMUL3:
1118			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1119			break;
1120		case IR_SHIFT:
1121			if (rule & IR_FUSED) {
1122				flags = IR_OP2_MUST_BE_IN_REG;
1123			} else {
1124				flags = IR_DEF_REUSES_OP1_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1125			}
1126			constraints->hints[1] = IR_REG_NONE;
1127			constraints->hints[2] = IR_REG_RCX;
1128			constraints->hints_count = 3;
1129			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RCX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1130			n = 1;
1131			break;
1132		case IR_MUL_INT:
1133			/* %rax - used as input and result */
1134			constraints->def_reg = IR_REG_RAX;
1135			constraints->hints[1] = IR_REG_RAX;
1136			constraints->hints_count = 2;
1137			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1138			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1139			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1140			n = 2;
1141			break;
1142		case IR_DIV_INT:
1143			/* %rax - used as input and result */
1144			constraints->def_reg = IR_REG_RAX;
1145			constraints->hints[1] = IR_REG_RAX;
1146			constraints->hints_count = 2;
1147			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1148			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1149			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1150			n = 2;
1151			goto op2_const;
1152		case IR_MOD_INT:
1153			constraints->def_reg = IR_REG_RDX;
1154			constraints->hints[1] = IR_REG_RAX;
1155			constraints->hints_count = 2;
1156			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1157			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1158			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1159			n = 2;
1160			goto op2_const;
1161		case IR_MIN_MAX_INT:
1162			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1163op2_const:
1164			insn = &ctx->ir_base[ref];
1165			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1166				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1167				n++;
1168			}
1169			break;
1170		case IR_CMP_INT:
1171		case IR_TEST_INT:
1172			insn = &ctx->ir_base[ref];
1173			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1174			if (IR_IS_CONST_REF(insn->op1)) {
1175				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1176				constraints->tmp_regs[0] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1177				n = 1;
1178			} else if (ir_rule(ctx, insn->op1) & IR_FUSED) {
1179				flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1180			}
1181			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1182				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1183				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1184			}
1185			break;
1186		case IR_CMP_FP:
1187			insn = &ctx->ir_base[ref];
1188			if (!(rule & IR_FUSED)) {
1189				constraints->tmp_regs[0] = IR_TMP_REG(3, IR_BOOL, IR_DEF_SUB_REF, IR_SAVE_SUB_REF);
1190				n = 1;
1191			}
1192			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1193			if (IR_IS_CONST_REF(insn->op1)) {
1194				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1195				constraints->tmp_regs[n] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1196				n++;
1197			}
1198			break;
1199		case IR_BINOP_AVX:
1200			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1201			insn = &ctx->ir_base[ref];
1202			if (IR_IS_CONST_REF(insn->op1)) {
1203				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1204				n = 1;
1205			}
1206			break;
1207		case IR_VSTORE_INT:
1208			flags = IR_OP3_MUST_BE_IN_REG;
1209			insn = &ctx->ir_base[ref];
1210			if (IR_IS_CONST_REF(insn->op3)) {
1211				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1212			}
1213			break;
1214		case IR_STORE_INT:
1215			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1216			insn = &ctx->ir_base[ref];
1217			if (IR_IS_CONST_REF(insn->op2)) {
1218				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1219			}
1220			if (IR_IS_CONST_REF(insn->op3)) {
1221				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1222			}
1223			break;
1224		case IR_VSTORE_FP:
1225			flags = IR_OP3_MUST_BE_IN_REG;
1226			insn = &ctx->ir_base[ref];
1227			if (IR_IS_CONST_REF(insn->op3)) {
1228				insn = &ctx->ir_base[insn->op3];
1229				constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1230				n = 1;
1231			}
1232			break;
1233		case IR_LOAD_FP:
1234		case IR_LOAD_INT:
1235		case IR_MEM_OP_INT:
1236		case IR_MEM_INC:
1237		case IR_MEM_DEC:
1238		case IR_MEM_MUL_PWR2:
1239		case IR_MEM_DIV_PWR2:
1240		case IR_MEM_MOD_PWR2:
1241		case IR_MEM_BINOP_INT:
1242		case IR_MEM_SHIFT:
1243		case IR_MEM_SHIFT_CONST:
1244			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1245			insn = &ctx->ir_base[ref];
1246			if (IR_IS_CONST_REF(insn->op2)) {
1247				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1248			}
1249			break;
1250		case IR_STORE_FP:
1251			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1252			insn = &ctx->ir_base[ref];
1253			if (IR_IS_CONST_REF(insn->op2)) {
1254				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1255			}
1256			if (IR_IS_CONST_REF(insn->op3)) {
1257				insn = &ctx->ir_base[insn->op3];
1258				constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1259				n++;
1260			}
1261			break;
1262		case IR_SWITCH:
1263			flags = IR_OP2_MUST_BE_IN_REG;
1264			insn = &ctx->ir_base[ref];
1265			if (IR_IS_CONST_REF(insn->op2)) {
1266				insn = &ctx->ir_base[insn->op2];
1267				constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1268				n = 1;
1269			}
1270			if (sizeof(void*) == 8) {
1271				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1272				n++;
1273			}
1274			break;
1275		case IR_CALL:
1276			insn = &ctx->ir_base[ref];
1277			if (IR_IS_TYPE_INT(insn->type)) {
1278				constraints->def_reg = IR_REG_INT_RET1;
1279#ifdef IR_REG_FP_RET1
1280			} else {
1281				constraints->def_reg = IR_REG_FP_RET1;
1282#endif
1283			}
1284			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_SCRATCH, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1285			n = 1;
1286			IR_FALLTHROUGH;
1287		case IR_TAILCALL:
1288			insn = &ctx->ir_base[ref];
1289			if (insn->inputs_count > 2) {
1290				constraints->hints[2] = IR_REG_NONE;
1291				constraints->hints_count = ir_get_args_regs(ctx, insn, constraints->hints);
1292				if (!IR_IS_CONST_REF(insn->op2)) {
1293					constraints->tmp_regs[n] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_USE_SUB_REF);
1294					n++;
1295				}
1296			}
1297			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1298			break;
1299		case IR_BINOP_SSE2:
1300			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1301			break;
1302		case IR_SHIFT_CONST:
1303		case IR_INC:
1304		case IR_DEC:
1305		case IR_MUL_PWR2:
1306		case IR_DIV_PWR2:
1307		case IR_OP_INT:
1308		case IR_OP_FP:
1309			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1310			break;
1311		case IR_MOD_PWR2:
1312			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1313			insn = &ctx->ir_base[ref];
1314			if (ir_type_size[insn->type] == 8) {
1315				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1316				if (!IR_IS_SIGNED_32BIT(offset)) {
1317					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1318					n++;
1319				}
1320			}
1321			break;
1322		case IR_SMOD_PWR2:
1323			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1324			insn = &ctx->ir_base[ref];
1325			if (ir_type_size[insn->type] == 8) {
1326				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1327				if (!IR_IS_SIGNED_32BIT(offset)) {
1328					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1329					n++;
1330				}
1331			}
1332			constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1333			n++;
1334			break;
1335		case IR_SDIV_PWR2:
1336			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1337			insn = &ctx->ir_base[ref];
1338			if (ir_type_size[insn->type] == 8) {
1339				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1340				if (!IR_IS_SIGNED_32BIT(offset)) {
1341					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1342					n++;
1343				}
1344			}
1345			break;
1346		case IR_BIT_COUNT:
1347			insn = &ctx->ir_base[ref];
1348			if (ir_type_size[insn->type] == 1) {
1349				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1350			} else {
1351				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1352			}
1353			if (IR_IS_CONST_REF(insn->op1)) {
1354				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1355				n = 1;
1356			}
1357			break;
1358		case IR_CTPOP:
1359			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1360			insn = &ctx->ir_base[ref];
1361			constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1362			n = 1;
1363			if (ir_type_size[insn->type] == 8) {
1364				constraints->tmp_regs[1] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1365				n = 2;
1366			}
1367			break;
1368		case IR_COPY_INT:
1369		case IR_COPY_FP:
1370		case IR_SEXT:
1371		case IR_ZEXT:
1372		case IR_TRUNC:
1373		case IR_BITCAST:
1374		case IR_PROTO:
1375			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1376			break;
1377		case IR_ABS_INT:
1378			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1379			break;
1380		case IR_PARAM:
1381			constraints->def_reg = ir_get_param_reg(ctx, ref);
1382			flags = 0;
1383			break;
1384		case IR_PI:
1385		case IR_PHI:
1386			flags = IR_USE_SHOULD_BE_IN_REG;
1387			break;
1388		case IR_RLOAD:
1389			constraints->def_reg = ctx->ir_base[ref].op2;
1390			flags = IR_USE_SHOULD_BE_IN_REG;
1391			break;
1392		case IR_EXITCALL:
1393			flags = IR_USE_MUST_BE_IN_REG;
1394			constraints->def_reg = IR_REG_INT_RET1;
1395			break;
1396		case IR_IF_INT:
1397		case IR_GUARD:
1398		case IR_GUARD_NOT:
1399			flags = IR_OP2_SHOULD_BE_IN_REG;
1400			break;
1401		case IR_IJMP:
1402			flags = IR_OP2_SHOULD_BE_IN_REG;
1403			break;
1404		case IR_RSTORE:
1405			flags = IR_OP3_SHOULD_BE_IN_REG;
1406			break;
1407		case IR_RETURN_INT:
1408			flags = IR_OP2_SHOULD_BE_IN_REG;
1409			constraints->hints[2] = IR_REG_INT_RET1;
1410			constraints->hints_count = 3;
1411			break;
1412		case IR_RETURN_FP:
1413#ifdef IR_REG_FP_RET1
1414			flags = IR_OP2_SHOULD_BE_IN_REG;
1415			constraints->hints[2] = IR_REG_FP_RET1;
1416			constraints->hints_count = 3;
1417#endif
1418			break;
1419		case IR_SNAPSHOT:
1420			flags = 0;
1421			break;
1422		case IR_VA_START:
1423			flags = IR_OP1_MUST_BE_IN_REG;
1424			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1425			n = 1;
1426			break;
1427		case IR_VA_ARG:
1428			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1429			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1430			n = 1;
1431			break;
1432	}
1433	constraints->tmps_count = n;
1434
1435	return flags;
1436}
1437
1438/* instruction selection */
1439static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref);
1440static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root);
1441
1442static void ir_swap_ops(ir_insn *insn)
1443{
1444	ir_ref tmp = insn->op1;
1445	insn->op1 = insn->op2;
1446	insn->op2 = tmp;
1447}
1448
1449static bool ir_match_try_revert_lea_to_add(ir_ctx *ctx, ir_ref ref)
1450{
1451	ir_insn *insn = &ctx->ir_base[ref];
1452
1453	/* TODO: This optimization makes sense only if the other operand is killed */
1454	if (insn->op1 == insn->op2) {
1455		/* pass */
1456	} else if (ir_match_try_fuse_load(ctx, insn->op2, ref)) {
1457		ctx->rules[ref] = IR_BINOP_INT;
1458		return 1;
1459	} else if (ir_match_try_fuse_load(ctx, insn->op1, ref)) {
1460		/* swap for better load fusion */
1461		ir_swap_ops(insn);
1462		ctx->rules[ref] = IR_BINOP_INT;
1463		return 1;
1464	}
1465	return 0;
1466}
1467
1468static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)
1469{
1470	if (!IR_IS_CONST_REF(addr_ref)) {
1471		uint32_t rule = ctx->rules[addr_ref];
1472
1473		if (!rule) {
1474			ctx->rules[addr_ref] = rule = ir_match_insn(ctx, addr_ref);
1475		}
1476		if (rule >= IR_LEA_OB && rule <= IR_LEA_SI_B) {
1477			ir_use_list *use_list;
1478			ir_ref j;
1479
1480			if (rule == IR_LEA_IB && ir_match_try_revert_lea_to_add(ctx, addr_ref)) {
1481				return;
1482			}
1483
1484			use_list = &ctx->use_lists[addr_ref];
1485			j = use_list->count;
1486			if (j > 1) {
1487				/* check if address is used only in LOAD and STORE */
1488				ir_ref *p = &ctx->use_edges[use_list->refs];
1489
1490				do {
1491					ir_insn *insn = &ctx->ir_base[*p];
1492					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
1493						return;
1494					}
1495					p++;
1496				} while (--j);
1497			}
1498			ctx->rules[addr_ref] = IR_FUSED | IR_SIMPLE | rule;
1499		}
1500	}
1501}
1502
1503/* A naive check if there is a STORE or CALL between this LOAD and the fusion root */
1504static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
1505{
1506	if (ref + 1 != root) {
1507		ir_ref pos = ctx->prev_ref[root];
1508
1509		do {
1510			ir_insn *insn = &ctx->ir_base[pos];
1511
1512			if (insn->op == IR_STORE) {
1513				// TODO: check if LOAD and STORE addresses may alias
1514				return 1;
1515			} else if (insn->op == IR_CALL) {
1516				return 1;
1517			}
1518			pos = ctx->prev_ref[pos];
1519		} while (ref != pos);
1520	}
1521	return 0;
1522}
1523
1524static void ir_match_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1525{
1526	if (ir_in_same_block(ctx, ref)
1527	 && ctx->ir_base[ref].op == IR_LOAD) {
1528		if (ctx->use_lists[ref].count == 2
1529		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1530			ir_ref addr_ref = ctx->ir_base[ref].op2;
1531			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1532
1533			if (IR_IS_CONST_REF(addr_ref)) {
1534				if (ir_may_fuse_addr(ctx, addr_insn)) {
1535					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1536					return;
1537				}
1538			} else {
1539				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1540				ir_match_fuse_addr(ctx, addr_ref);
1541				return;
1542			}
1543		}
1544	}
1545}
1546
1547static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1548{
1549	ir_insn *insn = &ctx->ir_base[ref];
1550
1551	if (ir_in_same_block(ctx, ref)
1552	 && insn->op == IR_LOAD) {
1553		if (ctx->use_lists[ref].count == 2
1554		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1555			ir_ref addr_ref = ctx->ir_base[ref].op2;
1556			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1557
1558			if (IR_IS_CONST_REF(addr_ref)) {
1559				if (ir_may_fuse_addr(ctx, addr_insn)) {
1560					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1561					return 1;
1562				}
1563			} else {
1564				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1565				ir_match_fuse_addr(ctx, addr_ref);
1566				return 1;
1567			}
1568		}
1569	} else if (insn->op == IR_PARAM) {
1570		if (ctx->use_lists[ref].count == 1
1571		 && ir_get_param_reg(ctx, ref) == IR_REG_NONE) {
1572			return 1;
1573		}
1574	} else if (ctx->ir_base[ref].op == IR_VLOAD) {
1575		return 1;
1576	}
1577	return 0;
1578}
1579
1580static void ir_match_fuse_load_commutative_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1581{
1582	if (IR_IS_CONST_REF(insn->op2)
1583	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1584		return;
1585	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1586		return;
1587	} else if (ir_match_try_fuse_load(ctx, insn->op1, root)) {
1588		ir_swap_ops(insn);
1589	}
1590}
1591
1592static void ir_match_fuse_load_commutative_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1593{
1594	if (!IR_IS_CONST_REF(insn->op2)
1595	 && !ir_match_try_fuse_load(ctx, insn->op2, root)
1596	 && (IR_IS_CONST_REF(insn->op1) || ir_match_try_fuse_load(ctx, insn->op1, root))) {
1597		ir_swap_ops(insn);
1598	}
1599}
1600
1601static void ir_match_fuse_load_cmp_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1602{
1603	if (IR_IS_CONST_REF(insn->op2)
1604	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1605		ir_match_fuse_load(ctx, insn->op1, root);
1606	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1607	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1608		ir_swap_ops(insn);
1609		if (insn->op != IR_EQ && insn->op != IR_NE) {
1610			insn->op ^= 3;
1611		}
1612	}
1613}
1614
1615static void ir_match_fuse_load_test_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1616{
1617	if (IR_IS_CONST_REF(insn->op2)
1618	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1619		ir_match_fuse_load(ctx, insn->op1, root);
1620	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1621	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1622		ir_swap_ops(insn);
1623	}
1624}
1625
1626static void ir_match_fuse_load_cmp_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1627{
1628	if (insn->op != IR_EQ && insn->op != IR_NE) {
1629		if (insn->op == IR_LT || insn->op == IR_LE) {
1630			/* swap operands to avoid P flag check */
1631			ir_swap_ops(insn);
1632			insn->op ^= 3;
1633		}
1634		ir_match_fuse_load(ctx, insn->op2, root);
1635	} else if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1636		/* pass */
1637	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1638		/* pass */
1639	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1640		ir_swap_ops(insn);
1641		if (insn->op != IR_EQ && insn->op != IR_NE) {
1642			insn->op ^= 3;
1643		}
1644	}
1645}
1646
1647static void ir_match_fuse_load_cmp_fp_br(ir_ctx *ctx, ir_insn *insn, ir_ref root, bool direct)
1648{
1649	if (direct) {
1650		if (insn->op == IR_LT || insn->op == IR_LE) {
1651			/* swap operands to avoid P flag check */
1652			ir_swap_ops(insn);
1653			insn->op ^= 3;
1654		}
1655	} else {
1656		if (insn->op == IR_GT || insn->op == IR_GE) {
1657			/* swap operands to avoid P flag check */
1658			ir_swap_ops(insn);
1659			insn->op ^= 3;
1660		}
1661	}
1662	if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1663		/* pass */
1664	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1665		/* pass */
1666	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1667		ir_swap_ops(insn);
1668		if (insn->op != IR_EQ && insn->op != IR_NE) {
1669			insn->op ^= 3;
1670		}
1671	}
1672}
1673
1674static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref)
1675{
1676	ir_insn *op2_insn;
1677	ir_insn *insn = &ctx->ir_base[ref];
1678	uint32_t store_rule;
1679	ir_op load_op;
1680
1681	switch (insn->op) {
1682		case IR_EQ:
1683		case IR_NE:
1684		case IR_LT:
1685		case IR_GE:
1686		case IR_LE:
1687		case IR_GT:
1688		case IR_ULT:
1689		case IR_UGE:
1690		case IR_ULE:
1691		case IR_UGT:
1692			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
1693				if (IR_IS_CONST_REF(insn->op2)
1694				 && !IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op)
1695				 && ctx->ir_base[insn->op2].val.i64 == 0
1696				 && insn->op1 == ref - 1) { /* previous instruction */
1697					ir_insn *op1_insn = &ctx->ir_base[insn->op1];
1698
1699					if (op1_insn->op == IR_AND && ctx->use_lists[insn->op1].count == 1) {
1700						/* v = AND(_, _); CMP(v, 0) => SKIP_TEST; TEST */
1701						ir_match_fuse_load_test_int(ctx, op1_insn, ref);
1702						ctx->rules[insn->op1] = IR_FUSED | IR_TEST_INT;
1703						return IR_TESTCC_INT;
1704					} else if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
1705							/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
1706							((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
1707								(insn->op == IR_EQ || insn->op == IR_NE))) {
1708						/* v = BINOP(_, _); CMP(v, 0) => BINOP; SETCC */
1709						if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1710							ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
1711						} else {
1712							ir_match_fuse_load(ctx, op1_insn->op2, ref);
1713						}
1714						ctx->rules[insn->op1] = IR_BINOP_INT;
1715						return IR_SETCC_INT;
1716					}
1717				}
1718				ir_match_fuse_load_cmp_int(ctx, insn, ref);
1719				return IR_CMP_INT;
1720			} else {
1721				ir_match_fuse_load_cmp_fp(ctx, insn, ref);
1722				return IR_CMP_FP;
1723			}
1724			break;
1725		case IR_ADD:
1726		case IR_SUB:
1727			if (IR_IS_TYPE_INT(insn->type)) {
1728				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1729					op2_insn = &ctx->ir_base[insn->op2];
1730					if (IR_IS_CONST_REF(insn->op1)) {
1731						// const
1732						// TODO: add support for sym+offset ???
1733					} else if (IR_IS_SYM_CONST(op2_insn->op)) {
1734						if (insn->op == IR_ADD && ir_may_fuse_addr(ctx, op2_insn)) {
1735							goto lea;
1736						}
1737						/* pass */
1738					} else if (op2_insn->val.i64 == 0) {
1739						return IR_COPY_INT;
1740					} else if ((ir_type_size[insn->type] >= 4 && insn->op == IR_ADD && IR_IS_SIGNED_32BIT(op2_insn->val.i64)) ||
1741							(ir_type_size[insn->type] >= 4 && insn->op == IR_SUB && IR_IS_SIGNED_NEG_32BIT(op2_insn->val.i64))) {
1742lea:
1743						if (ir_in_same_block(ctx, insn->op1) && ctx->use_lists[insn->op1].count == 1) {
1744							uint32_t rule = ctx->rules[insn->op1];
1745
1746							if (!rule) {
1747								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
1748							}
1749							if (rule == IR_LEA_SI) {
1750								/* z = MUL(Y, 2|4|8) ... ADD(z, imm32) => SKIP ... LEA [Y*2|4|8+im32] */
1751								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1752								return IR_LEA_SI_O;
1753							} else if (rule == IR_LEA_SIB) {
1754								/* z = ADD(X, MUL(Y, 2|4|8)) ... ADD(z, imm32) => SKIP ... LEA [X+Y*2|4|8+im32] */
1755								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SIB;
1756								return IR_LEA_SIB_O;
1757							} else if (rule == IR_LEA_IB) {
1758								/* z = ADD(X, Y) ... ADD(z, imm32) => SKIP ... LEA [X+Y+im32] */
1759								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_IB;
1760								return IR_LEA_IB_O;
1761							}
1762						}
1763						/* ADD(X, imm32) => LEA [X+imm32] */
1764						return IR_LEA_OB;
1765					} else if (op2_insn->val.i64 == 1 || op2_insn->val.i64 == -1) {
1766						if (insn->op == IR_ADD) {
1767							if (op2_insn->val.i64 == 1) {
1768								/* ADD(_, 1) => INC */
1769								return IR_INC;
1770						    } else {
1771								/* ADD(_, -1) => DEC */
1772								return IR_DEC;
1773						    }
1774						} else {
1775							if (op2_insn->val.i64 == 1) {
1776								/* SUB(_, 1) => DEC */
1777								return IR_DEC;
1778						    } else {
1779								/* SUB(_, -1) => INC */
1780								return IR_INC;
1781						    }
1782						}
1783					}
1784				} else if ((ctx->flags & IR_OPT_CODEGEN) && insn->op == IR_ADD && ir_type_size[insn->type] >= 4) {
1785					if (insn->op1 != insn->op2) {
1786						if (ir_in_same_block(ctx, insn->op1) && ctx->use_lists[insn->op1].count == 1) {
1787							uint32_t rule =ctx->rules[insn->op1];
1788							if (!rule) {
1789								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
1790							}
1791							if (rule == IR_LEA_OB) {
1792								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1793								if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
1794									rule = ctx->rules[insn->op2];
1795									if (!rule) {
1796										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1797									}
1798									if (rule == IR_LEA_SI) {
1799										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(x, y) => SKIP ... SKIP ... LEA */
1800										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1801										return IR_LEA_OB_SI;
1802									}
1803								}
1804								/* x = ADD(X, imm32) ... ADD(x, Y) => SKIP ... LEA */
1805								return IR_LEA_OB_I;
1806							} else if (rule == IR_LEA_SI) {
1807								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1808								if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
1809									rule = ctx->rules[insn->op2];
1810									if (!rule) {
1811										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1812									}
1813									if (rule == IR_LEA_OB) {
1814										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(y, x) => SKIP ... SKIP ... LEA */
1815										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1816										return IR_LEA_SI_OB;
1817									}
1818								}
1819								/* x = MUL(X, 2|4|8) ... ADD(x, Y) => SKIP ... LEA */
1820								return IR_LEA_SI_B;
1821							}
1822						}
1823						if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
1824							uint32_t rule = ctx->rules[insn->op2];
1825							if (!rule) {
1826								ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1827							}
1828							if (rule == IR_LEA_OB) {
1829								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1830								/* x = ADD(X, imm32) ... ADD(Y, x) => SKIP ... LEA */
1831								return IR_LEA_I_OB;
1832							} else if (rule == IR_LEA_SI) {
1833								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1834								/* x = MUL(X, 2|4|8) ... ADD(Y, x) => SKIP ... LEA */
1835								return IR_LEA_B_SI;
1836							}
1837						}
1838					}
1839					/* ADD(X, Y) => LEA [X + Y] */
1840					return IR_LEA_IB;
1841				}
1842binop_int:
1843				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1844					ir_match_fuse_load_commutative_int(ctx, insn, ref);
1845				} else {
1846					ir_match_fuse_load(ctx, insn->op2, ref);
1847				}
1848				return IR_BINOP_INT;
1849			} else {
1850binop_fp:
1851				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1852					ir_match_fuse_load_commutative_fp(ctx, insn, ref);
1853				} else {
1854					ir_match_fuse_load(ctx, insn->op2, ref);
1855				}
1856				if (ctx->mflags & IR_X86_AVX) {
1857					return IR_BINOP_AVX;
1858				} else {
1859					return IR_BINOP_SSE2;
1860				}
1861			}
1862			break;
1863		case IR_MUL:
1864			if (IR_IS_TYPE_INT(insn->type)) {
1865				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1866					op2_insn = &ctx->ir_base[insn->op2];
1867					if (IR_IS_SYM_CONST(op2_insn->op)) {
1868						/* pass */
1869					} else if (IR_IS_CONST_REF(insn->op1)) {
1870						// const
1871					} else if (op2_insn->val.u64 == 0) {
1872						// 0
1873					} else if (op2_insn->val.u64 == 1) {
1874						return IR_COPY_INT;
1875					} else if (ir_type_size[insn->type] >= 4 &&
1876							(op2_insn->val.u64 == 2 || op2_insn->val.u64 == 4 || op2_insn->val.u64 == 8)) {
1877						/* MUL(X, 2|4|8) => LEA [X*2|4|8] */
1878						return IR_LEA_SI;
1879					} else if (ir_type_size[insn->type] >= 4 &&
1880							(op2_insn->val.u64 == 3 || op2_insn->val.u64 == 5 || op2_insn->val.u64 == 9)) {
1881						/* MUL(X, 3|5|9) => LEA [X+X*2|4|8] */
1882						return IR_LEA_SIB;
1883					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
1884						/* MUL(X, PWR2) => SHL */
1885						return IR_MUL_PWR2;
1886					} else if (IR_IS_TYPE_SIGNED(insn->type)
1887					 && ir_type_size[insn->type] != 1
1888					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
1889					 && !IR_IS_CONST_REF(insn->op1)) {
1890						/* MUL(_, imm32) => IMUL */
1891						ir_match_fuse_load(ctx, insn->op1, ref);
1892						return IR_IMUL3;
1893					}
1894				}
1895				/* Prefer IMUL over MUL because it's more flexible and uses less registers ??? */
1896//				if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
1897				if (ir_type_size[insn->type] != 1) {
1898					goto binop_int;
1899				}
1900				ir_match_fuse_load(ctx, insn->op2, ref);
1901				return IR_MUL_INT;
1902			} else {
1903				goto binop_fp;
1904			}
1905			break;
1906		case IR_ADD_OV:
1907		case IR_SUB_OV:
1908			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
1909			goto binop_int;
1910		case IR_MUL_OV:
1911			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
1912			if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
1913				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1914					op2_insn = &ctx->ir_base[insn->op2];
1915					if (!IR_IS_SYM_CONST(op2_insn->op)
1916					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
1917					 && !IR_IS_CONST_REF(insn->op1)) {
1918						/* MUL(_, imm32) => IMUL */
1919						ir_match_fuse_load(ctx, insn->op1, ref);
1920						return IR_IMUL3;
1921					}
1922				}
1923				goto binop_int;
1924			}
1925			ir_match_fuse_load(ctx, insn->op2, ref);
1926			return IR_MUL_INT;
1927		case IR_DIV:
1928			if (IR_IS_TYPE_INT(insn->type)) {
1929				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1930					op2_insn = &ctx->ir_base[insn->op2];
1931					if (IR_IS_SYM_CONST(op2_insn->op)) {
1932						/* pass */
1933					} else if (IR_IS_CONST_REF(insn->op1)) {
1934						// const
1935					} else if (op2_insn->val.u64 == 1) {
1936						return IR_COPY_INT;
1937					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
1938						/* DIV(X, PWR2) => SHR */
1939						if (IR_IS_TYPE_UNSIGNED(insn->type)) {
1940							return IR_DIV_PWR2;
1941						} else {
1942							return IR_SDIV_PWR2;
1943						}
1944					}
1945				}
1946				ir_match_fuse_load(ctx, insn->op2, ref);
1947				return IR_DIV_INT;
1948			} else {
1949				goto binop_fp;
1950			}
1951			break;
1952		case IR_MOD:
1953			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1954				op2_insn = &ctx->ir_base[insn->op2];
1955				if (IR_IS_SYM_CONST(op2_insn->op)) {
1956					/* pass */
1957				} else if (IR_IS_CONST_REF(insn->op1)) {
1958					// const
1959				} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
1960					/* MOD(X, PWR2) => AND */
1961					if (IR_IS_TYPE_UNSIGNED(insn->type)) {
1962						return IR_MOD_PWR2;
1963					} else {
1964						return IR_SMOD_PWR2;
1965					}
1966				}
1967			}
1968			ir_match_fuse_load(ctx, insn->op2, ref);
1969			return IR_MOD_INT;
1970		case IR_BSWAP:
1971		case IR_NOT:
1972			if (insn->type == IR_BOOL) {
1973				IR_ASSERT(IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)); // TODO: IR_BOOL_NOT_FP
1974				return IR_BOOL_NOT_INT;
1975			} else {
1976				IR_ASSERT(IR_IS_TYPE_INT(insn->type));
1977				return IR_OP_INT;
1978			}
1979			break;
1980		case IR_NEG:
1981			if (IR_IS_TYPE_INT(insn->type)) {
1982				return IR_OP_INT;
1983			} else {
1984				return IR_OP_FP;
1985			}
1986		case IR_ABS:
1987			if (IR_IS_TYPE_INT(insn->type)) {
1988				return IR_ABS_INT; // movl %edi, %eax; negl %eax; cmovs %edi, %eax
1989			} else {
1990				return IR_OP_FP;
1991			}
1992		case IR_OR:
1993			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1994				op2_insn = &ctx->ir_base[insn->op2];
1995				if (IR_IS_SYM_CONST(op2_insn->op)) {
1996					/* pass */
1997				} else if (IR_IS_CONST_REF(insn->op1)) {
1998					// const
1999				} else if (op2_insn->val.i64 == 0) {
2000					return IR_COPY_INT;
2001				} else if (op2_insn->val.i64 == -1) {
2002					// -1
2003				}
2004			}
2005			goto binop_int;
2006		case IR_AND:
2007			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2008				op2_insn = &ctx->ir_base[insn->op2];
2009				if (IR_IS_SYM_CONST(op2_insn->op)) {
2010					/* pass */
2011				} else if (IR_IS_CONST_REF(insn->op1)) {
2012					// const
2013				} else if (op2_insn->val.i64 == 0) {
2014					// 0
2015				} else if (op2_insn->val.i64 == -1) {
2016					return IR_COPY_INT;
2017				}
2018			}
2019			goto binop_int;
2020		case IR_XOR:
2021			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2022				op2_insn = &ctx->ir_base[insn->op2];
2023				if (IR_IS_SYM_CONST(op2_insn->op)) {
2024					/* pass */
2025				} else if (IR_IS_CONST_REF(insn->op1)) {
2026					// const
2027				}
2028			}
2029			goto binop_int;
2030		case IR_SHL:
2031			if (IR_IS_CONST_REF(insn->op2)) {
2032				if (ctx->flags & IR_OPT_CODEGEN) {
2033					op2_insn = &ctx->ir_base[insn->op2];
2034					if (IR_IS_SYM_CONST(op2_insn->op)) {
2035						/* pass */
2036					} else if (IR_IS_CONST_REF(insn->op1)) {
2037						// const
2038					} else if (op2_insn->val.u64 == 0) {
2039						return IR_COPY_INT;
2040					} else if (ir_type_size[insn->type] >= 4) {
2041						if (op2_insn->val.u64 == 1) {
2042							// lea [op1*2]
2043						} else if (op2_insn->val.u64 == 2) {
2044							// lea [op1*4]
2045						} else if (op2_insn->val.u64 == 3) {
2046							// lea [op1*8]
2047						}
2048					}
2049				}
2050				return IR_SHIFT_CONST;
2051			}
2052			return IR_SHIFT;
2053		case IR_SHR:
2054		case IR_SAR:
2055		case IR_ROL:
2056		case IR_ROR:
2057			if (IR_IS_CONST_REF(insn->op2)) {
2058				if (ctx->flags & IR_OPT_CODEGEN) {
2059					op2_insn = &ctx->ir_base[insn->op2];
2060					if (IR_IS_SYM_CONST(op2_insn->op)) {
2061						/* pass */
2062					} else if (IR_IS_CONST_REF(insn->op1)) {
2063						// const
2064					} else if (op2_insn->val.u64 == 0) {
2065						return IR_COPY_INT;
2066					}
2067				}
2068				return IR_SHIFT_CONST;
2069			}
2070			return IR_SHIFT;
2071		case IR_MIN:
2072		case IR_MAX:
2073			if (IR_IS_TYPE_INT(insn->type)) {
2074				return IR_MIN_MAX_INT;
2075			} else {
2076				goto binop_fp;
2077			}
2078			break;
2079//		case IR_COND:
2080		case IR_COPY:
2081			if (IR_IS_TYPE_INT(insn->type)) {
2082				return IR_COPY_INT;
2083			} else {
2084				return IR_COPY_FP;
2085			}
2086			break;
2087		case IR_CALL:
2088			ctx->flags2 |= IR_HAS_CALLS;
2089#ifndef IR_REG_FP_RET1
2090			if (IR_IS_TYPE_FP(insn->type)) {
2091				ctx->flags2 |= IR_HAS_FP_RET_SLOT;
2092			}
2093#endif
2094			IR_FALLTHROUGH;
2095		case IR_TAILCALL:
2096		case IR_IJMP:
2097			ir_match_fuse_load(ctx, insn->op2, ref);
2098			return insn->op;
2099		case IR_VAR:
2100			return IR_SKIPPED | IR_VAR;
2101		case IR_PARAM:
2102			return ctx->use_lists[ref].count > 0 ? IR_PARAM : IR_SKIPPED | IR_PARAM;
2103		case IR_ALLOCA:
2104			/* alloca() may be used only in functions */
2105			if (ctx->flags & IR_FUNCTION) {
2106				ctx->flags |= IR_USE_FRAME_POINTER;
2107				ctx->flags2 |= IR_HAS_ALLOCA;
2108			}
2109			return IR_ALLOCA;
2110		case IR_VSTORE:
2111			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2112				store_rule = IR_VSTORE_INT;
2113				load_op = IR_VLOAD;
2114store_int:
2115				if ((ctx->flags & IR_OPT_CODEGEN)
2116				 && ir_in_same_block(ctx, insn->op3)
2117				 && (ctx->use_lists[insn->op3].count == 1 ||
2118				     (ctx->use_lists[insn->op3].count == 2
2119				   && (ctx->ir_base[insn->op3].op == IR_ADD_OV ||
2120				       ctx->ir_base[insn->op3].op == IR_SUB_OV)))) {
2121					ir_insn *op_insn = &ctx->ir_base[insn->op3];
2122					uint32_t rule = ctx->rules[insn->op3];
2123
2124					if (!rule) {
2125						ctx->rules[insn->op3] = rule = ir_match_insn(ctx, insn->op3);
2126					}
2127					if ((rule == IR_BINOP_INT && op_insn->op != IR_MUL) || rule == IR_LEA_OB || rule == IR_LEA_IB) {
2128						if (insn->op1 == op_insn->op1
2129						 && ctx->ir_base[op_insn->op1].op == load_op
2130						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2131						 && ctx->use_lists[op_insn->op1].count == 2) {
2132							/* l = LOAD(_, a) ... v = BINOP(l, _) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2133							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2134							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2135							if (!IR_IS_CONST_REF(op_insn->op2)
2136							 && ctx->rules[op_insn->op2] == (IR_FUSED|IR_SIMPLE|IR_LOAD)) {
2137								ctx->rules[op_insn->op2] = IR_LOAD_INT;
2138							}
2139							return IR_MEM_BINOP_INT;
2140						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2141						 && insn->op1 == op_insn->op2
2142						 && ctx->ir_base[op_insn->op2].op == load_op
2143						 && ctx->ir_base[op_insn->op2].op2 == insn->op2
2144						 && ctx->use_lists[op_insn->op2].count == 2) {
2145							/* l = LOAD(_, a) ... v = BINOP(_, l) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2146							ir_swap_ops(op_insn);
2147							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2148							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2149							return IR_MEM_BINOP_INT;
2150						}
2151					} else if (rule == IR_INC) {
2152						if (insn->op1 == op_insn->op1
2153						 && ctx->ir_base[op_insn->op1].op == load_op
2154						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2155						 && ctx->use_lists[op_insn->op1].count == 2) {
2156							/* l = LOAD(_, a) ... v = INC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_INC */
2157							ctx->rules[insn->op3] = IR_SKIPPED | IR_INC;
2158							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2159							return IR_MEM_INC;
2160						}
2161					} else if (rule == IR_DEC) {
2162						if (insn->op1 == op_insn->op1
2163						 && ctx->ir_base[op_insn->op1].op == load_op
2164						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2165						 && ctx->use_lists[op_insn->op1].count == 2){
2166							/* l = LOAD(_, a) ... v = DEC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DEC */
2167							ctx->rules[insn->op3] = IR_SKIPPED | IR_DEC;
2168							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2169							return IR_MEM_DEC;
2170						}
2171					} else if (rule == IR_MUL_PWR2) {
2172						if (insn->op1 == op_insn->op1
2173						 && ctx->ir_base[op_insn->op1].op == load_op
2174						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2175						 && ctx->use_lists[op_insn->op1].count == 2) {
2176							/* l = LOAD(_, a) ... v = MUL_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MUL_PWR2 */
2177							ctx->rules[insn->op3] = IR_SKIPPED | IR_MUL_PWR2;
2178							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2179							return IR_MEM_MUL_PWR2;
2180						}
2181					} else if (rule == IR_DIV_PWR2) {
2182						if (insn->op1 == op_insn->op1
2183						 && ctx->ir_base[op_insn->op1].op == load_op
2184						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2185						 && ctx->use_lists[op_insn->op1].count == 2) {
2186							/* l = LOAD(_, a) ... v = DIV_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DIV_PWR2 */
2187							ctx->rules[insn->op3] = IR_SKIPPED | IR_DIV_PWR2;
2188							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2189							return IR_MEM_DIV_PWR2;
2190						}
2191					} else if (rule == IR_MOD_PWR2) {
2192						if (insn->op1 == op_insn->op1
2193						 && ctx->ir_base[op_insn->op1].op == load_op
2194						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2195						 && ctx->use_lists[op_insn->op1].count == 2) {
2196							/* l = LOAD(_, a) ... v = MOD_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MOD_PWR2 */
2197							ctx->rules[insn->op3] = IR_SKIPPED | IR_MOD_PWR2;
2198							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2199							return IR_MEM_MOD_PWR2;
2200						}
2201					} else if (rule == IR_SHIFT) {
2202						if (insn->op1 == op_insn->op1
2203						 && ctx->ir_base[op_insn->op1].op == load_op
2204						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2205						 && ctx->use_lists[op_insn->op1].count == 2) {
2206							/* l = LOAD(_, a) ... v = SHIFT(l, _) ... STORE(l, a, v) => SKIP ... SKIP_SHIFT ... MEM_SHIFT */
2207							ctx->rules[insn->op3] = IR_FUSED | IR_SHIFT;
2208							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2209							return IR_MEM_SHIFT;
2210						}
2211					} else if (rule == IR_SHIFT_CONST) {
2212						if (insn->op1 == op_insn->op1
2213						 && ctx->ir_base[op_insn->op1].op == load_op
2214						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2215						 && ctx->use_lists[op_insn->op1].count == 2) {
2216							/* l = LOAD(_, a) ... v = SHIFT(l, CONST) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_SHIFT_CONST */
2217							ctx->rules[insn->op3] = IR_SKIPPED | IR_SHIFT_CONST;
2218							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2219							return IR_MEM_SHIFT_CONST;
2220						}
2221					} else if (rule == IR_OP_INT && op_insn->op != IR_BSWAP) {
2222						if (insn->op1 == op_insn->op1
2223						 && ctx->ir_base[op_insn->op1].op == load_op
2224						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2225						 && ctx->use_lists[op_insn->op1].count == 2) {
2226							/* l = LOAD(_, a) ... v = OP(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_OP */
2227							ctx->rules[insn->op3] = IR_SKIPPED | IR_OP_INT;
2228							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2229							return IR_MEM_OP_INT;
2230						}
2231					}
2232				}
2233				return store_rule;
2234			} else {
2235				return IR_VSTORE_FP;
2236			}
2237			break;
2238		case IR_LOAD:
2239			ir_match_fuse_addr(ctx, insn->op2);
2240			if (IR_IS_TYPE_INT(insn->type)) {
2241				return IR_LOAD_INT;
2242			} else {
2243				return IR_LOAD_FP;
2244			}
2245			break;
2246		case IR_STORE:
2247			ir_match_fuse_addr(ctx, insn->op2);
2248			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2249				store_rule = IR_STORE_INT;
2250				load_op = IR_LOAD;
2251				goto store_int;
2252			} else {
2253				return IR_STORE_FP;
2254			}
2255			break;
2256		case IR_RLOAD:
2257			if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), insn->op2)) {
2258				return IR_SKIPPED | IR_RLOAD;
2259			}
2260			return IR_RLOAD;
2261		case IR_RSTORE:
2262			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2263				if ((ctx->flags & IR_OPT_CODEGEN)
2264				 && ir_in_same_block(ctx, insn->op2)
2265				 && ctx->use_lists[insn->op2].count == 1
2266				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2267					ir_insn *op_insn = &ctx->ir_base[insn->op2];
2268
2269					if (op_insn->op == IR_ADD ||
2270				        op_insn->op == IR_SUB ||
2271//				        op_insn->op == IR_MUL ||
2272				        op_insn->op == IR_OR  ||
2273				        op_insn->op == IR_AND ||
2274				        op_insn->op == IR_XOR) {
2275						if (insn->op1 == op_insn->op1
2276						 && ctx->ir_base[op_insn->op1].op == IR_RLOAD
2277						 && ctx->ir_base[op_insn->op1].op2 == insn->op3
2278						 && ctx->use_lists[op_insn->op1].count == 2) {
2279							/* l = RLOAD(r) ... v = BINOP(l, _) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2280							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2281							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2282							return IR_REG_BINOP_INT;
2283						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2284						 && insn->op1 == op_insn->op2
2285						 && ctx->ir_base[op_insn->op2].op == IR_RLOAD
2286						 && ctx->ir_base[op_insn->op2].op2 == insn->op3
2287						 && ctx->use_lists[op_insn->op2].count == 2) {
2288							/* l = RLOAD(r) ... v = BINOP(x, l) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2289							ir_swap_ops(op_insn);
2290							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2291							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2292							return IR_REG_BINOP_INT;
2293						}
2294					}
2295				}
2296			}
2297			ir_match_fuse_load(ctx, insn->op2, ref);
2298			return IR_RSTORE;
2299		case IR_START:
2300		case IR_BEGIN:
2301		case IR_IF_TRUE:
2302		case IR_IF_FALSE:
2303		case IR_CASE_VAL:
2304		case IR_CASE_DEFAULT:
2305		case IR_MERGE:
2306		case IR_LOOP_BEGIN:
2307		case IR_UNREACHABLE:
2308			return IR_SKIPPED | insn->op;
2309		case IR_RETURN:
2310			if (!insn->op2) {
2311				return IR_RETURN_VOID;
2312			} else if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2313				return IR_RETURN_INT;
2314			} else {
2315				return IR_RETURN_FP;
2316			}
2317		case IR_IF:
2318			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2319				op2_insn = &ctx->ir_base[insn->op2];
2320				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT) {
2321					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2322						if (IR_IS_CONST_REF(op2_insn->op2)
2323						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2324						 && ctx->ir_base[op2_insn->op2].val.i64 == 0
2325						 && op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2326							ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2327
2328							if (op1_insn->op == IR_AND && ctx->use_lists[op2_insn->op1].count == 1) {
2329								/* v = AND(_, _); c = CMP(v, 0) ... IF(c) => SKIP_TEST; SKIP ... TEST_AND_BRANCH */
2330								ir_match_fuse_load_test_int(ctx, op1_insn, ref);
2331								ctx->rules[op2_insn->op1] = IR_FUSED | IR_TEST_INT;
2332								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_NOP;
2333								return IR_TEST_AND_BRANCH_INT;
2334							} else if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2335									/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2336									((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2337										(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2338								/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... JCC */
2339								if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2340									ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2341								} else {
2342									ir_match_fuse_load(ctx, op1_insn->op2, ref);
2343								}
2344								ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2345								ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2346								return IR_JCC_INT;
2347							}
2348						}
2349						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2350						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2351						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2352						return IR_CMP_AND_BRANCH_INT;
2353					} else {
2354						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2355						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, 1);
2356						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2357						return IR_CMP_AND_BRANCH_FP;
2358					}
2359				} else if (op2_insn->op == IR_AND) {
2360					/* c = AND(_, _) ... IF(c) => SKIP_TEST ... TEST_AND_BRANCH */
2361					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2362					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2363					return IR_TEST_AND_BRANCH_INT;
2364				} else if (op2_insn->op == IR_OVERFLOW) {
2365					/* c = OVERFLOW(_) ... IF(c) => SKIP_OVERFLOW ... OVERFLOW_AND_BRANCH */
2366					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2367					return IR_OVERFLOW_AND_BRANCH;
2368				}
2369			}
2370			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2371				if (insn->op2 == ref - 1 /* previous instruction */
2372				 && ir_in_same_block(ctx, insn->op2)) {
2373					op2_insn = &ctx->ir_base[insn->op2];
2374					if (op2_insn->op == IR_ADD ||
2375					    op2_insn->op == IR_SUB ||
2376//					    op2_insn->op == IR_MUL ||
2377					    op2_insn->op == IR_OR  ||
2378					    op2_insn->op == IR_AND ||
2379					    op2_insn->op == IR_XOR) {
2380
2381							/* v = BINOP(_, _); IF(v) => BINOP; JCC */
2382						if (ir_op_flags[op2_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2383							ir_match_fuse_load_commutative_int(ctx, op2_insn, ref);
2384						} else {
2385							ir_match_fuse_load(ctx, op2_insn->op2, ref);
2386						}
2387						ctx->rules[insn->op2] = IR_BINOP_INT;
2388						return IR_JCC_INT;
2389					}
2390				} else if ((ctx->flags & IR_OPT_CODEGEN)
2391				 && insn->op1 == ref - 1 /* previous instruction */
2392				 && insn->op2 == ref - 2 /* previous instruction */
2393				 && ir_in_same_block(ctx, insn->op2)
2394				 && ctx->use_lists[insn->op2].count == 2
2395				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2396					ir_insn *store_insn = &ctx->ir_base[insn->op1];
2397
2398					if (store_insn->op == IR_STORE && store_insn->op3 == insn->op2) {
2399						ir_insn *op_insn = &ctx->ir_base[insn->op2];
2400
2401						if (op_insn->op == IR_ADD ||
2402						    op_insn->op == IR_SUB ||
2403//						    op_insn->op == IR_MUL ||
2404						    op_insn->op == IR_OR  ||
2405						    op_insn->op == IR_AND ||
2406						    op_insn->op == IR_XOR) {
2407							if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2408							 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2409								if (ir_in_same_block(ctx, op_insn->op1)
2410								 && ctx->use_lists[op_insn->op1].count == 2
2411								 && store_insn->op1 == op_insn->op1) {
2412									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2413									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2414									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2415									ir_match_fuse_addr(ctx, store_insn->op2);
2416									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2417									return IR_JCC_INT;
2418								}
2419							} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2420							 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2421							 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2422								if (ir_in_same_block(ctx, op_insn->op2)
2423								 && ctx->use_lists[op_insn->op2].count == 2
2424								 && store_insn->op1 == op_insn->op2) {
2425									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2426									ir_swap_ops(op_insn);
2427									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2428									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2429									ir_match_fuse_addr(ctx, store_insn->op2);
2430									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2431									return IR_JCC_INT;
2432								}
2433							}
2434						}
2435					}
2436				}
2437				ir_match_fuse_load(ctx, insn->op2, ref);
2438				return IR_IF_INT;
2439			} else {
2440				IR_ASSERT(0 && "NIY IR_IF_FP");
2441				break;
2442			}
2443		case IR_GUARD:
2444		case IR_GUARD_NOT:
2445			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2446				op2_insn = &ctx->ir_base[insn->op2];
2447				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT
2448					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
2449				 && (insn->op2 == ref - 1 ||
2450				     (insn->op2 == ctx->prev_ref[ref] - 1
2451				   && ctx->ir_base[ctx->prev_ref[ref]].op == IR_SNAPSHOT))) {
2452					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2453						if (IR_IS_CONST_REF(op2_insn->op2)
2454						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2455						 && ctx->ir_base[op2_insn->op2].val.i64 == 0) {
2456							if (op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2457								ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2458
2459								if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2460										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2461										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2462											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2463									if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2464										ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2465									} else {
2466										ir_match_fuse_load(ctx, op1_insn->op2, ref);
2467									}
2468									/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... GUARD_JCC */
2469									ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2470									ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2471									return IR_GUARD_JCC_INT;
2472								}
2473							} else if ((ctx->flags & IR_OPT_CODEGEN)
2474							 && op2_insn->op1 == insn->op2 - 2 /* before previous instruction */
2475							 && ir_in_same_block(ctx, op2_insn->op1)
2476							 && ctx->use_lists[op2_insn->op1].count == 2) {
2477								ir_insn *store_insn = &ctx->ir_base[insn->op2 - 1];
2478
2479								if (store_insn->op == IR_STORE && store_insn->op3 == op2_insn->op1) {
2480									ir_insn *op_insn = &ctx->ir_base[op2_insn->op1];
2481
2482									if ((op_insn->op == IR_OR || op_insn->op == IR_AND || op_insn->op == IR_XOR) ||
2483											/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2484											((op_insn->op == IR_ADD || op_insn->op == IR_SUB) &&
2485												(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2486										if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2487										 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2488											if (ir_in_same_block(ctx, op_insn->op1)
2489											 && ctx->use_lists[op_insn->op1].count == 2
2490											 && store_insn->op1 == op_insn->op1) {
2491												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; GUARD_JCC */
2492												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2493												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2494												ir_match_fuse_addr(ctx, store_insn->op2);
2495												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2496												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2497												return IR_GUARD_JCC_INT;
2498											}
2499										} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2500										 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2501										 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2502											if (ir_in_same_block(ctx, op_insn->op2)
2503											 && ctx->use_lists[op_insn->op2].count == 2
2504											 && store_insn->op1 == op_insn->op2) {
2505												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2506												ir_swap_ops(op_insn);
2507												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2508												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2509												ir_match_fuse_addr(ctx, store_insn->op2);
2510												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2511												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2512												return IR_GUARD_JCC_INT;
2513											}
2514										}
2515									}
2516								}
2517							}
2518						}
2519						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2520						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2521						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2522						return IR_GUARD_CMP_INT;
2523					} else {
2524						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2525						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, insn->op == IR_GUARD_NOT);
2526						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2527						return IR_GUARD_CMP_FP;
2528					}
2529				} else if (op2_insn->op == IR_AND) { // TODO: OR, XOR. etc
2530					/* c = AND(_, _) ... GUARD(c) => SKIP_TEST ... GUARD_TEST */
2531					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2532					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2533					return IR_GUARD_TEST_INT;
2534				} else if (op2_insn->op == IR_OVERFLOW) {
2535					/* c = OVERFLOW(_) ... GUARD(c) => SKIP_OVERFLOW ... GUARD_OVERFLOW */
2536					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2537					return IR_GUARD_OVERFLOW;
2538				}
2539			}
2540			ir_match_fuse_load(ctx, insn->op2, ref);
2541			return insn->op;
2542		case IR_INT2FP:
2543			if (ir_type_size[ctx->ir_base[insn->op1].type] > (IR_IS_TYPE_SIGNED(ctx->ir_base[insn->op1].type) ? 2 : 4)) {
2544				ir_match_fuse_load(ctx, insn->op1, ref);
2545			}
2546			return insn->op;
2547		case IR_SEXT:
2548		case IR_ZEXT:
2549		case IR_BITCAST:
2550		case IR_FP2INT:
2551		case IR_FP2FP:
2552		case IR_PROTO:
2553			ir_match_fuse_load(ctx, insn->op1, ref);
2554			return insn->op;
2555		case IR_CTLZ:
2556		case IR_CTTZ:
2557			ir_match_fuse_load(ctx, insn->op1, ref);
2558			return IR_BIT_COUNT;
2559		case IR_CTPOP:
2560			ir_match_fuse_load(ctx, insn->op1, ref);
2561			return (ctx->mflags & IR_X86_BMI1) ? IR_BIT_COUNT : IR_CTPOP;
2562		case IR_VA_START:
2563			ctx->flags2 |= IR_HAS_VA_START;
2564			if ((ctx->ir_base[insn->op2].op == IR_ALLOCA) || (ctx->ir_base[insn->op2].op == IR_VADDR)) {
2565				ir_use_list *use_list = &ctx->use_lists[insn->op2];
2566				ir_ref *p, n = use_list->count;
2567				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2568					ir_insn *use_insn = &ctx->ir_base[*p];
2569					if (use_insn->op == IR_VA_START || use_insn->op == IR_VA_END) {
2570					} else if (use_insn->op == IR_VA_COPY) {
2571						if (use_insn->op3 == insn->op2) {
2572							ctx->flags2 |= IR_HAS_VA_COPY;
2573						}
2574					} else if (use_insn->op == IR_VA_ARG) {
2575						if (use_insn->op2 == insn->op2) {
2576							if (IR_IS_TYPE_INT(use_insn->type)) {
2577								ctx->flags2 |= IR_HAS_VA_ARG_GP;
2578							} else {
2579								IR_ASSERT(IR_IS_TYPE_FP(use_insn->type));
2580								ctx->flags2 |= IR_HAS_VA_ARG_FP;
2581							}
2582						}
2583					} else if (*p > ref) {
2584						/* diriect va_list access */
2585						ctx->flags2 |= IR_HAS_VA_ARG_GP|IR_HAS_VA_ARG_FP;
2586					}
2587				}
2588			}
2589			return IR_VA_START;
2590		case IR_VA_END:
2591			return IR_SKIPPED | IR_NOP;
2592		case IR_VADDR:
2593			if (ctx->use_lists[ref].count > 0) {
2594				ir_use_list *use_list = &ctx->use_lists[ref];
2595				ir_ref *p, n = use_list->count;
2596
2597				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2598					if (ctx->ir_base[*p].op != IR_VA_END) {
2599						return IR_VADDR;
2600					}
2601				}
2602			}
2603			return IR_SKIPPED | IR_NOP;
2604		default:
2605			break;
2606	}
2607
2608	return insn->op;
2609}
2610
2611static void ir_match_insn2(ir_ctx *ctx, ir_ref ref, uint32_t rule)
2612{
2613	if (rule == IR_LEA_IB) {
2614		ir_match_try_revert_lea_to_add(ctx, ref);
2615	}
2616}
2617
2618/* code generation */
2619static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg)
2620{
2621	int32_t offset;
2622
2623	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2624	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
2625	IR_ASSERT(offset != -1);
2626	if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2627		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2628		*reg = ctx->spill_base;
2629		return offset;
2630	}
2631	*reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2632	return IR_SPILL_POS_TO_OFFSET(offset);
2633}
2634
2635static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v)
2636{
2637	int32_t offset;
2638	ir_reg base;
2639
2640	IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]);
2641	offset = ctx->live_intervals[v]->stack_spill_pos;
2642	IR_ASSERT(offset != -1);
2643	if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2644		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2645		return IR_MEM_BO(ctx->spill_base, offset);
2646	}
2647	base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2648	offset = IR_SPILL_POS_TO_OFFSET(offset);
2649	return IR_MEM_BO(base, offset);
2650}
2651
2652static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref)
2653{
2654	IR_ASSERT(!IR_IS_CONST_REF(ref));
2655	return ir_vreg_spill_slot(ctx, ctx->vregs[ref]);
2656}
2657
2658static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem)
2659{
2660	ir_mem m = ir_ref_spill_slot(ctx, ref);
2661	return IR_MEM_VAL(m) == IR_MEM_VAL(mem);
2662}
2663
2664static ir_mem ir_var_spill_slot(ir_ctx *ctx, ir_ref ref)
2665{
2666	ir_insn *var_insn = &ctx->ir_base[ref];
2667	ir_reg reg;
2668
2669	IR_ASSERT(var_insn->op == IR_VAR);
2670	reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2671	return IR_MEM_BO(reg, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
2672}
2673
2674static bool ir_may_avoid_spill_load(ir_ctx *ctx, ir_ref ref, ir_ref use)
2675{
2676	ir_live_interval *ival;
2677
2678	IR_ASSERT(ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2679	ival = ctx->live_intervals[ctx->vregs[ref]];
2680	while (ival) {
2681		ir_use_pos *use_pos = ival->use_pos;
2682		while (use_pos) {
2683			if (IR_LIVE_POS_TO_REF(use_pos->pos) == use) {
2684				return !use_pos->next || use_pos->next->op_num == 0;
2685			}
2686			use_pos = use_pos->next;
2687		}
2688		ival = ival->next;
2689	}
2690	return 0;
2691}
2692
2693static void ir_emit_load_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
2694{
2695	ir_backend_data *data = ctx->data;
2696	dasm_State **Dst = &data->dasm_state;
2697
2698	IR_ASSERT(IR_IS_TYPE_INT(type));
2699	if (val == 0) {
2700		|	ASM_REG_REG_OP xor, type, reg, reg
2701	} else if (ir_type_size[type] == 8) {
2702		IR_ASSERT(sizeof(void*) == 8);
2703|.if X64
2704		if (IR_IS_UNSIGNED_32BIT(val)) {
2705			|	mov Rd(reg), (uint32_t)val // zero extended load
2706		} else if (IR_IS_SIGNED_32BIT(val)) {
2707			|	mov Rq(reg), (int32_t)val // sign extended load
2708//		} else if (type == IR_ADDR && IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, (intptr_t)val)) {
2709//			|	lea Ra(reg), [&val]
2710		} else {
2711			|	mov64 Ra(reg), val
2712		}
2713|.endif
2714	} else {
2715		|	ASM_REG_IMM_OP mov, type, reg, (int32_t)val // sign extended load
2716	}
2717}
2718
2719static void ir_emit_load_mem_int(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2720{
2721	ir_backend_data *data = ctx->data;
2722	dasm_State **Dst = &data->dasm_state;
2723
2724	|	ASM_REG_MEM_OP mov, type, reg, mem
2725}
2726
2727static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
2728{
2729	ir_backend_data *data = ctx->data;
2730	dasm_State **Dst = &data->dasm_state;
2731	ir_insn *insn = &ctx->ir_base[src];
2732	int label;
2733
2734	if (type == IR_FLOAT && insn->val.u32 == 0) {
2735		if (ctx->mflags & IR_X86_AVX) {
2736			|	vxorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2737		} else {
2738			|	xorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2739		}
2740	} else if (type == IR_DOUBLE && insn->val.u64 == 0) {
2741		if (ctx->mflags & IR_X86_AVX) {
2742			|	vxorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2743		} else {
2744			|	xorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2745		}
2746	} else {
2747		label = ir_const_label(ctx, src);
2748		|	ASM_FP_REG_TXT_OP movs, type, reg, [=>label]
2749	}
2750}
2751
2752static void ir_emit_load_mem_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2753{
2754	ir_backend_data *data = ctx->data;
2755	dasm_State **Dst = &data->dasm_state;
2756
2757	|	ASM_FP_REG_MEM_OP movs, type, reg, mem
2758}
2759
2760static void ir_emit_load_mem(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2761{
2762	if (IR_IS_TYPE_INT(type)) {
2763		ir_emit_load_mem_int(ctx, type, reg, mem);
2764	} else {
2765		ir_emit_load_mem_fp(ctx, type, reg, mem);
2766	}
2767}
2768
2769static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
2770{
2771	if (IR_IS_CONST_REF(src)) {
2772		if (IR_IS_TYPE_INT(type)) {
2773			ir_insn *insn = &ctx->ir_base[src];
2774
2775			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
2776				void *addr = ir_sym_val(ctx, insn);
2777				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
2778			} else if (insn->op == IR_STR) {
2779				ir_backend_data *data = ctx->data;
2780				dasm_State **Dst = &data->dasm_state;
2781				int label = ir_const_label(ctx, src);
2782
2783				|	lea Ra(reg), aword [=>label]
2784			} else {
2785				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
2786			}
2787		} else {
2788			ir_emit_load_imm_fp(ctx, type, reg, src);
2789		}
2790	} else {
2791		ir_emit_load_mem(ctx, type, reg, ir_ref_spill_slot(ctx, src));
2792	}
2793}
2794
2795static void ir_emit_store_mem_int(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
2796{
2797	ir_backend_data *data = ctx->data;
2798	dasm_State **Dst = &data->dasm_state;
2799
2800	|	ASM_MEM_REG_OP mov, type, mem, reg
2801}
2802
2803static void ir_emit_store_mem_fp(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
2804{
2805	ir_backend_data *data = ctx->data;
2806	dasm_State **Dst = &data->dasm_state;
2807
2808	|	ASM_FP_MEM_REG_OP movs, type, mem, reg
2809}
2810
2811static void ir_emit_store_mem_imm(ir_ctx *ctx, ir_type type, ir_mem mem, int32_t imm)
2812{
2813	ir_backend_data *data = ctx->data;
2814	dasm_State **Dst = &data->dasm_state;
2815
2816	|	ASM_MEM_IMM_OP mov, type, mem, imm
2817}
2818
2819static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, bool is_arg)
2820{
2821	ir_backend_data *data = ctx->data;
2822	dasm_State **Dst = &data->dasm_state;
2823	ir_insn *val_insn = &ctx->ir_base[src];
2824
2825	IR_ASSERT(IR_IS_CONST_REF(src));
2826	if (val_insn->op == IR_STR) {
2827		int label = ir_const_label(ctx, src);
2828
2829		IR_ASSERT(tmp_reg != IR_REG_NONE);
2830|.if X64
2831		|	lea Ra(tmp_reg), aword [=>label]
2832||		ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
2833|.else
2834		|	ASM_TMEM_TXT_OP mov, aword, mem, =>label
2835|.endif
2836	} else {
2837		int64_t val = val_insn->val.i64;
2838
2839		if (val_insn->op == IR_FUNC || val_insn->op == IR_SYM) {
2840			val = (int64_t)(intptr_t)ir_sym_val(ctx, val_insn);
2841		}
2842
2843		if (sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(val)) {
2844			if (is_arg && ir_type_size[type] < 4) {
2845				type = IR_U32;
2846			}
2847			ir_emit_store_mem_imm(ctx, type, mem, val);
2848		} else {
2849			IR_ASSERT(tmp_reg != IR_REG_NONE);
2850			ir_emit_load_imm_int(ctx, type, tmp_reg, val);
2851			ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
2852		}
2853	}
2854}
2855
2856static void ir_emit_store_mem_fp_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, ir_reg tmp_fp_reg)
2857{
2858	ir_val *val = &ctx->ir_base[src].val;
2859
2860	if (type == IR_FLOAT) {
2861		ir_emit_store_mem_imm(ctx, IR_U32, mem, val->i32);
2862	} else if (sizeof(void*) == 8 && val->i64 == 0) {
2863		ir_emit_store_mem_imm(ctx, IR_U64, mem, 0);
2864	} else if (sizeof(void*) == 8 && tmp_reg != IR_REG_NONE) {
2865		ir_emit_load_imm_int(ctx, IR_U64, tmp_reg, val->i64);
2866		ir_emit_store_mem_int(ctx, IR_U64, mem, tmp_reg);
2867	} else {
2868		ir_emit_load(ctx, type, tmp_fp_reg, src);
2869		ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
2870	}
2871}
2872
2873static void ir_emit_store_mem(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
2874{
2875	if (IR_IS_TYPE_INT(type)) {
2876		ir_emit_store_mem_int(ctx, type, mem, reg);
2877	} else {
2878		ir_emit_store_mem_fp(ctx, type, mem, reg);
2879	}
2880}
2881
2882static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg)
2883{
2884	IR_ASSERT(dst >= 0);
2885	ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg);
2886}
2887
2888static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
2889{
2890	ir_backend_data *data = ctx->data;
2891	dasm_State **Dst = &data->dasm_state;
2892
2893	|	ASM_REG_REG_OP mov, type, dst, src
2894}
2895
2896#define IR_HAVE_SWAP_INT
2897
2898static void ir_emit_swap(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
2899{
2900	ir_backend_data *data = ctx->data;
2901	dasm_State **Dst = &data->dasm_state;
2902
2903	|	ASM_REG_REG_OP xchg, type, dst, src
2904}
2905
2906static void ir_emit_mov_ext(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
2907{
2908	ir_backend_data *data = ctx->data;
2909	dasm_State **Dst = &data->dasm_state;
2910
2911	if (ir_type_size[type] > 2) {
2912		|	ASM_REG_REG_OP mov, type, dst, src
2913	} else if (ir_type_size[type] == 2) {
2914		if (IR_IS_TYPE_SIGNED(type)) {
2915			|	movsx Rd(dst), Rw(src)
2916		} else {
2917			|	movzx Rd(dst), Rw(src)
2918		}
2919	} else /* if (ir_type_size[type] == 1) */ {
2920		if (IR_IS_TYPE_SIGNED(type)) {
2921			|	movsx Rd(dst), Rb(src)
2922		} else {
2923			|	movzx Rd(dst), Rb(src)
2924		}
2925	}
2926}
2927
2928static void ir_emit_fp_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
2929{
2930	ir_backend_data *data = ctx->data;
2931	dasm_State **Dst = &data->dasm_state;
2932
2933	|	ASM_FP_REG_REG_OP movap, type, dst, src
2934}
2935
2936static ir_mem ir_fuse_addr_const(ir_ctx *ctx, ir_ref ref)
2937{
2938	ir_mem mem;
2939	ir_insn *addr_insn = &ctx->ir_base[ref];
2940
2941	IR_ASSERT(IR_IS_CONST_REF(ref));
2942	if (IR_IS_SYM_CONST(addr_insn->op)) {
2943		void *addr = ir_sym_val(ctx, addr_insn);
2944		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT((intptr_t)addr));
2945		mem = IR_MEM_O((int32_t)(intptr_t)addr);
2946	} else {
2947		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(addr_insn->val.i64));
2948		mem = IR_MEM_O(addr_insn->val.i32);
2949	}
2950	return mem;
2951}
2952
2953static ir_mem ir_fuse_addr(ir_ctx *ctx, ir_ref root, ir_ref ref)
2954{
2955	uint32_t rule = ctx->rules[ref];
2956	ir_insn *insn = &ctx->ir_base[ref];
2957	ir_insn *op1_insn, *op2_insn, *offset_insn;
2958	ir_ref base_reg_ref, index_reg_ref;
2959	ir_reg base_reg, index_reg;
2960	int32_t offset, scale;
2961
2962	IR_ASSERT((rule & IR_RULE_MASK) >= IR_LEA_OB && (rule & IR_RULE_MASK) <= IR_LEA_SI_B);
2963	switch (rule & IR_RULE_MASK) {
2964		default:
2965			IR_ASSERT(0);
2966		case IR_LEA_OB:
2967			offset_insn = insn;
2968			base_reg_ref = ref * sizeof(ir_ref) + 1;
2969			index_reg_ref = IR_UNUSED;
2970			scale = 1;
2971			break;
2972		case IR_LEA_SI:
2973			scale = ctx->ir_base[insn->op2].val.i32;
2974			index_reg_ref = ref * sizeof(ir_ref) + 1;
2975			base_reg_ref = IR_UNUSED;
2976			offset_insn = NULL;
2977			break;
2978		case IR_LEA_SIB:
2979			base_reg_ref = index_reg_ref = ref * sizeof(ir_ref) + 1;
2980			scale = ctx->ir_base[insn->op2].val.i32 - 1;
2981			offset_insn = NULL;
2982			break;
2983		case IR_LEA_IB:
2984			base_reg_ref = ref * sizeof(ir_ref) + 1;
2985			index_reg_ref = ref * sizeof(ir_ref) + 2;
2986			offset_insn = NULL;
2987			scale = 1;
2988			break;
2989		case IR_LEA_OB_I:
2990			base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
2991			index_reg_ref = ref * sizeof(ir_ref) + 2;
2992			op1_insn = &ctx->ir_base[insn->op1];
2993			offset_insn = op1_insn;
2994			scale = 1;
2995			break;
2996		case IR_LEA_I_OB:
2997			base_reg_ref = ref * sizeof(ir_ref) + 1;
2998			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
2999			op2_insn = &ctx->ir_base[insn->op2];
3000			offset_insn = op2_insn;
3001			scale = 1;
3002			break;
3003		case IR_LEA_SI_O:
3004			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3005			op1_insn = &ctx->ir_base[insn->op1];
3006			scale = ctx->ir_base[op1_insn->op2].val.i32;
3007			offset_insn = insn;
3008			base_reg_ref = IR_UNUSED;
3009			break;
3010		case IR_LEA_SIB_O:
3011			base_reg_ref = index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3012			op1_insn = &ctx->ir_base[insn->op1];
3013			scale = ctx->ir_base[op1_insn->op2].val.i32 - 1;
3014			offset_insn = insn;
3015			break;
3016		case IR_LEA_IB_O:
3017			base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3018			index_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
3019			offset_insn = insn;
3020			scale = 1;
3021			break;
3022		case IR_LEA_OB_SI:
3023			base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3024			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3025			op1_insn = &ctx->ir_base[insn->op1];
3026			offset_insn = op1_insn;
3027			op2_insn = &ctx->ir_base[insn->op2];
3028			scale = ctx->ir_base[op2_insn->op2].val.i32;
3029			break;
3030		case IR_LEA_SI_OB:
3031			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3032			base_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3033			op1_insn = &ctx->ir_base[insn->op1];
3034			scale = ctx->ir_base[op1_insn->op2].val.i32;
3035			op2_insn = &ctx->ir_base[insn->op2];
3036			offset_insn = op2_insn;
3037			break;
3038		case IR_LEA_B_SI:
3039			base_reg_ref = ref * sizeof(ir_ref) + 1;
3040			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3041			op2_insn = &ctx->ir_base[insn->op2];
3042			scale = ctx->ir_base[op2_insn->op2].val.i32;
3043			offset_insn = NULL;
3044			break;
3045		case IR_LEA_SI_B:
3046			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3047			base_reg_ref = ref * sizeof(ir_ref) + 2;
3048			op1_insn = &ctx->ir_base[insn->op1];
3049			scale = ctx->ir_base[op1_insn->op2].val.i32;
3050			offset_insn = NULL;
3051			break;
3052	}
3053
3054	offset = 0;
3055	if (offset_insn) {
3056		ir_insn *addr_insn = &ctx->ir_base[offset_insn->op2];
3057
3058		if (IR_IS_SYM_CONST(addr_insn->op)) {
3059			void *addr = ir_sym_val(ctx, addr_insn);
3060			IR_ASSERT(sizeof(void*) != 8 || IR_IS_SIGNED_32BIT((intptr_t)addr));
3061			offset = (int64_t)(intptr_t)(addr);
3062		} else {
3063			offset = addr_insn->val.i32;
3064			if (offset_insn->op == IR_SUB) {
3065				offset = -offset;
3066			}
3067		}
3068	}
3069
3070	base_reg = IR_REG_NONE;
3071	if (base_reg_ref) {
3072		if (UNEXPECTED(ctx->rules[base_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3073			base_reg = ir_get_fused_reg(ctx, root, base_reg_ref);
3074		} else {
3075			base_reg = ((int8_t*)ctx->regs)[base_reg_ref];
3076		}
3077		IR_ASSERT(base_reg != IR_REG_NONE);
3078		if (IR_REG_SPILLED(base_reg)) {
3079			base_reg = IR_REG_NUM(base_reg);
3080			ir_emit_load(ctx, insn->type, base_reg, ((ir_ref*)ctx->ir_base)[base_reg_ref]);
3081		}
3082	}
3083
3084	index_reg = IR_REG_NONE;
3085	if (index_reg_ref) {
3086		if (base_reg_ref
3087			&& ((ir_ref*)ctx->ir_base)[index_reg_ref]
3088				== ((ir_ref*)ctx->ir_base)[base_reg_ref]) {
3089			index_reg = base_reg;
3090		} else {
3091			if (UNEXPECTED(ctx->rules[index_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3092				index_reg = ir_get_fused_reg(ctx, root, index_reg_ref);
3093			} else {
3094				index_reg = ((int8_t*)ctx->regs)[index_reg_ref];
3095			}
3096			IR_ASSERT(index_reg != IR_REG_NONE);
3097			if (IR_REG_SPILLED(index_reg)) {
3098				index_reg = IR_REG_NUM(index_reg);
3099				ir_emit_load(ctx, insn->type, index_reg, ((ir_ref*)ctx->ir_base)[index_reg_ref]);
3100			}
3101		}
3102	}
3103
3104	return IR_MEM(base_reg, offset, index_reg, scale);
3105}
3106
3107static ir_mem ir_fuse_mem(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *mem_insn, ir_reg reg)
3108{
3109	if (reg != IR_REG_NONE) {
3110		if (IR_REG_SPILLED(reg) || IR_IS_CONST_REF(mem_insn->op2)) {
3111			reg = IR_REG_NUM(reg);
3112			ir_emit_load(ctx, IR_ADDR, reg, mem_insn->op2);
3113		}
3114		return IR_MEM_B(reg);
3115	} else if (IR_IS_CONST_REF(mem_insn->op2)) {
3116		return ir_fuse_addr_const(ctx, mem_insn->op2);
3117	} else {
3118		return ir_fuse_addr(ctx, root, mem_insn->op2);
3119	}
3120}
3121
3122static ir_mem ir_fuse_load(ir_ctx *ctx, ir_ref root, ir_ref ref)
3123{
3124	ir_insn *load_insn = &ctx->ir_base[ref];
3125	ir_reg reg;
3126
3127	IR_ASSERT(load_insn->op == IR_LOAD);
3128	if (UNEXPECTED(ctx->rules[ref] & IR_FUSED_REG)) {
3129		reg = ir_get_fused_reg(ctx, root, ref * sizeof(ir_ref) + 2);
3130	} else {
3131		reg = ctx->regs[ref][2];
3132	}
3133	return ir_fuse_mem(ctx, root, ref, load_insn, reg);
3134}
3135
3136static int32_t ir_fuse_imm(ir_ctx *ctx, ir_ref ref)
3137{
3138	ir_insn *val_insn = &ctx->ir_base[ref];
3139
3140	IR_ASSERT(IR_IS_CONST_REF(ref));
3141	if (IR_IS_SYM_CONST(val_insn->op)) {
3142		void *addr = ir_sym_val(ctx, val_insn);
3143		IR_ASSERT(IR_IS_SIGNED_32BIT((intptr_t)addr));
3144		return (int32_t)(intptr_t)addr;
3145	} else {
3146		IR_ASSERT(IR_IS_SIGNED_32BIT(val_insn->val.i32));
3147		return val_insn->val.i32;
3148	}
3149}
3150
3151static void ir_emit_prologue(ir_ctx *ctx)
3152{
3153	ir_backend_data *data = ctx->data;
3154	dasm_State **Dst = &data->dasm_state;
3155
3156	if (ctx->flags & IR_USE_FRAME_POINTER) {
3157		|	push Ra(IR_REG_RBP)
3158		|	mov Ra(IR_REG_RBP), Ra(IR_REG_RSP)
3159	}
3160	if (ctx->stack_frame_size + ctx->call_stack_size) {
3161		if (ctx->fixed_stack_red_zone) {
3162			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3163		} else {
3164			|	sub Ra(IR_REG_RSP), (ctx->stack_frame_size + ctx->call_stack_size)
3165		}
3166	}
3167	if (ctx->used_preserved_regs) {
3168		ir_reg fp;
3169		int offset;
3170		uint32_t i;
3171		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
3172
3173		if (ctx->flags & IR_USE_FRAME_POINTER) {
3174			fp = IR_REG_FRAME_POINTER;
3175			offset = 0;
3176		} else {
3177			fp = IR_REG_STACK_POINTER;
3178			offset = ctx->stack_frame_size + ctx->call_stack_size;
3179		}
3180		for (i = 0; i < IR_REG_NUM; i++) {
3181			if (IR_REGSET_IN(used_preserved_regs, i)) {
3182				if (i < IR_REG_FP_FIRST) {
3183					offset -= sizeof(void*);
3184					|	mov aword [Ra(fp)+offset], Ra(i)
3185				} else {
3186					offset -= sizeof(void*);
3187					if (ctx->mflags & IR_X86_AVX) {
3188						|	vmovsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3189					} else {
3190						|	movsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3191					}
3192				}
3193			}
3194		}
3195	}
3196	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
3197#if defined(_WIN64)
3198		ir_reg fp;
3199		int offset;
3200
3201		if (ctx->flags & IR_USE_FRAME_POINTER) {
3202			fp = IR_REG_FRAME_POINTER;
3203			offset = sizeof(void*) * 2;
3204		} else {
3205			fp = IR_REG_STACK_POINTER;
3206			offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*);
3207		}
3208		|	mov [Ra(fp)+offset], Ra(IR_REG_INT_ARG1)
3209		|	mov [Ra(fp)+offset+8], Ra(IR_REG_INT_ARG2)
3210		|	mov [Ra(fp)+offset+16], Ra(IR_REG_INT_ARG3)
3211		|	mov [Ra(fp)+offset+24], Ra(IR_REG_INT_ARG4)
3212#elif defined(IR_TARGET_X64)
3213|.if X64
3214		const int8_t *int_reg_params = _ir_int_reg_params;
3215		const int8_t *fp_reg_params = _ir_fp_reg_params;
3216		uint32_t i;
3217		ir_reg fp;
3218		int offset;
3219
3220		if (ctx->flags & IR_USE_FRAME_POINTER) {
3221			fp = IR_REG_FRAME_POINTER;
3222
3223			offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
3224		} else {
3225			fp = IR_REG_STACK_POINTER;
3226			offset = ctx->locals_area_size + ctx->call_stack_size;
3227		}
3228
3229		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
3230			/* skip named args */
3231			offset += sizeof(void*) * ctx->gp_reg_params;
3232			for (i = ctx->gp_reg_params; i < IR_REG_INT_ARGS; i++) {
3233				|	mov qword [Ra(fp)+offset], Rq(int_reg_params[i])
3234				offset += sizeof(void*);
3235			}
3236		}
3237		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
3238			|	test al, al
3239			|	je	>1
3240			/* skip named args */
3241			offset += 16 * ctx->fp_reg_params;
3242			for (i = ctx->fp_reg_params; i < IR_REG_FP_ARGS; i++) {
3243				|	movaps [Ra(fp)+offset], xmm(fp_reg_params[i]-IR_REG_FP_FIRST)
3244				offset += 16;
3245			}
3246			|1:
3247		}
3248|.endif
3249#endif
3250	}
3251}
3252
3253static void ir_emit_epilogue(ir_ctx *ctx)
3254{
3255	ir_backend_data *data = ctx->data;
3256	dasm_State **Dst = &data->dasm_state;
3257
3258	if (ctx->used_preserved_regs) {
3259		int offset;
3260		uint32_t i;
3261		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
3262
3263		if (ctx->flags & IR_USE_FRAME_POINTER) {
3264			offset = 0;
3265		} else {
3266			offset = ctx->stack_frame_size + ctx->call_stack_size;
3267		}
3268		for (i = 0; i < IR_REG_NUM; i++) {
3269			if (IR_REGSET_IN(used_preserved_regs, i)) {
3270				if (i < IR_REG_FP_FIRST) {
3271					ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3272
3273					offset -= sizeof(void*);
3274					|	mov Ra(i), aword [Ra(fp)+offset]
3275				} else {
3276					ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3277
3278					offset -= sizeof(void*);
3279					if (ctx->mflags & IR_X86_AVX) {
3280						|	vmovsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3281					} else {
3282						|	movsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3283					}
3284				}
3285			}
3286		}
3287	}
3288
3289	if (ctx->flags & IR_USE_FRAME_POINTER) {
3290		|	mov Ra(IR_REG_RSP), Ra(IR_REG_RBP)
3291		|	pop Ra(IR_REG_RBP)
3292	} else if (ctx->stack_frame_size + ctx->call_stack_size) {
3293		if (ctx->fixed_stack_red_zone) {
3294			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3295		} else {
3296			|	add Ra(IR_REG_RSP), (ctx->stack_frame_size + ctx->call_stack_size)
3297		}
3298	}
3299}
3300
3301static void ir_emit_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3302{
3303	ir_backend_data *data = ctx->data;
3304	dasm_State **Dst = &data->dasm_state;
3305	ir_type type = insn->type;
3306	ir_ref op1 = insn->op1;
3307	ir_ref op2 = insn->op2;
3308	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3309	ir_reg op1_reg = ctx->regs[def][1];
3310	ir_reg op2_reg = ctx->regs[def][2];
3311
3312	IR_ASSERT(def_reg != IR_REG_NONE);
3313
3314	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3315		op1_reg = IR_REG_NUM(op1_reg);
3316		ir_emit_load(ctx, type, op1_reg, op1);
3317	}
3318	if (def_reg != op1_reg) {
3319		if (op1_reg != IR_REG_NONE) {
3320			ir_emit_mov(ctx, type, def_reg, op1_reg);
3321		} else {
3322			ir_emit_load(ctx, type, def_reg, op1);
3323		}
3324		if (op1 == op2) {
3325			op2_reg = def_reg;
3326		}
3327	}
3328
3329	if (op2_reg != IR_REG_NONE) {
3330		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
3331			op2_reg = IR_REG_NUM(op2_reg);
3332			if (op1 != op2) {
3333				ir_emit_load(ctx, type, op2_reg, op2);
3334			}
3335		}
3336		switch (insn->op) {
3337			default:
3338				IR_ASSERT(0 && "NIY binary op");
3339			case IR_ADD:
3340			case IR_ADD_OV:
3341				|	ASM_REG_REG_OP add, type, def_reg, op2_reg
3342				break;
3343			case IR_SUB:
3344			case IR_SUB_OV:
3345				|	ASM_REG_REG_OP sub, type, def_reg, op2_reg
3346				break;
3347			case IR_MUL:
3348			case IR_MUL_OV:
3349				|	ASM_REG_REG_MUL imul, type, def_reg, op2_reg
3350				break;
3351			case IR_OR:
3352				|	ASM_REG_REG_OP or, type, def_reg, op2_reg
3353				break;
3354			case IR_AND:
3355				|	ASM_REG_REG_OP and, type, def_reg, op2_reg
3356				break;
3357			case IR_XOR:
3358				|	ASM_REG_REG_OP xor, type, def_reg, op2_reg
3359				break;
3360		}
3361	} else if (IR_IS_CONST_REF(op2)) {
3362		int32_t val = ir_fuse_imm(ctx, op2);
3363
3364		switch (insn->op) {
3365			default:
3366				IR_ASSERT(0 && "NIY binary op");
3367			case IR_ADD:
3368			case IR_ADD_OV:
3369				|	ASM_REG_IMM_OP add, type, def_reg, val
3370				break;
3371			case IR_SUB:
3372			case IR_SUB_OV:
3373				|	ASM_REG_IMM_OP sub, type, def_reg, val
3374				break;
3375			case IR_MUL:
3376			case IR_MUL_OV:
3377				|	ASM_REG_IMM_MUL imul, type, def_reg, val
3378				break;
3379			case IR_OR:
3380				|	ASM_REG_IMM_OP or, type, def_reg, val
3381				break;
3382			case IR_AND:
3383				|	ASM_REG_IMM_OP and, type, def_reg, val
3384				break;
3385			case IR_XOR:
3386				|	ASM_REG_IMM_OP xor, type, def_reg, val
3387				break;
3388		}
3389	} else {
3390		ir_mem mem;
3391
3392		if (ir_rule(ctx, op2) & IR_FUSED) {
3393			mem = ir_fuse_load(ctx, def, op2);
3394		} else {
3395			mem = ir_ref_spill_slot(ctx, op2);
3396		}
3397		switch (insn->op) {
3398			default:
3399				IR_ASSERT(0 && "NIY binary op");
3400			case IR_ADD:
3401			case IR_ADD_OV:
3402				|	ASM_REG_MEM_OP add, type, def_reg, mem
3403				break;
3404			case IR_SUB:
3405			case IR_SUB_OV:
3406				|	ASM_REG_MEM_OP sub, type, def_reg, mem
3407				break;
3408			case IR_MUL:
3409			case IR_MUL_OV:
3410				|	ASM_REG_MEM_MUL imul, type, def_reg, mem
3411				break;
3412			case IR_OR:
3413				|	ASM_REG_MEM_OP or, type, def_reg, mem
3414				break;
3415			case IR_AND:
3416				|	ASM_REG_MEM_OP and, type, def_reg, mem
3417				break;
3418			case IR_XOR:
3419				|	ASM_REG_MEM_OP xor, type, def_reg, mem
3420				break;
3421		}
3422	}
3423	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3424		ir_emit_store(ctx, type, def, def_reg);
3425	}
3426}
3427
3428static void ir_emit_imul3(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3429{
3430	ir_backend_data *data = ctx->data;
3431	dasm_State **Dst = &data->dasm_state;
3432	ir_type type = insn->type;
3433	ir_ref op1 = insn->op1;
3434	ir_ref op2 = insn->op2;
3435	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3436	ir_reg op1_reg = ctx->regs[def][1];
3437	int32_t val = ir_fuse_imm(ctx, op2);
3438
3439	IR_ASSERT(def_reg != IR_REG_NONE);
3440	IR_ASSERT(!IR_IS_CONST_REF(op1));
3441
3442	if (op1_reg != IR_REG_NONE) {
3443		if (IR_REG_SPILLED(op1_reg)) {
3444			op1_reg = IR_REG_NUM(op1_reg);
3445			ir_emit_load(ctx, type, op1_reg, op1);
3446		}
3447		switch (ir_type_size[type]) {
3448			default:
3449				IR_ASSERT(0);
3450			case 2:
3451				|	imul Rw(def_reg), Rw(op1_reg), val
3452				break;
3453			case 4:
3454				|	imul Rd(def_reg), Rd(op1_reg), val
3455				break;
3456|.if X64
3457||			case 8:
3458|				imul Rq(def_reg), Rq(op1_reg), val
3459||				break;
3460|.endif
3461		}
3462	} else {
3463		ir_mem mem;
3464
3465		if (ir_rule(ctx, op1) & IR_FUSED) {
3466			mem = ir_fuse_load(ctx, def, op1);
3467		} else {
3468			mem = ir_ref_spill_slot(ctx, op1);
3469		}
3470		|	ASM_REG_MEM_TXT_MUL imul, type, def_reg, mem, val
3471	}
3472	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3473		ir_emit_store(ctx, type, def, def_reg);
3474	}
3475}
3476
3477static void ir_emit_min_max_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3478{
3479	ir_backend_data *data = ctx->data;
3480	dasm_State **Dst = &data->dasm_state;
3481	ir_type type = insn->type;
3482	ir_ref op1 = insn->op1;
3483	ir_ref op2 = insn->op2;
3484	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3485	ir_reg op1_reg = ctx->regs[def][1];
3486	ir_reg op2_reg = ctx->regs[def][2];
3487
3488	IR_ASSERT(def_reg != IR_REG_NONE && op2_reg != IR_REG_NONE);
3489
3490	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3491		op1_reg = IR_REG_NUM(op1_reg);
3492		ir_emit_load(ctx, type, op1_reg, op1);
3493	}
3494	if (def_reg != op1_reg) {
3495		if (op1_reg != IR_REG_NONE) {
3496			ir_emit_mov(ctx, type, def_reg, op1_reg);
3497		} else {
3498			ir_emit_load(ctx, type, def_reg, op1);
3499		}
3500	}
3501
3502	if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
3503		op2_reg = IR_REG_NUM(op2_reg);
3504		if (op1 != op2) {
3505			ir_emit_load(ctx, type, op2_reg, op2);
3506		}
3507	}
3508
3509	if (op1 == op2) {
3510		return;
3511	}
3512
3513	|	ASM_REG_REG_OP cmp, type, def_reg, op2_reg
3514	if (insn->op == IR_MIN) {
3515		if (IR_IS_TYPE_SIGNED(type)) {
3516			|	ASM_REG_REG_OP2 cmovg, type, def_reg, op2_reg
3517		} else {
3518			|	ASM_REG_REG_OP2 cmova, type, def_reg, op2_reg
3519		}
3520	} else {
3521		IR_ASSERT(insn->op == IR_MAX);
3522		if (IR_IS_TYPE_SIGNED(type)) {
3523			|	ASM_REG_REG_OP2 cmovl, type, def_reg, op2_reg
3524		} else {
3525			|	ASM_REG_REG_OP2 cmovb, type, def_reg, op2_reg
3526		}
3527	}
3528
3529	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3530		ir_emit_store(ctx, type, def, def_reg);
3531	}
3532}
3533
3534static void ir_emit_overflow(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3535{
3536	ir_backend_data *data = ctx->data;
3537	dasm_State **Dst = &data->dasm_state;
3538	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3539	ir_type type = ctx->ir_base[insn->op1].type;
3540
3541	IR_ASSERT(def_reg != IR_REG_NONE);
3542	IR_ASSERT(IR_IS_TYPE_INT(type));
3543	if (IR_IS_TYPE_SIGNED(type)) {
3544		|	seto Rb(def_reg)
3545	} else {
3546		|	setc Rb(def_reg)
3547	}
3548	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3549		ir_emit_store(ctx, insn->type, def, def_reg);
3550	}
3551}
3552
3553static void ir_emit_overflow_and_branch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
3554{
3555	ir_backend_data *data = ctx->data;
3556	dasm_State **Dst = &data->dasm_state;
3557	ir_insn *overflow_insn = &ctx->ir_base[insn->op2];
3558	ir_type type = ctx->ir_base[overflow_insn->op1].type;
3559	uint32_t true_block, false_block, next_block;
3560	bool reverse = 0;
3561
3562	ir_get_true_false_blocks(ctx, b, &true_block, &false_block, &next_block);
3563	if (true_block == next_block) {
3564		reverse = 1;
3565		true_block = false_block;
3566		false_block = 0;
3567	} else if (false_block == next_block) {
3568		false_block = 0;
3569	}
3570
3571	if (IR_IS_TYPE_SIGNED(type)) {
3572		if (reverse) {
3573			|	jno =>true_block
3574		} else {
3575			|	jo =>true_block
3576		}
3577	} else {
3578		if (reverse) {
3579			|	jnc =>true_block
3580		} else {
3581			|	jc =>true_block
3582		}
3583	}
3584	if (false_block) {
3585		|	jmp =>false_block
3586	}
3587}
3588
3589static void ir_emit_mem_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3590{
3591	ir_backend_data *data = ctx->data;
3592	dasm_State **Dst = &data->dasm_state;
3593	ir_insn *op_insn = &ctx->ir_base[insn->op3];
3594	ir_type type = op_insn->type;
3595	ir_ref op2 = op_insn->op2;
3596	ir_reg op2_reg = ctx->regs[insn->op3][2];
3597	ir_mem mem;
3598
3599	if (insn->op == IR_STORE) {
3600		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
3601	} else {
3602		IR_ASSERT(insn->op == IR_VSTORE);
3603		mem = ir_var_spill_slot(ctx, insn->op2);
3604	}
3605
3606	if (op2_reg == IR_REG_NONE) {
3607		int32_t val = ir_fuse_imm(ctx, op2);
3608
3609		switch (op_insn->op) {
3610			default:
3611				IR_ASSERT(0 && "NIY binary op");
3612			case IR_ADD:
3613			case IR_ADD_OV:
3614				|	ASM_MEM_IMM_OP add, type, mem, val
3615				break;
3616			case IR_SUB:
3617			case IR_SUB_OV:
3618				|	ASM_MEM_IMM_OP sub, type, mem, val
3619				break;
3620			case IR_OR:
3621				|	ASM_MEM_IMM_OP or, type, mem, val
3622				break;
3623			case IR_AND:
3624				|	ASM_MEM_IMM_OP and, type, mem, val
3625				break;
3626			case IR_XOR:
3627				|	ASM_MEM_IMM_OP xor, type, mem, val
3628				break;
3629		}
3630	} else {
3631		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
3632			op2_reg = IR_REG_NUM(op2_reg);
3633			ir_emit_load(ctx, type, op2_reg, op2);
3634		}
3635		switch (op_insn->op) {
3636			default:
3637				IR_ASSERT(0 && "NIY binary op");
3638			case IR_ADD:
3639			case IR_ADD_OV:
3640				|	ASM_MEM_REG_OP add, type, mem, op2_reg
3641				break;
3642			case IR_SUB:
3643			case IR_SUB_OV:
3644				|	ASM_MEM_REG_OP sub, type, mem, op2_reg
3645				break;
3646			case IR_OR:
3647				|	ASM_MEM_REG_OP or, type, mem, op2_reg
3648				break;
3649			case IR_AND:
3650				|	ASM_MEM_REG_OP and, type, mem, op2_reg
3651				break;
3652			case IR_XOR:
3653				|	ASM_MEM_REG_OP xor, type, mem, op2_reg
3654				break;
3655		}
3656	}
3657}
3658
3659static void ir_emit_reg_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3660{
3661	ir_backend_data *data = ctx->data;
3662	dasm_State **Dst = &data->dasm_state;
3663	ir_insn *op_insn = &ctx->ir_base[insn->op2];
3664	ir_type type = op_insn->type;
3665	ir_ref op2 = op_insn->op2;
3666	ir_reg op2_reg = ctx->regs[insn->op2][2];
3667	ir_reg reg;
3668
3669	IR_ASSERT(insn->op == IR_RSTORE);
3670	reg = insn->op3;
3671
3672	if (op2_reg == IR_REG_NONE) {
3673		int32_t val = ir_fuse_imm(ctx, op2);
3674
3675		switch (op_insn->op) {
3676			default:
3677				IR_ASSERT(0 && "NIY binary op");
3678			case IR_ADD:
3679				|	ASM_REG_IMM_OP add, type, reg, val
3680				break;
3681			case IR_SUB:
3682				|	ASM_REG_IMM_OP sub, type, reg, val
3683				break;
3684			case IR_OR:
3685				|	ASM_REG_IMM_OP or, type, reg, val
3686				break;
3687			case IR_AND:
3688				|	ASM_REG_IMM_OP and, type, reg, val
3689				break;
3690			case IR_XOR:
3691				|	ASM_REG_IMM_OP xor, type, reg, val
3692				break;
3693		}
3694	} else {
3695		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
3696			op2_reg = IR_REG_NUM(op2_reg);
3697			ir_emit_load(ctx, type, op2_reg, op2);
3698		}
3699		switch (op_insn->op) {
3700			default:
3701				IR_ASSERT(0 && "NIY binary op");
3702			case IR_ADD:
3703				|	ASM_REG_REG_OP add, type, reg, op2_reg
3704				break;
3705			case IR_SUB:
3706				|	ASM_REG_REG_OP sub, type, reg, op2_reg
3707				break;
3708			case IR_OR:
3709				|	ASM_REG_REG_OP or, type, reg, op2_reg
3710				break;
3711			case IR_AND:
3712				|	ASM_REG_REG_OP and, type, reg, op2_reg
3713				break;
3714			case IR_XOR:
3715				|	ASM_REG_REG_OP xor, type, reg, op2_reg
3716				break;
3717		}
3718	}
3719}
3720
3721static void ir_emit_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3722{
3723	ir_backend_data *data = ctx->data;
3724	dasm_State **Dst = &data->dasm_state;
3725	ir_type type = insn->type;
3726	ir_ref op1 = insn->op1;
3727	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3728	ir_reg op1_reg = ctx->regs[def][1];
3729
3730	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
3731	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
3732	IR_ASSERT(def_reg != IR_REG_NONE);
3733
3734	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3735		op1_reg = IR_REG_NUM(op1_reg);
3736		ir_emit_load(ctx, type, op1_reg, op1);
3737	}
3738	if (def_reg != op1_reg) {
3739		if (op1_reg != IR_REG_NONE) {
3740			ir_emit_mov(ctx, type, def_reg, op1_reg);
3741		} else {
3742			ir_emit_load(ctx, type, def_reg, op1);
3743		}
3744	}
3745	if (insn->op == IR_MUL) {
3746		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
3747
3748		if (shift == 1) {
3749			|	ASM_REG_REG_OP add, type, def_reg, def_reg
3750		} else {
3751			|	ASM_REG_IMM_OP shl, type, def_reg, shift
3752		}
3753	} else if (insn->op == IR_DIV) {
3754		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
3755
3756		|	ASM_REG_IMM_OP shr, type, def_reg, shift
3757	} else {
3758		IR_ASSERT(insn->op == IR_MOD);
3759		uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
3760
3761|.if X64
3762||		if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
3763||			ir_reg op2_reg = ctx->regs[def][2];
3764||
3765||			ir_emit_load_imm_int(ctx, type, op2_reg, mask);
3766			|	ASM_REG_REG_OP and, type, def_reg, op2_reg
3767||		} else {
3768|.endif
3769			|	ASM_REG_IMM_OP and, type, def_reg, mask
3770|.if X64
3771||		}
3772|.endif
3773	}
3774	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3775		ir_emit_store(ctx, type, def, def_reg);
3776	}
3777}
3778
3779static void ir_emit_sdiv_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3780{
3781	ir_backend_data *data = ctx->data;
3782	dasm_State **Dst = &data->dasm_state;
3783	ir_type type = insn->type;
3784	ir_ref op1 = insn->op1;
3785	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3786	ir_reg op1_reg = ctx->regs[def][1];
3787	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
3788	int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
3789
3790	IR_ASSERT(shift != 0);
3791	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
3792	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
3793	IR_ASSERT(op1_reg != IR_REG_NONE && def_reg != IR_REG_NONE && op1_reg != def_reg);
3794
3795	if (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(op1)) {
3796		op1_reg = IR_REG_NUM(op1_reg);
3797		ir_emit_load(ctx, type, op1_reg, op1);
3798	}
3799
3800	if (shift == 1) {
3801|.if X64
3802||		if (ir_type_size[type] == 8) {
3803			|	mov Rq(def_reg), Rq(op1_reg)
3804			|	ASM_REG_IMM_OP shr, type, def_reg, 63
3805			|	add Rq(def_reg), Rq(op1_reg)
3806||		} else {
3807|.endif
3808			|	mov Rd(def_reg), Rd(op1_reg)
3809			|	ASM_REG_IMM_OP shr, type, def_reg, (ir_type_size[type]*8-1)
3810			|	add Rd(def_reg), Rd(op1_reg)
3811|.if X64
3812||		}
3813|.endif
3814	} else {
3815|.if X64
3816||		if (ir_type_size[type] == 8) {
3817||			ir_reg op2_reg = ctx->regs[def][2];
3818||
3819||			if (op2_reg != IR_REG_NONE) {
3820||				ir_emit_load_imm_int(ctx, type, op2_reg, offset);
3821				|	lea Rq(def_reg), [Rq(op1_reg)+Rq(op2_reg)]
3822||			} else {
3823				|	lea Rq(def_reg), [Rq(op1_reg)+(int32_t)offset]
3824||			}
3825||		} else {
3826|.endif
3827			|	lea Rd(def_reg), [Rd(op1_reg)+(int32_t)offset]
3828|.if X64
3829||		}
3830|.endif
3831		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
3832		|	ASM_REG_REG_OP2 cmovns, type, def_reg, op1_reg
3833	}
3834	|	ASM_REG_IMM_OP sar, type, def_reg, shift
3835
3836	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3837		ir_emit_store(ctx, type, def, def_reg);
3838	}
3839}
3840
3841static void ir_emit_smod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3842{
3843	ir_backend_data *data = ctx->data;
3844	dasm_State **Dst = &data->dasm_state;
3845	ir_type type = insn->type;
3846	ir_ref op1 = insn->op1;
3847	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3848	ir_reg op1_reg = ctx->regs[def][1];
3849	ir_reg tmp_reg = ctx->regs[def][3];
3850	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
3851	uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
3852
3853	IR_ASSERT(shift != 0);
3854	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
3855	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
3856	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE && def_reg != tmp_reg);
3857
3858	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3859		op1_reg = IR_REG_NUM(op1_reg);
3860		ir_emit_load(ctx, type, op1_reg, op1);
3861	}
3862	if (def_reg != op1_reg) {
3863		if (op1_reg != IR_REG_NONE) {
3864			ir_emit_mov(ctx, type, def_reg, op1_reg);
3865		} else {
3866			ir_emit_load(ctx, type, def_reg, op1);
3867		}
3868	}
3869	if (tmp_reg != op1_reg) {
3870		ir_emit_mov(ctx, type, tmp_reg, def_reg);
3871	}
3872
3873
3874	if (shift == 1) {
3875		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-1)
3876	} else {
3877		|	ASM_REG_IMM_OP sar, type, tmp_reg, (ir_type_size[type]*8-1)
3878		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-shift)
3879	}
3880	|	ASM_REG_REG_OP add, type, def_reg, tmp_reg
3881
3882|.if X64
3883||	if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
3884||		ir_reg op2_reg = ctx->regs[def][2];
3885||
3886||		ir_emit_load_imm_int(ctx, type, op2_reg, mask);
3887		|	ASM_REG_REG_OP and, type, def_reg, op2_reg
3888||	} else {
3889|.endif
3890		|	ASM_REG_IMM_OP and, type, def_reg, mask
3891|.if X64
3892||	}
3893|.endif
3894
3895	|	ASM_REG_REG_OP sub, type, def_reg, tmp_reg
3896
3897	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3898		ir_emit_store(ctx, type, def, def_reg);
3899	}
3900}
3901
3902static void ir_emit_mem_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3903{
3904	ir_backend_data *data = ctx->data;
3905	dasm_State **Dst = &data->dasm_state;
3906	ir_insn *op_insn = &ctx->ir_base[insn->op3];
3907	ir_type type = op_insn->type;
3908	ir_mem mem;
3909
3910	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
3911	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
3912
3913	if (insn->op == IR_STORE) {
3914		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
3915	} else {
3916		IR_ASSERT(insn->op == IR_VSTORE);
3917		mem = ir_var_spill_slot(ctx, insn->op2);
3918	}
3919
3920	if (op_insn->op == IR_MUL) {
3921		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
3922		|	ASM_MEM_IMM_OP shl, type, mem, shift
3923	} else if (op_insn->op == IR_DIV) {
3924		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
3925		|	ASM_MEM_IMM_OP shr, type, mem, shift
3926	} else {
3927		IR_ASSERT(op_insn->op == IR_MOD);
3928		uint64_t mask = ctx->ir_base[op_insn->op2].val.u64 - 1;
3929		IR_ASSERT(IR_IS_UNSIGNED_32BIT(mask));
3930		|	ASM_MEM_IMM_OP and, type, mem, mask
3931	}
3932}
3933
3934static void ir_emit_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3935{
3936	ir_backend_data *data = ctx->data;
3937	dasm_State **Dst = &data->dasm_state;
3938	ir_type type = insn->type;
3939	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3940	ir_reg op1_reg = ctx->regs[def][1];
3941	ir_reg op2_reg = ctx->regs[def][2];
3942
3943	IR_ASSERT(def_reg != IR_REG_NONE && def_reg != IR_REG_RCX);
3944	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3945		op1_reg = IR_REG_NUM(op1_reg);
3946		ir_emit_load(ctx, type, op1_reg, insn->op1);
3947	}
3948	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
3949		op2_reg = IR_REG_NUM(op2_reg);
3950		ir_emit_load(ctx, type, op2_reg, insn->op2);
3951	}
3952	if (op2_reg != IR_REG_RCX) {
3953		if (op1_reg == IR_REG_RCX) {
3954			ir_emit_mov(ctx, type, def_reg, op1_reg);
3955			op1_reg = def_reg;
3956		}
3957		if (op2_reg != IR_REG_NONE) {
3958			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
3959		} else {
3960			ir_emit_load(ctx, type, IR_REG_RCX, insn->op2);
3961		}
3962	}
3963	if (def_reg != op1_reg) {
3964		if (op1_reg != IR_REG_NONE) {
3965			ir_emit_mov(ctx, type, def_reg, op1_reg);
3966		} else {
3967			ir_emit_load(ctx, type, def_reg, insn->op1);
3968		}
3969	}
3970	switch (insn->op) {
3971		default:
3972			IR_ASSERT(0);
3973		case IR_SHL:
3974			|	ASM_REG_TXT_OP shl, insn->type, def_reg, cl
3975			break;
3976		case IR_SHR:
3977			|	ASM_REG_TXT_OP shr, insn->type, def_reg, cl
3978			break;
3979		case IR_SAR:
3980			|	ASM_REG_TXT_OP sar, insn->type, def_reg, cl
3981			break;
3982		case IR_ROL:
3983			|	ASM_REG_TXT_OP rol, insn->type, def_reg, cl
3984			break;
3985		case IR_ROR:
3986			|	ASM_REG_TXT_OP ror, insn->type, def_reg, cl
3987			break;
3988	}
3989	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3990		ir_emit_store(ctx, type, def, def_reg);
3991	}
3992}
3993
3994static void ir_emit_mem_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3995{
3996	ir_backend_data *data = ctx->data;
3997	dasm_State **Dst = &data->dasm_state;
3998	ir_insn *op_insn = &ctx->ir_base[insn->op3];
3999	ir_type type = op_insn->type;
4000	ir_ref op2 = op_insn->op2;
4001	ir_reg op2_reg = ctx->regs[insn->op3][2];
4002	ir_mem mem;
4003
4004	if (insn->op == IR_STORE) {
4005		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4006	} else {
4007		IR_ASSERT(insn->op == IR_VSTORE);
4008		mem = ir_var_spill_slot(ctx, insn->op2);
4009	}
4010
4011	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
4012		op2_reg = IR_REG_NUM(op2_reg);
4013		ir_emit_load(ctx, type, op2_reg, op2);
4014	}
4015	if (op2_reg != IR_REG_RCX) {
4016		if (op2_reg != IR_REG_NONE) {
4017			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
4018		} else {
4019			ir_emit_load(ctx, type, IR_REG_RCX, op2);
4020		}
4021	}
4022	switch (op_insn->op) {
4023		default:
4024			IR_ASSERT(0);
4025		case IR_SHL:
4026			|	ASM_MEM_TXT_OP shl, type, mem, cl
4027			break;
4028		case IR_SHR:
4029			|	ASM_MEM_TXT_OP shr, type, mem, cl
4030			break;
4031		case IR_SAR:
4032			|	ASM_MEM_TXT_OP sar, type, mem, cl
4033			break;
4034		case IR_ROL:
4035			|	ASM_MEM_TXT_OP rol, type, mem, cl
4036			break;
4037		case IR_ROR:
4038			|	ASM_MEM_TXT_OP ror, type, mem, cl
4039			break;
4040	}
4041}
4042
4043static void ir_emit_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4044{
4045	ir_backend_data *data = ctx->data;
4046	dasm_State **Dst = &data->dasm_state;
4047	int32_t shift;
4048	ir_type type = insn->type;
4049	ir_ref op1 = insn->op1;
4050	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4051	ir_reg op1_reg = ctx->regs[def][1];
4052
4053	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4054	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[insn->op2].val.i64));
4055	shift = ctx->ir_base[insn->op2].val.i32;
4056	IR_ASSERT(def_reg != IR_REG_NONE);
4057
4058	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4059		op1_reg = IR_REG_NUM(op1_reg);
4060		ir_emit_load(ctx, type, op1_reg, op1);
4061	}
4062	if (def_reg != op1_reg) {
4063		if (op1_reg != IR_REG_NONE) {
4064			ir_emit_mov(ctx, type, def_reg, op1_reg);
4065		} else {
4066			ir_emit_load(ctx, type, def_reg, op1);
4067		}
4068	}
4069	switch (insn->op) {
4070		default:
4071			IR_ASSERT(0);
4072		case IR_SHL:
4073			|	ASM_REG_IMM_OP shl, insn->type, def_reg, shift
4074			break;
4075		case IR_SHR:
4076			|	ASM_REG_IMM_OP shr, insn->type, def_reg, shift
4077			break;
4078		case IR_SAR:
4079			|	ASM_REG_IMM_OP sar, insn->type, def_reg, shift
4080			break;
4081		case IR_ROL:
4082			|	ASM_REG_IMM_OP rol, insn->type, def_reg, shift
4083			break;
4084		case IR_ROR:
4085			|	ASM_REG_IMM_OP ror, insn->type, def_reg, shift
4086			break;
4087	}
4088	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4089		ir_emit_store(ctx, type, def, def_reg);
4090	}
4091}
4092
4093static void ir_emit_mem_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4094{
4095	ir_backend_data *data = ctx->data;
4096	dasm_State **Dst = &data->dasm_state;
4097	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4098	ir_type type = op_insn->type;
4099	int32_t shift;
4100	ir_mem mem;
4101
4102	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
4103	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
4104	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[op_insn->op2].val.i64));
4105	shift = ctx->ir_base[op_insn->op2].val.i32;
4106	if (insn->op == IR_STORE) {
4107		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4108	} else {
4109		IR_ASSERT(insn->op == IR_VSTORE);
4110		mem = ir_var_spill_slot(ctx, insn->op2);
4111	}
4112
4113	switch (op_insn->op) {
4114		default:
4115			IR_ASSERT(0);
4116		case IR_SHL:
4117			|	ASM_MEM_IMM_OP shl, type, mem, shift
4118			break;
4119		case IR_SHR:
4120			|	ASM_MEM_IMM_OP shr, type, mem, shift
4121			break;
4122		case IR_SAR:
4123			|	ASM_MEM_IMM_OP sar, type, mem, shift
4124			break;
4125		case IR_ROL:
4126			|	ASM_MEM_IMM_OP rol, type, mem, shift
4127			break;
4128		case IR_ROR:
4129			|	ASM_MEM_IMM_OP ror, type, mem, shift
4130			break;
4131	}
4132}
4133
4134static void ir_emit_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4135{
4136	ir_backend_data *data = ctx->data;
4137	dasm_State **Dst = &data->dasm_state;
4138	ir_type type = insn->type;
4139	ir_ref op1 = insn->op1;
4140	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4141	ir_reg op1_reg = ctx->regs[def][1];
4142
4143	IR_ASSERT(def_reg != IR_REG_NONE);
4144
4145	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4146		op1_reg = IR_REG_NUM(op1_reg);
4147		ir_emit_load(ctx, type, op1_reg, op1);
4148	}
4149	if (def_reg != op1_reg) {
4150		if (op1_reg != IR_REG_NONE) {
4151			ir_emit_mov(ctx, type, def_reg, op1_reg);
4152		} else {
4153			ir_emit_load(ctx, type, def_reg, op1);
4154		}
4155	}
4156	if (rule == IR_INC) {
4157		|	ASM_REG_OP inc, insn->type, def_reg
4158	} else if (rule == IR_DEC) {
4159		|	ASM_REG_OP dec, insn->type, def_reg
4160	} else if (insn->op == IR_NOT) {
4161		|	ASM_REG_OP not, insn->type, def_reg
4162	} else if (insn->op == IR_NEG) {
4163		|	ASM_REG_OP neg, insn->type, def_reg
4164	} else {
4165		IR_ASSERT(insn->op == IR_BSWAP);
4166		switch (ir_type_size[insn->type]) {
4167			default:
4168				IR_ASSERT(0);
4169			case 4:
4170				|	bswap Rd(def_reg)
4171				break;
4172			case 8:
4173				IR_ASSERT(sizeof(void*) == 8);
4174|.if X64
4175				|	bswap Rq(def_reg)
4176|.endif
4177				break;
4178		}
4179	}
4180	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4181		ir_emit_store(ctx, type, def, def_reg);
4182	}
4183}
4184
4185static void ir_emit_bit_count(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4186{
4187	ir_backend_data *data = ctx->data;
4188	dasm_State **Dst = &data->dasm_state;
4189	ir_type type = insn->type;
4190	ir_ref op1 = insn->op1;
4191	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4192	ir_reg op1_reg = ctx->regs[def][1];
4193
4194	IR_ASSERT(def_reg != IR_REG_NONE);
4195
4196	if (op1_reg != IR_REG_NONE) {
4197		if (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(op1)) {
4198			op1_reg = IR_REG_NUM(op1_reg);
4199			ir_emit_load(ctx, type, op1_reg, op1);
4200		}
4201		switch (ir_type_size[insn->type]) {
4202			default:
4203				IR_ASSERT(0);
4204			case 2:
4205				if (insn->op == IR_CTLZ) {
4206					if (ctx->mflags & IR_X86_BMI1) {
4207						|	lzcnt Rw(def_reg), Rw(op1_reg)
4208					} else {
4209						|	bsr Rw(def_reg), Rw(op1_reg)
4210						|	xor Rw(def_reg), 0xf
4211					}
4212				} else if (insn->op == IR_CTTZ) {
4213					if (ctx->mflags & IR_X86_BMI1) {
4214						|	tzcnt Rw(def_reg), Rw(op1_reg)
4215					} else {
4216						|	bsf Rw(def_reg), Rw(op1_reg)
4217					}
4218				} else {
4219					IR_ASSERT(insn->op == IR_CTPOP);
4220					|	popcnt Rw(def_reg), Rw(op1_reg)
4221				}
4222				break;
4223			case 1:
4224				|   movzx Rd(op1_reg), Rb(op1_reg)
4225				if (insn->op == IR_CTLZ) {
4226					if (ctx->mflags & IR_X86_BMI1) {
4227						|	lzcnt Rd(def_reg), Rd(op1_reg)
4228						|	sub Rd(def_reg), 24
4229					} else {
4230						|	bsr Rd(def_reg), Rd(op1_reg)
4231						|	xor Rw(def_reg), 0x7
4232					}
4233					break;
4234				}
4235				IR_FALLTHROUGH;
4236			case 4:
4237				if (insn->op == IR_CTLZ) {
4238					if (ctx->mflags & IR_X86_BMI1) {
4239						|	lzcnt Rd(def_reg), Rd(op1_reg)
4240					} else {
4241						|	bsr Rd(def_reg), Rd(op1_reg)
4242						|	xor Rw(def_reg), 0x1f
4243					}
4244				} else if (insn->op == IR_CTTZ) {
4245					if (ctx->mflags & IR_X86_BMI1) {
4246						|	tzcnt Rd(def_reg), Rd(op1_reg)
4247					} else {
4248						|	bsf Rd(def_reg), Rd(op1_reg)
4249					}
4250				} else {
4251					IR_ASSERT(insn->op == IR_CTPOP);
4252					|	popcnt Rd(def_reg), Rd(op1_reg)
4253				}
4254				break;
4255|.if X64
4256			case 8:
4257				if (insn->op == IR_CTLZ) {
4258					if (ctx->mflags & IR_X86_BMI1) {
4259						|	lzcnt Rq(def_reg), Rq(op1_reg)
4260					} else {
4261						|	bsr Rq(def_reg), Rq(op1_reg)
4262						|	xor Rw(def_reg), 0x3f
4263					}
4264				} else if (insn->op == IR_CTTZ) {
4265					if (ctx->mflags & IR_X86_BMI1) {
4266						|	tzcnt Rq(def_reg), Rq(op1_reg)
4267					} else {
4268						|	bsf Rq(def_reg), Rq(op1_reg)
4269					}
4270				} else {
4271					IR_ASSERT(insn->op == IR_CTPOP);
4272					|	popcnt Rq(def_reg), Rq(op1_reg)
4273				}
4274				break;
4275|.endif
4276		}
4277	} else {
4278		ir_mem mem;
4279
4280		if (ir_rule(ctx, op1) & IR_FUSED) {
4281			mem = ir_fuse_load(ctx, def, op1);
4282		} else {
4283			mem = ir_ref_spill_slot(ctx, op1);
4284		}
4285		switch (ir_type_size[insn->type]) {
4286			default:
4287				IR_ASSERT(0);
4288			case 2:
4289				if (insn->op == IR_CTLZ) {
4290					if (ctx->mflags & IR_X86_BMI1) {
4291						|	ASM_TXT_TMEM_OP lzcnt, Rw(def_reg), word, mem
4292					} else {
4293						|	ASM_TXT_TMEM_OP bsr, Rw(def_reg), word, mem
4294						|	xor Rw(def_reg), 0xf
4295					}
4296				} else if (insn->op == IR_CTTZ) {
4297					if (ctx->mflags & IR_X86_BMI1) {
4298						|	ASM_TXT_TMEM_OP tzcnt, Rw(def_reg), word, mem
4299					} else {
4300						|	ASM_TXT_TMEM_OP bsf, Rw(def_reg), word, mem
4301					}
4302				} else {
4303					|	ASM_TXT_TMEM_OP popcnt, Rw(def_reg), word, mem
4304				}
4305				break;
4306			case 4:
4307				if (insn->op == IR_CTLZ) {
4308					if (ctx->mflags & IR_X86_BMI1) {
4309						|	ASM_TXT_TMEM_OP lzcnt, Rd(def_reg), dword, mem
4310					} else {
4311						|	ASM_TXT_TMEM_OP bsr, Rd(def_reg), dword, mem
4312						|	xor Rw(def_reg), 0x1f
4313					}
4314				} else if (insn->op == IR_CTTZ) {
4315					if (ctx->mflags & IR_X86_BMI1) {
4316						|	ASM_TXT_TMEM_OP tzcnt, Rd(def_reg), dword, mem
4317					} else {
4318						|	ASM_TXT_TMEM_OP bsf, Rd(def_reg), dword, mem
4319					}
4320				} else {
4321					|	ASM_TXT_TMEM_OP popcnt, Rd(def_reg), dword, mem
4322				}
4323				break;
4324|.if X64
4325			case 8:
4326				if (insn->op == IR_CTLZ) {
4327					if (ctx->mflags & IR_X86_BMI1) {
4328						|	ASM_TXT_TMEM_OP lzcnt, Rq(def_reg), qword, mem
4329					} else {
4330						|	ASM_TXT_TMEM_OP bsr, Rq(def_reg), qword, mem
4331						|	xor Rw(def_reg), 0x3f
4332					}
4333				} else if (insn->op == IR_CTTZ) {
4334					if (ctx->mflags & IR_X86_BMI1) {
4335						|	ASM_TXT_TMEM_OP tzcnt, Rq(def_reg), qword, mem
4336					} else {
4337						|	ASM_TXT_TMEM_OP bsf, Rq(def_reg), qword, mem
4338					}
4339				} else {
4340					|	ASM_TXT_TMEM_OP popcnt, Rq(def_reg), qword, mem
4341				}
4342				break;
4343|.endif
4344		}
4345	}
4346
4347	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4348		ir_emit_store(ctx, type, def, def_reg);
4349	}
4350}
4351
4352static void ir_emit_ctpop(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4353{
4354	ir_backend_data *data = ctx->data;
4355	dasm_State **Dst = &data->dasm_state;
4356	ir_type type = insn->type;
4357	ir_ref op1 = insn->op1;
4358	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4359	ir_reg op1_reg = ctx->regs[def][1];
4360	ir_reg tmp_reg = ctx->regs[def][2];
4361|.if X64
4362||	ir_reg const_reg = ctx->regs[def][3];
4363|.endif
4364
4365	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
4366	if (IR_IS_CONST_REF(op1) || op1_reg == IR_REG_NONE) {
4367		ir_emit_load(ctx, type, def_reg, op1);
4368		if (ir_type_size[insn->type] == 1) {
4369			|	movzx Rd(def_reg), Rb(def_reg)
4370		} else if (ir_type_size[insn->type] == 2) {
4371			|	movzx Rd(def_reg), Rw(def_reg)
4372		}
4373	} else {
4374		if (IR_REG_SPILLED(op1_reg)) {
4375			op1_reg = IR_REG_NUM(op1_reg);
4376			ir_emit_load(ctx, type, op1_reg, op1);
4377		}
4378		switch (ir_type_size[insn->type]) {
4379			default:
4380				IR_ASSERT(0);
4381			case 1:
4382				|	movzx Rd(def_reg), Rb(op1_reg)
4383				break;
4384			case 2:
4385				|	movzx Rd(def_reg), Rw(op1_reg)
4386				break;
4387			case 4:
4388				|	mov Rd(def_reg), Rd(op1_reg)
4389				break;
4390|.if X64
4391||			case 8:
4392				|	mov Rq(def_reg), Rq(op1_reg)
4393||				break;
4394|.endif
4395		}
4396	}
4397	switch (ir_type_size[insn->type]) {
4398		default:
4399			IR_ASSERT(0);
4400		case 1:
4401			|	mov Rd(tmp_reg), Rd(def_reg)
4402			|	shr Rd(def_reg), 1
4403			|	and Rd(def_reg), 0x55
4404			|	sub Rd(tmp_reg), Rd(def_reg)
4405			|	mov Rd(def_reg), Rd(tmp_reg)
4406			|	and Rd(def_reg), 0x33
4407			|	shr Rd(tmp_reg), 2
4408			|	and Rd(tmp_reg), 0x33
4409			|	add Rd(tmp_reg), Rd(def_reg)
4410			|	mov Rd(def_reg), Rd(tmp_reg)
4411			|	shr Rd(def_reg), 4
4412			|	add Rd(def_reg), Rd(tmp_reg)
4413			|	and Rd(def_reg), 0x0f
4414			break;
4415		case 2:
4416			|	mov Rd(tmp_reg), Rd(def_reg)
4417			|	shr Rd(def_reg), 1
4418			|	and Rd(def_reg), 0x5555
4419			|	sub Rd(tmp_reg), Rd(def_reg)
4420			|	mov Rd(def_reg), Rd(tmp_reg)
4421			|	and Rd(def_reg), 0x3333
4422			|	shr Rd(tmp_reg), 2
4423			|	and Rd(tmp_reg), 0x3333
4424			|	add Rd(tmp_reg), Rd(def_reg)
4425			|	mov Rd(def_reg), Rd(tmp_reg)
4426			|	shr Rd(def_reg), 4
4427			|	add Rd(def_reg), Rd(tmp_reg)
4428			|	and Rd(def_reg), 0x0f0f
4429			|	mov	Rd(tmp_reg), Rd(def_reg)
4430			|	shr Rd(tmp_reg), 8
4431			|	and Rd(def_reg), 0x0f
4432			|	add Rd(def_reg), Rd(tmp_reg)
4433			break;
4434		case 4:
4435			|	mov Rd(tmp_reg), Rd(def_reg)
4436			|	shr Rd(def_reg), 1
4437			|	and Rd(def_reg), 0x55555555
4438			|	sub Rd(tmp_reg), Rd(def_reg)
4439			|	mov Rd(def_reg), Rd(tmp_reg)
4440			|	and Rd(def_reg), 0x33333333
4441			|	shr Rd(tmp_reg), 2
4442			|	and Rd(tmp_reg), 0x33333333
4443			|	add Rd(tmp_reg), Rd(def_reg)
4444			|	mov Rd(def_reg), Rd(tmp_reg)
4445			|	shr Rd(def_reg), 4
4446			|	add Rd(def_reg), Rd(tmp_reg)
4447			|	and Rd(def_reg), 0x0f0f0f0f
4448			|	imul Rd(def_reg), 0x01010101
4449			|	shr Rd(def_reg), 24
4450			break;
4451|.if X64
4452||		case 8:
4453||			IR_ASSERT(const_reg != IR_REG_NONE);
4454			|	mov Rq(tmp_reg), Rq(def_reg)
4455			|	shr Rq(def_reg), 1
4456			|	mov64 Rq(const_reg), 0x5555555555555555
4457			|	and Rq(def_reg), Rq(const_reg)
4458			|	sub Rq(tmp_reg), Rq(def_reg)
4459			|	mov Rq(def_reg), Rq(tmp_reg)
4460			|	mov64 Rq(const_reg), 0x3333333333333333
4461			|	and Rq(def_reg), Rq(const_reg)
4462			|	shr Rq(tmp_reg), 2
4463			|	and Rq(tmp_reg), Rq(const_reg)
4464			|	add Rq(tmp_reg), Rq(def_reg)
4465			|	mov Rq(def_reg), Rq(tmp_reg)
4466			|	shr Rq(def_reg), 4
4467			|	add Rq(def_reg), Rq(tmp_reg)
4468			|	mov64 Rq(const_reg), 0x0f0f0f0f0f0f0f0f
4469			|	and Rq(def_reg), Rq(const_reg)
4470			|	mov64 Rq(const_reg), 0x0101010101010101
4471			|	imul Rq(def_reg), Rq(const_reg)
4472			|	shr Rq(def_reg), 56
4473||			break;
4474|.endif
4475	}
4476
4477	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4478		ir_emit_store(ctx, type, def, def_reg);
4479	}
4480}
4481
4482static void ir_emit_mem_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4483{
4484	ir_backend_data *data = ctx->data;
4485	dasm_State **Dst = &data->dasm_state;
4486	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4487	ir_type type = op_insn->type;
4488	ir_mem mem;
4489
4490	if (insn->op == IR_STORE) {
4491		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4492	} else {
4493		IR_ASSERT(insn->op == IR_VSTORE);
4494		mem = ir_var_spill_slot(ctx, insn->op2);
4495	}
4496
4497	if (rule == IR_MEM_INC) {
4498		|	ASM_MEM_OP inc, type, mem
4499	} else if (rule == IR_MEM_DEC) {
4500		|	ASM_MEM_OP dec, type, mem
4501	} else if (op_insn->op == IR_NOT) {
4502		|	ASM_MEM_OP not, type, mem
4503	} else {
4504		IR_ASSERT(op_insn->op == IR_NEG);
4505		|	ASM_MEM_OP neg, type, mem
4506	}
4507}
4508
4509static void ir_emit_abs_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4510{
4511	ir_backend_data *data = ctx->data;
4512	dasm_State **Dst = &data->dasm_state;
4513	ir_type type = insn->type;
4514	ir_ref op1 = insn->op1;
4515	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4516	ir_reg op1_reg = ctx->regs[def][1];
4517
4518	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
4519
4520	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4521		op1_reg = IR_REG_NUM(op1_reg);
4522		ir_emit_load(ctx, type, op1_reg, op1);
4523	}
4524
4525	IR_ASSERT(def_reg != op1_reg);
4526
4527	ir_emit_mov(ctx, insn->type, def_reg, op1_reg);
4528	|	ASM_REG_OP neg, insn->type, def_reg
4529	|	ASM_REG_REG_OP2, cmovs, type, def_reg, op1_reg
4530	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4531		ir_emit_store(ctx, type, def, def_reg);
4532	}
4533}
4534
4535static void ir_emit_bool_not_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4536{
4537	ir_backend_data *data = ctx->data;
4538	dasm_State **Dst = &data->dasm_state;
4539	ir_type type = ctx->ir_base[insn->op1].type;
4540	ir_ref op1 = insn->op1;
4541	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4542	ir_reg op1_reg = ctx->regs[def][1];
4543
4544	IR_ASSERT(def_reg != IR_REG_NONE);
4545
4546	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4547		op1_reg = IR_REG_NUM(op1_reg);
4548		ir_emit_load(ctx, type, op1_reg, op1);
4549	}
4550
4551	if (op1_reg != IR_REG_NONE) {
4552		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
4553	} else {
4554		ir_mem mem = ir_ref_spill_slot(ctx, op1);
4555
4556		|	ASM_MEM_IMM_OP cmp, type, mem, 0
4557	}
4558	|	sete Rb(def_reg)
4559
4560	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4561		ir_emit_store(ctx, type, def, def_reg);
4562	}
4563}
4564
4565static void ir_emit_mul_div_mod(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4566{
4567	ir_backend_data *data = ctx->data;
4568	dasm_State **Dst = &data->dasm_state;
4569	ir_type type = insn->type;
4570	ir_ref op1 = insn->op1;
4571	ir_ref op2 = insn->op2;
4572	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4573	ir_reg op1_reg = ctx->regs[def][1];
4574	ir_reg op2_reg = ctx->regs[def][2];
4575	ir_mem mem;
4576
4577	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4578		op1_reg = IR_REG_NUM(op1_reg);
4579		ir_emit_load(ctx, type, op1_reg, op1);
4580	}
4581	if (op1_reg != IR_REG_RAX) {
4582		if (op1_reg != IR_REG_NONE) {
4583			ir_emit_mov(ctx, type, IR_REG_RAX, op1_reg);
4584		} else {
4585			ir_emit_load(ctx, type, IR_REG_RAX, op1);
4586		}
4587	}
4588	if (op2_reg == IR_REG_NONE && op1 == op2) {
4589		op2_reg = IR_REG_RAX;
4590	} else if (IR_IS_CONST_REF(op2)) {
4591		if (insn->op == IR_MUL || insn->op == IR_MUL_OV) {
4592			op2_reg = IR_REG_RDX;
4593		} else {
4594			IR_ASSERT(op2_reg != IR_REG_NONE);
4595		}
4596		ir_emit_load(ctx, type, op2_reg, op2);
4597	}
4598	if (insn->op == IR_MUL || insn->op == IR_MUL_OV) {
4599		if (IR_IS_TYPE_SIGNED(insn->type)) {
4600			if (op2_reg != IR_REG_NONE) {
4601				if (IR_REG_SPILLED(op2_reg)) {
4602					op2_reg = IR_REG_NUM(op2_reg);
4603					ir_emit_load(ctx, type, op2_reg, op2);
4604				}
4605				|	ASM_REG_OP imul, type, op2_reg
4606			} else {
4607				if (ir_rule(ctx, op2) & IR_FUSED) {
4608					mem = ir_fuse_load(ctx, def, op2);
4609				} else {
4610					mem = ir_ref_spill_slot(ctx, op2);
4611				}
4612				|	ASM_MEM_OP imul, type, mem
4613			}
4614		} else {
4615			if (op2_reg != IR_REG_NONE) {
4616				if (IR_REG_SPILLED(op2_reg)) {
4617					op2_reg = IR_REG_NUM(op2_reg);
4618					ir_emit_load(ctx, type, op2_reg, op2);
4619				}
4620				|	ASM_REG_OP mul, type, op2_reg
4621			} else {
4622				if (ir_rule(ctx, op2) & IR_FUSED) {
4623					mem = ir_fuse_load(ctx, def, op2);
4624				} else {
4625					mem = ir_ref_spill_slot(ctx, op2);
4626				}
4627				|	ASM_MEM_OP mul, type, mem
4628			}
4629		}
4630	} else {
4631		if (IR_IS_TYPE_SIGNED(type)) {
4632			if (ir_type_size[type] == 8) {
4633				|	cqo
4634			} else if (ir_type_size[type] == 4) {
4635				|	cdq
4636			} else if (ir_type_size[type] == 2) {
4637				|	cwd
4638			} else {
4639				|	movsx ax, al
4640			}
4641			if (op2_reg != IR_REG_NONE) {
4642				if (IR_REG_SPILLED(op2_reg)) {
4643					op2_reg = IR_REG_NUM(op2_reg);
4644					ir_emit_load(ctx, type, op2_reg, op2);
4645				}
4646				|	ASM_REG_OP idiv, type, op2_reg
4647			} else {
4648				if (ir_rule(ctx, op2) & IR_FUSED) {
4649					mem = ir_fuse_load(ctx, def, op2);
4650				} else {
4651					mem = ir_ref_spill_slot(ctx, op2);
4652				}
4653				|	ASM_MEM_OP idiv, type, mem
4654			}
4655		} else {
4656			if (ir_type_size[type] == 1) {
4657				|	movzx ax, al
4658			} else {
4659				|	ASM_REG_REG_OP xor, type, IR_REG_RDX, IR_REG_RDX
4660			}
4661			if (op2_reg != IR_REG_NONE) {
4662				if (IR_REG_SPILLED(op2_reg)) {
4663					op2_reg = IR_REG_NUM(op2_reg);
4664					ir_emit_load(ctx, type, op2_reg, op2);
4665				}
4666				|	ASM_REG_OP div, type, op2_reg
4667			} else {
4668				if (ir_rule(ctx, op2) & IR_FUSED) {
4669					mem = ir_fuse_load(ctx, def, op2);
4670				} else {
4671					mem = ir_ref_spill_slot(ctx, op2);
4672				}
4673				|	ASM_MEM_OP div, type, mem
4674			}
4675		}
4676	}
4677
4678	if (insn->op == IR_MUL || insn->op == IR_MUL_OV || insn->op == IR_DIV) {
4679		if (def_reg != IR_REG_NONE) {
4680			if (def_reg != IR_REG_RAX) {
4681				ir_emit_mov(ctx, type, def_reg, IR_REG_RAX);
4682			}
4683			if (IR_REG_SPILLED(ctx->regs[def][0])) {
4684				ir_emit_store(ctx, type, def, def_reg);
4685			}
4686		} else {
4687			ir_emit_store(ctx, type, def, IR_REG_RAX);
4688		}
4689	} else {
4690		IR_ASSERT(insn->op == IR_MOD);
4691		if (ir_type_size[type] == 1) {
4692			if (def_reg != IR_REG_NONE) {
4693				|	mov al, ah
4694				if (def_reg != IR_REG_RAX) {
4695					|	mov Rb(def_reg), al
4696				}
4697				if (IR_REG_SPILLED(ctx->regs[def][0])) {
4698					ir_emit_store(ctx, type, def, def_reg);
4699				}
4700			} else {
4701				ir_reg fp;
4702				int32_t offset = ir_ref_spill_slot_offset(ctx, def, &fp);
4703
4704//?????
4705				|	mov byte [Ra(fp)+offset], ah
4706			}
4707		} else {
4708			if (def_reg != IR_REG_NONE) {
4709				if (def_reg != IR_REG_RDX) {
4710					ir_emit_mov(ctx, type, def_reg, IR_REG_RDX);
4711				}
4712				if (IR_REG_SPILLED(ctx->regs[def][0])) {
4713					ir_emit_store(ctx, type, def, def_reg);
4714				}
4715			} else {
4716				ir_emit_store(ctx, type, def, IR_REG_RDX);
4717			}
4718		}
4719	}
4720}
4721
4722static void ir_rodata(ir_ctx *ctx)
4723{
4724	ir_backend_data *data = ctx->data;
4725	dasm_State **Dst = &data->dasm_state;
4726
4727	|.rodata
4728	if (!data->rodata_label) {
4729		int label = data->rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
4730		|=>label:
4731	}
4732}
4733
4734static void ir_emit_op_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4735{
4736	ir_backend_data *data = ctx->data;
4737	dasm_State **Dst = &data->dasm_state;
4738	ir_type type = insn->type;
4739	ir_ref op1 = insn->op1;
4740	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4741	ir_reg op1_reg = ctx->regs[def][1];
4742
4743	IR_ASSERT(def_reg != IR_REG_NONE);
4744
4745	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4746		op1_reg = IR_REG_NUM(op1_reg);
4747		ir_emit_load(ctx, type, op1_reg, op1);
4748	}
4749	if (def_reg != op1_reg) {
4750		if (op1_reg != IR_REG_NONE) {
4751			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
4752		} else {
4753			ir_emit_load(ctx, type, def_reg, op1);
4754		}
4755	}
4756	if (insn->op == IR_NEG) {
4757		if (insn->type == IR_DOUBLE) {
4758			if (!data->double_neg_const) {
4759				data->double_neg_const = 1;
4760				ir_rodata(ctx);
4761				|.align 16
4762				|->double_neg_const:
4763				|.dword 0, 0x80000000, 0, 0
4764				|.code
4765			}
4766			if (ctx->mflags & IR_X86_AVX) {
4767				|	vxorpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
4768			} else {
4769				|	xorpd xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
4770			}
4771		} else {
4772			IR_ASSERT(insn->type == IR_FLOAT);
4773			if (!data->float_neg_const) {
4774				data->float_neg_const = 1;
4775				ir_rodata(ctx);
4776				|.align 16
4777				|->float_neg_const:
4778				|.dword 0x80000000, 0, 0, 0
4779				|.code
4780			}
4781			if (ctx->mflags & IR_X86_AVX) {
4782				|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
4783			} else {
4784				|	xorps xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
4785			}
4786		}
4787	} else {
4788		IR_ASSERT(insn->op == IR_ABS);
4789		if (insn->type == IR_DOUBLE) {
4790			if (!data->double_abs_const) {
4791				data->double_abs_const = 1;
4792				ir_rodata(ctx);
4793				|.align 16
4794				|->double_abs_const:
4795				|.dword 0xffffffff, 0x7fffffff, 0, 0
4796				|.code
4797			}
4798			if (ctx->mflags & IR_X86_AVX) {
4799				|	vandpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
4800			} else {
4801				|	andpd xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
4802			}
4803		} else {
4804			IR_ASSERT(insn->type == IR_FLOAT);
4805			if (!data->float_abs_const) {
4806				data->float_abs_const = 1;
4807				ir_rodata(ctx);
4808				|.align 16
4809				|->float_abs_const:
4810				|.dword 0x7fffffff, 0, 0, 0
4811				|.code
4812			}
4813			if (ctx->mflags & IR_X86_AVX) {
4814				|	vandps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
4815			} else {
4816				|	andps xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
4817			}
4818		}
4819	}
4820	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4821		ir_emit_store(ctx, insn->type, def, def_reg);
4822	}
4823}
4824
4825static void ir_emit_binop_sse2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4826{
4827	ir_backend_data *data = ctx->data;
4828	dasm_State **Dst = &data->dasm_state;
4829	ir_type type = insn->type;
4830	ir_ref op1 = insn->op1;
4831	ir_ref op2 = insn->op2;
4832	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4833	ir_reg op1_reg = ctx->regs[def][1];
4834	ir_reg op2_reg = ctx->regs[def][2];
4835
4836	IR_ASSERT(def_reg != IR_REG_NONE);
4837
4838	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4839		op1_reg = IR_REG_NUM(op1_reg);
4840		ir_emit_load(ctx, type, op1_reg, op1);
4841	}
4842	if (def_reg != op1_reg) {
4843		if (op1_reg != IR_REG_NONE) {
4844			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
4845		} else {
4846			ir_emit_load(ctx, type, def_reg, op1);
4847		}
4848		if (op1 == op2) {
4849			op2_reg = def_reg;
4850		}
4851	}
4852	if (op2_reg != IR_REG_NONE) {
4853		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
4854			op2_reg = IR_REG_NUM(op2_reg);
4855			if (op1 != op2) {
4856				ir_emit_load(ctx, type, op2_reg, op2);
4857			}
4858		}
4859		switch (insn->op) {
4860			default:
4861				IR_ASSERT(0 && "NIY binary op");
4862			case IR_ADD:
4863				|	ASM_SSE2_REG_REG_OP adds, type, def_reg, op2_reg
4864				break;
4865			case IR_SUB:
4866				|	ASM_SSE2_REG_REG_OP subs, type, def_reg, op2_reg
4867				break;
4868			case IR_MUL:
4869				|	ASM_SSE2_REG_REG_OP muls, type, def_reg, op2_reg
4870				break;
4871			case IR_DIV:
4872				|	ASM_SSE2_REG_REG_OP divs, type, def_reg, op2_reg
4873				break;
4874			case IR_MIN:
4875				|	ASM_SSE2_REG_REG_OP mins, type, def_reg, op2_reg
4876				break;
4877			case IR_MAX:
4878				|	ASM_SSE2_REG_REG_OP maxs, type, def_reg, op2_reg
4879				break;
4880		}
4881	} else if (IR_IS_CONST_REF(op2)) {
4882		int label = ir_const_label(ctx, op2);
4883
4884		switch (insn->op) {
4885			default:
4886				IR_ASSERT(0 && "NIY binary op");
4887			case IR_ADD:
4888				|	ASM_SSE2_REG_TXT_OP adds, type, def_reg, [=>label]
4889				break;
4890			case IR_SUB:
4891				|	ASM_SSE2_REG_TXT_OP subs, type, def_reg, [=>label]
4892				break;
4893			case IR_MUL:
4894				|	ASM_SSE2_REG_TXT_OP muls, type, def_reg, [=>label]
4895				break;
4896			case IR_DIV:
4897				|	ASM_SSE2_REG_TXT_OP divs, type, def_reg, [=>label]
4898				break;
4899			case IR_MIN:
4900				|	ASM_SSE2_REG_TXT_OP mins, type, def_reg, [=>label]
4901				break;
4902			case IR_MAX:
4903				|	ASM_SSE2_REG_TXT_OP maxs, type, def_reg, [=>label]
4904				break;
4905		}
4906	} else {
4907		ir_mem mem;
4908
4909		if (ir_rule(ctx, op2) & IR_FUSED) {
4910			mem = ir_fuse_load(ctx, def, op2);
4911		} else {
4912			mem = ir_ref_spill_slot(ctx, op2);
4913		}
4914		switch (insn->op) {
4915			default:
4916				IR_ASSERT(0 && "NIY binary op");
4917			case IR_ADD:
4918				|	ASM_SSE2_REG_MEM_OP adds, type, def_reg, mem
4919				break;
4920			case IR_SUB:
4921				|	ASM_SSE2_REG_MEM_OP subs, type, def_reg, mem
4922				break;
4923			case IR_MUL:
4924				|	ASM_SSE2_REG_MEM_OP muls, type, def_reg, mem
4925				break;
4926			case IR_DIV:
4927				|	ASM_SSE2_REG_MEM_OP divs, type, def_reg, mem
4928				break;
4929			case IR_MIN:
4930				|	ASM_SSE2_REG_MEM_OP mins, type, def_reg, mem
4931				break;
4932			case IR_MAX:
4933				|	ASM_SSE2_REG_MEM_OP maxs, type, def_reg, mem
4934				break;
4935		}
4936	}
4937	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4938		ir_emit_store(ctx, insn->type, def, def_reg);
4939	}
4940}
4941
4942static void ir_emit_binop_avx(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4943{
4944	ir_backend_data *data = ctx->data;
4945	dasm_State **Dst = &data->dasm_state;
4946	ir_type type = insn->type;
4947	ir_ref op1 = insn->op1;
4948	ir_ref op2 = insn->op2;
4949	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4950	ir_reg op1_reg = ctx->regs[def][1];
4951	ir_reg op2_reg = ctx->regs[def][2];
4952
4953	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
4954
4955	if (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(op1)) {
4956		op1_reg = IR_REG_NUM(op1_reg);
4957		ir_emit_load(ctx, type, op1_reg, op1);
4958	}
4959	if (op2_reg != IR_REG_NONE) {
4960		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
4961			op2_reg = IR_REG_NUM(op2_reg);
4962			if (op1 != op2) {
4963				ir_emit_load(ctx, type, op2_reg, op2);
4964			}
4965		}
4966		switch (insn->op) {
4967			default:
4968				IR_ASSERT(0 && "NIY binary op");
4969			case IR_ADD:
4970				|	ASM_AVX_REG_REG_REG_OP vadds, type, def_reg, op1_reg, op2_reg
4971				break;
4972			case IR_SUB:
4973				|	ASM_AVX_REG_REG_REG_OP vsubs, type, def_reg, op1_reg, op2_reg
4974				break;
4975			case IR_MUL:
4976				|	ASM_AVX_REG_REG_REG_OP vmuls, type, def_reg, op1_reg, op2_reg
4977				break;
4978			case IR_DIV:
4979				|	ASM_AVX_REG_REG_REG_OP vdivs, type, def_reg, op1_reg, op2_reg
4980				break;
4981			case IR_MIN:
4982				|	ASM_AVX_REG_REG_REG_OP vmins, type, def_reg, op1_reg, op2_reg
4983				break;
4984			case IR_MAX:
4985				|	ASM_AVX_REG_REG_REG_OP vmaxs, type, def_reg, op1_reg, op2_reg
4986				break;
4987		}
4988	} else if (IR_IS_CONST_REF(op2)) {
4989		int label = ir_const_label(ctx, op2);
4990
4991		switch (insn->op) {
4992			default:
4993				IR_ASSERT(0 && "NIY binary op");
4994			case IR_ADD:
4995				|	ASM_AVX_REG_REG_TXT_OP vadds, type, def_reg, op1_reg, [=>label]
4996				break;
4997			case IR_SUB:
4998				|	ASM_AVX_REG_REG_TXT_OP vsubs, type, def_reg, op1_reg, [=>label]
4999				break;
5000			case IR_MUL:
5001				|	ASM_AVX_REG_REG_TXT_OP vmuls, type, def_reg, op1_reg, [=>label]
5002				break;
5003			case IR_DIV:
5004				|	ASM_AVX_REG_REG_TXT_OP vdivs, type, def_reg, op1_reg, [=>label]
5005				break;
5006			case IR_MIN:
5007				|	ASM_AVX_REG_REG_TXT_OP vmins, type, def_reg, op1_reg, [=>label]
5008				break;
5009			case IR_MAX:
5010				|	ASM_AVX_REG_REG_TXT_OP vmaxs, type, def_reg, op1_reg, [=>label]
5011				break;
5012		}
5013	} else {
5014		ir_mem mem;
5015
5016		if (ir_rule(ctx, op2) & IR_FUSED) {
5017			mem = ir_fuse_load(ctx, def, op2);
5018		} else {
5019			mem = ir_ref_spill_slot(ctx, op2);
5020		}
5021		switch (insn->op) {
5022			default:
5023				IR_ASSERT(0 && "NIY binary op");
5024			case IR_ADD:
5025				|	ASM_AVX_REG_REG_MEM_OP vadds, type, def_reg, op1_reg, mem
5026				break;
5027			case IR_SUB:
5028				|	ASM_AVX_REG_REG_MEM_OP vsubs, type, def_reg, op1_reg, mem
5029				break;
5030			case IR_MUL:
5031				|	ASM_AVX_REG_REG_MEM_OP vmuls, type, def_reg, op1_reg, mem
5032				break;
5033			case IR_DIV:
5034				|	ASM_AVX_REG_REG_MEM_OP vdivs, type, def_reg, op1_reg, mem
5035				break;
5036			case IR_MIN:
5037				|	ASM_AVX_REG_REG_MEM_OP vmins, type, def_reg, op1_reg, mem
5038				break;
5039			case IR_MAX:
5040				|	ASM_AVX_REG_REG_MEM_OP vmaxs, type, def_reg, op1_reg, mem
5041				break;
5042		}
5043	}
5044	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5045		ir_emit_store(ctx, insn->type, def, def_reg);
5046	}
5047}
5048
5049static void ir_emit_cmp_int_common(ir_ctx *ctx, ir_type type, ir_ref root, ir_insn *insn, ir_reg op1_reg, ir_ref op1, ir_reg op2_reg, ir_ref op2)
5050{
5051	ir_backend_data *data = ctx->data;
5052	dasm_State **Dst = &data->dasm_state;
5053
5054	if (op1_reg != IR_REG_NONE) {
5055		if (op2_reg != IR_REG_NONE) {
5056			|	ASM_REG_REG_OP cmp, type, op1_reg, op2_reg
5057		} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5058			|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
5059		} else if (IR_IS_CONST_REF(op2)) {
5060			int32_t val = ir_fuse_imm(ctx, op2);
5061			|	ASM_REG_IMM_OP cmp, type, op1_reg, val
5062		} else {
5063			ir_mem mem;
5064
5065			if (ir_rule(ctx, op2) & IR_FUSED) {
5066				mem = ir_fuse_load(ctx, root, op2);
5067			} else {
5068				mem = ir_ref_spill_slot(ctx, op2);
5069			}
5070			|	ASM_REG_MEM_OP cmp, type, op1_reg, mem
5071		}
5072	} else if (IR_IS_CONST_REF(insn->op1)) {
5073		IR_ASSERT(0);
5074	} else {
5075		ir_mem mem;
5076
5077		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
5078			mem = ir_fuse_load(ctx, root, insn->op1);
5079		} else {
5080			mem = ir_ref_spill_slot(ctx, insn->op1);
5081		}
5082		if (op2_reg != IR_REG_NONE) {
5083			|	ASM_MEM_REG_OP cmp, type, mem, op2_reg
5084		} else {
5085			IR_ASSERT(!IR_IS_CONST_REF(op1));
5086			int32_t val = ir_fuse_imm(ctx, op2);
5087			|	ASM_MEM_IMM_OP cmp, type, mem, val
5088		}
5089	}
5090}
5091
5092static void _ir_emit_setcc_int(ir_ctx *ctx, uint8_t op, ir_reg def_reg)
5093{
5094	ir_backend_data *data = ctx->data;
5095	dasm_State **Dst = &data->dasm_state;
5096
5097	switch (op) {
5098		default:
5099			IR_ASSERT(0 && "NIY binary op");
5100		case IR_EQ:
5101			|	sete Rb(def_reg)
5102			break;
5103		case IR_NE:
5104			|	setne Rb(def_reg)
5105			break;
5106		case IR_LT:
5107			|	setl Rb(def_reg)
5108			break;
5109		case IR_GE:
5110			|	setge Rb(def_reg)
5111			break;
5112		case IR_LE:
5113			|	setle Rb(def_reg)
5114			break;
5115		case IR_GT:
5116			|	setg Rb(def_reg)
5117			break;
5118		case IR_ULT:
5119			|	setb Rb(def_reg)
5120			break;
5121		case IR_UGE:
5122			|	setae Rb(def_reg)
5123			break;
5124		case IR_ULE:
5125			|	setbe Rb(def_reg)
5126			break;
5127		case IR_UGT:
5128			|	seta Rb(def_reg)
5129			break;
5130	}
5131}
5132
5133static void ir_emit_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5134{
5135	ir_backend_data *data = ctx->data;
5136	dasm_State **Dst = &data->dasm_state;
5137	ir_type type = ctx->ir_base[insn->op1].type;
5138	ir_op op = insn->op;
5139	ir_ref op1 = insn->op1;
5140	ir_ref op2 = insn->op2;
5141	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5142	ir_reg op1_reg = ctx->regs[def][1];
5143	ir_reg op2_reg = ctx->regs[def][2];
5144
5145	IR_ASSERT(def_reg != IR_REG_NONE);
5146	if (op1_reg != IR_REG_NONE && (IR_IS_CONST_REF(op1) || IR_REG_SPILLED(op1_reg))) {
5147		op1_reg = IR_REG_NUM(op1_reg);
5148		ir_emit_load(ctx, type, op1_reg, op1);
5149	}
5150	if (op2_reg != IR_REG_NONE && (IR_IS_CONST_REF(op2) || IR_REG_SPILLED(op2_reg))) {
5151		op2_reg = IR_REG_NUM(op2_reg);
5152		if (op1 != op2) {
5153			ir_emit_load(ctx, type, op2_reg, op2);
5154		}
5155	}
5156	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5157		if (op == IR_ULT) {
5158			/* always false */
5159			|	xor Ra(def_reg), Ra(def_reg)
5160			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5161				ir_emit_store(ctx, insn->type, def, def_reg);
5162			}
5163			return;
5164		} else if (op == IR_UGE) {
5165			/* always true */
5166			|	ASM_REG_IMM_OP mov, insn->type, def_reg, 1
5167			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5168				ir_emit_store(ctx, insn->type, def, def_reg);
5169			}
5170			return;
5171		} else if (op == IR_ULE) {
5172			op = IR_EQ;
5173		} else if (op == IR_UGT) {
5174			op = IR_NE;
5175		}
5176	}
5177	ir_emit_cmp_int_common(ctx, type, def, insn, op1_reg, op1, op2_reg, op2);
5178	_ir_emit_setcc_int(ctx, op, def_reg);
5179	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5180		ir_emit_store(ctx, insn->type, def, def_reg);
5181	}
5182}
5183
5184static void ir_emit_test_int_common(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_op op)
5185{
5186	ir_backend_data *data = ctx->data;
5187	dasm_State **Dst = &data->dasm_state;
5188	ir_insn *binop_insn = &ctx->ir_base[ref];
5189	ir_type type = binop_insn->type;
5190	ir_ref op1 = binop_insn->op1;
5191	ir_ref op2 = binop_insn->op2;
5192	ir_reg op1_reg = ctx->regs[ref][1];
5193	ir_reg op2_reg = ctx->regs[ref][2];
5194
5195	IR_ASSERT(binop_insn->op == IR_AND);
5196	if (op1_reg != IR_REG_NONE) {
5197		if (IR_REG_SPILLED(op1_reg)) {
5198			op1_reg = IR_REG_NUM(op1_reg);
5199			ir_emit_load(ctx, type, op1_reg, op1);
5200		}
5201		if (op2_reg != IR_REG_NONE) {
5202			if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
5203				op2_reg = IR_REG_NUM(op2_reg);
5204				if (op1 != op2) {
5205					ir_emit_load(ctx, type, op2_reg, op2);
5206				}
5207			}
5208			|	ASM_REG_REG_OP test, type, op1_reg, op2_reg
5209		} else if (IR_IS_CONST_REF(op2)) {
5210			int32_t val = ir_fuse_imm(ctx, op2);
5211
5212			if ((op == IR_EQ || op == IR_NE) && val == 0xff && (sizeof(void*) == 8 || op1_reg <= IR_REG_R3)) {
5213				|	test Rb(op1_reg), Rb(op1_reg)
5214			} else if ((op == IR_EQ || op == IR_NE) && val == 0xff00 && op1_reg <= IR_REG_R3) {
5215				if (op1_reg == IR_REG_RAX) {
5216					|	test ah, ah
5217				} else if (op1_reg == IR_REG_RBX) {
5218					|	test bh, bh
5219				} else if (op1_reg == IR_REG_RCX) {
5220					|	test ch, ch
5221				} else if (op1_reg == IR_REG_RDX) {
5222					|	test dh, dh
5223				} else {
5224					IR_ASSERT(0);
5225				}
5226			} else if ((op == IR_EQ || op == IR_NE) && val == 0xffff) {
5227				|	test Rw(op1_reg), Rw(op1_reg)
5228			} else if ((op == IR_EQ || op == IR_NE) && val == -1) {
5229				|	test Rd(op1_reg), Rd(op1_reg)
5230			} else {
5231				|	ASM_REG_IMM_OP test, type, op1_reg, val
5232			}
5233		} else {
5234			ir_mem mem;
5235
5236			if (ir_rule(ctx, op2) & IR_FUSED) {
5237				mem = ir_fuse_load(ctx, root, op2);
5238			} else {
5239				mem = ir_ref_spill_slot(ctx, op2);
5240			}
5241			|	ASM_REG_MEM_OP test, type, op1_reg, mem
5242		}
5243	} else if (IR_IS_CONST_REF(op1)) {
5244		IR_ASSERT(0);
5245	} else {
5246		ir_mem mem;
5247
5248		if (ir_rule(ctx, op1) & IR_FUSED) {
5249			mem = ir_fuse_load(ctx, root, op1);
5250		} else {
5251			mem = ir_ref_spill_slot(ctx, op1);
5252		}
5253		if (op2_reg != IR_REG_NONE) {
5254			if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
5255				op2_reg = IR_REG_NUM(op2_reg);
5256				if (op1 != op2) {
5257					ir_emit_load(ctx, type, op2_reg, op2);
5258				}
5259			}
5260			|	ASM_MEM_REG_OP test, type, mem, op2_reg
5261		} else {
5262			IR_ASSERT(!IR_IS_CONST_REF(op1));
5263			int32_t val = ir_fuse_imm(ctx, op2);
5264			|	ASM_MEM_IMM_OP test, type, mem, val
5265		}
5266	}
5267}
5268
5269static void ir_emit_testcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5270{
5271	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5272
5273	IR_ASSERT(def_reg != IR_REG_NONE);
5274	ir_emit_test_int_common(ctx, def, insn->op1, insn->op);
5275	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5276	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5277		ir_emit_store(ctx, insn->type, def, def_reg);
5278	}
5279}
5280
5281static void ir_emit_setcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5282{
5283	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5284
5285	IR_ASSERT(def_reg != IR_REG_NONE);
5286	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5287	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5288		ir_emit_store(ctx, insn->type, def, def_reg);
5289	}
5290}
5291
5292static ir_op ir_emit_cmp_fp_common(ir_ctx *ctx, ir_ref root, ir_ref cmp_ref, ir_insn *cmp_insn)
5293{
5294	ir_backend_data *data = ctx->data;
5295	dasm_State **Dst = &data->dasm_state;
5296	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5297	ir_op op = cmp_insn->op;
5298	ir_ref op1, op2;
5299	ir_reg op1_reg, op2_reg;
5300
5301	op1 = cmp_insn->op1;
5302	op2 = cmp_insn->op2;
5303	op1_reg = ctx->regs[cmp_ref][1];
5304	op2_reg = ctx->regs[cmp_ref][2];
5305
5306	if (op1_reg == IR_REG_NONE && op2_reg != IR_REG_NONE && (op == IR_EQ || op == IR_NE)) {
5307		ir_ref tmp;
5308		ir_reg tmp_reg;
5309
5310		tmp = op1;
5311		op1 = op2;
5312		op2 = tmp;
5313		tmp_reg = op1_reg;
5314		op1_reg = op2_reg;
5315		op2_reg = tmp_reg;
5316	}
5317
5318
5319	IR_ASSERT(op1_reg != IR_REG_NONE);
5320	if (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(op1)) {
5321		op1_reg = IR_REG_NUM(op1_reg);
5322		ir_emit_load(ctx, type, op1_reg, op1);
5323	}
5324	if (op2_reg != IR_REG_NONE) {
5325		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2)) {
5326			op2_reg = IR_REG_NUM(op2_reg);
5327			if (op1 != op2) {
5328				ir_emit_load(ctx, type, op2_reg, op2);
5329			}
5330		}
5331		|	ASM_FP_REG_REG_OP ucomis, type, op1_reg, op2_reg
5332	} else if (IR_IS_CONST_REF(op2)) {
5333		int label = ir_const_label(ctx, op2);
5334
5335		|	ASM_FP_REG_TXT_OP ucomis, type, op1_reg, [=>label]
5336	} else {
5337		ir_mem mem;
5338
5339		if (ir_rule(ctx, op2) & IR_FUSED) {
5340			mem = ir_fuse_load(ctx, root, op2);
5341		} else {
5342			mem = ir_ref_spill_slot(ctx, op2);
5343		}
5344		|	ASM_FP_REG_MEM_OP ucomis, type, op1_reg, mem
5345	}
5346	return op;
5347}
5348
5349static void ir_emit_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5350{
5351	ir_backend_data *data = ctx->data;
5352	dasm_State **Dst = &data->dasm_state;
5353	ir_op op = ir_emit_cmp_fp_common(ctx, def, def, insn);
5354	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5355	ir_reg tmp_reg = ctx->regs[def][3];
5356
5357	IR_ASSERT(def_reg != IR_REG_NONE);
5358	switch (op) {
5359		default:
5360			IR_ASSERT(0 && "NIY binary op");
5361		case IR_EQ:
5362			|	setnp Rb(def_reg)
5363			|	mov Rd(tmp_reg), 0
5364			|	cmovne Rd(def_reg), Rd(tmp_reg)
5365			break;
5366		case IR_NE:
5367			|	setp Rb(def_reg)
5368			|	mov Rd(tmp_reg), 1
5369			|	cmovne Rd(def_reg), Rd(tmp_reg)
5370			break;
5371		case IR_LT:
5372			|	setnp Rb(def_reg)
5373			|	mov Rd(tmp_reg), 0
5374			|	cmovae Rd(def_reg), Rd(tmp_reg)
5375			break;
5376		case IR_GE:
5377			|	setae Rb(def_reg)
5378			break;
5379		case IR_LE:
5380			|	setnp Rb(def_reg)
5381			|	mov Rd(tmp_reg), 0
5382			|	cmova Rd(def_reg), Rd(tmp_reg)
5383			break;
5384		case IR_GT:
5385			|	seta Rb(def_reg)
5386			break;
5387		case IR_ULT:
5388			|	setb Rb(def_reg)
5389			break;
5390		case IR_UGE:
5391			|	setp Rb(def_reg)
5392			|	mov Rd(tmp_reg), 1
5393			|	cmovae Rd(def_reg), Rd(tmp_reg)
5394			break;
5395		case IR_ULE:
5396			|	setbe Rb(def_reg)
5397			break;
5398		case IR_UGT:
5399			|	setp Rb(def_reg)
5400			|	mov Rd(tmp_reg), 1
5401			|	cmova Rd(def_reg), Rd(tmp_reg)
5402			break;
5403	}
5404	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5405		ir_emit_store(ctx, insn->type, def, def_reg);
5406	}
5407}
5408
5409static void ir_emit_jmp_true(ir_ctx *ctx, uint32_t b, ir_ref def)
5410{
5411	uint32_t true_block, false_block, next_block;
5412	ir_backend_data *data = ctx->data;
5413	dasm_State **Dst = &data->dasm_state;
5414
5415	ir_get_true_false_blocks(ctx, b, &true_block, &false_block, &next_block);
5416	if (true_block != next_block) {
5417		|	jmp =>true_block
5418	}
5419}
5420
5421static void ir_emit_jmp_false(ir_ctx *ctx, uint32_t b, ir_ref def)
5422{
5423	uint32_t true_block, false_block, next_block;
5424	ir_backend_data *data = ctx->data;
5425	dasm_State **Dst = &data->dasm_state;
5426
5427	ir_get_true_false_blocks(ctx, b, &true_block, &false_block, &next_block);
5428	if (false_block != next_block) {
5429		|	jmp =>false_block
5430	}
5431}
5432
5433static void ir_emit_jcc(ir_ctx *ctx, uint8_t op, uint32_t b, ir_ref def, ir_insn *insn, bool int_cmp)
5434{
5435	uint32_t true_block, false_block, next_block;
5436	ir_backend_data *data = ctx->data;
5437	dasm_State **Dst = &data->dasm_state;
5438
5439	ir_get_true_false_blocks(ctx, b, &true_block, &false_block, &next_block);
5440	if (true_block == next_block) {
5441		/* swap to avoid unconditional JMP */
5442		if (int_cmp || op == IR_EQ || op == IR_NE) {
5443			op ^= 1; // reverse
5444		} else {
5445			op ^= 5; // reverse
5446		}
5447		true_block = false_block;
5448		false_block = 0;
5449	} else if (false_block == next_block) {
5450		false_block = 0;
5451	}
5452
5453	if (int_cmp) {
5454		switch (op) {
5455			default:
5456				IR_ASSERT(0 && "NIY binary op");
5457			case IR_EQ:
5458				|	je =>true_block
5459				break;
5460			case IR_NE:
5461				|	jne =>true_block
5462				break;
5463			case IR_LT:
5464				|	jl =>true_block
5465				break;
5466			case IR_GE:
5467				|	jge =>true_block
5468				break;
5469			case IR_LE:
5470				|	jle =>true_block
5471				break;
5472			case IR_GT:
5473				|	jg =>true_block
5474				break;
5475			case IR_ULT:
5476				|	jb =>true_block
5477				break;
5478			case IR_UGE:
5479				|	jae =>true_block
5480				break;
5481			case IR_ULE:
5482				|	jbe =>true_block
5483				break;
5484			case IR_UGT:
5485				|	ja =>true_block
5486				break;
5487		}
5488	} else {
5489		switch (op) {
5490			default:
5491				IR_ASSERT(0 && "NIY binary op");
5492			case IR_EQ:
5493				if (!false_block) {
5494					|	jp >1
5495					|	je =>true_block
5496					|1:
5497				} else {
5498					|	jp =>false_block
5499					|	je =>true_block
5500				}
5501				break;
5502			case IR_NE:
5503				|	jne =>true_block
5504				|	jp =>true_block
5505				break;
5506			case IR_LT:
5507				if (!false_block) {
5508					|	jp >1
5509					|	jb =>true_block
5510					|1:
5511				} else {
5512					|	jp =>false_block
5513					|	jb =>true_block
5514				}
5515				break;
5516			case IR_GE:
5517				|	jae =>true_block
5518				break;
5519			case IR_LE:
5520				if (!false_block) {
5521					|	jp >1
5522					|	jbe =>true_block
5523					|1:
5524				} else {
5525					|	jp =>false_block
5526					|	jbe =>true_block
5527				}
5528				break;
5529			case IR_GT:
5530				|	ja =>true_block
5531				break;
5532			case IR_ULT:
5533				|	jb =>true_block
5534				break;
5535			case IR_UGE:
5536				|	jp =>true_block
5537				|	jae =>true_block
5538				break;
5539			case IR_ULE:
5540				|	jbe =>true_block
5541				break;
5542			case IR_UGT:
5543				|	jp =>true_block
5544				|	ja =>true_block
5545				break;
5546		}
5547	}
5548	if (false_block) {
5549		|	jmp =>false_block
5550	}
5551}
5552
5553static void ir_emit_cmp_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
5554{
5555	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
5556	ir_op op = cmp_insn->op;
5557	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5558	ir_ref op1 = cmp_insn->op1;
5559	ir_ref op2 = cmp_insn->op2;
5560	ir_reg op1_reg = ctx->regs[insn->op2][1];
5561	ir_reg op2_reg = ctx->regs[insn->op2][2];
5562
5563	if (op1_reg != IR_REG_NONE && (IR_IS_CONST_REF(op1) || IR_REG_SPILLED(op1_reg))) {
5564		op1_reg = IR_REG_NUM(op1_reg);
5565		ir_emit_load(ctx, type, op1_reg, op1);
5566	}
5567	if (op2_reg != IR_REG_NONE && (IR_IS_CONST_REF(op2) || IR_REG_SPILLED(op2_reg))) {
5568		op2_reg = IR_REG_NUM(op2_reg);
5569		if (op1 != op2) {
5570			ir_emit_load(ctx, type, op2_reg, op2);
5571		}
5572	}
5573	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5574		if (op == IR_ULT) {
5575			/* always false */
5576			ir_emit_jmp_false(ctx, b, def);
5577			return;
5578		} else if (op == IR_UGE) {
5579			/* always true */
5580			ir_emit_jmp_true(ctx, b, def);
5581			return;
5582		} else if (op == IR_ULE) {
5583			op = IR_EQ;
5584		} else if (op == IR_UGT) {
5585			op = IR_NE;
5586		}
5587	}
5588
5589	bool same_comparison = 0;
5590	ir_insn *prev_insn = &ctx->ir_base[insn->op1];
5591	if (prev_insn->op == IR_IF_TRUE || prev_insn->op == IR_IF_FALSE) {
5592		if (ir_rule(ctx, prev_insn->op1) == IR_CMP_AND_BRANCH_INT) {
5593			prev_insn = &ctx->ir_base[prev_insn->op1];
5594			prev_insn = &ctx->ir_base[prev_insn->op2];
5595			if (prev_insn->op1 == cmp_insn->op1 && prev_insn->op2 == cmp_insn->op2) {
5596				same_comparison = true;
5597			}
5598		}
5599	}
5600	if (!same_comparison) {
5601		ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
5602	}
5603	ir_emit_jcc(ctx, op, b, def, insn, 1);
5604}
5605
5606static void ir_emit_test_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
5607{
5608	ir_ref op2 = insn->op2;
5609	ir_op op = ctx->ir_base[op2].op;
5610
5611	if (op >= IR_EQ && op <= IR_UGT) {
5612		op2 = ctx->ir_base[op2].op1;
5613	} else {
5614		IR_ASSERT(op == IR_AND);
5615		op = IR_NE;
5616	}
5617
5618	ir_emit_test_int_common(ctx, def, op2, op);
5619	ir_emit_jcc(ctx, op, b, def, insn, 1);
5620}
5621
5622static void ir_emit_cmp_and_branch_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
5623{
5624	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
5625	ir_emit_jcc(ctx, op, b, def, insn, 0);
5626}
5627
5628static void ir_emit_if_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
5629{
5630	ir_type type = ctx->ir_base[insn->op2].type;
5631	ir_reg op2_reg = ctx->regs[def][2];
5632	ir_backend_data *data = ctx->data;
5633	dasm_State **Dst = &data->dasm_state;
5634
5635	if (op2_reg != IR_REG_NONE) {
5636		if (IR_REG_SPILLED(op2_reg)) {
5637			op2_reg = IR_REG_NUM(op2_reg);
5638			ir_emit_load(ctx, type, op2_reg, insn->op2);
5639		}
5640		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
5641	} else if (IR_IS_CONST_REF(insn->op2)) {
5642		uint32_t true_block, false_block, next_block;
5643
5644		ir_get_true_false_blocks(ctx, b, &true_block, &false_block, &next_block);
5645		if (ir_const_is_true(&ctx->ir_base[insn->op2])) {
5646			if (true_block != next_block) {
5647				|	jmp =>true_block
5648			}
5649		} else {
5650			if (false_block != next_block) {
5651				|	jmp =>false_block
5652			}
5653		}
5654		return;
5655	} else {
5656		ir_mem mem;
5657
5658		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
5659			mem = ir_fuse_load(ctx, def, insn->op2);
5660		} else {
5661			mem = ir_ref_spill_slot(ctx, insn->op2);
5662		}
5663		|	ASM_MEM_IMM_OP cmp, type, mem, 0
5664	}
5665	ir_emit_jcc(ctx, IR_NE, b, def, insn, 1);
5666}
5667
5668static void ir_emit_cond(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5669{
5670	ir_backend_data *data = ctx->data;
5671	dasm_State **Dst = &data->dasm_state;
5672	ir_type type = insn->type;
5673	ir_ref op1 = insn->op1;
5674	ir_ref op2 = insn->op2;
5675	ir_ref op3 = insn->op3;
5676	ir_type op1_type = ctx->ir_base[op1].type;
5677	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5678	ir_reg op1_reg = ctx->regs[def][1];
5679	ir_reg op2_reg = ctx->regs[def][2];
5680	ir_reg op3_reg = ctx->regs[def][3];
5681
5682	IR_ASSERT(def_reg != IR_REG_NONE);
5683
5684	if (op2_reg != IR_REG_NONE && (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(op2))) {
5685		op2_reg = IR_REG_NUM(op2_reg);
5686		ir_emit_load(ctx, type, op2_reg, op2);
5687		if (op1 == op2) {
5688			op1_reg = op2_reg;
5689		}
5690		if (op3 == op2) {
5691			op3_reg = op2_reg;
5692		}
5693	}
5694	if (op3_reg != IR_REG_NONE && op3 != op2 && (IR_REG_SPILLED(op3_reg) || IR_IS_CONST_REF(op3))) {
5695		op3_reg = IR_REG_NUM(op3_reg);
5696		ir_emit_load(ctx, type, op3_reg, op3);
5697		if (op1 == op2) {
5698			op1_reg = op3_reg;
5699		}
5700	}
5701	if (op1_reg != IR_REG_NONE && op1 != op2 && op1 != op3 && (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(op1))) {
5702		op1_reg = IR_REG_NUM(op1_reg);
5703		ir_emit_load(ctx, op1_type, op1_reg, op1);
5704	}
5705
5706	if (IR_IS_TYPE_INT(op1_type)) {
5707		if (op1_reg != IR_REG_NONE) {
5708			|	ASM_REG_REG_OP test, op1_type, op1_reg, op1_reg
5709		} else {
5710			ir_mem mem = ir_ref_spill_slot(ctx, op1);
5711
5712			|	ASM_MEM_IMM_OP cmp, op1_type, mem, 0
5713		}
5714		|	je >2
5715	} else {
5716		if (!data->double_zero_const) {
5717			data->double_zero_const = 1;
5718			ir_rodata(ctx);
5719			|.align 16
5720			|->double_zero_const:
5721			|.dword 0, 0
5722			|.code
5723		}
5724		|	ASM_FP_REG_TXT_OP ucomis, op1_type, op1_reg, [->double_zero_const]
5725		|	jp >1
5726		|	je >2
5727		|1:
5728	}
5729
5730	if (op2_reg != IR_REG_NONE) {
5731		if (def_reg != op2_reg) {
5732			if (IR_IS_TYPE_INT(type)) {
5733				ir_emit_mov(ctx, type, def_reg, op2_reg);
5734			} else {
5735				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
5736			}
5737		}
5738	} else if (IR_IS_CONST_REF(op2) || !(ir_rule(ctx, op2) & IR_FUSED)) {
5739		ir_emit_load(ctx, type, def_reg, op2);
5740	} else {
5741		ir_emit_load_mem(ctx, type, def_reg, ir_fuse_load(ctx, def, op2));
5742	}
5743	|	jmp >3
5744	|2:
5745	if (op3_reg != IR_REG_NONE) {
5746		if (def_reg != op3_reg) {
5747			if (IR_IS_TYPE_INT(type)) {
5748				ir_emit_mov(ctx, type, def_reg, op3_reg);
5749			} else {
5750				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
5751			}
5752		}
5753	} else if (IR_IS_CONST_REF(op3) || !(ir_rule(ctx, op3) & IR_FUSED)) {
5754		ir_emit_load(ctx, type, def_reg, op3);
5755	} else {
5756		ir_emit_load_mem(ctx, type, def_reg, ir_fuse_load(ctx, def, op3));
5757	}
5758	|3:
5759
5760	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5761		ir_emit_store(ctx, type, def, def_reg);
5762	}
5763}
5764
5765static void ir_emit_return_void(ir_ctx *ctx)
5766{
5767	ir_backend_data *data = ctx->data;
5768	dasm_State **Dst = &data->dasm_state;
5769
5770	ir_emit_epilogue(ctx);
5771
5772#ifdef IR_TARGET_X86
5773	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC) && ctx->param_stack_size) {
5774		|	ret ctx->param_stack_size
5775		return;
5776	}
5777#endif
5778
5779	|	ret
5780}
5781
5782static void ir_emit_return_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
5783{
5784	ir_reg op2_reg = ctx->regs[ref][2];
5785
5786	if (op2_reg != IR_REG_INT_RET1) {
5787		ir_type type = ctx->ir_base[insn->op2].type;
5788
5789		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
5790			ir_emit_mov(ctx, type, IR_REG_INT_RET1, op2_reg);
5791		} else {
5792			ir_emit_load(ctx, type, IR_REG_INT_RET1, insn->op2);
5793		}
5794	}
5795	ir_emit_return_void(ctx);
5796}
5797
5798static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
5799{
5800	ir_reg op2_reg = ctx->regs[ref][2];
5801	ir_type type = ctx->ir_base[insn->op2].type;
5802
5803#ifdef IR_REG_FP_RET1
5804	if (op2_reg != IR_REG_FP_RET1) {
5805		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
5806			ir_emit_fp_mov(ctx, type, IR_REG_FP_RET1, op2_reg);
5807		} else {
5808			ir_emit_load(ctx, type, IR_REG_FP_RET1, insn->op2);
5809		}
5810	}
5811#else
5812	ir_backend_data *data = ctx->data;
5813	dasm_State **Dst = &data->dasm_state;
5814
5815	if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
5816		ir_reg fp;
5817		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp);
5818
5819		if (type == IR_DOUBLE) {
5820			|	fld qword [Ra(fp)+offset]
5821		} else {
5822			IR_ASSERT(type == IR_FLOAT);
5823			|	fld dword [Ra(fp)+offset]
5824		}
5825	} else {
5826		int32_t offset = ctx->ret_slot;
5827		ir_reg fp;
5828
5829		IR_ASSERT(offset != -1);
5830		offset = IR_SPILL_POS_TO_OFFSET(offset);
5831		fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
5832		ir_emit_store_mem_fp(ctx, type, IR_MEM_BO(fp, offset), op2_reg);
5833		if (type == IR_DOUBLE) {
5834			|	fld qword [Ra(fp)+offset]
5835		} else {
5836			IR_ASSERT(type == IR_FLOAT);
5837			|	fld dword [Ra(fp)+offset]
5838		}
5839	}
5840#endif
5841	ir_emit_return_void(ctx);
5842}
5843
5844static void ir_emit_sext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5845{
5846	ir_type dst_type = insn->type;
5847	ir_type src_type = ctx->ir_base[insn->op1].type;
5848	ir_backend_data *data = ctx->data;
5849	dasm_State **Dst = &data->dasm_state;
5850	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5851	ir_reg op1_reg = ctx->regs[def][1];
5852
5853	IR_ASSERT(IR_IS_TYPE_INT(src_type));
5854	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
5855	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
5856	IR_ASSERT(def_reg != IR_REG_NONE);
5857
5858	if (op1_reg != IR_REG_NONE) {
5859		if (IR_REG_SPILLED(op1_reg)) {
5860			op1_reg = IR_REG_NUM(op1_reg);
5861			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
5862		}
5863		if (ir_type_size[src_type] == 1) {
5864			if (ir_type_size[dst_type] == 2) {
5865				|	movsx Rw(def_reg), Rb(op1_reg)
5866			} else if (ir_type_size[dst_type] == 4) {
5867				|	movsx Rd(def_reg), Rb(op1_reg)
5868			} else {
5869				IR_ASSERT(ir_type_size[dst_type] == 8);
5870				IR_ASSERT(sizeof(void*) == 8);
5871|.if X64
5872				|	movsx Rq(def_reg), Rb(op1_reg)
5873|.endif
5874			}
5875		} else if (ir_type_size[src_type] == 2) {
5876			if (ir_type_size[dst_type] == 4) {
5877				|	movsx Rd(def_reg), Rw(op1_reg)
5878			} else {
5879				IR_ASSERT(ir_type_size[dst_type] == 8);
5880				IR_ASSERT(sizeof(void*) == 8);
5881|.if X64
5882				|	movsx Rq(def_reg), Rw(op1_reg)
5883|.endif
5884			}
5885		} else {
5886			IR_ASSERT(ir_type_size[src_type] == 4);
5887			IR_ASSERT(ir_type_size[dst_type] == 8);
5888			IR_ASSERT(sizeof(void*) == 8);
5889|.if X64
5890			|	movsxd Rq(def_reg), Rd(op1_reg)
5891|.endif
5892		}
5893	} else if (IR_IS_CONST_REF(insn->op1)) {
5894		IR_ASSERT(0);
5895	} else {
5896		ir_mem mem;
5897
5898		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
5899			mem = ir_fuse_load(ctx, def, insn->op1);
5900		} else {
5901			mem = ir_ref_spill_slot(ctx, insn->op1);
5902		}
5903
5904		if (ir_type_size[src_type] == 1) {
5905			if (ir_type_size[dst_type] == 2) {
5906				|	ASM_TXT_TMEM_OP movsx, Rw(def_reg), byte, mem
5907			} else if (ir_type_size[dst_type] == 4) {
5908				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), byte, mem
5909			} else {
5910				IR_ASSERT(ir_type_size[dst_type] == 8);
5911				IR_ASSERT(sizeof(void*) == 8);
5912|.if X64
5913				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), byte, mem
5914|.endif
5915			}
5916		} else if (ir_type_size[src_type] == 2) {
5917			if (ir_type_size[dst_type] == 4) {
5918				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), word, mem
5919			} else {
5920				IR_ASSERT(ir_type_size[dst_type] == 8);
5921				IR_ASSERT(sizeof(void*) == 8);
5922|.if X64
5923				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), word, mem
5924|.endif
5925			}
5926		} else {
5927			IR_ASSERT(ir_type_size[src_type] == 4);
5928			IR_ASSERT(ir_type_size[dst_type] == 8);
5929			IR_ASSERT(sizeof(void*) == 8);
5930|.if X64
5931			|	ASM_TXT_TMEM_OP movsxd, Rq(def_reg), dword, mem
5932|.endif
5933		}
5934	}
5935	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5936		ir_emit_store(ctx, dst_type, def, def_reg);
5937	}
5938}
5939
5940static void ir_emit_zext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5941{
5942	ir_type dst_type = insn->type;
5943	ir_type src_type = ctx->ir_base[insn->op1].type;
5944	ir_backend_data *data = ctx->data;
5945	dasm_State **Dst = &data->dasm_state;
5946	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5947	ir_reg op1_reg = ctx->regs[def][1];
5948
5949	IR_ASSERT(IR_IS_TYPE_INT(src_type));
5950	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
5951	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
5952	IR_ASSERT(def_reg != IR_REG_NONE);
5953
5954	if (op1_reg != IR_REG_NONE) {
5955		if (IR_REG_SPILLED(op1_reg)) {
5956			op1_reg = IR_REG_NUM(op1_reg);
5957			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
5958		}
5959		if (ir_type_size[src_type] == 1) {
5960			if (ir_type_size[dst_type] == 2) {
5961				|	movzx Rw(def_reg), Rb(op1_reg)
5962			} else if (ir_type_size[dst_type] == 4) {
5963				|	movzx Rd(def_reg), Rb(op1_reg)
5964			} else {
5965				IR_ASSERT(ir_type_size[dst_type] == 8);
5966				IR_ASSERT(sizeof(void*) == 8);
5967|.if X64
5968				|	movzx Rq(def_reg), Rb(op1_reg)
5969|.endif
5970			}
5971		} else if (ir_type_size[src_type] == 2) {
5972			if (ir_type_size[dst_type] == 4) {
5973				|	movzx Rd(def_reg), Rw(op1_reg)
5974			} else {
5975				IR_ASSERT(ir_type_size[dst_type] == 8);
5976				IR_ASSERT(sizeof(void*) == 8);
5977|.if X64
5978				|	movzx Rq(def_reg), Rw(op1_reg)
5979|.endif
5980			}
5981		} else {
5982			IR_ASSERT(ir_type_size[src_type] == 4);
5983			IR_ASSERT(ir_type_size[dst_type] == 8);
5984			IR_ASSERT(sizeof(void*) == 8);
5985|.if X64
5986			/* Avoid zero extension to the same register. This may be not always safe ??? */
5987			if (op1_reg != def_reg) {
5988				|	mov Rd(def_reg), Rd(op1_reg)
5989			}
5990|.endif
5991		}
5992	} else if (IR_IS_CONST_REF(insn->op1)) {
5993		IR_ASSERT(0);
5994	} else {
5995		ir_mem mem;
5996
5997		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
5998			mem = ir_fuse_load(ctx, def, insn->op1);
5999		} else {
6000			mem = ir_ref_spill_slot(ctx, insn->op1);
6001		}
6002
6003		if (ir_type_size[src_type] == 1) {
6004			if (ir_type_size[dst_type] == 2) {
6005				|	ASM_TXT_TMEM_OP movzx, Rw(def_reg), byte, mem
6006			} else if (ir_type_size[dst_type] == 4) {
6007				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), byte, mem
6008			} else {
6009				IR_ASSERT(ir_type_size[dst_type] == 8);
6010				IR_ASSERT(sizeof(void*) == 8);
6011|.if X64
6012				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), byte, mem
6013|.endif
6014			}
6015		} else if (ir_type_size[src_type] == 2) {
6016			if (ir_type_size[dst_type] == 4) {
6017				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), word, mem
6018			} else {
6019				IR_ASSERT(ir_type_size[dst_type] == 8);
6020				IR_ASSERT(sizeof(void*) == 8);
6021|.if X64
6022				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), word, mem
6023|.endif
6024			}
6025		} else {
6026			IR_ASSERT(ir_type_size[src_type] == 4);
6027			IR_ASSERT(ir_type_size[dst_type] == 8);
6028|.if X64
6029			|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6030|.endif
6031		}
6032	}
6033	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6034		ir_emit_store(ctx, dst_type, def, def_reg);
6035	}
6036}
6037
6038static void ir_emit_trunc(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6039{
6040	ir_type dst_type = insn->type;
6041	ir_type src_type = ctx->ir_base[insn->op1].type;
6042	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6043	ir_reg op1_reg = ctx->regs[def][1];
6044
6045	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6046	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6047	IR_ASSERT(ir_type_size[dst_type] < ir_type_size[src_type]);
6048	IR_ASSERT(def_reg != IR_REG_NONE);
6049	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
6050		op1_reg = IR_REG_NUM(op1_reg);
6051		ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6052	}
6053	if (op1_reg != IR_REG_NONE) {
6054		if (op1_reg != def_reg) {
6055			ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6056		}
6057	} else {
6058		ir_emit_load(ctx, dst_type, def_reg, insn->op1);
6059	}
6060	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6061		ir_emit_store(ctx, dst_type, def, def_reg);
6062	}
6063}
6064
6065static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6066{
6067	ir_type dst_type = insn->type;
6068	ir_type src_type = ctx->ir_base[insn->op1].type;
6069	ir_backend_data *data = ctx->data;
6070	dasm_State **Dst = &data->dasm_state;
6071	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6072	ir_reg op1_reg = ctx->regs[def][1];
6073
6074	IR_ASSERT(ir_type_size[dst_type] == ir_type_size[src_type]);
6075	IR_ASSERT(def_reg != IR_REG_NONE);
6076	if (IR_IS_TYPE_INT(src_type) && IR_IS_TYPE_INT(dst_type)) {
6077		if (!IR_IS_CONST_REF(insn->op1) && (ir_rule(ctx, insn->op1) & IR_FUSED)) {
6078			ir_emit_load_mem_int(ctx, dst_type, def_reg, ir_fuse_load(ctx, def, insn->op1));
6079		} else if (op1_reg != IR_REG_NONE) {
6080			if (IR_REG_SPILLED(op1_reg)) {
6081				op1_reg = IR_REG_NUM(op1_reg);
6082				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6083			}
6084			if (op1_reg != def_reg) {
6085				ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6086			}
6087		} else {
6088			ir_emit_load(ctx, dst_type, def_reg, insn->op1);
6089		}
6090	} else if (IR_IS_TYPE_FP(src_type) && IR_IS_TYPE_FP(dst_type)) {
6091		if (!IR_IS_CONST_REF(insn->op1) && (ir_rule(ctx, insn->op1) & IR_FUSED)) {
6092			ir_mem mem = ir_fuse_load(ctx, def, insn->op1);
6093			ir_emit_load_mem_fp(ctx, dst_type, def_reg, mem);
6094		} else if (op1_reg != IR_REG_NONE) {
6095			if (IR_REG_SPILLED(op1_reg)) {
6096				op1_reg = IR_REG_NUM(op1_reg);
6097				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6098			}
6099			if (op1_reg != def_reg) {
6100				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
6101			}
6102		} else {
6103			ir_emit_load(ctx, dst_type, def_reg, insn->op1);
6104		}
6105	} else if (IR_IS_TYPE_FP(src_type)) {
6106		IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6107		if (op1_reg != IR_REG_NONE) {
6108			if (IR_REG_SPILLED(op1_reg)) {
6109				op1_reg = IR_REG_NUM(op1_reg);
6110				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6111			}
6112			if (src_type == IR_DOUBLE) {
6113				IR_ASSERT(sizeof(void*) == 8);
6114|.if X64
6115				if (ctx->mflags & IR_X86_AVX) {
6116					|	vmovd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6117				} else {
6118					|	movd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6119				}
6120|.endif
6121			} else {
6122				IR_ASSERT(src_type == IR_FLOAT);
6123				if (ctx->mflags & IR_X86_AVX) {
6124					|	vmovd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6125				} else {
6126					|	movd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6127				}
6128			}
6129		} else if (IR_IS_CONST_REF(insn->op1)) {
6130			ir_insn *_insn = &ctx->ir_base[insn->op1];
6131			IR_ASSERT(!IR_IS_SYM_CONST(_insn->op));
6132			if (src_type == IR_DOUBLE) {
6133				IR_ASSERT(sizeof(void*) == 8);
6134|.if X64
6135				|	mov64 Rq(def_reg), _insn->val.i64
6136|.endif
6137			} else {
6138				IR_ASSERT(src_type == IR_FLOAT);
6139				|	mov Rd(def_reg), _insn->val.i32
6140			}
6141		} else {
6142			ir_mem mem;
6143
6144			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6145				mem = ir_fuse_load(ctx, def, insn->op1);
6146			} else {
6147				mem = ir_ref_spill_slot(ctx, insn->op1);
6148			}
6149
6150			if (src_type == IR_DOUBLE) {
6151				IR_ASSERT(sizeof(void*) == 8);
6152|.if X64
6153				|	ASM_TXT_TMEM_OP mov, Rq(def_reg), qword, mem
6154|.endif
6155			} else {
6156				IR_ASSERT(src_type == IR_FLOAT);
6157				|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6158			}
6159		}
6160	} else if (IR_IS_TYPE_FP(dst_type)) {
6161		IR_ASSERT(IR_IS_TYPE_INT(src_type));
6162		if (op1_reg != IR_REG_NONE) {
6163			if (IR_REG_SPILLED(op1_reg)) {
6164				op1_reg = IR_REG_NUM(op1_reg);
6165				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6166			}
6167			if (dst_type == IR_DOUBLE) {
6168				IR_ASSERT(sizeof(void*) == 8);
6169|.if X64
6170				if (ctx->mflags & IR_X86_AVX) {
6171					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6172				} else {
6173					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6174				}
6175|.endif
6176			} else {
6177				IR_ASSERT(dst_type == IR_FLOAT);
6178				if (ctx->mflags & IR_X86_AVX) {
6179					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6180				} else {
6181					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6182				}
6183			}
6184		} else if (IR_IS_CONST_REF(insn->op1)) {
6185			int label = ir_const_label(ctx, insn->op1);
6186
6187			|	ASM_FP_REG_TXT_OP movs, dst_type, def_reg, [=>label]
6188		} else {
6189			ir_mem mem;
6190
6191			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6192				mem = ir_fuse_load(ctx, def, insn->op1);
6193			} else {
6194				mem = ir_ref_spill_slot(ctx, insn->op1);
6195			}
6196
6197			|	ASM_FP_REG_MEM_OP movs, dst_type, def_reg, mem
6198		}
6199	}
6200	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6201		ir_emit_store(ctx, dst_type, def, def_reg);
6202	}
6203}
6204
6205static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6206{
6207	ir_type dst_type = insn->type;
6208	ir_type src_type = ctx->ir_base[insn->op1].type;
6209	ir_backend_data *data = ctx->data;
6210	dasm_State **Dst = &data->dasm_state;
6211	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6212	ir_reg op1_reg = ctx->regs[def][1];
6213
6214	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6215	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
6216	IR_ASSERT(def_reg != IR_REG_NONE);
6217	if (op1_reg != IR_REG_NONE) {
6218		bool src64 = 0;
6219
6220		if (IR_REG_SPILLED(op1_reg) || IR_IS_CONST_REF(insn->op1)) {
6221			op1_reg = IR_REG_NUM(op1_reg);
6222			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6223		}
6224		if (IR_IS_TYPE_SIGNED(src_type)) {
6225			if (ir_type_size[src_type] < 4) {
6226|.if X64
6227||				if (ir_type_size[src_type] == 1) {
6228					| movsx Rq(op1_reg), Rb(op1_reg)
6229||				} else {
6230					| movsx Rq(op1_reg), Rw(op1_reg)
6231||				}
6232||				src64 = 1;
6233|.else
6234||				if (ir_type_size[src_type] == 1) {
6235					| movsx Rd(op1_reg), Rb(op1_reg)
6236||				} else {
6237					| movsx Rd(op1_reg), Rw(op1_reg)
6238||				}
6239|.endif
6240			} else if (ir_type_size[src_type] > 4) {
6241				src64 = 1;
6242			}
6243		} else {
6244			if (ir_type_size[src_type] < 8) {
6245|.if X64
6246||				if (ir_type_size[src_type] == 1) {
6247					| movzx Rq(op1_reg), Rb(op1_reg)
6248||				} else if (ir_type_size[src_type] == 2) {
6249					| movzx Rq(op1_reg), Rw(op1_reg)
6250||				}
6251||				src64 = 1;
6252|.else
6253||				if (ir_type_size[src_type] == 1) {
6254					| movzx Rd(op1_reg), Rb(op1_reg)
6255||				} else if (ir_type_size[src_type] == 2) {
6256					| movzx Rd(op1_reg), Rw(op1_reg)
6257||				}
6258|.endif
6259			} else {
6260				// TODO: uint64_t -> double
6261				src64 = 1;
6262			}
6263		}
6264		if (!src64) {
6265			if (dst_type == IR_DOUBLE) {
6266				if (ctx->mflags & IR_X86_AVX) {
6267					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6268					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6269				} else {
6270					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6271					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6272				}
6273			} else {
6274				IR_ASSERT(dst_type == IR_FLOAT);
6275				if (ctx->mflags & IR_X86_AVX) {
6276					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6277					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6278				} else {
6279					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6280					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6281				}
6282			}
6283		} else {
6284			IR_ASSERT(sizeof(void*) == 8);
6285|.if X64
6286			if (dst_type == IR_DOUBLE) {
6287				if (ctx->mflags & IR_X86_AVX) {
6288					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6289					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6290				} else {
6291					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6292					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6293				}
6294			} else {
6295				IR_ASSERT(dst_type == IR_FLOAT);
6296				if (ctx->mflags & IR_X86_AVX) {
6297					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6298					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6299				} else {
6300					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6301					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6302				}
6303			}
6304|.endif
6305		}
6306	} else {
6307		ir_mem mem;
6308		bool src64 = ir_type_size[src_type] == 8;
6309
6310		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6311			mem = ir_fuse_load(ctx, def, insn->op1);
6312		} else {
6313			mem = ir_ref_spill_slot(ctx, insn->op1);
6314		}
6315
6316		if (!src64) {
6317			if (dst_type == IR_DOUBLE) {
6318				if (ctx->mflags & IR_X86_AVX) {
6319					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6320					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6321				} else {
6322					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6323					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6324				}
6325			} else {
6326				IR_ASSERT(dst_type == IR_FLOAT);
6327				if (ctx->mflags & IR_X86_AVX) {
6328					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6329					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6330				} else {
6331					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6332					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6333				}
6334			}
6335		} else {
6336			IR_ASSERT(sizeof(void*) == 8);
6337|.if X64
6338			if (dst_type == IR_DOUBLE) {
6339				if (ctx->mflags & IR_X86_AVX) {
6340					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6341					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6342				} else {
6343					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6344					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6345				}
6346			} else {
6347				IR_ASSERT(dst_type == IR_FLOAT);
6348				if (ctx->mflags & IR_X86_AVX) {
6349					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6350					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6351				} else {
6352					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
6353					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6354				}
6355			}
6356|.endif
6357		}
6358	}
6359	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6360		ir_emit_store(ctx, dst_type, def, def_reg);
6361	}
6362}
6363
6364static void ir_emit_fp2int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6365{
6366	ir_type dst_type = insn->type;
6367	ir_type src_type = ctx->ir_base[insn->op1].type;
6368	ir_backend_data *data = ctx->data;
6369	dasm_State **Dst = &data->dasm_state;
6370	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6371	ir_reg op1_reg = ctx->regs[def][1];
6372	bool dst64 = 0;
6373
6374	IR_ASSERT(IR_IS_TYPE_FP(src_type));
6375	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6376	IR_ASSERT(def_reg != IR_REG_NONE);
6377	if (IR_IS_TYPE_SIGNED(dst_type) ? ir_type_size[dst_type] == 8 : ir_type_size[dst_type] >= 4) {
6378		// TODO: we might need to perform truncation from 32/64 bit integer
6379		dst64 = 1;
6380	}
6381	if (op1_reg != IR_REG_NONE) {
6382		if (IR_REG_SPILLED(op1_reg)) {
6383			op1_reg = IR_REG_NUM(op1_reg);
6384			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6385		}
6386		if (!dst64) {
6387			if (src_type == IR_DOUBLE) {
6388				if (ctx->mflags & IR_X86_AVX) {
6389					|	vcvtsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6390				} else {
6391					|	cvtsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6392				}
6393			} else {
6394				IR_ASSERT(src_type == IR_FLOAT);
6395				if (ctx->mflags & IR_X86_AVX) {
6396					|	vcvtss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6397				} else {
6398					|	cvtss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6399				}
6400			}
6401		} else {
6402			IR_ASSERT(sizeof(void*) == 8);
6403|.if X64
6404			if (src_type == IR_DOUBLE) {
6405				if (ctx->mflags & IR_X86_AVX) {
6406					|	vcvtsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6407				} else {
6408					|	cvtsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6409				}
6410			} else {
6411				IR_ASSERT(src_type == IR_FLOAT);
6412				if (ctx->mflags & IR_X86_AVX) {
6413					|	vcvtss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6414				} else {
6415					|	cvtss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6416				}
6417			}
6418|.endif
6419		}
6420	} else if (IR_IS_CONST_REF(insn->op1)) {
6421		int label = ir_const_label(ctx, insn->op1);
6422
6423		if (!dst64) {
6424			if (src_type == IR_DOUBLE) {
6425				if (ctx->mflags & IR_X86_AVX) {
6426					|	vcvtsd2si Rd(def_reg), qword [=>label]
6427				} else {
6428					|	cvtsd2si Rd(def_reg), qword [=>label]
6429				}
6430			} else {
6431				IR_ASSERT(src_type == IR_FLOAT);
6432				if (ctx->mflags & IR_X86_AVX) {
6433					|	vcvtss2si Rd(def_reg), dword [=>label]
6434				} else {
6435					|	cvtss2si Rd(def_reg), dword [=>label]
6436				}
6437			}
6438		} else {
6439			IR_ASSERT(sizeof(void*) == 8);
6440|.if X64
6441			if (src_type == IR_DOUBLE) {
6442				if (ctx->mflags & IR_X86_AVX) {
6443					|	vcvtsd2si Rq(def_reg), qword [=>label]
6444				} else {
6445					|	cvtsd2si Rq(def_reg), qword [=>label]
6446				}
6447			} else {
6448				IR_ASSERT(src_type == IR_FLOAT);
6449				if (ctx->mflags & IR_X86_AVX) {
6450					|	vcvtss2si Rq(def_reg), dword [=>label]
6451				} else {
6452					|	cvtss2si Rq(def_reg), dword [=>label]
6453				}
6454			}
6455|.endif
6456		}
6457	} else {
6458		ir_mem mem;
6459
6460		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6461			mem = ir_fuse_load(ctx, def, insn->op1);
6462		} else {
6463			mem = ir_ref_spill_slot(ctx, insn->op1);
6464		}
6465
6466		if (!dst64) {
6467			if (src_type == IR_DOUBLE) {
6468				if (ctx->mflags & IR_X86_AVX) {
6469					|	ASM_TXT_TMEM_OP vcvtsd2si, Rd(def_reg), qword, mem
6470				} else {
6471					|	ASM_TXT_TMEM_OP cvtsd2si, Rd(def_reg), qword, mem
6472				}
6473			} else {
6474				IR_ASSERT(src_type == IR_FLOAT);
6475				if (ctx->mflags & IR_X86_AVX) {
6476					|	ASM_TXT_TMEM_OP vcvtss2si, Rd(def_reg), dword, mem
6477				} else {
6478					|	ASM_TXT_TMEM_OP cvtss2si, Rd(def_reg), dword, mem
6479				}
6480			}
6481		} else {
6482			IR_ASSERT(sizeof(void*) == 8);
6483|.if X64
6484			if (src_type == IR_DOUBLE) {
6485				if (ctx->mflags & IR_X86_AVX) {
6486					|	ASM_TXT_TMEM_OP vcvtsd2si, Rq(def_reg), qword, mem
6487				} else {
6488					|	ASM_TXT_TMEM_OP cvtsd2si, Rq(def_reg), qword, mem
6489				}
6490			} else {
6491				IR_ASSERT(src_type == IR_FLOAT);
6492				if (ctx->mflags & IR_X86_AVX) {
6493					|	ASM_TXT_TMEM_OP vcvtss2si, Rq(def_reg), dword, mem
6494				} else {
6495					|	ASM_TXT_TMEM_OP cvtss2si, Rq(def_reg), dword, mem
6496				}
6497			}
6498|.endif
6499		}
6500	}
6501	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6502		ir_emit_store(ctx, dst_type, def, def_reg);
6503	}
6504}
6505
6506static void ir_emit_fp2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6507{
6508	ir_type dst_type = insn->type;
6509	ir_type src_type = ctx->ir_base[insn->op1].type;
6510	ir_backend_data *data = ctx->data;
6511	dasm_State **Dst = &data->dasm_state;
6512	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6513	ir_reg op1_reg = ctx->regs[def][1];
6514
6515	IR_ASSERT(IR_IS_TYPE_FP(src_type));
6516	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
6517	IR_ASSERT(def_reg != IR_REG_NONE);
6518	if (op1_reg != IR_REG_NONE) {
6519		if (IR_REG_SPILLED(op1_reg)) {
6520			op1_reg = IR_REG_NUM(op1_reg);
6521			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6522		}
6523		if (src_type == dst_type) {
6524			if (op1_reg != def_reg) {
6525				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
6526			}
6527		} else if (src_type == IR_DOUBLE) {
6528			if (ctx->mflags & IR_X86_AVX) {
6529				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
6530			} else {
6531				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
6532			}
6533		} else {
6534			IR_ASSERT(src_type == IR_FLOAT);
6535			if (ctx->mflags & IR_X86_AVX) {
6536				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
6537			} else {
6538				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
6539			}
6540		}
6541	} else if (IR_IS_CONST_REF(insn->op1)) {
6542		int label = ir_const_label(ctx, insn->op1);
6543
6544		if (src_type == IR_DOUBLE) {
6545			if (ctx->mflags & IR_X86_AVX) {
6546				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
6547			} else {
6548				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
6549			}
6550		} else {
6551			IR_ASSERT(src_type == IR_FLOAT);
6552			if (ctx->mflags & IR_X86_AVX) {
6553				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
6554			} else {
6555				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
6556			}
6557		}
6558	} else {
6559		ir_mem mem;
6560
6561		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6562			mem = ir_fuse_load(ctx, def, insn->op1);
6563		} else {
6564			mem = ir_ref_spill_slot(ctx, insn->op1);
6565		}
6566
6567		if (src_type == IR_DOUBLE) {
6568			if (ctx->mflags & IR_X86_AVX) {
6569				|	ASM_TXT_TXT_TMEM_OP vcvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6570			} else {
6571				|	ASM_TXT_TMEM_OP cvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
6572			}
6573		} else {
6574			IR_ASSERT(src_type == IR_FLOAT);
6575			if (ctx->mflags & IR_X86_AVX) {
6576				|	ASM_TXT_TXT_TMEM_OP vcvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6577			} else {
6578				|	ASM_TXT_TMEM_OP cvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
6579			}
6580		}
6581	}
6582	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6583		ir_emit_store(ctx, dst_type, def, def_reg);
6584	}
6585}
6586
6587static void ir_emit_copy_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6588{
6589	ir_ref type = insn->type;
6590	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6591	ir_reg op1_reg = ctx->regs[def][1];
6592
6593	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
6594	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
6595		op1_reg = IR_REG_NUM(op1_reg);
6596		ir_emit_load(ctx, type, op1_reg, insn->op1);
6597	}
6598	if (def_reg == op1_reg) {
6599		/* same reg */
6600	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
6601		ir_emit_mov(ctx, type, def_reg, op1_reg);
6602	} else if (def_reg != IR_REG_NONE) {
6603		ir_emit_load(ctx, type, def_reg, insn->op1);
6604	} else if (op1_reg != IR_REG_NONE) {
6605		ir_emit_store(ctx, type, def, op1_reg);
6606	} else {
6607		IR_ASSERT(0);
6608	}
6609	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
6610		ir_emit_store(ctx, type, def, def_reg);
6611	}
6612}
6613
6614static void ir_emit_copy_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6615{
6616	ir_type type = insn->type;
6617	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6618	ir_reg op1_reg = ctx->regs[def][1];
6619
6620	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
6621	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
6622		op1_reg = IR_REG_NUM(op1_reg);
6623		ir_emit_load(ctx, type, op1_reg, insn->op1);
6624	}
6625	if (def_reg == op1_reg) {
6626		/* same reg */
6627	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
6628		ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
6629	} else if (def_reg != IR_REG_NONE) {
6630		ir_emit_load(ctx, type, def_reg, insn->op1);
6631	} else if (op1_reg != IR_REG_NONE) {
6632		ir_emit_store(ctx, type, def, op1_reg);
6633	} else {
6634		IR_ASSERT(0);
6635	}
6636	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
6637		ir_emit_store(ctx, type, def, def_reg);
6638	}
6639}
6640
6641static void ir_emit_vaddr(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6642{
6643	ir_backend_data *data = ctx->data;
6644	dasm_State **Dst = &data->dasm_state;
6645	ir_ref type = insn->type;
6646	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6647	ir_mem mem;
6648	int32_t offset;
6649	ir_reg fp;
6650
6651	IR_ASSERT(def_reg != IR_REG_NONE);
6652	mem = ir_var_spill_slot(ctx, insn->op1);
6653	fp = IR_MEM_BASE(mem);
6654	offset = IR_MEM_OFFSET(mem);
6655	|	lea Ra(def_reg), aword [Ra(fp)+offset]
6656	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6657		ir_emit_store(ctx, type, def, def_reg);
6658	}
6659}
6660
6661static void ir_emit_vload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6662{
6663	ir_insn *var_insn = &ctx->ir_base[insn->op2];
6664	ir_ref type = insn->type;
6665	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6666	ir_reg fp;
6667	ir_mem mem;
6668
6669	IR_ASSERT(var_insn->op == IR_VAR);
6670	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
6671	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
6672	if (def_reg == IR_REG_NONE && ir_is_same_mem_var(ctx, def, var_insn->op3)) {
6673		return; // fake load
6674	}
6675	IR_ASSERT(def_reg != IR_REG_NONE);
6676
6677	ir_emit_load_mem(ctx, type, def_reg, mem);
6678	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6679		ir_emit_store(ctx, type, def, def_reg);
6680	}
6681}
6682
6683static void ir_emit_vstore_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6684{
6685	ir_insn *var_insn = &ctx->ir_base[insn->op2];
6686	ir_insn *val_insn = &ctx->ir_base[insn->op3];
6687	ir_ref type = val_insn->type;
6688	ir_reg op3_reg = ctx->regs[ref][3];
6689	ir_reg fp;
6690	ir_mem mem;
6691
6692	IR_ASSERT(var_insn->op == IR_VAR);
6693	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
6694	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
6695	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
6696	 && !IR_IS_CONST_REF(insn->op3) && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
6697		return; // fake store
6698	}
6699	if (IR_IS_CONST_REF(insn->op3)) {
6700		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
6701	} else {
6702		IR_ASSERT(op3_reg != IR_REG_NONE);
6703		if (IR_REG_SPILLED(op3_reg)) {
6704			op3_reg = IR_REG_NUM(op3_reg);
6705			ir_emit_load(ctx, type, op3_reg, insn->op3);
6706		}
6707		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
6708	}
6709}
6710
6711static void ir_emit_vstore_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6712{
6713	ir_insn *var_insn = &ctx->ir_base[insn->op2];
6714	ir_ref type = ctx->ir_base[insn->op3].type;
6715	ir_reg op3_reg = ctx->regs[ref][3];
6716	ir_reg fp;
6717	ir_mem mem;
6718
6719	IR_ASSERT(var_insn->op == IR_VAR);
6720	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
6721	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
6722	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
6723	 && !IR_IS_CONST_REF(insn->op3) && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
6724		return; // fake store
6725	}
6726	if (IR_IS_CONST_REF(insn->op3)) {
6727		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
6728	} else {
6729		IR_ASSERT(op3_reg != IR_REG_NONE);
6730		if (IR_REG_SPILLED(op3_reg)) {
6731			op3_reg = IR_REG_NUM(op3_reg);
6732			ir_emit_load(ctx, type, op3_reg, insn->op3);
6733		}
6734		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
6735	}
6736}
6737
6738static void ir_emit_load_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6739{
6740	ir_ref type = insn->type;
6741	ir_reg op2_reg = ctx->regs[def][2];
6742	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6743	ir_mem mem;
6744
6745	if (ctx->use_lists[def].count == 1) {
6746		/* dead load */
6747		return;
6748	}
6749	IR_ASSERT(def_reg != IR_REG_NONE);
6750	if (op2_reg != IR_REG_NONE) {
6751		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(insn->op2)) {
6752			op2_reg = IR_REG_NUM(op2_reg);
6753			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
6754			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
6755		}
6756		mem = IR_MEM_B(op2_reg);
6757	} else if (IR_IS_CONST_REF(insn->op2)) {
6758		mem = ir_fuse_addr_const(ctx, insn->op2);
6759	} else {
6760		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
6761		mem = ir_fuse_addr(ctx, def, insn->op2);
6762		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
6763			if (!ir_may_avoid_spill_load(ctx, def, def)) {
6764				ir_emit_load_mem_int(ctx, type, def_reg, mem);
6765			}
6766			/* avoid load to the same location (valid only when register is not reused) */
6767			return;
6768		}
6769	}
6770
6771	ir_emit_load_mem_int(ctx, type, def_reg, mem);
6772	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6773		ir_emit_store(ctx, type, def, def_reg);
6774	}
6775}
6776
6777static void ir_emit_load_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6778{
6779	ir_ref type = insn->type;
6780	ir_reg op2_reg = ctx->regs[def][2];
6781	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6782	ir_mem mem;
6783
6784	if (ctx->use_lists[def].count == 1) {
6785		/* dead load */
6786		return;
6787	}
6788	IR_ASSERT(def_reg != IR_REG_NONE);
6789	if (op2_reg != IR_REG_NONE) {
6790		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(insn->op2)) {
6791			op2_reg = IR_REG_NUM(op2_reg);
6792			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
6793			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
6794		}
6795		mem = IR_MEM_B(op2_reg);
6796	} else if (IR_IS_CONST_REF(insn->op2)) {
6797		mem = ir_fuse_addr_const(ctx, insn->op2);
6798	} else {
6799		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
6800		mem = ir_fuse_addr(ctx, def, insn->op2);
6801		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
6802			if (!ir_may_avoid_spill_load(ctx, def, def)) {
6803				ir_emit_load_mem_fp(ctx, type, def_reg, mem);
6804			}
6805			/* avoid load to the same location (valid only when register is not reused) */
6806			return;
6807		}
6808	}
6809
6810	ir_emit_load_mem_fp(ctx, type, def_reg, mem);
6811	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6812		ir_emit_store(ctx, type, def, def_reg);
6813	}
6814}
6815
6816static void ir_emit_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6817{
6818	ir_insn *val_insn = &ctx->ir_base[insn->op3];
6819	ir_ref type = val_insn->type;
6820	ir_reg op2_reg = ctx->regs[ref][2];
6821	ir_reg op3_reg = ctx->regs[ref][3];
6822	ir_mem mem;
6823
6824	if (op2_reg != IR_REG_NONE) {
6825		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(insn->op2)) {
6826			op2_reg = IR_REG_NUM(op2_reg);
6827			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
6828			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
6829		}
6830		mem = IR_MEM_B(op2_reg);
6831	} else if (IR_IS_CONST_REF(insn->op2)) {
6832		mem = ir_fuse_addr_const(ctx, insn->op2);
6833	} else {
6834		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
6835		mem = ir_fuse_addr(ctx, ref, insn->op2);
6836		if (!IR_IS_CONST_REF(insn->op3) && IR_REG_SPILLED(op3_reg) && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
6837			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
6838				op3_reg = IR_REG_NUM(op3_reg);
6839				ir_emit_load(ctx, type, op3_reg, insn->op3);
6840			}
6841			/* avoid store to the same location */
6842			return;
6843		}
6844	}
6845
6846	if (IR_IS_CONST_REF(insn->op3)) {
6847		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
6848	} else {
6849		IR_ASSERT(op3_reg != IR_REG_NONE);
6850		if (IR_REG_SPILLED(op3_reg)) {
6851			op3_reg = IR_REG_NUM(op3_reg);
6852			ir_emit_load(ctx, type, op3_reg, insn->op3);
6853		}
6854		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
6855	}
6856}
6857
6858static void ir_emit_store_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6859{
6860	ir_ref type = ctx->ir_base[insn->op3].type;
6861	ir_reg op2_reg = ctx->regs[ref][2];
6862	ir_reg op3_reg = ctx->regs[ref][3];
6863	ir_mem mem;
6864
6865	IR_ASSERT(op3_reg != IR_REG_NONE);
6866	if (op2_reg != IR_REG_NONE) {
6867		if (IR_REG_SPILLED(op2_reg) || IR_IS_CONST_REF(insn->op2)) {
6868			op2_reg = IR_REG_NUM(op2_reg);
6869			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
6870			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
6871		}
6872		mem = IR_MEM_B(op2_reg);
6873	} else if (IR_IS_CONST_REF(insn->op2)) {
6874		mem = ir_fuse_addr_const(ctx, insn->op2);
6875	} else {
6876		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
6877		mem = ir_fuse_addr(ctx, ref, insn->op2);
6878		if (!IR_IS_CONST_REF(insn->op3) && IR_REG_SPILLED(op3_reg) && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
6879			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
6880				op3_reg = IR_REG_NUM(op3_reg);
6881				ir_emit_load(ctx, type, op3_reg, insn->op3);
6882			}
6883			/* avoid store to the same location */
6884			return;
6885		}
6886	}
6887
6888	if (IR_IS_CONST_REF(insn->op3)) {
6889		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
6890	} else {
6891		IR_ASSERT(op3_reg != IR_REG_NONE);
6892		if (IR_REG_SPILLED(op3_reg)) {
6893			op3_reg = IR_REG_NUM(op3_reg);
6894			ir_emit_load(ctx, type, op3_reg, insn->op3);
6895		}
6896		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
6897	}
6898}
6899
6900static void ir_emit_rload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6901{
6902	ir_reg src_reg = insn->op2;
6903	ir_type type = insn->type;
6904
6905	if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), src_reg)) {
6906		if (ctx->vregs[def]
6907		 && ctx->live_intervals[ctx->vregs[def]]
6908		 && ctx->live_intervals[ctx->vregs[def]]->stack_spill_pos != -1) {
6909			ir_emit_store(ctx, type, def, src_reg);
6910		}
6911	} else {
6912		ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6913
6914		if (def_reg == IR_REG_NONE) {
6915			/* op3 is used as a flag that the value is already stored in memory.
6916			 * If op3 is set we don't have to store the value once again (in case of spilling)
6917			 */
6918			if (!insn->op3 || !ir_is_same_spill_slot(ctx, def, IR_MEM_BO(ctx->spill_base, insn->op3))) {
6919				ir_emit_store(ctx, type, def, src_reg);
6920			}
6921		} else {
6922			if (src_reg != def_reg) {
6923				if (IR_IS_TYPE_INT(type)) {
6924					ir_emit_mov(ctx, type, def_reg, src_reg);
6925				} else {
6926					IR_ASSERT(IR_IS_TYPE_FP(type));
6927					ir_emit_fp_mov(ctx, type, def_reg, src_reg);
6928				}
6929			}
6930			if (IR_REG_SPILLED(ctx->regs[def][0])
6931			 && (!insn->op3 || !ir_is_same_spill_slot(ctx, def,  IR_MEM_BO(ctx->spill_base, insn->op3)))) {
6932				ir_emit_store(ctx, type, def, def_reg);
6933			}
6934		}
6935	}
6936}
6937
6938static void ir_emit_rstore(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6939{
6940	ir_ref type = ctx->ir_base[insn->op2].type;
6941	ir_reg op2_reg = ctx->regs[ref][2];
6942	ir_reg dst_reg = insn->op3;
6943
6944	if (!IR_IS_CONST_REF(insn->op2) && (ir_rule(ctx, insn->op2) & IR_FUSED)) {
6945		ir_emit_load_mem(ctx, type, dst_reg, ir_fuse_load(ctx, ref, insn->op2));
6946	} else if (op2_reg != IR_REG_NONE) {
6947		if (IR_REG_SPILLED(op2_reg)) {
6948			op2_reg = IR_REG_NUM(op2_reg);
6949			ir_emit_load(ctx, type, op2_reg, insn->op2);
6950		}
6951		if (op2_reg != dst_reg) {
6952			if (IR_IS_TYPE_INT(type)) {
6953				ir_emit_mov(ctx, type, dst_reg, op2_reg);
6954			} else {
6955				IR_ASSERT(IR_IS_TYPE_FP(type));
6956				ir_emit_fp_mov(ctx, type, dst_reg, op2_reg);
6957			}
6958		}
6959	} else {
6960		ir_emit_load(ctx, type, dst_reg, insn->op2);
6961	}
6962}
6963
6964static void ir_emit_alloca(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6965{
6966	ir_backend_data *data = ctx->data;
6967	dasm_State **Dst = &data->dasm_state;
6968	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6969
6970	if (IR_IS_CONST_REF(insn->op2)) {
6971		ir_insn *val = &ctx->ir_base[insn->op2];
6972		int32_t size = val->val.i32;
6973
6974		IR_ASSERT(IR_IS_TYPE_INT(val->type));
6975		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
6976		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 >= 0);
6977		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
6978
6979		if (ctx->flags2 & IR_HAS_CALLS) {
6980			/* Stack must be 16 byte aligned */
6981			size = IR_ALIGNED_SIZE(size, 16);
6982		} else {
6983			size = IR_ALIGNED_SIZE(size, 8);
6984		}
6985		|	ASM_REG_IMM_OP sub, IR_ADDR, IR_REG_RSP, size
6986		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
6987			ctx->call_stack_size += size;
6988		}
6989	} else {
6990		int32_t alignment = (ctx->flags2 & IR_HAS_CALLS) ? 16 : 8;
6991		ir_reg op2_reg = ctx->regs[def][2];
6992		ir_type type = ctx->ir_base[insn->op2].type;
6993
6994		IR_ASSERT(ctx->flags & IR_FUNCTION);
6995		IR_ASSERT(ctx->flags & IR_USE_FRAME_POINTER);
6996		IR_ASSERT(def_reg != IR_REG_NONE);
6997		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6998			op2_reg = IR_REG_NUM(op2_reg);
6999			ir_emit_load(ctx, type, op2_reg, insn->op2);
7000		}
7001		if (def_reg != op2_reg) {
7002			if (op2_reg != IR_REG_NONE) {
7003				ir_emit_mov(ctx, type, def_reg, op2_reg);
7004			} else {
7005				ir_emit_load(ctx, type, def_reg, insn->op2);
7006			}
7007		}
7008
7009		|	ASM_REG_IMM_OP add, IR_ADDR, def_reg, (alignment-1)
7010		|	ASM_REG_IMM_OP and, IR_ADDR, def_reg, ~(alignment-1)
7011		|	ASM_REG_REG_OP sub, IR_ADDR, IR_REG_RSP, def_reg
7012	}
7013	if (def_reg != IR_REG_NONE) {
7014		|	mov Ra(def_reg), Ra(IR_REG_RSP)
7015		if (IR_REG_SPILLED(ctx->regs[def][0])) {
7016			ir_emit_store(ctx, insn->type, def, def_reg);
7017		}
7018	} else {
7019		ir_emit_store(ctx, IR_ADDR, def, IR_REG_STACK_POINTER);
7020	}
7021}
7022
7023static void ir_emit_afree(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7024{
7025	ir_backend_data *data = ctx->data;
7026	dasm_State **Dst = &data->dasm_state;
7027
7028	if (IR_IS_CONST_REF(insn->op2)) {
7029		ir_insn *val = &ctx->ir_base[insn->op2];
7030		int32_t size = val->val.i32;
7031
7032		IR_ASSERT(IR_IS_TYPE_INT(val->type));
7033		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7034		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 > 0);
7035		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
7036
7037		if (ctx->flags2 & IR_HAS_CALLS) {
7038			/* Stack must be 16 byte aligned */
7039			size = IR_ALIGNED_SIZE(size, 16);
7040		} else {
7041			size = IR_ALIGNED_SIZE(size, 8);
7042		}
7043		|	ASM_REG_IMM_OP add, IR_ADDR, IR_REG_RSP, size
7044		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
7045			ctx->call_stack_size -= size;
7046		}
7047	} else {
7048//		int32_t alignment = (ctx->flags2 & IR_HAS_CALLS) ? 16 : 8;
7049		ir_reg op2_reg = ctx->regs[def][2];
7050		ir_type type = ctx->ir_base[insn->op2].type;
7051
7052		IR_ASSERT(ctx->flags & IR_FUNCTION);
7053		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7054			op2_reg = IR_REG_NUM(op2_reg);
7055			ir_emit_load(ctx, type, op2_reg, insn->op2);
7056		}
7057
7058		// TODO: alignment ???
7059
7060		|	ASM_REG_REG_OP add, IR_ADDR, IR_REG_RSP, op2_reg
7061	}
7062}
7063
7064static void ir_emit_frame_addr(ir_ctx *ctx, ir_ref def)
7065{
7066	ir_backend_data *data = ctx->data;
7067	dasm_State **Dst = &data->dasm_state;
7068	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7069
7070	if (ctx->flags & IR_USE_FRAME_POINTER) {
7071		|	mov Ra(def_reg), Ra(IR_REG_RBP)
7072	} else {
7073		|	lea Ra(def_reg), [Ra(IR_REG_RSP)+(ctx->stack_frame_size + ctx->call_stack_size)]
7074	}
7075	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7076		ir_emit_store(ctx, IR_ADDR, def, def_reg);
7077	}
7078}
7079
7080static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7081{
7082#if defined(_WIN64) || defined(IR_TARGET_X86)
7083	ir_backend_data *data = ctx->data;
7084	dasm_State **Dst = &data->dasm_state;
7085	ir_reg fp;
7086	int arg_area_offset;
7087	ir_reg op2_reg = ctx->regs[def][2];
7088	ir_reg tmp_reg = ctx->regs[def][3];
7089
7090	IR_ASSERT(tmp_reg != IR_REG_NONE);
7091	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7092		op2_reg = IR_REG_NUM(op2_reg);
7093		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7094	}
7095
7096	if (ctx->flags & IR_USE_FRAME_POINTER) {
7097		fp = IR_REG_FRAME_POINTER;
7098		arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
7099	} else {
7100		fp = IR_REG_STACK_POINTER;
7101		arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
7102	}
7103	|	lea Ra(tmp_reg), aword [Ra(fp)+arg_area_offset]
7104	if (op2_reg != IR_REG_NONE) {
7105		|	mov aword [Ra(op2_reg)], Ra(tmp_reg)
7106	} else {
7107		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &op2_reg);
7108
7109		|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
7110	}
7111#elif defined(IR_TARGET_X64)
7112|.if X64
7113	ir_backend_data *data = ctx->data;
7114	dasm_State **Dst = &data->dasm_state;
7115	ir_reg fp;
7116	int reg_save_area_offset;
7117	int overflow_arg_area_offset;
7118	ir_reg op2_reg = ctx->regs[def][2];
7119	ir_reg tmp_reg = ctx->regs[def][3];
7120	bool have_reg_save_area = 0;
7121
7122	IR_ASSERT(op2_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
7123	if (IR_REG_SPILLED(op2_reg)) {
7124		op2_reg = IR_REG_NUM(op2_reg);
7125		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7126	}
7127
7128	if (ctx->flags & IR_USE_FRAME_POINTER) {
7129		fp = IR_REG_FRAME_POINTER;
7130		reg_save_area_offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
7131		overflow_arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
7132	} else {
7133		fp = IR_REG_STACK_POINTER;
7134		reg_save_area_offset = ctx->locals_area_size + ctx->call_stack_size;
7135		overflow_arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
7136	}
7137
7138	if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
7139		|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
7140		have_reg_save_area = 1;
7141		/* Set va_list.gp_offset */
7142		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, gp_offset)], sizeof(void*) * ctx->gp_reg_params
7143	} else {
7144		reg_save_area_offset -= sizeof(void*) * IR_REG_INT_ARGS;
7145		/* Set va_list.gp_offset */
7146		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, gp_offset)], sizeof(void*) * IR_REG_INT_ARGS
7147	}
7148	if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
7149		if (!have_reg_save_area) {
7150			|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
7151			have_reg_save_area = 1;
7152		}
7153		/* Set va_list.fp_offset */
7154		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, fp_offset)], sizeof(void*) * IR_REG_INT_ARGS + 16 * ctx->fp_reg_params
7155	} else {
7156		/* Set va_list.fp_offset */
7157		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, fp_offset)], sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
7158	}
7159	if (have_reg_save_area) {
7160		/* Set va_list.reg_save_area */
7161		|	mov qword [Ra(op2_reg)+offsetof(ir_va_list, reg_save_area)], Ra(tmp_reg)
7162	}
7163	|	lea Ra(tmp_reg), aword [Ra(fp)+overflow_arg_area_offset]
7164	/* Set va_list.overflow_arg_area */
7165	|	mov qword [Ra(op2_reg)+offsetof(ir_va_list, overflow_arg_area)], Ra(tmp_reg)
7166|.endif
7167#else
7168	IR_ASSERT(0 && "NIY va_start");
7169#endif
7170}
7171
7172static void ir_emit_va_copy(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7173{
7174	IR_ASSERT(0 && "NIY va_copy");
7175}
7176
7177static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7178{
7179#if defined(_WIN64) || defined(IR_TARGET_X86)
7180	ir_backend_data *data = ctx->data;
7181	dasm_State **Dst = &data->dasm_state;
7182	ir_type type = insn->type;
7183	ir_reg def_reg = ctx->regs[def][0];
7184	ir_reg op2_reg = ctx->regs[def][2];
7185	ir_reg tmp_reg = ctx->regs[def][3];
7186
7187	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
7188	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7189		op2_reg = IR_REG_NUM(op2_reg);
7190		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7191	}
7192	|	mov Ra(tmp_reg), aword [Ra(op2_reg)]
7193	ir_emit_load_mem(ctx, type, def_reg, IR_MEM_B(tmp_reg));
7194	|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
7195	if (op2_reg != IR_REG_NONE) {
7196		|	mov aword [Ra(op2_reg)], Ra(tmp_reg)
7197	} else {
7198		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &op2_reg);
7199
7200		|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
7201	}
7202	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7203		ir_emit_store(ctx, type, def, def_reg);
7204	}
7205#elif defined(IR_TARGET_X64)
7206|.if X64
7207	ir_backend_data *data = ctx->data;
7208	dasm_State **Dst = &data->dasm_state;
7209	ir_type type = insn->type;
7210	ir_reg def_reg = ctx->regs[def][0];
7211	ir_reg op2_reg = ctx->regs[def][2];
7212	ir_reg tmp_reg = ctx->regs[def][3];
7213
7214	IR_ASSERT(def_reg != IR_REG_NONE && op2_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
7215	if (IR_REG_SPILLED(op2_reg)) {
7216		op2_reg = IR_REG_NUM(op2_reg);
7217		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7218	}
7219	if (IR_IS_TYPE_INT(type)) {
7220		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+offsetof(ir_va_list, gp_offset)]
7221		|	cmp Rd(tmp_reg), sizeof(void*)*IR_REG_INT_ARGS
7222		|	jge >1
7223		|	add Rd(tmp_reg), sizeof(void*)
7224		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, gp_offset)], Rd(tmp_reg)
7225		|	add Ra(tmp_reg), aword [Ra(op2_reg)+offsetof(ir_va_list, reg_save_area)]
7226		|	jmp >2
7227		|1:
7228		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+offsetof(ir_va_list, overflow_arg_area)]
7229		|	add Ra(tmp_reg), sizeof(void*)
7230		|	mov aword [Ra(op2_reg)+offsetof(ir_va_list, overflow_arg_area)], Ra(tmp_reg)
7231		|2:
7232		|	mov Ra(def_reg), aword [Ra(tmp_reg)-sizeof(void*)]
7233	} else {
7234		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+offsetof(ir_va_list, fp_offset)]
7235		|	cmp Rd(tmp_reg), sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
7236		|	jge >1
7237		|	add Rd(tmp_reg), 16
7238		|	mov dword [Ra(op2_reg)+offsetof(ir_va_list, fp_offset)], Rd(tmp_reg)
7239		|	add Ra(tmp_reg), aword [Ra(op2_reg)+offsetof(ir_va_list, reg_save_area)]
7240		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, -16));
7241		|	jmp >2
7242		|1:
7243		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+offsetof(ir_va_list, overflow_arg_area)]
7244		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, 0));
7245		|	add Ra(tmp_reg), 8
7246		|	mov aword [Ra(op2_reg)+offsetof(ir_va_list, overflow_arg_area)], Ra(tmp_reg)
7247		|2:
7248	}
7249	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7250		ir_emit_store(ctx, type, def, def_reg);
7251	}
7252|.endif
7253#else
7254	IR_ASSERT(0 && "NIY va_arg");
7255#endif
7256}
7257
7258static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
7259{
7260	ir_backend_data *data = ctx->data;
7261	dasm_State **Dst = &data->dasm_state;
7262	ir_type type;
7263	ir_block *bb;
7264	ir_insn *use_insn, *val;
7265	uint32_t n, *p, use_block;
7266	int i;
7267	int label, default_label = 0;
7268	int count = 0;
7269	ir_val min, max;
7270	int64_t offset;
7271	ir_reg op2_reg = ctx->regs[def][2];
7272|.if X64
7273||	ir_reg tmp_reg = ctx->regs[def][3];
7274|.endif
7275
7276	type = ctx->ir_base[insn->op2].type;
7277	if (IR_IS_TYPE_SIGNED(type)) {
7278		min.u64 = 0x7fffffffffffffff;
7279		max.u64 = 0x8000000000000000;
7280	} else {
7281		min.u64 = 0xffffffffffffffff;
7282		max.u64 = 0x0;
7283	}
7284
7285	bb = &ctx->cfg_blocks[b];
7286	p = &ctx->cfg_edges[bb->successors];
7287	for (n = bb->successors_count; n != 0; p++, n--) {
7288		use_block = *p;
7289		use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
7290		if (use_insn->op == IR_CASE_VAL) {
7291			val = &ctx->ir_base[use_insn->op2];
7292			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7293			if (IR_IS_TYPE_SIGNED(type)) {
7294				IR_ASSERT(IR_IS_TYPE_SIGNED(val->type));
7295				min.i64 = IR_MIN(min.i64, val->val.i64);
7296				max.i64 = IR_MAX(max.i64, val->val.i64);
7297			} else {
7298				IR_ASSERT(!IR_IS_TYPE_SIGNED(val->type));
7299				min.u64 = (int64_t)IR_MIN(min.u64, val->val.u64);
7300				max.u64 = (int64_t)IR_MAX(max.u64, val->val.u64);
7301			}
7302			count++;
7303		} else {
7304			IR_ASSERT(use_insn->op == IR_CASE_DEFAULT);
7305			default_label = ir_skip_empty_target_blocks(ctx, use_block);
7306		}
7307	}
7308
7309	IR_ASSERT(op2_reg != IR_REG_NONE);
7310|.if X64
7311||	IR_ASSERT(tmp_reg != IR_REG_NONE || sizeof(void*) != 8);
7312|.endif
7313	if (IR_REG_SPILLED(op2_reg)) {
7314		op2_reg = IR_REG_NUM(op2_reg);
7315		ir_emit_load(ctx, type, op2_reg, insn->op2);
7316	} else if (IR_IS_CONST_REF(insn->op2)) {
7317		ir_emit_load(ctx, type, op2_reg, insn->op2);
7318	}
7319
7320	/* Generate a table jmp or a seqence of calls */
7321	if ((max.i64-min.i64) < count * 8) {
7322		int *labels = ir_mem_malloc(sizeof(int) * (size_t)(max.i64 - min.i64 + 1));
7323
7324		for (i = 0; i <= (max.i64 - min.i64); i++) {
7325			labels[i] = default_label;
7326		}
7327		p = &ctx->cfg_edges[bb->successors];
7328		for (n = bb->successors_count; n != 0; p++, n--) {
7329			use_block = *p;
7330			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
7331			if (use_insn->op == IR_CASE_VAL) {
7332				val = &ctx->ir_base[use_insn->op2];
7333				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7334				label = ir_skip_empty_target_blocks(ctx, use_block);
7335				labels[val->val.i64 - min.i64] = label;
7336			}
7337		}
7338
7339		if (IR_IS_32BIT(type, max)) {
7340			|	ASM_REG_IMM_OP cmp, type, op2_reg, max.i32
7341		} else {
7342			IR_ASSERT(ir_type_size[type] == 8);
7343			IR_ASSERT(sizeof(void*) == 8);
7344|.if X64
7345			|	mov64 Rq(tmp_reg), max.i64
7346			|	cmp Rq(op2_reg), Rq(tmp_reg)
7347|.endif
7348		}
7349		if (IR_IS_TYPE_SIGNED(type)) {
7350			|	jg =>default_label
7351		} else {
7352			|	ja =>default_label
7353		}
7354
7355		if (IR_IS_32BIT(type, min)) {
7356			offset = -min.i64 * sizeof(void*);
7357			if (IR_IS_SIGNED_32BIT(offset)) {
7358				|	ASM_REG_IMM_OP cmp, type, op2_reg, min.i32
7359			} else {
7360				|	ASM_REG_REG_OP sub, type, op2_reg, (int32_t)offset // TODO: reg clobbering
7361				offset = 0;
7362			}
7363		} else {
7364			IR_ASSERT(sizeof(void*) == 8);
7365|.if X64
7366			|	mov64 Rq(tmp_reg), min.i64
7367			|	ASM_REG_REG_OP sub, type, op2_reg, tmp_reg // TODO: reg clobbering
7368			offset = 0;
7369|.endif
7370		}
7371		if (IR_IS_TYPE_SIGNED(type)) {
7372			|	jl =>default_label
7373		} else {
7374			|	jb =>default_label
7375		}
7376		if (sizeof(void*) == 8) {
7377|.if X64
7378			switch (ir_type_size[type]) {
7379				default:
7380					IR_ASSERT(0);
7381				case 1:
7382					if (IR_IS_TYPE_SIGNED(type)) {
7383						|	movsx Ra(op2_reg), Rb(op2_reg)
7384					} else {
7385						|	movzx Ra(op2_reg), Rb(op2_reg)
7386					}
7387					break;
7388				case 2:
7389					if (IR_IS_TYPE_SIGNED(type)) {
7390						|	movsx Ra(op2_reg), Rw(op2_reg)
7391					} else {
7392						|	movzx Ra(op2_reg), Rw(op2_reg)
7393					}
7394					break;
7395				case 4:
7396					if (IR_IS_TYPE_SIGNED(type)) {
7397						|	movsxd Ra(op2_reg), Rd(op2_reg)
7398					} else {
7399						|	mov Rd(op2_reg), Rd(op2_reg)
7400					}
7401					break;
7402				case 8:
7403					break;
7404			}
7405			|	lea Ra(tmp_reg), aword [>1]
7406			|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8+(int32_t)offset]
7407|.endif
7408		} else {
7409|.if not X64
7410			switch (ir_type_size[type]) {
7411				default:
7412					IR_ASSERT(0 && "Unsupported type size");
7413				case 1:
7414					if (IR_IS_TYPE_SIGNED(type)) {
7415						|	movsx Ra(op2_reg), Rb(op2_reg)
7416					} else {
7417						|	movzx Ra(op2_reg), Rb(op2_reg)
7418					}
7419					break;
7420				case 2:
7421					if (IR_IS_TYPE_SIGNED(type)) {
7422						|	movsx Ra(op2_reg), Rw(op2_reg)
7423					} else {
7424						|	movzx Ra(op2_reg), Rw(op2_reg)
7425					}
7426					break;
7427				case 4:
7428					break;
7429			}
7430			|//	jmp aword [Ra(op2_reg)*4+(int32_t)offset+>1]
7431			|	lea Ra(op2_reg), aword [Ra(op2_reg)*4+(int32_t)offset] // TODO: reg clobbering
7432			|	jmp aword [Ra(op2_reg)+>1]
7433|.endif
7434		}
7435		|.jmp_table
7436		if (!data->jmp_table_label) {
7437			data->jmp_table_label = ctx->cfg_blocks_count + ctx->consts_count + 3;
7438			|=>data->jmp_table_label:
7439		}
7440		|.align aword
7441		|1:
7442		for (i = 0; i <= (max.i64 - min.i64); i++) {
7443			int b = labels[i];
7444			ir_block *bb = &ctx->cfg_blocks[b];
7445			ir_insn *insn = &ctx->ir_base[bb->end];
7446
7447			if (insn->op == IR_IJMP && IR_IS_CONST_REF(insn->op2)) {
7448				ir_ref prev = ctx->prev_ref[bb->end];
7449				if (prev != bb->start && ctx->ir_base[prev].op == IR_SNAPSHOT) {
7450					prev = ctx->prev_ref[prev];
7451				}
7452				if (prev == bb->start) {
7453					void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
7454
7455					|	.aword &addr
7456					if (ctx->ir_base[bb->start].op != IR_CASE_DEFAULT) {
7457						bb->flags |= IR_BB_EMPTY;
7458					}
7459					continue;
7460				}
7461			}
7462			|	.aword =>b
7463		}
7464		|.code
7465		ir_mem_free(labels);
7466	} else {
7467		p = &ctx->cfg_edges[bb->successors];
7468		for (n = bb->successors_count; n != 0; p++, n--) {
7469			use_block = *p;
7470			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
7471			if (use_insn->op == IR_CASE_VAL) {
7472				val = &ctx->ir_base[use_insn->op2];
7473				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7474				label = ir_skip_empty_target_blocks(ctx, use_block);
7475				if (IR_IS_32BIT(type, val->val)) {
7476					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
7477				} else {
7478					IR_ASSERT(sizeof(void*) == 8);
7479|.if X64
7480					|	mov64 Ra(tmp_reg), val->val.i64
7481					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
7482|.endif
7483				}
7484				|	je =>label
7485			}
7486		}
7487		if (default_label) {
7488			|	jmp =>default_label
7489		}
7490	}
7491}
7492
7493static int32_t ir_call_used_stack(ir_ctx *ctx, ir_insn *insn)
7494{
7495	int j, n;
7496	ir_type type;
7497	int int_param = 0;
7498	int fp_param = 0;
7499	int int_reg_params_count = IR_REG_INT_ARGS;
7500	int fp_reg_params_count = IR_REG_FP_ARGS;
7501	int32_t used_stack = 0;
7502
7503#ifdef IR_HAVE_FASTCALL
7504	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
7505		int_reg_params_count = IR_REG_INT_FCARGS;
7506		fp_reg_params_count = IR_REG_FP_FCARGS;
7507	}
7508#endif
7509
7510	n = insn->inputs_count;
7511	for (j = 3; j <= n; j++) {
7512		type = ctx->ir_base[ir_insn_op(insn, j)].type;
7513		if (IR_IS_TYPE_INT(type)) {
7514			if (int_param >= int_reg_params_count) {
7515				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
7516			}
7517			int_param++;
7518#ifdef _WIN64
7519			/* WIN64 calling convention use common couter for int and fp registers */
7520			fp_param++;
7521#endif
7522		} else {
7523			IR_ASSERT(IR_IS_TYPE_FP(type));
7524			if (fp_param >= fp_reg_params_count) {
7525				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
7526			}
7527			fp_param++;
7528#ifdef _WIN64
7529			/* WIN64 calling convention use common couter for int and fp registers */
7530			int_param++;
7531#endif
7532		}
7533	}
7534
7535	/* Reserved "home space" or "shadow store" for register arguments (used in Windows64 ABI) */
7536	used_stack += IR_SHADOW_ARGS;
7537
7538	return used_stack;
7539}
7540
7541static int32_t ir_emit_arguments(ir_ctx *ctx, ir_ref def, ir_insn *insn, ir_reg tmp_reg)
7542{
7543	ir_backend_data *data = ctx->data;
7544	dasm_State **Dst = &data->dasm_state;
7545	int j, n;
7546	ir_ref arg;
7547	ir_insn *arg_insn;
7548	uint8_t type;
7549	ir_reg src_reg, dst_reg;
7550	int int_param = 0;
7551	int fp_param = 0;
7552	int count = 0;
7553	int int_reg_params_count = IR_REG_INT_ARGS;
7554	int fp_reg_params_count = IR_REG_FP_ARGS;
7555	const int8_t *int_reg_params = _ir_int_reg_params;
7556	const int8_t *fp_reg_params = _ir_fp_reg_params;
7557	int32_t used_stack, stack_offset = IR_SHADOW_ARGS;
7558	ir_copy *copies;
7559	bool do_pass3 = 0;
7560	/* For temporaries we may use any scratch registers except for registers used for parameters */
7561	ir_reg tmp_fp_reg = IR_REG_FP_LAST; /* Temporary register for FP loads and swap */
7562
7563	n = insn->inputs_count;
7564	if (n < 3) {
7565		return 0;
7566	}
7567
7568	if (tmp_reg == IR_REG_NONE) {
7569		tmp_reg = IR_REG_RAX;
7570	}
7571
7572#ifdef IR_HAVE_FASTCALL
7573	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
7574		int_reg_params_count = IR_REG_INT_FCARGS;
7575		fp_reg_params_count = IR_REG_FP_FCARGS;
7576		int_reg_params = _ir_int_fc_reg_params;
7577		fp_reg_params = _ir_fp_fc_reg_params;
7578	}
7579#endif
7580
7581	if (insn->op == IR_CALL
7582	 && (ctx->flags & IR_PREALLOCATED_STACK)
7583#ifdef IR_HAVE_FASTCALL
7584	 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
7585#endif
7586	) {
7587		// TODO: support for preallocated stack
7588		used_stack = 0;
7589	} else {
7590		used_stack = ir_call_used_stack(ctx, insn);
7591		if (IR_SHADOW_ARGS
7592		 && insn->op == IR_TAILCALL
7593		 && used_stack == IR_SHADOW_ARGS) {
7594			used_stack = 0;
7595		}
7596		if (ctx->fixed_call_stack_size
7597		 && used_stack <= ctx->fixed_call_stack_size
7598#ifdef IR_HAVE_FASTCALL
7599		 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
7600#endif
7601		) {
7602			used_stack = 0;
7603		} else {
7604			/* Stack must be 16 byte aligned */
7605			int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
7606			ctx->call_stack_size += aligned_stack;
7607			if (aligned_stack) {
7608				|	sub Ra(IR_REG_RSP), aligned_stack
7609			}
7610		}
7611	}
7612
7613	/* 1. move all register arguments that should be passed through stack
7614	 *    and collect arguments that should be passed through registers */
7615	copies = ir_mem_malloc((n - 2) * sizeof(ir_copy));
7616	for (j = 3; j <= n; j++) {
7617		arg = ir_insn_op(insn, j);
7618		src_reg = ir_get_alocated_reg(ctx, def, j);
7619		arg_insn = &ctx->ir_base[arg];
7620		type = arg_insn->type;
7621		if (IR_IS_TYPE_INT(type)) {
7622			if (int_param < int_reg_params_count) {
7623				dst_reg = int_reg_params[int_param];
7624			} else {
7625				dst_reg = IR_REG_NONE; /* pass argument through stack */
7626			}
7627			int_param++;
7628#ifdef _WIN64
7629			/* WIN64 calling convention use common couter for int and fp registers */
7630			fp_param++;
7631#endif
7632		} else {
7633			IR_ASSERT(IR_IS_TYPE_FP(type));
7634			if (fp_param < fp_reg_params_count) {
7635				dst_reg = fp_reg_params[fp_param];
7636			} else {
7637				dst_reg = IR_REG_NONE; /* pass argument through stack */
7638			}
7639			fp_param++;
7640#ifdef _WIN64
7641			/* WIN64 calling convention use common couter for int and fp registers */
7642			int_param++;
7643#endif
7644		}
7645		if (dst_reg != IR_REG_NONE) {
7646			if (IR_IS_CONST_REF(arg) || src_reg == IR_REG_NONE) {
7647				/* delay CONST->REG and MEM->REG moves to third pass */
7648				do_pass3 = 1;
7649			} else {
7650				if (IR_REG_SPILLED(src_reg)) {
7651					src_reg = IR_REG_NUM(src_reg);
7652					ir_emit_load(ctx, type, src_reg, arg);
7653				}
7654				if (src_reg != dst_reg) {
7655					/* delay REG->REG moves to second pass */
7656					copies[count].type = type;
7657					copies[count].from = src_reg;
7658					copies[count].to = dst_reg;
7659					count++;
7660				}
7661			}
7662		} else {
7663			/* Pass register arguments to stack (REG->MEM moves) */
7664			if (!IR_IS_CONST_REF(arg) && src_reg != IR_REG_NONE && !IR_REG_SPILLED(src_reg)) {
7665				ir_emit_store_mem(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg);
7666			} else {
7667				do_pass3 = 1;
7668			}
7669			stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
7670		}
7671	}
7672
7673	/* 2. move all arguments that should be passed from one register to another (REG->REG movs) */
7674	if (count) {
7675		ir_parallel_copy(ctx, copies, count, tmp_reg, tmp_fp_reg);
7676	}
7677	ir_mem_free(copies);
7678
7679	/* 3. move the remaining memory and immediate values */
7680	if (do_pass3) {
7681		stack_offset = IR_SHADOW_ARGS;
7682		int_param = 0;
7683		fp_param = 0;
7684		for (j = 3; j <= n; j++) {
7685			arg = ir_insn_op(insn, j);
7686			src_reg = ir_get_alocated_reg(ctx, def, j);
7687			arg_insn = &ctx->ir_base[arg];
7688			type = arg_insn->type;
7689			if (IR_IS_TYPE_INT(type)) {
7690				if (int_param < int_reg_params_count) {
7691					dst_reg = int_reg_params[int_param];
7692				} else {
7693					dst_reg = IR_REG_NONE; /* argument already passed through stack */
7694				}
7695				int_param++;
7696#ifdef _WIN64
7697				/* WIN64 calling convention use common couter for int and fp registers */
7698				fp_param++;
7699#endif
7700			} else {
7701				IR_ASSERT(IR_IS_TYPE_FP(type));
7702				if (fp_param < fp_reg_params_count) {
7703					dst_reg = fp_reg_params[fp_param];
7704				} else {
7705					dst_reg = IR_REG_NONE; /* argument already passed through stack */
7706				}
7707				fp_param++;
7708#ifdef _WIN64
7709				/* WIN64 calling convention use common couter for int and fp registers */
7710				int_param++;
7711#endif
7712			}
7713			if (dst_reg != IR_REG_NONE) {
7714				if (IR_IS_CONST_REF(arg) || src_reg == IR_REG_NONE) {
7715					if (IR_IS_TYPE_INT(type)) {
7716						if (IR_IS_CONST_REF(arg)) {
7717							if (type == IR_I8 || type == IR_I16) {
7718								type = IR_I32;
7719							} else if (type == IR_U8 || type == IR_U16) {
7720								type = IR_U32;
7721							}
7722							ir_emit_load(ctx, type, dst_reg, arg);
7723						} else {
7724							ir_mem mem = ir_ref_spill_slot(ctx, arg);
7725
7726							if (ir_type_size[type] > 2) {
7727								ir_emit_load_mem_int(ctx, type, dst_reg, mem);
7728							} else if (ir_type_size[type] == 2) {
7729								if (type == IR_I16) {
7730									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), word, mem
7731								} else {
7732									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), word, mem
7733								}
7734							} else {
7735								IR_ASSERT(ir_type_size[type] == 1);
7736								if (type == IR_I8) {
7737									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), byte, mem
7738								} else {
7739									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), byte, mem
7740								}
7741							}
7742						}
7743					} else {
7744						ir_emit_load(ctx, type, dst_reg, arg);
7745					}
7746				}
7747			} else {
7748				ir_mem mem = IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset);
7749
7750				if (IR_IS_TYPE_INT(type)) {
7751					if (IR_IS_CONST_REF(arg)) {
7752						ir_emit_store_mem_int_const(ctx, type, mem, arg, tmp_reg, 1);
7753					} else if (src_reg == IR_REG_NONE) {
7754						IR_ASSERT(tmp_reg != IR_REG_NONE);
7755						ir_emit_load(ctx, type, tmp_reg, arg);
7756						ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
7757					} else if (IR_REG_SPILLED(src_reg)) {
7758						src_reg = IR_REG_NUM(src_reg);
7759						ir_emit_load(ctx, type, src_reg, arg);
7760						ir_emit_store_mem_int(ctx, type, mem, src_reg);
7761					}
7762				} else {
7763					if (IR_IS_CONST_REF(arg)) {
7764						ir_emit_store_mem_fp_const(ctx, type, mem, arg, tmp_reg, tmp_fp_reg);
7765					} else if (src_reg == IR_REG_NONE) {
7766						IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
7767						ir_emit_load(ctx, type, tmp_fp_reg, arg);
7768						ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
7769					} else if (IR_REG_SPILLED(src_reg)) {
7770						src_reg = IR_REG_NUM(src_reg);
7771						ir_emit_load(ctx, type, src_reg, arg);
7772						ir_emit_store_mem_fp(ctx, type, mem, src_reg);
7773					}
7774				}
7775				stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
7776			}
7777		}
7778	}
7779
7780#ifdef _WIN64
7781	/* WIN64 calling convention requires duplcation of parameters passed in FP register into GP ones */
7782	if (ir_is_vararg(ctx, insn)) {
7783		n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
7784		for (j = 3; j <= n; j++) {
7785			arg = ir_insn_op(insn, j);
7786			arg_insn = &ctx->ir_base[arg];
7787			type = arg_insn->type;
7788			if (IR_IS_TYPE_FP(type)) {
7789				src_reg = fp_reg_params[j-3];
7790				dst_reg = int_reg_params[j-3];
7791|.if X64
7792				if (ctx->mflags & IR_X86_AVX) {
7793					|	vmovd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
7794				} else {
7795					|	movd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
7796				}
7797|.endif
7798			}
7799		}
7800	}
7801#endif
7802#ifdef IR_REG_VARARG_FP_REGS
7803	/* set hidden argument to specify the number of vector registers used */
7804	if (ir_is_vararg(ctx, insn)) {
7805		fp_param = IR_MIN(fp_param, fp_reg_params_count);
7806		|	mov Rd(IR_REG_VARARG_FP_REGS), fp_param
7807	}
7808#endif
7809
7810	return used_stack;
7811}
7812
7813static void ir_emit_call_ex(ir_ctx *ctx, ir_ref def, ir_insn *insn, int32_t used_stack)
7814{
7815	ir_backend_data *data = ctx->data;
7816	dasm_State **Dst = &data->dasm_state;
7817	ir_reg def_reg;
7818
7819	if (IR_IS_CONST_REF(insn->op2)) {
7820		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
7821
7822		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
7823			|	call aword &addr
7824		} else {
7825|.if X64
7826||			ir_reg tmp_reg = IR_REG_RAX;
7827
7828#ifdef IR_REG_VARARG_FP_REGS
7829||			if (ir_is_vararg(ctx, insn)) {
7830||				tmp_reg = IR_REG_R11;
7831||			}
7832#endif
7833||			if (IR_IS_SIGNED_32BIT(addr)) {
7834				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
7835||			} else {
7836				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
7837||			}
7838			|	call Rq(tmp_reg)
7839|.endif
7840		}
7841    } else {
7842		ir_reg op2_reg = ctx->regs[def][2];
7843
7844		if (op2_reg != IR_REG_NONE) {
7845			if (IR_REG_SPILLED(op2_reg)) {
7846				op2_reg = IR_REG_NUM(op2_reg);
7847				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7848			}
7849			|	call Ra(op2_reg)
7850		} else {
7851			ir_mem mem;
7852
7853			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
7854				mem = ir_fuse_load(ctx, def, insn->op2);
7855			} else {
7856				mem = ir_ref_spill_slot(ctx, insn->op2);
7857			}
7858
7859			|	ASM_TMEM_OP call, aword, mem
7860		}
7861    }
7862
7863	if (used_stack) {
7864		int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
7865
7866		ctx->call_stack_size -= aligned_stack;
7867		if (ir_is_fastcall(ctx, insn)) {
7868			aligned_stack -= used_stack;
7869			if (aligned_stack) {
7870				|	add Ra(IR_REG_RSP), aligned_stack
7871			}
7872		} else {
7873			|	add Ra(IR_REG_RSP), aligned_stack
7874		}
7875	}
7876
7877	if (insn->type != IR_VOID) {
7878		if (IR_IS_TYPE_INT(insn->type)) {
7879			def_reg = IR_REG_NUM(ctx->regs[def][0]);
7880			if (def_reg != IR_REG_NONE) {
7881				if (def_reg != IR_REG_INT_RET1) {
7882					ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
7883				}
7884				if (IR_REG_SPILLED(ctx->regs[def][0])) {
7885					ir_emit_store(ctx, insn->type, def, def_reg);
7886				}
7887			} else if (ctx->use_lists[def].count > 1) {
7888				ir_emit_store(ctx, insn->type, def, IR_REG_INT_RET1);
7889			}
7890		} else {
7891			IR_ASSERT(IR_IS_TYPE_FP(insn->type));
7892			def_reg = IR_REG_NUM(ctx->regs[def][0]);
7893#ifdef IR_REG_FP_RET1
7894			if (def_reg != IR_REG_NONE) {
7895				if (def_reg != IR_REG_FP_RET1) {
7896					ir_emit_fp_mov(ctx, insn->type, def_reg, IR_REG_FP_RET1);
7897				}
7898				if (IR_REG_SPILLED(ctx->regs[def][0])) {
7899					ir_emit_store(ctx, insn->type, def, def_reg);
7900				}
7901			} else if (ctx->use_lists[def].count > 1) {
7902				ir_emit_store(ctx, insn->type, def, IR_REG_FP_RET1);
7903			}
7904#else
7905			if (ctx->use_lists[def].count > 1) {
7906				int32_t offset;
7907				ir_reg fp;
7908
7909				if (def_reg == IR_REG_NONE) {
7910					offset = ir_ref_spill_slot_offset(ctx, def, &fp);
7911					if (insn->type == IR_DOUBLE) {
7912						|	fstp qword [Ra(fp)+offset]
7913					} else {
7914						IR_ASSERT(insn->type == IR_FLOAT);
7915						|	fstp dword [Ra(fp)+offset]
7916					}
7917				} else {
7918					offset = ctx->ret_slot;
7919					IR_ASSERT(offset != -1);
7920					offset = IR_SPILL_POS_TO_OFFSET(offset);
7921					fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7922					if (insn->type == IR_DOUBLE) {
7923						|	fstp qword [Ra(fp)+offset]
7924					} else {
7925						IR_ASSERT(insn->type == IR_FLOAT);
7926						|	fstp dword [Ra(fp)+offset]
7927					}
7928					ir_emit_load_mem_fp(ctx, insn->type, def_reg, IR_MEM_BO(fp, offset));
7929					if (IR_REG_SPILLED(ctx->regs[def][0])) {
7930						ir_emit_store(ctx, insn->type, def, def_reg);
7931					}
7932				}
7933			}
7934#endif
7935		}
7936	}
7937}
7938
7939static void ir_emit_call(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7940{
7941	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
7942	ir_emit_call_ex(ctx, def, insn, used_stack);
7943}
7944
7945static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7946{
7947	ir_backend_data *data = ctx->data;
7948	dasm_State **Dst = &data->dasm_state;
7949	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
7950
7951	if (used_stack != 0) {
7952		ir_emit_call_ex(ctx, def, insn, used_stack);
7953		ir_emit_return_void(ctx);
7954		return;
7955	}
7956
7957	ir_emit_epilogue(ctx);
7958
7959	if (IR_IS_CONST_REF(insn->op2)) {
7960		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
7961
7962		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
7963			|	jmp aword &addr
7964		} else {
7965|.if X64
7966||			ir_reg tmp_reg = IR_REG_RAX;
7967
7968#ifdef IR_REG_VARARG_FP_REGS
7969||			if (ir_is_vararg(ctx, insn)) {
7970||				tmp_reg = IR_REG_R11;
7971||			}
7972#endif
7973||			if (IR_IS_SIGNED_32BIT(addr)) {
7974				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
7975||			} else {
7976				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
7977||			}
7978			|	jmp Rq(tmp_reg)
7979|.endif
7980		}
7981    } else {
7982		ir_reg op2_reg = ctx->regs[def][2];
7983
7984		if (op2_reg != IR_REG_NONE) {
7985			if (IR_REG_SPILLED(op2_reg)) {
7986				op2_reg = IR_REG_NUM(op2_reg);
7987				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7988			}
7989			|	jmp Ra(op2_reg)
7990		} else {
7991			ir_mem mem;
7992
7993			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
7994				mem = ir_fuse_load(ctx, def, insn->op2);
7995			} else {
7996				mem = ir_ref_spill_slot(ctx, insn->op2);
7997			}
7998			|	ASM_TMEM_OP jmp, aword, mem
7999		}
8000    }
8001}
8002
8003static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8004{
8005	ir_backend_data *data = ctx->data;
8006	dasm_State **Dst = &data->dasm_state;
8007	ir_reg op2_reg = ctx->regs[def][2];
8008
8009	if (IR_IS_CONST_REF(insn->op2)) {
8010		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8011
8012		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8013			|	jmp aword &addr
8014		} else {
8015|.if X64
8016			if (IR_IS_SIGNED_32BIT(addr)) {
8017				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8018			} else {
8019				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8020			}
8021			|	jmp rax
8022|.endif
8023		}
8024	} else if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8025	    ir_mem mem = ir_fuse_load(ctx, def, insn->op2);
8026		|	ASM_TMEM_OP jmp, aword, mem
8027	} else if (op2_reg != IR_REG_NONE) {
8028		if (IR_REG_SPILLED(op2_reg)) {
8029			op2_reg = IR_REG_NUM(op2_reg);
8030			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8031		}
8032		|	jmp Ra(op2_reg)
8033	} else {
8034		ir_mem mem = ir_ref_spill_slot(ctx, insn->op2);
8035
8036		|	ASM_TMEM_OP jmp, aword, mem
8037	}
8038}
8039
8040static bool ir_emit_guard_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, uint8_t op, void *addr, bool int_cmp)
8041{
8042	ir_backend_data *data = ctx->data;
8043	dasm_State **Dst = &data->dasm_state;
8044	ir_insn *next_insn = &ctx->ir_base[def + 1];
8045
8046	if (next_insn->op == IR_END || next_insn->op == IR_LOOP_END) {
8047		ir_block *bb = &ctx->cfg_blocks[b];
8048		uint32_t target;
8049
8050		if (!(bb->flags & IR_BB_DESSA_MOVES)) {
8051			target = ctx->cfg_edges[bb->successors];
8052			if (UNEXPECTED(bb->successors_count == 2)) {
8053				if (ctx->cfg_blocks[target].flags & IR_BB_ENTRY) {
8054					target = ctx->cfg_edges[bb->successors + 1];
8055				} else {
8056					IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
8057				}
8058			} else {
8059				IR_ASSERT(bb->successors_count == 1);
8060			}
8061			target = ir_skip_empty_target_blocks(ctx, target);
8062			if (b == ctx->cfg_blocks_count || target != ir_skip_empty_next_blocks(ctx, b + 1)) {
8063				if (int_cmp) {
8064					switch (op) {
8065						default:
8066							IR_ASSERT(0 && "NIY binary op");
8067						case IR_EQ:
8068							|	jne =>target
8069							break;
8070						case IR_NE:
8071							|	je =>target
8072							break;
8073						case IR_LT:
8074							|	jge =>target
8075							break;
8076						case IR_GE:
8077							|	jl =>target
8078							break;
8079						case IR_LE:
8080							|	jg =>target
8081							break;
8082						case IR_GT:
8083							|	jle =>target
8084							break;
8085						case IR_ULT:
8086							|	jae =>target
8087							break;
8088						case IR_UGE:
8089							|	jb =>target
8090							break;
8091						case IR_ULE:
8092							|	ja =>target
8093							break;
8094						case IR_UGT:
8095							|	jbe =>target
8096							break;
8097					}
8098				} else {
8099					switch (op) {
8100						default:
8101							IR_ASSERT(0 && "NIY binary op");
8102						case IR_EQ:
8103							|	jne =>target
8104							|	jp =>target
8105							break;
8106						case IR_NE:
8107							|	jp &addr
8108							|	je =>target
8109							break;
8110						case IR_LT:
8111							|	jae =>target
8112							break;
8113						case IR_GE:
8114							|	jp &addr
8115							|	jb =>target
8116							break;
8117						case IR_LE:
8118							|	ja =>target
8119							break;
8120						case IR_GT:
8121							|	jp &addr
8122							|	jbe =>target
8123							break;
8124					}
8125				}
8126				|	jmp &addr
8127				return 1;
8128			}
8129		}
8130	} else if (next_insn->op == IR_IJMP && IR_IS_CONST_REF(next_insn->op2)) {
8131		void *target_addr = ir_jmp_addr(ctx, next_insn, &ctx->ir_base[next_insn->op2]);
8132
8133		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, target_addr)) {
8134			if (int_cmp) {
8135				switch (op) {
8136					default:
8137						IR_ASSERT(0 && "NIY binary op");
8138					case IR_EQ:
8139						|	jne &target_addr
8140						break;
8141					case IR_NE:
8142						|	je &target_addr
8143						break;
8144					case IR_LT:
8145						|	jge &target_addr
8146						break;
8147					case IR_GE:
8148						|	jl &target_addr
8149						break;
8150					case IR_LE:
8151						|	jg &target_addr
8152						break;
8153					case IR_GT:
8154						|	jle &target_addr
8155						break;
8156					case IR_ULT:
8157						|	jae &target_addr
8158						break;
8159					case IR_UGE:
8160						|	jb &target_addr
8161						break;
8162					case IR_ULE:
8163						|	ja &target_addr
8164						break;
8165					case IR_UGT:
8166						|	jbe &target_addr
8167						break;
8168				}
8169			} else {
8170				switch (op) {
8171					default:
8172						IR_ASSERT(0 && "NIY binary op");
8173					case IR_EQ:
8174						|	jne &target_addr
8175						|	jp &target_addr
8176						break;
8177					case IR_NE:
8178						|	jp &addr
8179						|	je &target_addr
8180						break;
8181					case IR_LT:
8182						|	jae &target_addr
8183						break;
8184					case IR_GE:
8185						|	jp &addr
8186						|	jb &target_addr
8187						break;
8188					case IR_LE:
8189						|	ja &target_addr
8190						break;
8191					case IR_GT:
8192						|	jp &addr
8193						|	jbe &target_addr
8194						break;
8195				}
8196			}
8197			|	jmp &addr
8198			return 1;
8199		}
8200	}
8201
8202	if (int_cmp) {
8203		switch (op) {
8204			default:
8205				IR_ASSERT(0 && "NIY binary op");
8206			case IR_EQ:
8207				|	je &addr
8208				break;
8209			case IR_NE:
8210				|	jne &addr
8211				break;
8212			case IR_LT:
8213				|	jl &addr
8214				break;
8215			case IR_GE:
8216				|	jge &addr
8217				break;
8218			case IR_LE:
8219				|	jle &addr
8220				break;
8221			case IR_GT:
8222				|	jg &addr
8223				break;
8224			case IR_ULT:
8225				|	jb &addr
8226				break;
8227			case IR_UGE:
8228				|	jae &addr
8229				break;
8230			case IR_ULE:
8231				|	jbe &addr
8232				break;
8233			case IR_UGT:
8234				|	ja &addr
8235				break;
8236		}
8237	} else {
8238		switch (op) {
8239			default:
8240				IR_ASSERT(0 && "NIY binary op");
8241			case IR_EQ:
8242				|	jp >1
8243				|	je &addr
8244				|1:
8245				break;
8246			case IR_NE:
8247				|	jne &addr
8248				|	jp &addr
8249				break;
8250			case IR_LT:
8251				|	jp >1
8252				|	jb &addr
8253				|1:
8254				break;
8255			case IR_GE:
8256				|	jae &addr
8257				break;
8258			case IR_LE:
8259				|	jp >1
8260				|	jbe &addr
8261				|1:
8262				break;
8263			case IR_GT:
8264				|	ja &addr
8265				break;
8266//			case IR_ULT: fprintf(stderr, "\tjb .LL%d\n", true_block); break;
8267//			case IR_UGE: fprintf(stderr, "\tjae .LL%d\n", true_block); break;
8268//			case IR_ULE: fprintf(stderr, "\tjbe .LL%d\n", true_block); break;
8269//			case IR_UGT: fprintf(stderr, "\tja .LL%d\n", true_block); break;
8270		}
8271	}
8272	return 0;
8273}
8274
8275static bool ir_emit_guard(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8276{
8277	ir_backend_data *data = ctx->data;
8278	dasm_State **Dst = &data->dasm_state;
8279	ir_reg op2_reg = ctx->regs[def][2];
8280	ir_type type = ctx->ir_base[insn->op2].type;
8281	void *addr;
8282
8283	IR_ASSERT(IR_IS_TYPE_INT(type));
8284	if (IR_IS_CONST_REF(insn->op2)) {
8285		bool is_true = ir_ref_is_true(ctx, insn->op2);
8286
8287		if ((insn->op == IR_GUARD && !is_true) || (insn->op == IR_GUARD_NOT && is_true)) {
8288			addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8289			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8290				|	jmp aword &addr
8291			} else {
8292|.if X64
8293				if (IR_IS_SIGNED_32BIT(addr)) {
8294					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8295				} else {
8296					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8297				}
8298				|	jmp aword [rax]
8299|.endif
8300			}
8301		}
8302		return 0;
8303	}
8304
8305	if (op2_reg != IR_REG_NONE) {
8306		if (IR_REG_SPILLED(op2_reg)) {
8307			op2_reg = IR_REG_NUM(op2_reg);
8308			ir_emit_load(ctx, type, op2_reg, insn->op2);
8309		}
8310		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
8311	} else {
8312		ir_mem mem;
8313
8314		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8315			mem = ir_fuse_load(ctx, def, insn->op2);
8316		} else {
8317			mem = ir_ref_spill_slot(ctx, insn->op2);
8318		}
8319		|	ASM_MEM_IMM_OP cmp, type, mem, 0
8320	}
8321
8322	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8323	if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8324		ir_op op;
8325
8326		if (insn->op == IR_GUARD) {
8327			op = IR_EQ;
8328		} else {
8329			op = IR_NE;
8330		}
8331		return ir_emit_guard_jcc(ctx, b, def, op, addr, 1);
8332	} else {
8333|.if X64
8334		if (insn->op == IR_GUARD) {
8335			|	je >1
8336		} else {
8337			|	jne >1
8338		}
8339		|.cold_code
8340		|1:
8341		if (IR_IS_SIGNED_32BIT(addr)) {
8342			|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8343		} else {
8344			|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8345		}
8346		|	jmp aword [rax]
8347		|.code
8348|.endif
8349		return 0;
8350	}
8351}
8352
8353static bool ir_emit_guard_cmp_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8354{
8355	ir_backend_data *data = ctx->data;
8356	dasm_State **Dst = &data->dasm_state;
8357	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
8358	ir_op op = cmp_insn->op;
8359	ir_type type = ctx->ir_base[cmp_insn->op1].type;
8360	ir_ref op1 = cmp_insn->op1;
8361	ir_ref op2 = cmp_insn->op2;
8362	ir_reg op1_reg = ctx->regs[insn->op2][1];
8363	ir_reg op2_reg = ctx->regs[insn->op2][2];
8364	void *addr;
8365
8366	if (op1_reg != IR_REG_NONE && (IR_IS_CONST_REF(op1) || IR_REG_SPILLED(op1_reg))) {
8367		op1_reg = IR_REG_NUM(op1_reg);
8368		ir_emit_load(ctx, type, op1_reg, op1);
8369	}
8370	if (op2_reg != IR_REG_NONE && (IR_IS_CONST_REF(op2) || IR_REG_SPILLED(op2_reg))) {
8371		op2_reg = IR_REG_NUM(op2_reg);
8372		if (op1 != op2) {
8373			ir_emit_load(ctx, type, op2_reg, op2);
8374		}
8375	}
8376
8377	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8378	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
8379		if (op == IR_ULT) {
8380			/* always false */
8381			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8382				|	jmp aword &addr
8383			} else {
8384|.if X64
8385				if (IR_IS_SIGNED_32BIT(addr)) {
8386					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8387				} else {
8388					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8389				}
8390				|	jmp aword [rax]
8391|.endif
8392			}
8393			return 0;
8394		} else if (op == IR_UGE) {
8395			/* always true */
8396			return 0;
8397		} else if (op == IR_ULE) {
8398			op = IR_EQ;
8399		} else if (op == IR_UGT) {
8400			op = IR_NE;
8401		}
8402	}
8403	ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
8404
8405	if (insn->op == IR_GUARD) {
8406		op ^= 1; // reverse
8407	}
8408
8409	return ir_emit_guard_jcc(ctx, b, def, op, addr, 1);
8410}
8411
8412static bool ir_emit_guard_cmp_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8413{
8414	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
8415	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8416
8417	if (insn->op == IR_GUARD) {
8418		op ^= 1; // reverse
8419	}
8420	return ir_emit_guard_jcc(ctx, b, def, op, addr, 0);
8421}
8422
8423static bool ir_emit_guard_test_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8424{
8425	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8426	ir_op op = (insn->op == IR_GUARD) ? IR_EQ : IR_NE;
8427
8428	ir_emit_test_int_common(ctx, def, insn->op2, op);
8429	return ir_emit_guard_jcc(ctx, b, def, op, addr, 1);
8430}
8431
8432static bool ir_emit_guard_jcc_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8433{
8434	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8435	ir_op op = ctx->ir_base[insn->op2].op;
8436
8437	if (insn->op == IR_GUARD) {
8438		op ^= 1; // reverse
8439	}
8440	return ir_emit_guard_jcc(ctx, b, def, op, addr, 1);
8441}
8442
8443static bool ir_emit_guard_overflow(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8444{
8445	ir_backend_data *data = ctx->data;
8446	dasm_State **Dst = &data->dasm_state;
8447	ir_type type;
8448	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
8449
8450	type = ctx->ir_base[ctx->ir_base[insn->op2].op1].type;
8451
8452	IR_ASSERT(IR_IS_TYPE_INT(type));
8453	if (IR_IS_TYPE_SIGNED(type)) {
8454		if (insn->op == IR_GUARD) {
8455			|	jno &addr
8456		} else {
8457			|	jo &addr
8458		}
8459	} else {
8460		if (insn->op == IR_GUARD) {
8461			|	jnc &addr
8462		} else {
8463			|	jc &addr
8464		}
8465	}
8466	return 0;
8467}
8468
8469static void ir_emit_lea(ir_ctx *ctx, ir_ref def, ir_type type)
8470{
8471	ir_backend_data *data = ctx->data;
8472	dasm_State **Dst = &data->dasm_state;
8473	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
8474	ir_mem mem = ir_fuse_addr(ctx, def, def);
8475
8476	IR_ASSERT(def_reg != IR_REG_NONE);
8477	if (ir_type_size[type] == 4) {
8478		if (IR_MEM_BASE(mem) == def_reg
8479		 && IR_MEM_OFFSET(mem) == 0
8480		 && IR_MEM_SCALE(mem) == 1
8481		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
8482			ir_reg reg = IR_MEM_INDEX(mem);
8483			|	add Rd(def_reg), Rd(reg)
8484		} else if (IR_MEM_INDEX(mem) == def_reg
8485		 && IR_MEM_OFFSET(mem) == 0
8486		 && IR_MEM_SCALE(mem) == 1
8487		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
8488			ir_reg reg = IR_MEM_BASE(mem);
8489			|	add Rd(def_reg), Rd(reg)
8490		} else {
8491			|	ASM_TXT_TMEM_OP lea, Rd(def_reg), dword, mem
8492		}
8493	} else {
8494		if (IR_MEM_BASE(mem) == def_reg
8495		 && IR_MEM_OFFSET(mem) == 0
8496		 && IR_MEM_SCALE(mem) == 1
8497		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
8498			ir_reg reg = IR_MEM_INDEX(mem);
8499			|	add Ra(def_reg), Ra(reg)
8500		} else if (IR_MEM_INDEX(mem) == def_reg
8501		 && IR_MEM_OFFSET(mem) == 0
8502		 && IR_MEM_SCALE(mem) == 1
8503		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
8504			ir_reg reg = IR_MEM_BASE(mem);
8505			|	add Ra(def_reg), Ra(reg)
8506		} else {
8507			|	ASM_TXT_TMEM_OP lea, Ra(def_reg), aword, mem
8508		}
8509	}
8510	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8511		ir_emit_store(ctx, type, def, def_reg);
8512	}
8513}
8514
8515static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8516{
8517	ir_backend_data *data = ctx->data;
8518	dasm_State **Dst = &data->dasm_state;
8519	ir_reg reg = IR_REG_NUM(ctx->regs[def][0]);
8520
8521	if (ctx->use_lists[def].count == 1) {
8522		/* dead load */
8523		return;
8524	}
8525
8526|.if X64WIN
8527|	gs
8528|	mov Ra(reg), aword [0x58]
8529|	mov Ra(reg), aword [Ra(reg)+insn->op2]
8530|	mov Ra(reg), aword [Ra(reg)+insn->op3]
8531|.elif WIN
8532|	fs
8533|	mov Ra(reg), aword [0x2c]
8534|	mov Ra(reg), aword [Ra(reg)+insn->op2]
8535|	mov Ra(reg), aword [Ra(reg)+insn->op3]
8536|.elif X64APPLE
8537|	gs
8538||	if (insn->op3 == IR_NULL) {
8539|		mov Ra(reg), aword [insn->op2]
8540||	} else {
8541|		mov Ra(reg), aword [insn->op2]
8542|		mov Ra(reg), aword [Ra(reg)+insn->op3]
8543||	}
8544|.elif X64
8545|	fs
8546||	if (insn->op3 == IR_NULL) {
8547|		mov Ra(reg), aword [insn->op2]
8548||	} else {
8549|		mov Ra(reg), [0x8]
8550|		mov Ra(reg), aword [Ra(reg)+insn->op2]
8551|		mov Ra(reg), aword [Ra(reg)+insn->op3]
8552||	}
8553|.else
8554|	gs
8555||	if (insn->op3 == IR_NULL) {
8556|		mov Ra(reg), aword [insn->op2]
8557||	} else {
8558|		mov Ra(reg), [0x4]
8559|		mov Ra(reg), aword [Ra(reg)+insn->op2]
8560|		mov Ra(reg), aword [Ra(reg)+insn->op3]
8561||	}
8562|	.endif
8563	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8564		ir_emit_store(ctx, IR_ADDR, def, reg);
8565	}
8566}
8567
8568static void ir_emit_exitcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8569{
8570	ir_backend_data *data = ctx->data;
8571	dasm_State **Dst = &data->dasm_state;
8572	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
8573
8574	IR_ASSERT(def_reg != IR_REG_NONE);
8575
8576	|.if X64
8577	|	sub rsp, 16*8+16*8+8 /* CPU regs + SSE regs */
8578	|	mov aword [rsp+0*8], rax
8579	|	mov aword [rsp+1*8], rcx
8580	|	mov aword [rsp+2*8], rdx
8581	|	mov aword [rsp+3*8], rbx
8582	|	mov aword [rsp+5*8], rbp
8583	|	mov aword [rsp+6*8], rsi
8584	|	mov aword [rsp+7*8], rdi
8585	|	mov aword [rsp+8*8], r8
8586	|	mov aword [rsp+9*8], r9
8587	|	mov aword [rsp+10*8], r10
8588	|	mov aword [rsp+11*8], r11
8589	|	mov aword [rsp+12*8], r12
8590	|	mov aword [rsp+13*8], r13
8591	|	mov aword [rsp+14*8], r14
8592	|	mov aword [rsp+15*8], r15
8593	|	movsd qword [rsp+16*8+0*8], xmm0
8594	|	movsd qword [rsp+16*8+1*8], xmm1
8595	|	movsd qword [rsp+16*8+2*8], xmm2
8596	|	movsd qword [rsp+16*8+3*8], xmm3
8597	|	movsd qword [rsp+16*8+4*8], xmm4
8598	|	movsd qword [rsp+16*8+5*8], xmm5
8599	|	movsd qword [rsp+16*8+6*8], xmm6
8600	|	movsd qword [rsp+16*8+7*8], xmm7
8601	|	movsd qword [rsp+16*8+8*8], xmm8
8602	|	movsd qword [rsp+16*8+9*8], xmm9
8603	|	movsd qword [rsp+16*8+10*8], xmm10
8604	|	movsd qword [rsp+16*8+11*8], xmm11
8605	|	movsd qword [rsp+16*8+12*8], xmm12
8606	|	movsd qword [rsp+16*8+13*8], xmm13
8607	|	movsd qword [rsp+16*8+14*8], xmm14
8608	|	movsd qword [rsp+16*8+15*8], xmm15
8609	|
8610	|	mov Ra(IR_REG_INT_ARG2), rsp
8611	|	lea Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+16]
8612	|	mov aword [rsp+4*8], Ra(IR_REG_INT_ARG1)
8613	|	mov Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+8]
8614	|.if X64WIN
8615	|	sub rsp, 32 /* shadow space */
8616	|.endif
8617	|.else
8618	|	sub esp, 8*4+8*8+12 /* CPU regs + SSE regs */
8619	|	mov aword [esp+0*4], eax
8620	|	mov aword [esp+1*4], ecx
8621	|	mov aword [esp+2*4], edx
8622	|	mov aword [esp+3*4], ebx
8623	|	mov aword [esp+5*4], ebp
8624	|	mov aword [esp+6*4], esi
8625	|	mov aword [esp+7*4], edi
8626	|	movsd qword [esp+8*4+0*8], xmm0
8627	|	movsd qword [esp+8*4+1*8], xmm1
8628	|	movsd qword [esp+8*4+2*8], xmm2
8629	|	movsd qword [esp+8*4+3*8], xmm3
8630	|	movsd qword [esp+8*4+4*8], xmm4
8631	|	movsd qword [esp+8*4+5*8], xmm5
8632	|	movsd qword [esp+8*4+6*8], xmm6
8633	|	movsd qword [esp+8*4+7*8], xmm7
8634	|
8635	|	mov Ra(IR_REG_INT_FCARG2), esp
8636	|	lea Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+16]
8637	|	mov aword [esp+4*4], Ra(IR_REG_INT_FCARG1)
8638	|	mov Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+12]
8639	|.endif
8640
8641	if (IR_IS_CONST_REF(insn->op2)) {
8642		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8643
8644		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8645			|	call aword &addr
8646		} else {
8647|.if X64
8648			if (IR_IS_SIGNED_32BIT(addr)) {
8649				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8650			} else {
8651				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8652			}
8653			|	call rax
8654|.endif
8655		}
8656	} else {
8657		IR_ASSERT(0);
8658	}
8659
8660	//  restore SP
8661	|.if X64WIN
8662	|	add rsp, 32+16*8+16*8+16 /* shadow space + CPU regs + SSE regs */
8663	|.elif X64
8664	|	add rsp, 16*8+16*8+16 /* CPU regs + SSE regs */
8665	|.else
8666	|	add esp, 8*4+8*8+16 /* CPU regs + SSE regs */
8667	|.endif
8668
8669	if (def_reg != IR_REG_INT_RET1) {
8670		ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
8671	}
8672	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8673		ir_emit_store(ctx, insn->type, def, def_reg);
8674	}
8675}
8676
8677static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_reg to_reg, ir_ref to, int32_t offset)
8678{
8679	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8680
8681	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);
8682
8683	if (IR_IS_TYPE_INT(type)) {
8684		if (from_reg != IR_REG_NONE) {
8685			if (to_reg != IR_REG_NONE) {
8686				ir_emit_mov(ctx, type, to_reg, from_reg);
8687			} else {
8688				ir_emit_store(ctx, type, to, from_reg);
8689			}
8690		} else {
8691			ir_emit_load_mem_int(ctx, type, to_reg, IR_MEM_BO(fp, offset));
8692		}
8693	} else {
8694		if (from_reg != IR_REG_NONE) {
8695			if (to_reg != IR_REG_NONE) {
8696				ir_emit_fp_mov(ctx, type, to_reg, from_reg);
8697			} else {
8698				ir_emit_store(ctx, type, to, from_reg);
8699			}
8700		} else {
8701			ir_emit_load_mem_fp(ctx, type, to_reg, IR_MEM_BO(fp, offset));
8702		}
8703	}
8704}
8705
8706static void ir_emit_load_params(ir_ctx *ctx)
8707{
8708	ir_use_list *use_list = &ctx->use_lists[1];
8709	ir_insn *insn;
8710	ir_ref i, n, *p, use;
8711	int int_param_num = 0;
8712	int fp_param_num = 0;
8713	ir_reg src_reg;
8714	ir_reg dst_reg;
8715	// TODO: Calling convention specific
8716	int int_reg_params_count = IR_REG_INT_ARGS;
8717	int fp_reg_params_count = IR_REG_FP_ARGS;
8718	const int8_t *int_reg_params = _ir_int_reg_params;
8719	const int8_t *fp_reg_params = _ir_fp_reg_params;
8720	int32_t stack_offset = 0;
8721
8722#ifdef IR_TARGET_X86
8723	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
8724		int_reg_params_count = IR_REG_INT_FCARGS;
8725		fp_reg_params_count = IR_REG_FP_FCARGS;
8726		int_reg_params = _ir_int_fc_reg_params;
8727		fp_reg_params = _ir_fp_fc_reg_params;
8728	}
8729#endif
8730
8731	if (ctx->flags & IR_USE_FRAME_POINTER) {
8732		stack_offset = sizeof(void*) * 2; /* skip old frame pointer and return address */
8733	} else {
8734		stack_offset = sizeof(void*) + ctx->stack_frame_size + ctx->call_stack_size; /* skip return address */
8735	}
8736	n = use_list->count;
8737	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
8738		use = *p;
8739		insn = &ctx->ir_base[use];
8740		if (insn->op == IR_PARAM) {
8741			if (IR_IS_TYPE_INT(insn->type)) {
8742				if (int_param_num < int_reg_params_count) {
8743					src_reg = int_reg_params[int_param_num];
8744				} else {
8745					src_reg = IR_REG_NONE;
8746				}
8747				int_param_num++;
8748#ifdef _WIN64
8749				/* WIN64 calling convention use common couter for int and fp registers */
8750				fp_param_num++;
8751#endif
8752			} else {
8753				if (fp_param_num < fp_reg_params_count) {
8754					src_reg = fp_reg_params[fp_param_num];
8755				} else {
8756					src_reg = IR_REG_NONE;
8757				}
8758				fp_param_num++;
8759#ifdef _WIN64
8760				/* WIN64 calling convention use common couter for int and fp registers */
8761				int_param_num++;
8762#endif
8763			}
8764			if (ctx->vregs[use]) {
8765				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
8766				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
8767					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
8768						((ctx->flags & IR_USE_FRAME_POINTER) ? -ctx->stack_frame_size : ctx->call_stack_size));
8769				if (src_reg != dst_reg) {
8770					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
8771				}
8772				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
8773					ir_emit_store(ctx, insn->type, use, dst_reg);
8774				}
8775			}
8776			if (src_reg == IR_REG_NONE) {
8777				if (sizeof(void*) == 8) {
8778					stack_offset += sizeof(void*);
8779				} else {
8780					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
8781				}
8782			}
8783		}
8784	}
8785}
8786
8787static ir_reg ir_get_free_reg(ir_type type, ir_regset available)
8788{
8789	if (IR_IS_TYPE_INT(type)) {
8790		available = IR_REGSET_INTERSECTION(available, IR_REGSET_GP);
8791	} else {
8792		IR_ASSERT(IR_IS_TYPE_FP(type));
8793		available = IR_REGSET_INTERSECTION(available, IR_REGSET_FP);
8794	}
8795	IR_ASSERT(!IR_REGSET_IS_EMPTY(available));
8796	return IR_REGSET_FIRST(available);
8797}
8798
8799static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
8800{
8801	ir_backend_data *data = ctx->data;
8802	ir_ref ref = ctx->cfg_blocks[data->dessa_from_block].end;
8803
8804	if (to == 0) {
8805		if (IR_IS_TYPE_INT(type)) {
8806			if (ctx->regs[ref][0] == IR_REG_NONE) {
8807				ctx->regs[ref][0] = IR_REG_RAX;
8808			}
8809		} else {
8810			IR_ASSERT(IR_IS_TYPE_FP(type));
8811			if (ctx->regs[ref][1] == IR_REG_NONE) {
8812				ctx->regs[ref][1] = IR_REG_XMM0;
8813			}
8814		}
8815	} else if (from != 0) {
8816		if (IR_IS_TYPE_INT(type)) {
8817			if (ctx->regs[ref][0] == IR_REG_NONE) {
8818				ctx->regs[ref][0] = IR_REG_RAX;
8819			}
8820		} else {
8821			IR_ASSERT(IR_IS_TYPE_FP(type));
8822			if (ctx->regs[ref][1] == IR_REG_NONE) {
8823				ctx->regs[ref][1] = IR_REG_XMM0;
8824			}
8825		}
8826	}
8827	return 1;
8828}
8829
8830static void ir_fix_param_spills(ir_ctx *ctx)
8831{
8832	ir_use_list *use_list = &ctx->use_lists[1];
8833	ir_insn *insn;
8834	ir_ref i, n, *p, use;
8835	int int_param_num = 0;
8836	int fp_param_num = 0;
8837	ir_reg src_reg;
8838	// TODO: Calling convention specific
8839	int int_reg_params_count = IR_REG_INT_ARGS;
8840	int fp_reg_params_count = IR_REG_FP_ARGS;
8841	const int8_t *int_reg_params = _ir_int_reg_params;
8842	const int8_t *fp_reg_params = _ir_fp_reg_params;
8843	int32_t stack_start = 0;
8844	int32_t stack_offset = 0;
8845
8846#ifdef IR_TARGET_X86
8847	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
8848		int_reg_params_count = IR_REG_INT_FCARGS;
8849		fp_reg_params_count = IR_REG_FP_FCARGS;
8850		int_reg_params = _ir_int_fc_reg_params;
8851		fp_reg_params = _ir_fp_fc_reg_params;
8852	}
8853#endif
8854
8855	if (ctx->flags & IR_USE_FRAME_POINTER) {
8856		/* skip old frame pointer and return address */
8857		stack_start = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
8858	} else {
8859		 /* skip return address */
8860		stack_start = sizeof(void*) + ctx->stack_frame_size;
8861	}
8862	n = use_list->count;
8863	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
8864		use = *p;
8865		insn = &ctx->ir_base[use];
8866		if (insn->op == IR_PARAM) {
8867			if (IR_IS_TYPE_INT(insn->type)) {
8868				if (int_param_num < int_reg_params_count) {
8869					src_reg = int_reg_params[int_param_num];
8870				} else {
8871					src_reg = IR_REG_NONE;
8872				}
8873				int_param_num++;
8874#ifdef _WIN64
8875				/* WIN64 calling convention use common couter for int and fp registers */
8876				fp_param_num++;
8877#endif
8878			} else {
8879				if (fp_param_num < fp_reg_params_count) {
8880					src_reg = fp_reg_params[fp_param_num];
8881				} else {
8882					src_reg = IR_REG_NONE;
8883				}
8884				fp_param_num++;
8885#ifdef _WIN64
8886				/* WIN64 calling convention use common couter for int and fp registers */
8887				int_param_num++;
8888#endif
8889			}
8890			if (src_reg == IR_REG_NONE) {
8891				if (ctx->vregs[use]) {
8892					ir_live_interval *ival = ctx->live_intervals[ctx->vregs[use]];
8893					if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
8894					 && ival->stack_spill_pos == -1
8895					 && (ival->next || ival->reg == IR_REG_NONE)) {
8896						ival->stack_spill_pos = stack_start + stack_offset;
8897					}
8898				}
8899				if (sizeof(void*) == 8) {
8900					stack_offset += sizeof(void*);
8901				} else {
8902					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
8903				}
8904			}
8905		}
8906	}
8907
8908#ifdef _WIN64
8909	/* WIN64 uses shsow area for registers */
8910	stack_offset += IR_MIN(int_param_num, int_reg_params_count) * sizeof(void*);
8911#endif
8912	ctx->gp_reg_params = IR_MIN(int_param_num, int_reg_params_count);
8913	ctx->fp_reg_params = IR_MIN(fp_param_num, fp_reg_params_count);
8914	ctx->param_stack_size = stack_offset;
8915}
8916
8917static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
8918{
8919	uint32_t b;
8920	ir_block *bb;
8921	ir_insn *insn;
8922	ir_ref i, n, j, *p;
8923	uint32_t *rule, insn_flags;
8924	ir_backend_data *data = ctx->data;
8925	ir_regset available = 0;
8926	ir_target_constraints constraints;
8927	uint32_t def_flags;
8928	ir_reg reg;
8929
8930#ifndef IR_REG_FP_RET1
8931	if (ctx->flags2 & IR_HAS_FP_RET_SLOT) {
8932		ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
8933	} else if (ctx->ret_type == IR_FLOAT || ctx->ret_type == IR_DOUBLE) {
8934		ctx->ret_slot = ir_allocate_spill_slot(ctx, ctx->ret_type, &data->ra_data);
8935	} else {
8936		ctx->ret_slot = -1;
8937	}
8938#endif
8939
8940	ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
8941	memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
8942
8943	/* vregs + tmp + fixed + SRATCH + ALL */
8944	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));
8945
8946    if (!ctx->arena) {
8947		ctx->arena = ir_arena_create(16 * 1024);
8948	}
8949
8950	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
8951		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
8952		for (i = bb->start, insn = ctx->ir_base + i, rule = ctx->rules + i; i <= bb->end;) {
8953			switch (ctx->rules ? *rule : insn->op) {
8954				case IR_START:
8955				case IR_BEGIN:
8956				case IR_END:
8957				case IR_IF_TRUE:
8958				case IR_IF_FALSE:
8959				case IR_CASE_VAL:
8960				case IR_CASE_DEFAULT:
8961				case IR_MERGE:
8962				case IR_LOOP_BEGIN:
8963				case IR_LOOP_END:
8964					break;
8965#ifndef IR_REG_FP_RET1
8966				case IR_CALL:
8967					if (ctx->ret_slot == -1 && (insn->type == IR_FLOAT || insn->type == IR_DOUBLE)) {
8968						ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
8969					}
8970#endif
8971					IR_FALLTHROUGH;
8972				default:
8973					def_flags = ir_get_target_constraints(ctx, i, &constraints);
8974					if (ctx->rules
8975					 && *rule != IR_CMP_AND_BRANCH_INT
8976					 && *rule != IR_CMP_AND_BRANCH_FP
8977					 && *rule != IR_TEST_AND_BRANCH_INT
8978					 && *rule != IR_GUARD_CMP_INT
8979					 && *rule != IR_GUARD_CMP_FP) {
8980						available = IR_REGSET_SCRATCH;
8981					}
8982					if (ctx->vregs[i]) {
8983						reg = constraints.def_reg;
8984						if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
8985							IR_REGSET_EXCL(available, reg);
8986							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
8987						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
8988							if (insn->op == IR_VLOAD
8989							 && ctx->live_intervals[ctx->vregs[i]]
8990							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1) {
8991								/* pass */
8992							} else if (insn->op != IR_PARAM) {
8993								reg = ir_get_free_reg(insn->type, available);
8994								IR_REGSET_EXCL(available, reg);
8995								ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
8996							}
8997						}
8998						if (!ctx->live_intervals[ctx->vregs[i]]) {
8999							ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9000							memset(ival, 0, sizeof(ir_live_interval));
9001							ctx->live_intervals[ctx->vregs[i]] = ival;
9002							ival->type = insn->type;
9003							ival->reg = IR_REG_NONE;
9004							ival->vreg = ctx->vregs[i];
9005							ival->stack_spill_pos = -1;
9006							if (insn->op == IR_PARAM && reg == IR_REG_NONE) {
9007								ival->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
9008							} else {
9009								ival->stack_spill_pos = ir_allocate_spill_slot(ctx, ival->type, &data->ra_data);
9010							}
9011						} else if (insn->op == IR_PARAM) {
9012							IR_ASSERT(0 && "unexpected PARAM");
9013							return;
9014						}
9015					} else if (insn->op == IR_VAR) {
9016						ir_use_list *use_list = &ctx->use_lists[i];
9017						ir_ref n = use_list->count;
9018
9019						if (n > 0) {
9020							int32_t stack_spill_pos = insn->op3 = ir_allocate_spill_slot(ctx, insn->type, &data->ra_data);
9021							ir_ref i, *p, use;
9022							ir_insn *use_insn;
9023
9024							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9025								use = *p;
9026								use_insn = &ctx->ir_base[use];
9027								if (use_insn->op == IR_VLOAD) {
9028									if (ctx->vregs[use]
9029									 && !ctx->live_intervals[ctx->vregs[use]]) {
9030										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9031										memset(ival, 0, sizeof(ir_live_interval));
9032										ctx->live_intervals[ctx->vregs[use]] = ival;
9033										ival->type = insn->type;
9034										ival->reg = IR_REG_NONE;
9035										ival->vreg = ctx->vregs[use];
9036										ival->stack_spill_pos = stack_spill_pos;
9037									}
9038								} else if (use_insn->op == IR_VSTORE) {
9039									if (!IR_IS_CONST_REF(use_insn->op3)
9040									 && ctx->vregs[use_insn->op3]
9041									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
9042										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9043										memset(ival, 0, sizeof(ir_live_interval));
9044										ctx->live_intervals[ctx->vregs[use_insn->op3]] = ival;
9045										ival->type = insn->type;
9046										ival->reg = IR_REG_NONE;
9047										ival->vreg = ctx->vregs[use_insn->op3];
9048										ival->stack_spill_pos = stack_spill_pos;
9049									}
9050								}
9051							}
9052						}
9053					}
9054
9055					insn_flags = ir_op_flags[insn->op];
9056					n = constraints.tmps_count;
9057					if (n) {
9058						do {
9059							n--;
9060							if (constraints.tmp_regs[n].type) {
9061								ir_reg reg = ir_get_free_reg(constraints.tmp_regs[n].type, available);
9062								IR_REGSET_EXCL(available, reg);
9063								ctx->regs[i][constraints.tmp_regs[n].num] = reg;
9064							} else if (constraints.tmp_regs[n].reg == IR_REG_SCRATCH) {
9065								available = IR_REGSET_DIFFERENCE(available, IR_REGSET_SCRATCH);
9066							} else {
9067								IR_REGSET_EXCL(available, constraints.tmp_regs[n].reg);
9068							}
9069						} while (n);
9070					}
9071					n = insn->inputs_count;
9072					for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
9073						ir_ref input = *p;
9074						if (IR_OPND_KIND(insn_flags, j) == IR_OPND_DATA && input > 0 && ctx->vregs[input]) {
9075							if ((def_flags & IR_DEF_REUSES_OP1_REG) && j == 1) {
9076								ir_reg reg = IR_REG_NUM(ctx->regs[i][0]);
9077								ctx->regs[i][1] = reg | IR_REG_SPILL_LOAD;
9078							} else {
9079								uint8_t use_flags = IR_USE_FLAGS(def_flags, j);
9080								ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
9081
9082								if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
9083									IR_REGSET_EXCL(available, reg);
9084									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
9085								} else if (j > 1 && input == insn->op1 && ctx->regs[i][1] != IR_REG_NONE) {
9086									ctx->regs[i][j] = ctx->regs[i][1];
9087								} else if (use_flags & IR_USE_MUST_BE_IN_REG) {
9088									reg = ir_get_free_reg(ctx->ir_base[input].type, available);
9089									IR_REGSET_EXCL(available, reg);
9090									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
9091								}
9092							}
9093						}
9094					}
9095					break;
9096			}
9097			n = ir_insn_len(insn);
9098			i += n;
9099			insn += n;
9100			rule += n;
9101		}
9102		if (bb->flags & IR_BB_DESSA_MOVES) {
9103			data->dessa_from_block = b;
9104			ir_gen_dessa_moves(ctx, b, ir_fix_dessa_tmps);
9105		}
9106	}
9107
9108	ctx->used_preserved_regs = ctx->fixed_save_regset;
9109	ctx->flags |= IR_NO_STACK_COMBINE;
9110	ir_fix_stack_frame(ctx);
9111}
9112
9113static void ir_preallocate_call_stack(ir_ctx *ctx)
9114{
9115	int call_stack_size, peak_call_stack_size = 0;
9116	ir_ref i, n;
9117	ir_insn *insn;
9118
9119	for (i = 1, insn = ctx->ir_base + 1; i < ctx->insns_count;) {
9120		if (insn->op == IR_CALL) {
9121			call_stack_size = ir_call_used_stack(ctx, insn);
9122			if (call_stack_size > peak_call_stack_size
9123#ifdef IR_HAVE_FASTCALL
9124			 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
9125#endif
9126			) {
9127				peak_call_stack_size = call_stack_size;
9128			}
9129		}
9130		n = ir_insn_len(insn);
9131		i += n;
9132		insn += n;
9133	}
9134	if (peak_call_stack_size) {
9135		ctx->call_stack_size = peak_call_stack_size;
9136		ctx->flags |= IR_PREALLOCATED_STACK;
9137	}
9138}
9139
9140void ir_fix_stack_frame(ir_ctx *ctx)
9141{
9142	uint32_t additional_size = 0;
9143
9144	ctx->locals_area_size = ctx->stack_frame_size;
9145
9146	if (ctx->used_preserved_regs) {
9147		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
9148		ir_reg reg;
9149		(void) reg;
9150
9151		IR_REGSET_FOREACH(used_preserved_regs, reg) {
9152			additional_size += sizeof(void*);
9153		} IR_REGSET_FOREACH_END();
9154	}
9155
9156#if defined(IR_TARGET_X64) && !defined(_WIN64)
9157	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
9158		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
9159			additional_size += sizeof(void*) * IR_REG_INT_ARGS;
9160		}
9161		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
9162			additional_size += 16 * IR_REG_FP_ARGS;
9163		}
9164	}
9165#endif
9166
9167	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
9168	ctx->stack_frame_size += additional_size;
9169	ctx->stack_frame_alignment = 0;
9170	ctx->call_stack_size = 0;
9171
9172	if (ctx->flags2 & IR_HAS_CALLS) {
9173		/* Stack must be 16 byte aligned */
9174		if (!(ctx->flags & IR_FUNCTION)) {
9175			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
9176				ctx->stack_frame_size += sizeof(void*);
9177				ctx->stack_frame_alignment += sizeof(void*);
9178			}
9179		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
9180			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
9181				ctx->stack_frame_size += sizeof(void*);
9182				ctx->stack_frame_alignment += sizeof(void*);
9183			}
9184		} else {
9185			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
9186				ir_preallocate_call_stack(ctx);
9187			}
9188			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*), 16) !=
9189					ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*)) {
9190				ctx->stack_frame_size += sizeof(void*);
9191				ctx->stack_frame_alignment += sizeof(void*);
9192			}
9193		}
9194	}
9195
9196	ir_fix_param_spills(ctx);
9197}
9198
9199static void* dasm_labels[ir_lb_MAX];
9200
9201void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
9202{
9203	uint32_t b, n, target;
9204	ir_block *bb;
9205	ir_ref i;
9206	ir_insn *insn;
9207	uint32_t *rule;
9208	ir_backend_data data;
9209	dasm_State **Dst;
9210	int ret;
9211	void *entry;
9212	size_t size;
9213
9214	data.ra_data.unused_slot_4 = 0;
9215	data.ra_data.unused_slot_2 = 0;
9216	data.ra_data.unused_slot_1 = 0;
9217	data.ra_data.handled = NULL;
9218	data.rodata_label = 0;
9219	data.jmp_table_label = 0;
9220	data.double_neg_const = 0;
9221	data.float_neg_const = 0;
9222	data.double_abs_const = 0;
9223	data.float_abs_const = 0;
9224	data.double_zero_const = 0;
9225	ctx->data = &data;
9226
9227	if (!ctx->live_intervals) {
9228		ctx->stack_frame_size = 0;
9229		ctx->stack_frame_alignment = 0;
9230		ctx->call_stack_size = 0;
9231		ctx->used_preserved_regs = 0;
9232		ir_allocate_unique_spill_slots(ctx);
9233	}
9234
9235	if (ctx->fixed_stack_frame_size != -1) {
9236		if (ctx->fixed_stack_red_zone) {
9237			IR_ASSERT(ctx->fixed_stack_red_zone == ctx->fixed_stack_frame_size + ctx->fixed_call_stack_size);
9238		}
9239		if (ctx->stack_frame_size > ctx->fixed_stack_frame_size) {
9240			// TODO: report error to caller
9241#ifdef IR_DEBUG_MESSAGES
9242			fprintf(stderr, "IR Compilation Aborted: ctx->stack_frame_size > ctx->fixed_stack_frame_size at %s:%d\n",
9243				__FILE__, __LINE__);
9244#endif
9245			ctx->data = NULL;
9246			ctx->status = IR_ERROR_FIXED_STACK_FRAME_OVERFLOW;
9247			return NULL;
9248		}
9249		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
9250		ctx->call_stack_size = ctx->fixed_call_stack_size;
9251		ctx->stack_frame_alignment = 0;
9252	}
9253
9254	Dst = &data.dasm_state;
9255	data.dasm_state = NULL;
9256	dasm_init(&data.dasm_state, DASM_MAXSECTION);
9257	dasm_setupglobal(&data.dasm_state, dasm_labels, ir_lb_MAX);
9258	dasm_setup(&data.dasm_state, dasm_actions);
9259	/* labels for each block + for each constant + rodata label + jmp_table label + for each entry */
9260	dasm_growpc(&data.dasm_state, ctx->cfg_blocks_count + 1 + ctx->consts_count + 1 + 1 + 1 + ctx->entries_count);
9261	data.emit_constants = ir_bitset_malloc(ctx->consts_count);
9262
9263	if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_START_BR_TARGET)) {
9264		|.if X64
9265		|	endbr64
9266		|.else
9267		|	endbr32
9268		|.endif
9269	}
9270
9271	if (!(ctx->flags & IR_SKIP_PROLOGUE)) {
9272		ir_emit_prologue(ctx);
9273	}
9274	if (ctx->flags & IR_FUNCTION) {
9275		ir_emit_load_params(ctx);
9276	}
9277
9278	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
9279		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
9280		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
9281			continue;
9282		}
9283		|=>b:
9284
9285		i = bb->start;
9286		insn = ctx->ir_base + i;
9287		if (bb->flags & IR_BB_ENTRY) {
9288			uint32_t label = ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3;
9289
9290			|=>label:
9291			if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_ENTRY_BR_TARGET)) {
9292				|.if X64
9293				|	endbr64
9294				|.else
9295				|	endbr32
9296				|.endif
9297			}
9298			ir_emit_prologue(ctx);
9299			ctx->entries[insn->op3] = i;
9300		}
9301
9302		/* skip first instruction */
9303		n = ir_insn_len(insn);
9304		i += n;
9305		insn += n;
9306		rule = ctx->rules + i;
9307
9308		while (i <= bb->end) {
9309			if (!((*rule) & (IR_FUSED|IR_SKIPPED)))
9310			switch (*rule) {
9311				case IR_VAR:
9312				case IR_PARAM:
9313				case IR_PI:
9314				case IR_PHI:
9315				case IR_SNAPSHOT:
9316				case IR_VA_END:
9317					break;
9318				case IR_LEA_OB:
9319				case IR_LEA_SI:
9320				case IR_LEA_SIB:
9321				case IR_LEA_IB:
9322				case IR_LEA_OB_I:
9323				case IR_LEA_I_OB:
9324				case IR_LEA_SI_O:
9325				case IR_LEA_SIB_O:
9326				case IR_LEA_IB_O:
9327				case IR_LEA_OB_SI:
9328				case IR_LEA_SI_OB:
9329				case IR_LEA_B_SI:
9330				case IR_LEA_SI_B:
9331					ir_emit_lea(ctx, i, insn->type);
9332					break;
9333				case IR_MUL_PWR2:
9334				case IR_DIV_PWR2:
9335				case IR_MOD_PWR2:
9336					ir_emit_mul_div_mod_pwr2(ctx, i, insn);
9337					break;
9338				case IR_SDIV_PWR2:
9339					ir_emit_sdiv_pwr2(ctx, i, insn);
9340					break;
9341				case IR_SMOD_PWR2:
9342					ir_emit_smod_pwr2(ctx, i, insn);
9343					break;
9344				case IR_SHIFT:
9345					ir_emit_shift(ctx, i, insn);
9346					break;
9347				case IR_SHIFT_CONST:
9348					ir_emit_shift_const(ctx, i, insn);
9349					break;
9350				case IR_BIT_COUNT:
9351					ir_emit_bit_count(ctx, i, insn);
9352					break;
9353				case IR_CTPOP:
9354					ir_emit_ctpop(ctx, i, insn);
9355					break;
9356				case IR_INC:
9357				case IR_DEC:
9358				case IR_OP_INT:
9359					ir_emit_op_int(ctx, i, insn, *rule);
9360					break;
9361				case IR_ABS_INT:
9362					ir_emit_abs_int(ctx, i, insn);
9363					break;
9364				case IR_BOOL_NOT_INT:
9365					ir_emit_bool_not_int(ctx, i, insn);
9366					break;
9367				case IR_OP_FP:
9368					ir_emit_op_fp(ctx, i, insn);
9369					break;
9370				case IR_IMUL3:
9371					ir_emit_imul3(ctx, i, insn);
9372					break;
9373				case IR_BINOP_INT:
9374					ir_emit_binop_int(ctx, i, insn);
9375					break;
9376				case IR_BINOP_SSE2:
9377					ir_emit_binop_sse2(ctx, i, insn);
9378					break;
9379				case IR_BINOP_AVX:
9380					ir_emit_binop_avx(ctx, i, insn);
9381					break;
9382				case IR_MUL_INT:
9383				case IR_DIV_INT:
9384				case IR_MOD_INT:
9385					ir_emit_mul_div_mod(ctx, i, insn);
9386					break;
9387				case IR_CMP_INT:
9388					ir_emit_cmp_int(ctx, i, insn);
9389					break;
9390				case IR_TESTCC_INT:
9391					ir_emit_testcc_int(ctx, i, insn);
9392					break;
9393				case IR_SETCC_INT:
9394					ir_emit_setcc_int(ctx, i, insn);
9395					break;
9396				case IR_CMP_FP:
9397					ir_emit_cmp_fp(ctx, i, insn);
9398					break;
9399				case IR_SEXT:
9400					ir_emit_sext(ctx, i, insn);
9401					break;
9402				case IR_ZEXT:
9403					ir_emit_zext(ctx, i, insn);
9404					break;
9405				case IR_TRUNC:
9406					ir_emit_trunc(ctx, i, insn);
9407					break;
9408				case IR_BITCAST:
9409				case IR_PROTO:
9410					ir_emit_bitcast(ctx, i, insn);
9411					break;
9412				case IR_INT2FP:
9413					ir_emit_int2fp(ctx, i, insn);
9414					break;
9415				case IR_FP2INT:
9416					ir_emit_fp2int(ctx, i, insn);
9417					break;
9418				case IR_FP2FP:
9419					ir_emit_fp2fp(ctx, i, insn);
9420					break;
9421				case IR_COPY_INT:
9422					ir_emit_copy_int(ctx, i, insn);
9423					break;
9424				case IR_COPY_FP:
9425					ir_emit_copy_fp(ctx, i, insn);
9426					break;
9427				case IR_CMP_AND_BRANCH_INT:
9428					ir_emit_cmp_and_branch_int(ctx, b, i, insn);
9429					break;
9430				case IR_CMP_AND_BRANCH_FP:
9431					ir_emit_cmp_and_branch_fp(ctx, b, i, insn);
9432					break;
9433				case IR_TEST_AND_BRANCH_INT:
9434					ir_emit_test_and_branch_int(ctx, b, i, insn);
9435					break;
9436				case IR_JCC_INT:
9437					{
9438						ir_op op = ctx->ir_base[insn->op2].op;
9439
9440						if (op == IR_ADD ||
9441						    op == IR_SUB ||
9442//						    op == IR_MUL ||
9443						    op == IR_OR  ||
9444						    op == IR_AND ||
9445						    op == IR_XOR) {
9446							op = IR_NE;
9447						} else {
9448							IR_ASSERT(op >= IR_EQ && op <= IR_UGT);
9449						}
9450						ir_emit_jcc(ctx, op, b, i, insn, 1);
9451					}
9452					break;
9453				case IR_GUARD_CMP_INT:
9454					if (ir_emit_guard_cmp_int(ctx, b, i, insn)) {
9455						goto next_block;
9456					}
9457					break;
9458				case IR_GUARD_CMP_FP:
9459					if (ir_emit_guard_cmp_fp(ctx, b, i, insn)) {
9460						goto next_block;
9461					}
9462					break;
9463				case IR_GUARD_TEST_INT:
9464					if (ir_emit_guard_test_int(ctx, b, i, insn)) {
9465						goto next_block;
9466					}
9467					break;
9468				case IR_GUARD_JCC_INT:
9469					if (ir_emit_guard_jcc_int(ctx, b, i, insn)) {
9470						goto next_block;
9471					}
9472					break;
9473				case IR_IF_INT:
9474					ir_emit_if_int(ctx, b, i, insn);
9475					break;
9476				case IR_COND:
9477					ir_emit_cond(ctx, i, insn);
9478					break;
9479				case IR_SWITCH:
9480					ir_emit_switch(ctx, b, i, insn);
9481					break;
9482				case IR_MIN_MAX_INT:
9483					ir_emit_min_max_int(ctx, i, insn);
9484					break;
9485				case IR_OVERFLOW:
9486					ir_emit_overflow(ctx, i, insn);
9487					break;
9488				case IR_OVERFLOW_AND_BRANCH:
9489					ir_emit_overflow_and_branch(ctx, b, i, insn);
9490					break;
9491				case IR_END:
9492				case IR_LOOP_END:
9493					if (bb->flags & IR_BB_OSR_ENTRY_LOADS) {
9494						ir_emit_osr_entry_loads(ctx, b, bb);
9495					}
9496					if (bb->flags & IR_BB_DESSA_MOVES) {
9497						ir_emit_dessa_moves(ctx, b, bb);
9498					}
9499					do {
9500						ir_ref succ = ctx->cfg_edges[bb->successors];
9501
9502						if (UNEXPECTED(bb->successors_count == 2)) {
9503							if (ctx->cfg_blocks[succ].flags & IR_BB_ENTRY) {
9504								succ = ctx->cfg_edges[bb->successors + 1];
9505							} else {
9506								IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
9507							}
9508						} else {
9509							IR_ASSERT(bb->successors_count == 1);
9510						}
9511						target = ir_skip_empty_target_blocks(ctx, succ);
9512						if (b == ctx->cfg_blocks_count || target != ir_skip_empty_next_blocks(ctx, b + 1)) {
9513							|	jmp =>target
9514						}
9515					} while (0);
9516					break;
9517				case IR_RETURN_VOID:
9518					ir_emit_return_void(ctx);
9519					break;
9520				case IR_RETURN_INT:
9521					ir_emit_return_int(ctx, i, insn);
9522					break;
9523				case IR_RETURN_FP:
9524					ir_emit_return_fp(ctx, i, insn);
9525					break;
9526				case IR_CALL:
9527					ir_emit_call(ctx, i, insn);
9528					break;
9529				case IR_TAILCALL:
9530					ir_emit_tailcall(ctx, i, insn);
9531					break;
9532				case IR_IJMP:
9533					ir_emit_ijmp(ctx, i, insn);
9534					break;
9535				case IR_MEM_OP_INT:
9536				case IR_MEM_INC:
9537				case IR_MEM_DEC:
9538					ir_emit_mem_op_int(ctx, i, insn, *rule);
9539					break;
9540				case IR_MEM_BINOP_INT:
9541					ir_emit_mem_binop_int(ctx, i, insn);
9542					break;
9543				case IR_MEM_MUL_PWR2:
9544				case IR_MEM_DIV_PWR2:
9545				case IR_MEM_MOD_PWR2:
9546					ir_emit_mem_mul_div_mod_pwr2(ctx, i, insn);
9547					break;
9548				case IR_MEM_SHIFT:
9549					ir_emit_mem_shift(ctx, i, insn);
9550					break;
9551				case IR_MEM_SHIFT_CONST:
9552					ir_emit_mem_shift_const(ctx, i, insn);
9553					break;
9554				case IR_REG_BINOP_INT:
9555					ir_emit_reg_binop_int(ctx, i, insn);
9556					break;
9557				case IR_VADDR:
9558					ir_emit_vaddr(ctx, i, insn);
9559					break;
9560				case IR_VLOAD:
9561					ir_emit_vload(ctx, i, insn);
9562					break;
9563				case IR_VSTORE_INT:
9564					ir_emit_vstore_int(ctx, i, insn);
9565					break;
9566				case IR_VSTORE_FP:
9567					ir_emit_vstore_fp(ctx, i, insn);
9568					break;
9569				case IR_RLOAD:
9570					ir_emit_rload(ctx, i, insn);
9571					break;
9572				case IR_RSTORE:
9573					ir_emit_rstore(ctx, i, insn);
9574					break;
9575				case IR_LOAD_INT:
9576					ir_emit_load_int(ctx, i, insn);
9577					break;
9578				case IR_LOAD_FP:
9579					ir_emit_load_fp(ctx, i, insn);
9580					break;
9581				case IR_STORE_INT:
9582					ir_emit_store_int(ctx, i, insn);
9583					break;
9584				case IR_STORE_FP:
9585					ir_emit_store_fp(ctx, i, insn);
9586					break;
9587				case IR_ALLOCA:
9588					ir_emit_alloca(ctx, i, insn);
9589					break;
9590				case IR_VA_START:
9591					ir_emit_va_start(ctx, i, insn);
9592					break;
9593				case IR_VA_COPY:
9594					ir_emit_va_copy(ctx, i, insn);
9595					break;
9596				case IR_VA_ARG:
9597					ir_emit_va_arg(ctx, i, insn);
9598					break;
9599				case IR_AFREE:
9600					ir_emit_afree(ctx, i, insn);
9601					break;
9602				case IR_FRAME_ADDR:
9603					ir_emit_frame_addr(ctx, i);
9604					break;
9605				case IR_EXITCALL:
9606					ir_emit_exitcall(ctx, i, insn);
9607					break;
9608				case IR_GUARD:
9609				case IR_GUARD_NOT:
9610					if (ir_emit_guard(ctx, b, i, insn)) {
9611						goto next_block;
9612					}
9613					break;
9614				case IR_GUARD_OVERFLOW:
9615					if (ir_emit_guard_overflow(ctx, b, i, insn)) {
9616						goto next_block;
9617					}
9618					break;
9619				case IR_TLS:
9620					ir_emit_tls(ctx, i, insn);
9621					break;
9622				case IR_TRAP:
9623					|	int3
9624					break;
9625				default:
9626					IR_ASSERT(0 && "NIY rule/instruction");
9627					ir_mem_free(data.emit_constants);
9628					dasm_free(&data.dasm_state);
9629					ctx->data = NULL;
9630					ctx->status = IR_ERROR_UNSUPPORTED_CODE_RULE;
9631					return NULL;
9632			}
9633			n = ir_insn_len(insn);
9634			i += n;
9635			insn += n;
9636			rule += n;
9637		}
9638next_block:;
9639	}
9640
9641	if (data.rodata_label) {
9642		|.rodata
9643	}
9644	IR_BITSET_FOREACH(data.emit_constants, ir_bitset_len(ctx->consts_count), i) {
9645		insn = &ctx->ir_base[-i];
9646		if (IR_IS_TYPE_FP(insn->type)) {
9647			int label = ctx->cfg_blocks_count + i;
9648
9649			if (!data.rodata_label) {
9650				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
9651
9652				|.rodata
9653				|=>data.rodata_label:
9654			}
9655			if (insn->type == IR_DOUBLE) {
9656				|.align 8
9657				|=>label:
9658				|.dword insn->val.u32, insn->val.u32_hi
9659			} else {
9660				IR_ASSERT(insn->type == IR_FLOAT);
9661				|.align 4
9662				|=>label:
9663				|.dword insn->val.u32
9664			}
9665		} else if (insn->op == IR_STR) {
9666			int label = ctx->cfg_blocks_count + i;
9667			const char *str = ir_get_str(ctx, insn->val.str);
9668			int i = 0;
9669
9670			if (!data.rodata_label) {
9671				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
9672
9673				|.rodata
9674				|=>data.rodata_label:
9675			}
9676			|.align 8
9677			|=>label:
9678			while (str[i]) {
9679				char c = str[i];
9680
9681				|.byte c
9682				i++;
9683			}
9684			|.byte 0
9685
9686		} else {
9687			IR_ASSERT(0);
9688		}
9689	} IR_BITSET_FOREACH_END();
9690	if (data.rodata_label) {
9691		|.code
9692	}
9693	ir_mem_free(data.emit_constants);
9694
9695	if (ctx->status) {
9696		dasm_free(&data.dasm_state);
9697		ctx->data = NULL;
9698		return NULL;
9699	}
9700
9701	ret = dasm_link(&data.dasm_state, size_ptr);
9702	if (ret != DASM_S_OK) {
9703		IR_ASSERT(0);
9704		dasm_free(&data.dasm_state);
9705		ctx->data = NULL;
9706		ctx->status = IR_ERROR_LINK;
9707		return NULL;
9708	}
9709	size = *size_ptr;
9710
9711	if (ctx->code_buffer) {
9712		entry = ctx->code_buffer->pos;
9713		entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
9714		if (size > (size_t)((char*)ctx->code_buffer->end - (char*)entry)) {
9715			ctx->data = NULL;
9716			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
9717			return NULL;
9718		}
9719		ctx->code_buffer->pos = (char*)entry + size;
9720	} else {
9721		entry = ir_mem_mmap(size);
9722		if (!entry) {
9723			dasm_free(&data.dasm_state);
9724			ctx->data = NULL;
9725			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
9726			return NULL;
9727		}
9728		ir_mem_unprotect(entry, size);
9729	}
9730
9731	ret = dasm_encode(&data.dasm_state, entry);
9732	if (ret != DASM_S_OK) {
9733		IR_ASSERT(0);
9734		dasm_free(&data.dasm_state);
9735		if (ctx->code_buffer) {
9736			if (ctx->code_buffer->pos == (char*)entry + size) {
9737				/* rollback */
9738				ctx->code_buffer->pos = (char*)entry - size;
9739			}
9740		} else {
9741			ir_mem_unmap(entry, size);
9742		}
9743		ctx->data = NULL;
9744		ctx->status = IR_ERROR_ENCODE;
9745		return NULL;
9746	}
9747
9748	if (data.jmp_table_label) {
9749		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.jmp_table_label);
9750		ctx->jmp_table_offset = offset;
9751	} else {
9752		ctx->jmp_table_offset = 0;
9753	}
9754	if (data.rodata_label) {
9755		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.rodata_label);
9756		ctx->rodata_offset = offset;
9757	} else {
9758		ctx->rodata_offset = 0;
9759	}
9760
9761	if (ctx->entries_count) {
9762		/* For all entries */
9763		i = ctx->entries_count;
9764		do {
9765			ir_insn *insn = &ctx->ir_base[ctx->entries[--i]];
9766			uint32_t offset = dasm_getpclabel(&data.dasm_state, ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3);
9767			insn->op3 = offset;
9768		} while (i != 0);
9769	}
9770
9771	dasm_free(&data.dasm_state);
9772
9773	ir_mem_flush(entry, size);
9774
9775#if defined(__GNUC__)
9776	if ((ctx->flags & IR_GEN_CACHE_DEMOTE) && (ctx->mflags & IR_X86_CLDEMOTE)) {
9777		uintptr_t start = (uintptr_t)entry;
9778		uintptr_t p = (uintptr_t)start & ~0x3F;
9779
9780		do {
9781			/* _cldemote(p); */
9782			asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
9783			p += 64;
9784		} while (p < start + size);
9785	}
9786#endif
9787
9788	if (!ctx->code_buffer) {
9789		ir_mem_protect(entry, size);
9790	}
9791
9792	ctx->data = NULL;
9793	return entry;
9794}
9795
9796const void *ir_emit_exitgroup(uint32_t first_exit_point, uint32_t exit_points_per_group, const void *exit_addr, ir_code_buffer *code_buffer, size_t *size_ptr)
9797{
9798	void *entry;
9799	size_t size;
9800	uint32_t i;
9801	dasm_State **Dst, *dasm_state;
9802	int ret;
9803
9804	IR_ASSERT(code_buffer);
9805	IR_ASSERT(sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(code_buffer, exit_addr));
9806
9807	Dst = &dasm_state;
9808	dasm_state = NULL;
9809	dasm_init(&dasm_state, DASM_MAXSECTION);
9810	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
9811	dasm_setup(&dasm_state, dasm_actions);
9812
9813	for (i = 0; i < exit_points_per_group - 1; i++) {
9814		|	push byte i
9815		|	.byte 0xeb, (4*(exit_points_per_group-i)-6) // jmp >1
9816	}
9817	|	push byte i
9818	|// 1:
9819	|	add aword [r4], first_exit_point
9820	|	jmp aword &exit_addr
9821
9822	ret = dasm_link(&dasm_state, &size);
9823	if (ret != DASM_S_OK) {
9824		IR_ASSERT(0);
9825		dasm_free(&dasm_state);
9826		return NULL;
9827	}
9828
9829	entry = code_buffer->pos;
9830	entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
9831	if (size > (size_t)((char*)code_buffer->end - (char*)entry)) {
9832		return NULL;
9833	}
9834	code_buffer->pos = (char*)entry + size;
9835
9836	ret = dasm_encode(&dasm_state, entry);
9837	if (ret != DASM_S_OK) {
9838		IR_ASSERT(0);
9839		dasm_free(&dasm_state);
9840		if (code_buffer->pos == (char*)entry + size) {
9841			/* rollback */
9842			code_buffer->pos = (char*)entry - size;
9843		}
9844		return NULL;
9845	}
9846
9847	dasm_free(&dasm_state);
9848
9849	ir_mem_flush(entry, size);
9850
9851	*size_ptr = size;
9852	return entry;
9853}
9854
9855bool ir_needs_thunk(ir_code_buffer *code_buffer, void *addr)
9856{
9857	return sizeof(void*) == 8 && !IR_MAY_USE_32BIT_ADDR(code_buffer, addr);
9858}
9859
9860void *ir_emit_thunk(ir_code_buffer *code_buffer, void *addr, size_t *size_ptr)
9861{
9862	void *entry;
9863	size_t size;
9864	dasm_State **Dst, *dasm_state;
9865	int ret;
9866
9867	Dst = &dasm_state;
9868	dasm_state = NULL;
9869	dasm_init(&dasm_state, DASM_MAXSECTION);
9870	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
9871	dasm_setup(&dasm_state, dasm_actions);
9872
9873	|.code
9874	|.if X64
9875	|	jmp aword [>1]
9876	|1:
9877	|	.aword &addr
9878	|.else
9879	|	jmp &addr
9880	|.endif
9881
9882	ret = dasm_link(&dasm_state, &size);
9883	if (ret != DASM_S_OK) {
9884		IR_ASSERT(0);
9885		dasm_free(&dasm_state);
9886		return NULL;
9887	}
9888
9889	if (size > (size_t)((char*)code_buffer->end - (char*)code_buffer->pos)) {
9890		dasm_free(&dasm_state);
9891		return NULL;
9892	}
9893
9894	entry = code_buffer->pos;
9895	ret = dasm_encode(&dasm_state, entry);
9896	if (ret != DASM_S_OK) {
9897		dasm_free(&dasm_state);
9898		return NULL;
9899	}
9900
9901	*size_ptr = size;
9902	code_buffer->pos = (char*)code_buffer->pos + size;
9903
9904	dasm_free(&dasm_state);
9905	ir_mem_flush(entry, size);
9906
9907	return entry;
9908}
9909
9910void ir_fix_thunk(void *thunk_entry, void *addr)
9911{
9912	unsigned char *code = thunk_entry;
9913	void **addr_ptr;
9914
9915	if (sizeof(void*) == 8) {
9916		int32_t *offset_ptr;
9917
9918		IR_ASSERT(code[0] == 0xff && code[1] == 0x25);
9919		offset_ptr = (int32_t*)(code + 2);
9920		addr_ptr = (void**)(code + 6 + *offset_ptr);
9921		*addr_ptr = addr;
9922	} else {
9923		IR_ASSERT(code[0] == 0xe9);
9924		addr_ptr = (void**)(code + 1);
9925		*addr_ptr = (void*)((unsigned char*)addr - (code + 5));
9926	}
9927}
9928