xref: /php-src/ext/opcache/jit/ir/ir_x86.dasc (revision 7e2831f0)
1/*
2 * IR - Lightweight JIT Compilation Framework
3 * (x86/x86_64 native code generator based on DynAsm)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8|.if X64
9|.arch x64
10|.else
11|.arch x86
12|.endif
13
14|.actionlist dasm_actions
15|.globals ir_lb
16|.section code, cold_code, rodata, jmp_table
17
18|.define IR_LOOP_ALIGNMENT, 16
19
20#ifdef IR_DEBUG
21typedef struct _ir_mem {uint64_t v;} ir_mem;
22
23# define IR_MEM_VAL(loc)            ((loc).v)
24#else
25typedef uint64_t ir_mem;
26
27# define IR_MEM_VAL(loc)            (loc)
28#endif
29
30#define IR_MEM_OFFSET(loc)          ((int32_t)(IR_MEM_VAL(loc) & 0xffffffff))
31#define IR_MEM_BASE(loc)            ((ir_reg)((IR_MEM_VAL(loc) >> 32) & 0xff))
32#define IR_MEM_INDEX(loc)           ((ir_reg)((IR_MEM_VAL(loc) >> 40) & 0xff))
33#define IR_MEM_SCALE(loc)           ((int32_t)((IR_MEM_VAL(loc) >> 48) & 0xff))
34
35#define IR_MEM_O(addr)            IR_MEM(IR_REG_NONE, addr, IR_REG_NONE, 1)
36#define IR_MEM_B(base)            IR_MEM(base, 0, IR_REG_NONE, 1)
37#define IR_MEM_BO(base, offset)   IR_MEM(base, offset, IR_REG_NONE, 1)
38
39IR_ALWAYS_INLINE ir_mem IR_MEM(ir_reg base, int32_t offset, ir_reg index, int32_t scale)
40{
41	ir_mem mem;
42	IR_ASSERT(base == IR_REG_NONE || (base >= IR_REG_GP_FIRST && base <= IR_REG_GP_LAST));
43	IR_ASSERT(index == IR_REG_NONE || (index >= IR_REG_GP_FIRST && index <= IR_REG_GP_LAST));
44	IR_ASSERT(scale == 1 || scale == 2 || scale == 4 || scale == 8);
45#ifdef IR_DEBUG
46	mem.v =
47#else
48	mem =
49#endif
50		((uint64_t)(uint32_t)offset |
51		((uint64_t)(uint8_t)base << 32) |
52		((uint64_t)(uint8_t)index << 40) |
53		((uint64_t)(uint8_t)scale << 48));
54	return mem;
55}
56
57#define IR_IS_SIGNED_32BIT(val)     ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= (-2147483647 - 1)))
58#define IR_IS_SIGNED_NEG_32BIT(val) ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= -2147483647))
59#define IR_IS_UNSIGNED_32BIT(val)   (((uintptr_t)(val)) <= 0xffffffff)
60#define IR_IS_32BIT(type, val)      (IR_IS_TYPE_SIGNED(type) ? IR_IS_SIGNED_32BIT((val).i64) : IR_IS_UNSIGNED_32BIT((val).u64))
61#define IR_IS_FP_ZERO(insn)         ((insn.type == IR_DOUBLE) ? (insn.val.u64 == 0) : (insn.val.u32 == 0))
62#define IR_MAY_USE_32BIT_ADDR(code_buffer, addr) \
63	((code_buffer) && \
64	IR_IS_SIGNED_32BIT((char*)(addr) - (char*)(code_buffer)->start) && \
65	IR_IS_SIGNED_32BIT((char*)(addr) - ((char*)(code_buffer)->end)))
66
67#define IR_SPILL_POS_TO_OFFSET(offset) \
68	((ctx->flags & IR_USE_FRAME_POINTER) ? \
69		((offset) - (ctx->stack_frame_size - ctx->stack_frame_alignment)) : \
70		((offset) + ctx->call_stack_size))
71
72|.macro ASM_EXPAND_OP_MEM, MACRO, op, type, op1
73||	do {
74||		int32_t offset = IR_MEM_OFFSET(op1);
75||		int32_t base = IR_MEM_BASE(op1);
76||		int32_t index = IR_MEM_INDEX(op1);
77||		int32_t scale = IR_MEM_SCALE(op1);
78||  	if (index == IR_REG_NONE) {
79||			if (base == IR_REG_NONE) {
80|				MACRO op, type, [offset]
81||			} else {
82|				MACRO op, type, [Ra(base)+offset]
83||			}
84||		} else if (scale == 8) {
85||			if (base == IR_REG_NONE) {
86|				MACRO op, type, [Ra(index)*8+offset]
87||			} else {
88|				MACRO op, type, [Ra(base)+Ra(index)*8+offset]
89||			}
90||		} else if (scale == 4) {
91||			if (base == IR_REG_NONE) {
92|				MACRO op, type, [Ra(index)*4+offset]
93||			} else {
94|				MACRO op, type, [Ra(base)+Ra(index)*4+offset]
95||			}
96||		} else if (scale == 2) {
97||			if (base == IR_REG_NONE) {
98|				MACRO op, type, [Ra(index)*2+offset]
99||			} else {
100|				MACRO op, type, [Ra(base)+Ra(index)*2+offset]
101||			}
102||		} else {
103||			IR_ASSERT(scale == 1);
104||			if (base == IR_REG_NONE) {
105|				MACRO op, type, [Ra(index)+offset]
106||			} else {
107|				MACRO op, type, [Ra(base)+Ra(index)+offset]
108||			}
109||		}
110||	} while (0);
111|.endmacro
112
113|.macro ASM_EXPAND_OP1_MEM, MACRO, op, type, op1, op2
114||	do {
115||		int32_t offset = IR_MEM_OFFSET(op1);
116||		int32_t base = IR_MEM_BASE(op1);
117||		int32_t index = IR_MEM_INDEX(op1);
118||		int32_t scale = IR_MEM_SCALE(op1);
119||  	if (index == IR_REG_NONE) {
120||			if (base == IR_REG_NONE) {
121|				MACRO op, type, [offset], op2
122||			} else {
123|				MACRO op, type, [Ra(base)+offset], op2
124||			}
125||		} else if (scale == 8) {
126||			if (base == IR_REG_NONE) {
127|				MACRO op, type, [Ra(index)*8+offset], op2
128||			} else {
129|				MACRO op, type, [Ra(base)+Ra(index)*8+offset], op2
130||			}
131||		} else if (scale == 4) {
132||			if (base == IR_REG_NONE) {
133|				MACRO op, type, [Ra(index)*4+offset], op2
134||			} else {
135|				MACRO op, type, [Ra(base)+Ra(index)*4+offset], op2
136||			}
137||		} else if (scale == 2) {
138||			if (base == IR_REG_NONE) {
139|				MACRO op, type, [Ra(index)*2+offset], op2
140||			} else {
141|				MACRO op, type, [Ra(base)+Ra(index)*2+offset], op2
142||			}
143||		} else {
144||			IR_ASSERT(scale == 1);
145||			if (base == IR_REG_NONE) {
146|				MACRO op, type, [Ra(index)+offset], op2
147||			} else {
148|				MACRO op, type, [Ra(base)+Ra(index)+offset], op2
149||			}
150||		}
151||	} while (0);
152|.endmacro
153
154|.macro ASM_EXPAND_OP2_MEM, MACRO, op, type, op1, op2
155||	do {
156||		int32_t offset = IR_MEM_OFFSET(op2);
157||		int32_t base = IR_MEM_BASE(op2);
158||		int32_t index = IR_MEM_INDEX(op2);
159||		int32_t scale = IR_MEM_SCALE(op2);
160||  	if (index == IR_REG_NONE) {
161||			if (base == IR_REG_NONE) {
162|				MACRO op, type, op1, [offset]
163||			} else {
164|				MACRO op, type, op1, [Ra(base)+offset]
165||			}
166||		} else if (scale == 8) {
167||			if (base == IR_REG_NONE) {
168|				MACRO op, type, op1, [Ra(index)*8+offset]
169||			} else {
170|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset]
171||			}
172||		} else if (scale == 4) {
173||			if (base == IR_REG_NONE) {
174|				MACRO op, type, op1, [Ra(index)*4+offset]
175||			} else {
176|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset]
177||			}
178||		} else if (scale == 2) {
179||			if (base == IR_REG_NONE) {
180|				MACRO op, type, op1, [Ra(index)*2+offset]
181||			} else {
182|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset]
183||			}
184||		} else {
185||			IR_ASSERT(scale == 1);
186||			if (base == IR_REG_NONE) {
187|				MACRO op, type, op1, [Ra(index)+offset]
188||			} else {
189|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset]
190||			}
191||		}
192||	} while (0);
193|.endmacro
194
195|.macro ASM_EXPAND_OP2_MEM_3, MACRO, op, type, op1, op2, op3
196||	do {
197||		int32_t offset = IR_MEM_OFFSET(op2);
198||		int32_t base = IR_MEM_BASE(op2);
199||		int32_t index = IR_MEM_INDEX(op2);
200||		int32_t scale = IR_MEM_SCALE(op2);
201||  	if (index == IR_REG_NONE) {
202||			if (base == IR_REG_NONE) {
203|				MACRO op, type, op1, [offset], op3
204||			} else {
205|				MACRO op, type, op1, [Ra(base)+offset], op3
206||			}
207||		} else if (scale == 8) {
208||			if (base == IR_REG_NONE) {
209|				MACRO op, type, op1, [Ra(index)*8+offset], op3
210||			} else {
211|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset], op3
212||			}
213||		} else if (scale == 4) {
214||			if (base == IR_REG_NONE) {
215|				MACRO op, type, op1, [Ra(index)*4+offset], op3
216||			} else {
217|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset], op3
218||			}
219||		} else if (scale == 2) {
220||			if (base == IR_REG_NONE) {
221|				MACRO op, type, op1, [Ra(index)*2+offset], op3
222||			} else {
223|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset], op3
224||			}
225||		} else {
226||			IR_ASSERT(scale == 1);
227||			if (base == IR_REG_NONE) {
228|				MACRO op, type, op1, [Ra(index)+offset], op3
229||			} else {
230|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset], op3
231||			}
232||		}
233||	} while (0);
234|.endmacro
235
236|.macro ASM_EXPAND_OP3_MEM, MACRO, op, type, op1, op2, op3
237||	do {
238||		int32_t offset = IR_MEM_OFFSET(op3);
239||		int32_t base = IR_MEM_BASE(op3);
240||		int32_t index = IR_MEM_INDEX(op3);
241||		int32_t scale = IR_MEM_SCALE(op3);
242||  	if (index == IR_REG_NONE) {
243||			if (base == IR_REG_NONE) {
244|				MACRO op, type, op1, op2, [offset]
245||			} else {
246|				MACRO op, type, op1, op2, [Ra(base)+offset]
247||			}
248||		} else if (scale == 8) {
249||			if (base == IR_REG_NONE) {
250|				MACRO op, type, op1, op2, [Ra(index)*8+offset]
251||			} else {
252|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*8+offset]
253||			}
254||		} else if (scale == 4) {
255||			if (base == IR_REG_NONE) {
256|				MACRO op, type, op1, op2, [Ra(index)*4+offset]
257||			} else {
258|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*4+offset]
259||			}
260||		} else if (scale == 2) {
261||			if (base == IR_REG_NONE) {
262|				MACRO op, type, op1, op2, [Ra(index)*2+offset]
263||			} else {
264|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*2+offset]
265||			}
266||		} else {
267||			IR_ASSERT(scale == 1);
268||			if (base == IR_REG_NONE) {
269|				MACRO op, type, op1, op2, [Ra(index)+offset]
270||			} else {
271|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)+offset]
272||			}
273||		}
274||	} while (0);
275|.endmacro
276
277|.macro ASM_EXPAND_TYPE_MEM, op, type, op1
278||	switch (ir_type_size[type]) {
279|| 		default:
280||			IR_ASSERT(0);
281||		case 1:
282|			op byte op1
283||			break;
284||		case 2:
285|			op word op1
286|| 			break;
287||		case 4:
288|			op dword op1
289|| 			break;
290|.if X64
291||		case 8:
292|			op qword op1
293|| 			break;
294|.endif
295||	}
296|.endmacro
297
298|.macro ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
299||	switch (ir_type_size[type]) {
300|| 		default:
301||			IR_ASSERT(0);
302||		case 1:
303|			op byte op1, Rb(op2)
304||			break;
305||		case 2:
306|			op word op1, Rw(op2)
307|| 			break;
308||		case 4:
309|			op dword op1, Rd(op2)
310|| 			break;
311|.if X64
312||		case 8:
313|			op qword op1, Rq(op2)
314|| 			break;
315|.endif
316||	}
317|.endmacro
318
319|.macro ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
320||	switch (ir_type_size[type]) {
321|| 		default:
322||			IR_ASSERT(0);
323||		case 1:
324|			op byte op1, op2
325||			break;
326||		case 2:
327|			op word op1, op2
328|| 			break;
329||		case 4:
330|			op dword op1, op2
331|| 			break;
332|.if X64
333||		case 8:
334|			op qword op1, op2
335|| 			break;
336|.endif
337||	}
338|.endmacro
339
340|.macro ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
341||	switch (ir_type_size[type]) {
342|| 		default:
343||			IR_ASSERT(0);
344||		case 1:
345|			op byte op1, (op2 & 0xff)
346||			break;
347||		case 2:
348|			op word op1, (op2 & 0xffff)
349|| 			break;
350||		case 4:
351|			op dword op1, op2
352|| 			break;
353|.if X64
354||		case 8:
355|			op qword op1, op2
356|| 			break;
357|.endif
358||	}
359|.endmacro
360
361|.macro ASM_EXPAND_TYPE_REG_MEM, op, type, op1, op2
362||	switch (ir_type_size[type]) {
363|| 		default:
364||			IR_ASSERT(0);
365||		case 1:
366|			op Rb(op1), byte op2
367||			break;
368||		case 2:
369|			op Rw(op1), word op2
370|| 			break;
371||		case 4:
372|			op Rd(op1), dword op2
373|| 			break;
374|.if X64
375||		case 8:
376|			op Rq(op1), qword op2
377|| 			break;
378|.endif
379||	}
380|.endmacro
381
382|.macro ASM_TMEM_OP, op, type, op1
383||	do {
384||		int32_t offset = IR_MEM_OFFSET(op1);
385||		int32_t base = IR_MEM_BASE(op1);
386||		int32_t index = IR_MEM_INDEX(op1);
387||		int32_t scale = IR_MEM_SCALE(op1);
388||  	if (index == IR_REG_NONE) {
389||			if (base == IR_REG_NONE) {
390|				op type [offset]
391||			} else {
392|				op type [Ra(base)+offset]
393||			}
394||		} else if (scale == 8) {
395||			if (base == IR_REG_NONE) {
396|				op type [Ra(index)*8+offset]
397||			} else {
398|				op type [Ra(base)+Ra(index)*8+offset]
399||			}
400||		} else if (scale == 4) {
401||			if (base == IR_REG_NONE) {
402|				op type [Ra(index)*4+offset]
403||			} else {
404|				op type [Ra(base)+Ra(index)*4+offset]
405||			}
406||		} else if (scale == 2) {
407||			if (base == IR_REG_NONE) {
408|				op type [Ra(index)*2+offset]
409||			} else {
410|				op type [Ra(base)+Ra(index)*2+offset]
411||			}
412||		} else {
413||			IR_ASSERT(scale == 1);
414||			if (base == IR_REG_NONE) {
415|				op type [Ra(index)+offset]
416||			} else {
417|				op type [Ra(base)+Ra(index)+offset]
418||			}
419||		}
420||	} while (0);
421|.endmacro
422
423|.macro ASM_TXT_TMEM_OP, op, op1, type, op2
424||	do {
425||		int32_t offset = IR_MEM_OFFSET(op2);
426||		int32_t base = IR_MEM_BASE(op2);
427||		int32_t index = IR_MEM_INDEX(op2);
428||		int32_t scale = IR_MEM_SCALE(op2);
429||  	if (index == IR_REG_NONE) {
430||			if (base == IR_REG_NONE) {
431|				op op1, type [offset]
432||			} else {
433|				op op1, type [Ra(base)+offset]
434||			}
435||		} else if (scale == 8) {
436||			if (base == IR_REG_NONE) {
437|				op op1, type [Ra(index)*8+offset]
438||			} else {
439|				op op1, type [Ra(base)+Ra(index)*8+offset]
440||			}
441||		} else if (scale == 4) {
442||			if (base == IR_REG_NONE) {
443|				op op1, type [Ra(index)*4+offset]
444||			} else {
445|				op op1, type [Ra(base)+Ra(index)*4+offset]
446||			}
447||		} else if (scale == 2) {
448||			if (base == IR_REG_NONE) {
449|				op op1, type [Ra(index)*2+offset]
450||			} else {
451|				op op1, type [Ra(base)+Ra(index)*2+offset]
452||			}
453||		} else {
454||			IR_ASSERT(scale == 1);
455||			if (base == IR_REG_NONE) {
456|				op op1, type [Ra(index)+offset]
457||			} else {
458|				op op1, type [Ra(base)+Ra(index)+offset]
459||			}
460||		}
461||	} while (0);
462|.endmacro
463
464|.macro ASM_TMEM_TXT_OP, op, type, op1, op2
465||	do {
466||		int32_t offset = IR_MEM_OFFSET(op1);
467||		int32_t base = IR_MEM_BASE(op1);
468||		int32_t index = IR_MEM_INDEX(op1);
469||		int32_t scale = IR_MEM_SCALE(op1);
470||  	if (index == IR_REG_NONE) {
471||			if (base == IR_REG_NONE) {
472|				op type [offset], op2
473||			} else {
474|				op type [Ra(base)+offset], op2
475||			}
476||		} else if (scale == 8) {
477||			if (base == IR_REG_NONE) {
478|				op type [Ra(index)*8+offset], op2
479||			} else {
480|				op type [Ra(base)+Ra(index)*8+offset], op2
481||			}
482||		} else if (scale == 4) {
483||			if (base == IR_REG_NONE) {
484|				op type [Ra(index)*4+offset], op2
485||			} else {
486|				op type [Ra(base)+Ra(index)*4+offset], op2
487||			}
488||		} else if (scale == 2) {
489||			if (base == IR_REG_NONE) {
490|				op type [Ra(index)*2+offset], op2
491||			} else {
492|				op type [Ra(base)+Ra(index)*2+offset], op2
493||			}
494||		} else {
495||			IR_ASSERT(scale == 1);
496||			if (base == IR_REG_NONE) {
497|				op type [Ra(index)+offset], op2
498||			} else {
499|				op type [Ra(base)+Ra(index)+offset], op2
500||			}
501||		}
502||	} while (0);
503|.endmacro
504
505|.macro ASM_TXT_TXT_TMEM_OP, op, op1, op2, type, op3
506||	do {
507||		int32_t offset = IR_MEM_OFFSET(op3);
508||		int32_t base = IR_MEM_BASE(op3);
509||		int32_t index = IR_MEM_INDEX(op3);
510||		int32_t scale = IR_MEM_SCALE(op3);
511||  	if (index == IR_REG_NONE) {
512||			if (base == IR_REG_NONE) {
513|				op op1, op2, type [offset]
514||			} else {
515|				op op1, op2, type [Ra(base)+offset]
516||			}
517||		} else if (scale == 8) {
518||			if (base == IR_REG_NONE) {
519|				op op1, op2, type [Ra(index)*8+offset]
520||			} else {
521|				op op1, op2, type [Ra(base)+Ra(index)*8+offset]
522||			}
523||		} else if (scale == 4) {
524||			if (base == IR_REG_NONE) {
525|				op op1, op2, type [Ra(index)*4+offset]
526||			} else {
527|				op op1, op2, type [Ra(base)+Ra(index)*4+offset]
528||			}
529||		} else if (scale == 2) {
530||			if (base == IR_REG_NONE) {
531|				op op1, op2, type [Ra(index)*2+offset]
532||			} else {
533|				op op1, op2, type [Ra(base)+Ra(index)*2+offset]
534||			}
535||		} else {
536||			IR_ASSERT(scale == 1);
537||			if (base == IR_REG_NONE) {
538|				op op1, op2, type [Ra(index)+offset]
539||			} else {
540|				op op1, op2, type [Ra(base)+Ra(index)+offset]
541||			}
542||		}
543||	} while (0);
544|.endmacro
545
546|.macro ASM_REG_OP, op, type, op1
547||	switch (ir_type_size[type]) {
548|| 		default:
549||			IR_ASSERT(0);
550||		case 1:
551|			op Rb(op1)
552||			break;
553||		case 2:
554|			op Rw(op1)
555|| 			break;
556||		case 4:
557|			op Rd(op1)
558|| 			break;
559|.if X64
560||		case 8:
561|			op Rq(op1)
562|| 			break;
563|.endif
564||	}
565|.endmacro
566
567|.macro ASM_MEM_OP, op, type, op1
568|	ASM_EXPAND_OP_MEM ASM_EXPAND_TYPE_MEM, op, type, op1
569|.endmacro
570
571|.macro ASM_REG_REG_OP, op, type, op1, op2
572||	switch (ir_type_size[type]) {
573|| 		default:
574||			IR_ASSERT(0);
575||		case 1:
576|			op Rb(op1), Rb(op2)
577||			break;
578||		case 2:
579|			op Rw(op1), Rw(op2)
580|| 			break;
581||		case 4:
582|			op Rd(op1), Rd(op2)
583|| 			break;
584|.if X64
585||		case 8:
586|			op Rq(op1), Rq(op2)
587|| 			break;
588|.endif
589||	}
590|.endmacro
591
592|.macro ASM_REG_REG_OP2, op, type, op1, op2
593||	switch (ir_type_size[type]) {
594|| 		default:
595||			IR_ASSERT(0);
596||		case 1:
597||		case 2:
598|			op Rw(op1), Rw(op2)
599|| 			break;
600||		case 4:
601|			op Rd(op1), Rd(op2)
602|| 			break;
603|.if X64
604||		case 8:
605|			op Rq(op1), Rq(op2)
606|| 			break;
607|.endif
608||	}
609|.endmacro
610
611|.macro ASM_REG_TXT_OP, op, type, op1, op2
612||	switch (ir_type_size[type]) {
613|| 		default:
614||			IR_ASSERT(0);
615||		case 1:
616|			op Rb(op1), op2
617||			break;
618||		case 2:
619|			op Rw(op1), op2
620|| 			break;
621||		case 4:
622|			op Rd(op1), op2
623|| 			break;
624|.if X64
625||		case 8:
626|			op Rq(op1), op2
627|| 			break;
628|.endif
629||	}
630|.endmacro
631
632|.macro ASM_REG_IMM_OP, op, type, op1, op2
633||	switch (ir_type_size[type]) {
634|| 		default:
635||			IR_ASSERT(0);
636||		case 1:
637|			op Rb(op1), (op2 & 0xff)
638||			break;
639||		case 2:
640|			op Rw(op1), (op2 & 0xffff)
641|| 			break;
642||		case 4:
643|			op Rd(op1), op2
644|| 			break;
645|.if X64
646||		case 8:
647|			op Rq(op1), op2
648|| 			break;
649|.endif
650||	}
651|.endmacro
652
653|.macro ASM_MEM_REG_OP, op, type, op1, op2
654|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
655|.endmacro
656
657|.macro ASM_MEM_TXT_OP, op, type, op1, op2
658|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
659|.endmacro
660
661|.macro ASM_MEM_IMM_OP, op, type, op1, op2
662|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
663|.endmacro
664
665|.macro ASM_REG_MEM_OP, op, type, op1, op2
666|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_OP, op, type, op1, op2
667|.endmacro
668
669|.macro ASM_REG_REG_MUL, op, type, op1, op2
670||	switch (ir_type_size[type]) {
671|| 		default:
672||			IR_ASSERT(0);
673||		case 2:
674|			op Rw(op1), Rw(op2)
675|| 			break;
676||		case 4:
677|			op Rd(op1), Rd(op2)
678|| 			break;
679|.if X64
680||		case 8:
681|			op Rq(op1), Rq(op2)
682|| 			break;
683|.endif
684||	}
685|.endmacro
686
687|.macro ASM_REG_IMM_MUL, op, type, op1, op2
688||	switch (ir_type_size[type]) {
689|| 		default:
690||			IR_ASSERT(0);
691||		case 2:
692|			op Rw(op1), op2
693|| 			break;
694||		case 4:
695|			op Rd(op1), op2
696|| 			break;
697|.if X64
698||		case 8:
699|			op Rq(op1), op2
700|| 			break;
701|.endif
702||	}
703|.endmacro
704
705|.macro ASM_REG_TXT_MUL, op, type, op1, op2
706||	switch (ir_type_size[type]) {
707|| 		default:
708||			IR_ASSERT(0);
709||		case 2:
710|			op Rw(op1), op2
711|| 			break;
712||		case 4:
713|			op Rd(op1), op2
714|| 			break;
715|.if X64
716||		case 8:
717|			op Rq(op1), op2
718|| 			break;
719|.endif
720||	}
721|.endmacro
722
723|.macro ASM_REG_MEM_MUL, op, type, op1, op2
724|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_MUL, op, type, op1, op2
725|.endmacro
726
727|.macro ASM_REG_TXT_TXT_MUL, op, type, op1, op2, op3
728||	switch (ir_type_size[type]) {
729|| 		default:
730||			IR_ASSERT(0);
731||		case 2:
732|			op Rw(op1), op2, op3
733|| 			break;
734||		case 4:
735|			op Rd(op1), op2, op3
736|| 			break;
737|.if X64
738||		case 8:
739|			op Rq(op1), op2, op3
740|| 			break;
741|.endif
742||	}
743|.endmacro
744
745|.macro ASM_REG_MEM_TXT_MUL, op, type, op1, op2, op3
746|	ASM_EXPAND_OP2_MEM_3 ASM_REG_TXT_TXT_MUL, imul, type, op1, op2, op3
747|.endmacro
748
749|.macro ASM_SSE2_REG_REG_OP, op, type, op1, op2
750||	if (type == IR_DOUBLE) {
751|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
752||	} else {
753||		IR_ASSERT(type == IR_FLOAT);
754|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
755||	}
756|.endmacro
757
758|.macro ASM_SSE2_REG_TXT_OP, op, type, op1, op2
759||	if (type == IR_DOUBLE) {
760|		op..d xmm(op1-IR_REG_FP_FIRST), qword op2
761||	} else {
762||		IR_ASSERT(type == IR_FLOAT);
763|		op..s xmm(op1-IR_REG_FP_FIRST), dword op2
764||	}
765|.endmacro
766
767|.macro ASM_SSE2_REG_MEM_OP, op, type, op1, op2
768|	ASM_EXPAND_OP2_MEM ASM_SSE2_REG_TXT_OP, op, type, op1, op2
769|.endmacro
770
771|.macro ASM_AVX_REG_REG_REG_OP, op, type, op1, op2, op3
772||	if (type == IR_DOUBLE) {
773|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
774||	} else {
775||		IR_ASSERT(type == IR_FLOAT);
776|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
777||	}
778|.endmacro
779
780|.macro ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
781||	if (type == IR_DOUBLE) {
782|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), qword op3
783||	} else {
784||		IR_ASSERT(type == IR_FLOAT);
785|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), dword op3
786||	}
787|.endmacro
788
789|.macro ASM_AVX_REG_REG_MEM_OP, op, type, op1, op2, op3
790|	ASM_EXPAND_OP3_MEM ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
791|.endmacro
792
793|.macro ASM_FP_REG_REG_OP, op, type, op1, op2
794||	if (ctx->mflags & IR_X86_AVX) {
795|		ASM_SSE2_REG_REG_OP v..op, type, op1, op2
796||	} else {
797|		ASM_SSE2_REG_REG_OP op, type, op1, op2
798||	}
799|.endmacro
800
801|.macro ASM_FP_TXT_REG_OP, op, type, dst, src
802||	if (type == IR_DOUBLE) {
803||		if (ctx->mflags & IR_X86_AVX) {
804|			v..op..d qword dst, xmm(src-IR_REG_FP_FIRST)
805||		} else {
806|			op..d qword dst, xmm(src-IR_REG_FP_FIRST)
807||		}
808||	} else {
809||		IR_ASSERT(type == IR_FLOAT);
810||		if (ctx->mflags & IR_X86_AVX) {
811|			v..op..s dword dst, xmm(src-IR_REG_FP_FIRST)
812||		} else {
813|			op..s dword dst, xmm(src-IR_REG_FP_FIRST)
814||		}
815||	}
816|.endmacro
817
818|.macro ASM_FP_MEM_REG_OP, op, type, op1, op2
819|	ASM_EXPAND_OP1_MEM ASM_FP_TXT_REG_OP, op, type, op1, op2
820|.endmacro
821
822|.macro ASM_FP_REG_TXT_OP, op, type, op1, op2
823||	if (ctx->mflags & IR_X86_AVX) {
824|		ASM_SSE2_REG_TXT_OP v..op, type, op1, op2
825||	} else {
826|		ASM_SSE2_REG_TXT_OP op, type, op1, op2
827||	}
828|.endmacro
829
830|.macro ASM_FP_REG_MEM_OP, op, type, op1, op2
831||	if (ctx->mflags & IR_X86_AVX) {
832|		ASM_SSE2_REG_MEM_OP v..op, type, op1, op2
833||	} else {
834|		ASM_SSE2_REG_MEM_OP op, type, op1, op2
835||	}
836|.endmacro
837
838|.macro ASM_SSE2_REG_REG_TXT_OP, op, type, op1, op2, op3
839||	if (type == IR_DOUBLE) {
840|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
841||	} else {
842||		IR_ASSERT(type == IR_FLOAT);
843|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
844||	}
845|.endmacro
846
847|.macro ASM_SSE2_REG_REG_REG_TXT_OP, op, type, op1, op2, op3, op4
848||	if (type == IR_DOUBLE) {
849|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
850||	} else {
851||		IR_ASSERT(type == IR_FLOAT);
852|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
853||	}
854|.endmacro
855
856|.macro ASM_FP_REG_REG_TXT_OP, op, type, op1, op2, op3
857||	if (ctx->mflags & IR_X86_AVX) {
858|		ASM_SSE2_REG_REG_REG_TXT_OP v..op, type, op1, op2, op3
859||	} else {
860|		ASM_SSE2_REG_REG_TXT_OP op, type, op1, op2, op3
861||	}
862|.endmacro
863
864typedef struct _ir_backend_data {
865    ir_reg_alloc_data  ra_data;
866	uint32_t           dessa_from_block;
867	dasm_State        *dasm_state;
868	ir_bitset          emit_constants;
869	int                rodata_label, jmp_table_label;
870	bool               double_neg_const;
871	bool               float_neg_const;
872	bool               double_abs_const;
873	bool               float_abs_const;
874	bool               double_zero_const;
875} ir_backend_data;
876
877#define IR_GP_REG_NAME(code, name64, name32, name16, name8, name8h) \
878	#name64,
879#define IR_GP_REG_NAME32(code, name64, name32, name16, name8, name8h) \
880	#name32,
881#define IR_GP_REG_NAME16(code, name64, name32, name16, name8, name8h) \
882	#name16,
883#define IR_GP_REG_NAME8(code, name64, name32, name16, name8, name8h) \
884	#name8,
885#define IR_FP_REG_NAME(code, name) \
886	#name,
887
888static const char *_ir_reg_name[IR_REG_NUM] = {
889	IR_GP_REGS(IR_GP_REG_NAME)
890	IR_FP_REGS(IR_FP_REG_NAME)
891};
892
893static const char *_ir_reg_name32[IR_REG_NUM] = {
894	IR_GP_REGS(IR_GP_REG_NAME32)
895};
896
897static const char *_ir_reg_name16[IR_REG_NUM] = {
898	IR_GP_REGS(IR_GP_REG_NAME16)
899};
900
901static const char *_ir_reg_name8[IR_REG_NUM] = {
902	IR_GP_REGS(IR_GP_REG_NAME8)
903};
904
905/* Calling Convention */
906#ifdef _WIN64
907
908static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
909	IR_REG_INT_ARG1,
910	IR_REG_INT_ARG2,
911	IR_REG_INT_ARG3,
912	IR_REG_INT_ARG4,
913};
914
915static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
916	IR_REG_FP_ARG1,
917	IR_REG_FP_ARG2,
918	IR_REG_FP_ARG3,
919	IR_REG_FP_ARG4,
920};
921
922#elif defined(IR_TARGET_X64)
923
924static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
925	IR_REG_INT_ARG1,
926	IR_REG_INT_ARG2,
927	IR_REG_INT_ARG3,
928	IR_REG_INT_ARG4,
929	IR_REG_INT_ARG5,
930	IR_REG_INT_ARG6,
931};
932
933static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
934	IR_REG_FP_ARG1,
935	IR_REG_FP_ARG2,
936	IR_REG_FP_ARG3,
937	IR_REG_FP_ARG4,
938	IR_REG_FP_ARG5,
939	IR_REG_FP_ARG6,
940	IR_REG_FP_ARG7,
941	IR_REG_FP_ARG8,
942};
943
944#else
945
946static const int8_t *_ir_int_reg_params = NULL;
947static const int8_t *_ir_fp_reg_params = NULL;
948static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS] = {
949	IR_REG_INT_FCARG1,
950	IR_REG_INT_FCARG2,
951};
952static const int8_t *_ir_fp_fc_reg_params = NULL;
953
954#endif
955
956const char *ir_reg_name(int8_t reg, ir_type type)
957{
958	if (reg >= IR_REG_NUM) {
959		if (reg == IR_REG_SCRATCH) {
960			return "SCRATCH";
961		} else {
962			IR_ASSERT(reg == IR_REG_ALL);
963			return "ALL";
964		}
965	}
966	IR_ASSERT(reg >= 0 && reg < IR_REG_NUM);
967	if (type == IR_VOID) {
968		type = (reg < IR_REG_FP_FIRST) ? IR_ADDR : IR_DOUBLE;
969	}
970	if (IR_IS_TYPE_FP(type) || ir_type_size[type] == 8) {
971		return _ir_reg_name[reg];
972	} else if (ir_type_size[type] == 4) {
973		return _ir_reg_name32[reg];
974	} else if (ir_type_size[type] == 2) {
975		return _ir_reg_name16[reg];
976	} else {
977		IR_ASSERT(ir_type_size[type] == 1);
978		return _ir_reg_name8[reg];
979	}
980}
981
982#define IR_RULES(_)        \
983	_(CMP_INT)             \
984	_(CMP_FP)              \
985	_(MUL_INT)             \
986	_(DIV_INT)             \
987	_(MOD_INT)             \
988	_(TEST_INT)            \
989	_(SETCC_INT)           \
990	_(TESTCC_INT)          \
991	_(LEA_OB)              \
992	_(LEA_SI)              \
993	_(LEA_SIB)             \
994	_(LEA_IB)              \
995	_(LEA_SI_O)            \
996	_(LEA_SIB_O)           \
997	_(LEA_IB_O)            \
998	_(LEA_I_OB)            \
999	_(LEA_OB_I)            \
1000	_(LEA_OB_SI)           \
1001	_(LEA_SI_OB)           \
1002	_(LEA_B_SI)            \
1003	_(LEA_SI_B)            \
1004	_(INC)                 \
1005	_(DEC)                 \
1006	_(MUL_PWR2)            \
1007	_(DIV_PWR2)            \
1008	_(MOD_PWR2)            \
1009	_(SDIV_PWR2)           \
1010	_(SMOD_PWR2)           \
1011	_(BOOL_NOT_INT)        \
1012	_(ABS_INT)             \
1013	_(OP_INT)              \
1014	_(OP_FP)               \
1015	_(IMUL3)               \
1016	_(BINOP_INT)           \
1017	_(BINOP_SSE2)          \
1018	_(BINOP_AVX)           \
1019	_(SHIFT)               \
1020	_(SHIFT_CONST)         \
1021	_(COPY_INT)            \
1022	_(COPY_FP)             \
1023	_(CMP_AND_STORE_INT)   \
1024	_(CMP_AND_BRANCH_INT)  \
1025	_(CMP_AND_BRANCH_FP)   \
1026	_(TEST_AND_BRANCH_INT) \
1027	_(JCC_INT)             \
1028	_(COND_CMP_INT)        \
1029	_(COND_CMP_FP)         \
1030	_(GUARD_CMP_INT)       \
1031	_(GUARD_CMP_FP)        \
1032	_(GUARD_TEST_INT)      \
1033	_(GUARD_JCC_INT)       \
1034	_(GUARD_OVERFLOW)      \
1035	_(OVERFLOW_AND_BRANCH) \
1036	_(MIN_MAX_INT)         \
1037	_(MEM_OP_INT)          \
1038	_(MEM_INC)             \
1039	_(MEM_DEC)             \
1040	_(MEM_MUL_PWR2)        \
1041	_(MEM_DIV_PWR2)        \
1042	_(MEM_MOD_PWR2)        \
1043	_(MEM_BINOP_INT)       \
1044	_(MEM_SHIFT)           \
1045	_(MEM_SHIFT_CONST)     \
1046	_(REG_BINOP_INT)       \
1047	_(VSTORE_INT)          \
1048	_(VSTORE_FP)           \
1049	_(LOAD_INT)            \
1050	_(LOAD_FP)             \
1051	_(STORE_INT)           \
1052	_(STORE_FP)            \
1053	_(IF_INT)              \
1054	_(RETURN_VOID)         \
1055	_(RETURN_INT)          \
1056	_(RETURN_FP)           \
1057	_(BIT_COUNT)           \
1058	_(SSE_SQRT)            \
1059	_(SSE_RINT)            \
1060	_(SSE_FLOOR)           \
1061	_(SSE_CEIL)            \
1062	_(SSE_TRUNC)           \
1063	_(SSE_NEARBYINT)       \
1064
1065#define IR_RULE_ENUM(name) IR_ ## name,
1066
1067#define IR_STATIC_ALLOCA   (IR_SKIPPED | IR_FUSED | IR_SIMPLE | IR_ALLOCA)
1068
1069enum _ir_rule {
1070	IR_FIRST_RULE = IR_LAST_OP,
1071	IR_RULES(IR_RULE_ENUM)
1072	IR_LAST_RULE
1073};
1074
1075#define IR_RULE_NAME(name)  #name,
1076const char *ir_rule_name[IR_LAST_OP] = {
1077	NULL,
1078	IR_RULES(IR_RULE_NAME)
1079	NULL
1080};
1081
1082static bool ir_may_fuse_addr(ir_ctx *ctx, const ir_insn *addr_insn)
1083{
1084	if (sizeof(void*) == 4) {
1085		return 1;
1086	} else if (IR_IS_SYM_CONST(addr_insn->op)) {
1087		void *addr = ir_sym_addr(ctx, addr_insn);
1088
1089		if (!addr) {
1090			return 0;
1091		}
1092		return IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
1093	} else {
1094		return IR_IS_SIGNED_32BIT(addr_insn->val.i64);
1095	}
1096}
1097
1098static bool ir_may_fuse_imm(ir_ctx *ctx, const ir_insn *val_insn)
1099{
1100	if (val_insn->type == IR_ADDR) {
1101		if (sizeof(void*) == 4) {
1102			return 1;
1103		} else if (IR_IS_SYM_CONST(val_insn->op)) {
1104			void *addr = ir_sym_addr(ctx, val_insn);
1105
1106			if (!addr) {
1107				return 0;
1108			}
1109			return IR_IS_SIGNED_32BIT((intptr_t)addr);
1110		} else {
1111			return IR_IS_SIGNED_32BIT(val_insn->val.i64);
1112		}
1113	} else {
1114		return (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
1115	}
1116}
1117
1118/* register allocation */
1119static int ir_add_const_tmp_reg(ir_ctx *ctx, ir_ref ref, uint32_t num, int n, ir_target_constraints *constraints)
1120{
1121	IR_ASSERT(IR_IS_CONST_REF(ref));
1122	const ir_insn *val_insn = &ctx->ir_base[ref];
1123
1124	if (!ir_may_fuse_imm(ctx, val_insn)) {
1125		constraints->tmp_regs[n] = IR_TMP_REG(num, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1126		n++;
1127	}
1128	return n;
1129}
1130
1131int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *constraints)
1132{
1133	uint32_t rule = ir_rule(ctx, ref);
1134	const ir_insn *insn;
1135	int n = 0;
1136	int flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1137
1138	constraints->def_reg = IR_REG_NONE;
1139	constraints->hints_count = 0;
1140	switch (rule & IR_RULE_MASK) {
1141		case IR_BINOP_INT:
1142			insn = &ctx->ir_base[ref];
1143			if (rule & IR_FUSED) {
1144				if (ctx->ir_base[insn->op1].op == IR_RLOAD) {
1145					flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1146				} else {
1147					flags = IR_OP2_MUST_BE_IN_REG;
1148				}
1149			} else {
1150				flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1151			}
1152			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1153				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1154			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
1155				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1156				n++;
1157			}
1158			break;
1159		case IR_IMUL3:
1160			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1161			break;
1162		case IR_SHIFT:
1163			if (rule & IR_FUSED) {
1164				flags = IR_OP2_MUST_BE_IN_REG;
1165			} else {
1166				flags = IR_DEF_REUSES_OP1_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1167			}
1168			constraints->hints[1] = IR_REG_NONE;
1169			constraints->hints[2] = IR_REG_RCX;
1170			constraints->hints_count = 3;
1171			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RCX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1172			n = 1;
1173			break;
1174		case IR_MUL_INT:
1175			/* %rax - used as input and result */
1176			constraints->def_reg = IR_REG_RAX;
1177			constraints->hints[1] = IR_REG_RAX;
1178			constraints->hints_count = 2;
1179			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1180			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1181			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1182			n = 2;
1183			break;
1184		case IR_DIV_INT:
1185			/* %rax - used as input and result */
1186			constraints->def_reg = IR_REG_RAX;
1187			constraints->hints[1] = IR_REG_RAX;
1188			constraints->hints_count = 2;
1189			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1190			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1191			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1192			n = 2;
1193			goto op2_const;
1194		case IR_MOD_INT:
1195			constraints->def_reg = IR_REG_RDX;
1196			constraints->hints[1] = IR_REG_RAX;
1197			constraints->hints_count = 2;
1198			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1199			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1200			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1201			n = 2;
1202			goto op2_const;
1203		case IR_MIN_MAX_INT:
1204			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1205op2_const:
1206			insn = &ctx->ir_base[ref];
1207			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1208				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1209				n++;
1210			}
1211			break;
1212		case IR_CMP_INT:
1213		case IR_TEST_INT:
1214			insn = &ctx->ir_base[ref];
1215			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1216			if (IR_IS_CONST_REF(insn->op1)) {
1217				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1218				constraints->tmp_regs[0] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1219				n = 1;
1220			} else if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
1221				constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1222				n = 1;
1223			} else if (ir_rule(ctx, insn->op1) & IR_FUSED) {
1224				flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1225			}
1226			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1227				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1228				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1229			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
1230				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1231				n++;
1232			}
1233			break;
1234		case IR_CMP_FP:
1235			insn = &ctx->ir_base[ref];
1236			if (!(rule & IR_FUSED)) {
1237				constraints->tmp_regs[0] = IR_TMP_REG(3, IR_BOOL, IR_DEF_SUB_REF, IR_SAVE_SUB_REF);
1238				n = 1;
1239			}
1240			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1241			if (IR_IS_CONST_REF(insn->op1)) {
1242				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1243				constraints->tmp_regs[n] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1244				n++;
1245			}
1246			break;
1247		case IR_BINOP_AVX:
1248			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1249			insn = &ctx->ir_base[ref];
1250			if (IR_IS_CONST_REF(insn->op1)) {
1251				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1252				n = 1;
1253			}
1254			break;
1255		case IR_COND:
1256			insn = &ctx->ir_base[ref];
1257			if (!IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
1258				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1259				break;
1260			}
1261			IR_FALLTHROUGH;
1262		case IR_COND_CMP_INT:
1263			insn = &ctx->ir_base[ref];
1264			if (IR_IS_TYPE_INT(insn->type)) {
1265				if (IR_IS_CONST_REF(insn->op3) || ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
1266					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1267					constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1268					n = 1;
1269				} else if (IR_IS_CONST_REF(insn->op2) || ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
1270					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1271					constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1272					n = 1;
1273				} else {
1274					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1275				}
1276			} else {
1277				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1278			}
1279			break;
1280		case IR_COND_CMP_FP:
1281			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1282			break;
1283		case IR_VSTORE_INT:
1284			flags = IR_OP3_MUST_BE_IN_REG;
1285			insn = &ctx->ir_base[ref];
1286			if (IR_IS_CONST_REF(insn->op3)) {
1287				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1288			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
1289				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1290				n++;
1291			}
1292			break;
1293		case IR_STORE_INT:
1294			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1295			insn = &ctx->ir_base[ref];
1296			if (IR_IS_CONST_REF(insn->op2)) {
1297				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1298			}
1299			if (IR_IS_CONST_REF(insn->op3)) {
1300				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1301			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
1302				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1303				n++;
1304			}
1305			break;
1306		case IR_VSTORE_FP:
1307			flags = IR_OP3_MUST_BE_IN_REG;
1308			insn = &ctx->ir_base[ref];
1309			if (IR_IS_CONST_REF(insn->op3)) {
1310				insn = &ctx->ir_base[insn->op3];
1311				constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1312				n = 1;
1313			}
1314			break;
1315		case IR_LOAD_FP:
1316		case IR_LOAD_INT:
1317		case IR_MEM_OP_INT:
1318		case IR_MEM_INC:
1319		case IR_MEM_DEC:
1320		case IR_MEM_MUL_PWR2:
1321		case IR_MEM_DIV_PWR2:
1322		case IR_MEM_MOD_PWR2:
1323		case IR_MEM_BINOP_INT:
1324		case IR_MEM_SHIFT:
1325		case IR_MEM_SHIFT_CONST:
1326		case IR_CMP_AND_STORE_INT:
1327			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1328			insn = &ctx->ir_base[ref];
1329			if (IR_IS_CONST_REF(insn->op2)) {
1330				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1331			}
1332			break;
1333		case IR_STORE_FP:
1334			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1335			insn = &ctx->ir_base[ref];
1336			if (IR_IS_CONST_REF(insn->op2)) {
1337				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1338			}
1339			if (IR_IS_CONST_REF(insn->op3)) {
1340				insn = &ctx->ir_base[insn->op3];
1341				constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1342				n++;
1343			}
1344			break;
1345		case IR_SWITCH:
1346			flags = IR_OP2_MUST_BE_IN_REG;
1347			insn = &ctx->ir_base[ref];
1348			if (IR_IS_CONST_REF(insn->op2)) {
1349				insn = &ctx->ir_base[insn->op2];
1350				constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1351				n = 1;
1352			}
1353			/* we need a temporary regeset in case MIN CASE value is not zero or some CASE VAL can't fit into 32-bit */
1354			constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1355			n++;
1356			break;
1357		case IR_CALL:
1358			insn = &ctx->ir_base[ref];
1359			if (IR_IS_TYPE_INT(insn->type)) {
1360				constraints->def_reg = IR_REG_INT_RET1;
1361#ifdef IR_REG_FP_RET1
1362			} else {
1363				constraints->def_reg = IR_REG_FP_RET1;
1364#endif
1365			}
1366			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_SCRATCH, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1367			n = 1;
1368			IR_FALLTHROUGH;
1369		case IR_TAILCALL:
1370			insn = &ctx->ir_base[ref];
1371			if (insn->inputs_count > 2) {
1372				constraints->hints[2] = IR_REG_NONE;
1373				constraints->hints_count = ir_get_args_regs(ctx, insn, constraints->hints);
1374				if (!IR_IS_CONST_REF(insn->op2)) {
1375					constraints->tmp_regs[n] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_USE_SUB_REF);
1376					n++;
1377				}
1378			}
1379			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1380			break;
1381		case IR_BINOP_SSE2:
1382			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1383			break;
1384		case IR_SHIFT_CONST:
1385		case IR_INC:
1386		case IR_DEC:
1387		case IR_MUL_PWR2:
1388		case IR_DIV_PWR2:
1389		case IR_OP_INT:
1390		case IR_OP_FP:
1391			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1392			break;
1393		case IR_MOD_PWR2:
1394			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1395			insn = &ctx->ir_base[ref];
1396			if (ir_type_size[insn->type] == 8) {
1397				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1398				if (!IR_IS_SIGNED_32BIT(offset)) {
1399					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1400					n++;
1401				}
1402			}
1403			break;
1404		case IR_SMOD_PWR2:
1405			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1406			insn = &ctx->ir_base[ref];
1407			if (ir_type_size[insn->type] == 8) {
1408				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1409				if (!IR_IS_SIGNED_32BIT(offset)) {
1410					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1411					n++;
1412				}
1413			}
1414			constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1415			n++;
1416			break;
1417		case IR_SDIV_PWR2:
1418			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1419			insn = &ctx->ir_base[ref];
1420			if (ir_type_size[insn->type] == 8) {
1421				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1422				if (!IR_IS_SIGNED_32BIT(offset)) {
1423					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1424					n++;
1425				}
1426			}
1427			break;
1428		case IR_BIT_COUNT:
1429			insn = &ctx->ir_base[ref];
1430			if (ir_type_size[insn->type] == 1) {
1431				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1432			} else {
1433				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1434			}
1435			if (IR_IS_CONST_REF(insn->op1)) {
1436				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1437				n = 1;
1438			}
1439			break;
1440		case IR_CTPOP:
1441			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1442			insn = &ctx->ir_base[ref];
1443			constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1444			n = 1;
1445			if (ir_type_size[insn->type] == 8) {
1446				constraints->tmp_regs[1] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1447				n = 2;
1448			}
1449			break;
1450		case IR_COPY_INT:
1451		case IR_COPY_FP:
1452		case IR_SEXT:
1453		case IR_ZEXT:
1454		case IR_TRUNC:
1455		case IR_BITCAST:
1456		case IR_PROTO:
1457		case IR_FP2FP:
1458			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1459			break;
1460		case IR_ABS_INT:
1461			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1462			break;
1463		case IR_PARAM:
1464			constraints->def_reg = ir_get_param_reg(ctx, ref);
1465			flags = 0;
1466			break;
1467		case IR_PI:
1468		case IR_PHI:
1469			flags = IR_USE_SHOULD_BE_IN_REG;
1470			break;
1471		case IR_RLOAD:
1472			constraints->def_reg = ctx->ir_base[ref].op2;
1473			flags = IR_USE_SHOULD_BE_IN_REG;
1474			break;
1475		case IR_EXITCALL:
1476			flags = IR_USE_MUST_BE_IN_REG;
1477			constraints->def_reg = IR_REG_INT_RET1;
1478			break;
1479		case IR_IF_INT:
1480		case IR_GUARD:
1481		case IR_GUARD_NOT:
1482			flags = IR_OP2_SHOULD_BE_IN_REG;
1483			break;
1484		case IR_IJMP:
1485			flags = IR_OP2_SHOULD_BE_IN_REG;
1486			break;
1487		case IR_RSTORE:
1488			flags = IR_OP3_SHOULD_BE_IN_REG;
1489			break;
1490		case IR_RETURN_INT:
1491			flags = IR_OP2_SHOULD_BE_IN_REG;
1492			constraints->hints[2] = IR_REG_INT_RET1;
1493			constraints->hints_count = 3;
1494			break;
1495		case IR_RETURN_FP:
1496#ifdef IR_REG_FP_RET1
1497			flags = IR_OP2_SHOULD_BE_IN_REG;
1498			constraints->hints[2] = IR_REG_FP_RET1;
1499			constraints->hints_count = 3;
1500#endif
1501			break;
1502		case IR_SNAPSHOT:
1503			flags = 0;
1504			break;
1505		case IR_VA_START:
1506			flags = IR_OP2_MUST_BE_IN_REG;
1507			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1508			n = 1;
1509			break;
1510		case IR_VA_ARG:
1511			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1512			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1513			n = 1;
1514			break;
1515		case IR_VA_COPY:
1516			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1517			constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1518			n = 1;
1519			break;
1520	}
1521	constraints->tmps_count = n;
1522
1523	return flags;
1524}
1525
1526/* instruction selection */
1527static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref);
1528static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root);
1529
1530static void ir_swap_ops(ir_insn *insn)
1531{
1532	SWAP_REFS(insn->op1, insn->op2);
1533}
1534
1535static bool ir_match_try_revert_lea_to_add(ir_ctx *ctx, ir_ref ref)
1536{
1537	ir_insn *insn = &ctx->ir_base[ref];
1538
1539	/* TODO: This optimization makes sense only if the other operand is killed */
1540	if (insn->op1 == insn->op2) {
1541		/* pass */
1542	} else if (ir_match_try_fuse_load(ctx, insn->op2, ref)) {
1543		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
1544		return 1;
1545	} else if (ir_match_try_fuse_load(ctx, insn->op1, ref)) {
1546		/* swap for better load fusion */
1547		ir_swap_ops(insn);
1548		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
1549		return 1;
1550	}
1551	return 0;
1552}
1553
1554static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)
1555{
1556	if (!IR_IS_CONST_REF(addr_ref)) {
1557		uint32_t rule = ctx->rules[addr_ref];
1558
1559		if (!rule) {
1560			ctx->rules[addr_ref] = rule = ir_match_insn(ctx, addr_ref);
1561		}
1562		if (rule >= IR_LEA_OB && rule <= IR_LEA_SI_B) {
1563			ir_use_list *use_list;
1564			ir_ref j;
1565
1566			if (rule == IR_LEA_IB && ir_match_try_revert_lea_to_add(ctx, addr_ref)) {
1567				return;
1568			}
1569
1570			use_list = &ctx->use_lists[addr_ref];
1571			j = use_list->count;
1572			if (j > 1) {
1573				/* check if address is used only in LOAD and STORE */
1574				ir_ref *p = &ctx->use_edges[use_list->refs];
1575
1576				do {
1577					ir_insn *insn = &ctx->ir_base[*p];
1578					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
1579						return;
1580					}
1581					p++;
1582				} while (--j);
1583			}
1584			ctx->rules[addr_ref] = IR_FUSED | IR_SIMPLE | rule;
1585		}
1586	}
1587}
1588
1589static bool ir_match_may_fuse_SI(ir_ctx *ctx, ir_ref ref, ir_ref use)
1590{
1591	ir_insn *op2_insn, *insn = &ctx->ir_base[use];
1592
1593	if (insn->op == IR_ADD) {
1594		if (insn->op1 == ref) {
1595			if (IR_IS_CONST_REF(insn->op2)) {
1596				op2_insn = &ctx->ir_base[insn->op2];
1597				if (IR_IS_SYM_CONST(op2_insn->op)) {
1598					if (ir_may_fuse_addr(ctx, op2_insn)) {
1599						return 1; // LEA_SI_O
1600					}
1601				} else if (IR_IS_SIGNED_32BIT(op2_insn->val.i64)) {
1602					return 1; // LEA_SI_O
1603				}
1604			} else if (insn->op2 != ref) {
1605				return 1; // LEA_SI_B or LEA_SI_OB
1606			}
1607		} else if (insn->op2 == ref && insn->op1 != insn->op2) {
1608			return 1; // LEA_B_SI or LEA_OB_SI
1609		}
1610	}
1611	return 0;
1612}
1613
1614static bool ir_match_fuse_addr_all_useges(ir_ctx *ctx, ir_ref ref)
1615{
1616	uint32_t rule = ctx->rules[ref];
1617	ir_use_list *use_list;
1618	ir_ref n, *p, use;
1619
1620	if (rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
1621		return 1;
1622	} else if (!rule) {
1623		ir_insn *insn = &ctx->ir_base[ref];
1624
1625		IR_ASSERT(IR_IS_TYPE_INT(insn->type) && ir_type_size[insn->type] >= 4);
1626		if (insn->op == IR_MUL
1627		 && IR_IS_CONST_REF(insn->op2)) {
1628			insn = &ctx->ir_base[insn->op2];
1629			if (!IR_IS_SYM_CONST(insn->op)
1630			 &&	(insn->val.u64 == 2 || insn->val.u64 == 4 || insn->val.u64 == 8)) {
1631				ctx->rules[ref] = IR_LEA_SI;
1632
1633				use_list = &ctx->use_lists[ref];
1634				n = use_list->count;
1635				IR_ASSERT(n > 1);
1636				p = &ctx->use_edges[use_list->refs];
1637				for (; n > 0; p++, n--) {
1638					use = *p;
1639					if (!ir_match_may_fuse_SI(ctx, ref, use)) {
1640						return 0;
1641					}
1642				}
1643
1644				return 1;
1645			}
1646		}
1647	}
1648
1649	return 0;
1650}
1651
1652/* A naive check if there is a STORE or CALL between this LOAD and the fusion root */
1653static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
1654{
1655	if (ref + 1 != root) {
1656		ir_ref pos = ctx->prev_ref[root];
1657
1658		do {
1659			ir_insn *insn = &ctx->ir_base[pos];
1660
1661			if (insn->op == IR_STORE) {
1662				// TODO: check if LOAD and STORE addresses may alias
1663				return 1;
1664			} else if (insn->op == IR_CALL) {
1665				return 1;
1666			}
1667			pos = ctx->prev_ref[pos];
1668		} while (ref != pos);
1669	}
1670	return 0;
1671}
1672
1673static void ir_match_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1674{
1675	if (ir_in_same_block(ctx, ref)
1676	 && ctx->ir_base[ref].op == IR_LOAD) {
1677		if (ctx->use_lists[ref].count == 2
1678		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1679			ir_ref addr_ref = ctx->ir_base[ref].op2;
1680			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1681
1682			if (IR_IS_CONST_REF(addr_ref)) {
1683				if (ir_may_fuse_addr(ctx, addr_insn)) {
1684					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1685					return;
1686				}
1687			} else {
1688				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1689				ir_match_fuse_addr(ctx, addr_ref);
1690				return;
1691			}
1692		}
1693	}
1694}
1695
1696static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1697{
1698	ir_insn *insn = &ctx->ir_base[ref];
1699
1700	if (ir_in_same_block(ctx, ref)
1701	 && insn->op == IR_LOAD) {
1702		if (ctx->use_lists[ref].count == 2
1703		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1704			ir_ref addr_ref = ctx->ir_base[ref].op2;
1705			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1706
1707			if (IR_IS_CONST_REF(addr_ref)) {
1708				if (ir_may_fuse_addr(ctx, addr_insn)) {
1709					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1710					return 1;
1711				}
1712			} else {
1713				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1714				ir_match_fuse_addr(ctx, addr_ref);
1715				return 1;
1716			}
1717		}
1718	} else if (insn->op == IR_PARAM) {
1719		if (ctx->use_lists[ref].count == 1
1720		 && ir_get_param_reg(ctx, ref) == IR_REG_NONE) {
1721			return 1;
1722		}
1723	} else if (ctx->ir_base[ref].op == IR_VLOAD) {
1724		return 1;
1725	}
1726	return 0;
1727}
1728
1729static void ir_match_fuse_load_commutative_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1730{
1731	if (IR_IS_CONST_REF(insn->op2)
1732	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1733		return;
1734	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1735		return;
1736	} else if (ir_match_try_fuse_load(ctx, insn->op1, root)) {
1737		ir_swap_ops(insn);
1738	}
1739}
1740
1741static void ir_match_fuse_load_commutative_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1742{
1743	if (!IR_IS_CONST_REF(insn->op2)
1744	 && !ir_match_try_fuse_load(ctx, insn->op2, root)
1745	 && (IR_IS_CONST_REF(insn->op1) || ir_match_try_fuse_load(ctx, insn->op1, root))) {
1746		ir_swap_ops(insn);
1747	}
1748}
1749
1750static void ir_match_fuse_load_cmp_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1751{
1752	if (IR_IS_CONST_REF(insn->op2)
1753	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1754		ir_match_fuse_load(ctx, insn->op1, root);
1755	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1756	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1757		ir_swap_ops(insn);
1758		if (insn->op != IR_EQ && insn->op != IR_NE) {
1759			insn->op ^= 3;
1760		}
1761	}
1762}
1763
1764static void ir_match_fuse_load_test_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1765{
1766	if (IR_IS_CONST_REF(insn->op2)
1767	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1768		ir_match_fuse_load(ctx, insn->op1, root);
1769	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1770	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1771		ir_swap_ops(insn);
1772	}
1773}
1774
1775static void ir_match_fuse_load_cmp_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1776{
1777	if (insn->op != IR_EQ && insn->op != IR_NE) {
1778		if (insn->op == IR_LT || insn->op == IR_LE) {
1779			/* swap operands to avoid P flag check */
1780			ir_swap_ops(insn);
1781			insn->op ^= 3;
1782		}
1783		ir_match_fuse_load(ctx, insn->op2, root);
1784	} else if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1785		/* pass */
1786	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1787		/* pass */
1788	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1789		ir_swap_ops(insn);
1790		if (insn->op != IR_EQ && insn->op != IR_NE) {
1791			insn->op ^= 3;
1792		}
1793	}
1794}
1795
1796static void ir_match_fuse_load_cmp_fp_br(ir_ctx *ctx, ir_insn *insn, ir_ref root, bool direct)
1797{
1798	if (direct) {
1799		if (insn->op == IR_LT || insn->op == IR_LE) {
1800			/* swap operands to avoid P flag check */
1801			ir_swap_ops(insn);
1802			insn->op ^= 3;
1803		}
1804	} else {
1805		if (insn->op == IR_GT || insn->op == IR_GE) {
1806			/* swap operands to avoid P flag check */
1807			ir_swap_ops(insn);
1808			insn->op ^= 3;
1809		}
1810	}
1811	if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1812		/* pass */
1813	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1814		/* pass */
1815	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1816		ir_swap_ops(insn);
1817		if (insn->op != IR_EQ && insn->op != IR_NE) {
1818			insn->op ^= 3;
1819		}
1820	}
1821}
1822
1823#define STR_EQUAL(name, name_len, str) (name_len == strlen(str) && memcmp(name, str, strlen(str)) == 0)
1824
1825#define IR_IS_FP_FUNC_1(proto, _type)  (proto->params_count == 1 && \
1826                                        proto->param_types[0] == _type && \
1827                                        proto->ret_type == _type)
1828
1829static uint32_t ir_match_builtin_call(ir_ctx *ctx, const ir_insn *func)
1830{
1831	const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);
1832
1833	if (proto->flags & IR_BUILTIN_FUNC) {
1834		size_t name_len;
1835		const char *name = ir_get_strl(ctx, func->val.name, &name_len);
1836
1837		if (STR_EQUAL(name, name_len, "sqrt")) {
1838			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1839				return IR_SSE_SQRT;
1840			}
1841		} else if (STR_EQUAL(name, name_len, "sqrtf")) {
1842			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1843				return IR_SSE_SQRT;
1844			}
1845		} else if (STR_EQUAL(name, name_len, "rint")) {
1846			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1847				return IR_SSE_RINT;
1848			}
1849		} else if (STR_EQUAL(name, name_len, "rintf")) {
1850			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1851				return IR_SSE_RINT;
1852			}
1853		} else if (STR_EQUAL(name, name_len, "floor")) {
1854			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1855				return IR_SSE_FLOOR;
1856			}
1857		} else if (STR_EQUAL(name, name_len, "floorf")) {
1858			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1859				return IR_SSE_FLOOR;
1860			}
1861		} else if (STR_EQUAL(name, name_len, "ceil")) {
1862			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1863				return IR_SSE_CEIL;
1864			}
1865		} else if (STR_EQUAL(name, name_len, "ceilf")) {
1866			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1867				return IR_SSE_CEIL;
1868			}
1869		} else if (STR_EQUAL(name, name_len, "trunc")) {
1870			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1871				return IR_SSE_TRUNC;
1872			}
1873		} else if (STR_EQUAL(name, name_len, "truncf")) {
1874			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1875				return IR_SSE_TRUNC;
1876			}
1877		} else if (STR_EQUAL(name, name_len, "nearbyint")) {
1878			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1879				return IR_SSE_NEARBYINT;
1880			}
1881		} else if (STR_EQUAL(name, name_len, "nearbyintf")) {
1882			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1883				return IR_SSE_NEARBYINT;
1884			}
1885		}
1886	}
1887
1888	return 0;
1889}
1890
1891static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref)
1892{
1893	ir_insn *op2_insn;
1894	ir_insn *insn = &ctx->ir_base[ref];
1895	uint32_t store_rule;
1896	ir_op load_op;
1897
1898	switch (insn->op) {
1899		case IR_EQ:
1900		case IR_NE:
1901		case IR_LT:
1902		case IR_GE:
1903		case IR_LE:
1904		case IR_GT:
1905		case IR_ULT:
1906		case IR_UGE:
1907		case IR_ULE:
1908		case IR_UGT:
1909			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
1910				if (IR_IS_CONST_REF(insn->op2)
1911				 && !IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op)
1912				 && ctx->ir_base[insn->op2].val.i64 == 0
1913				 && insn->op1 == ref - 1) { /* previous instruction */
1914					ir_insn *op1_insn = &ctx->ir_base[insn->op1];
1915
1916					if (op1_insn->op == IR_AND && ctx->use_lists[insn->op1].count == 1) {
1917						/* v = AND(_, _); CMP(v, 0) => SKIP_TEST; TEST */
1918						ir_match_fuse_load_test_int(ctx, op1_insn, ref);
1919						ctx->rules[insn->op1] = IR_FUSED | IR_TEST_INT;
1920						return IR_TESTCC_INT;
1921					} else if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
1922							/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
1923							((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
1924								(insn->op == IR_EQ || insn->op == IR_NE))) {
1925						/* v = BINOP(_, _); CMP(v, 0) => BINOP; SETCC */
1926						if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1927							ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
1928							ctx->rules[insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
1929						} else {
1930							ir_match_fuse_load(ctx, op1_insn->op2, ref);
1931							ctx->rules[insn->op1] = IR_BINOP_INT;
1932						}
1933						return IR_SETCC_INT;
1934					}
1935				}
1936				ir_match_fuse_load_cmp_int(ctx, insn, ref);
1937				return IR_CMP_INT;
1938			} else {
1939				ir_match_fuse_load_cmp_fp(ctx, insn, ref);
1940				return IR_CMP_FP;
1941			}
1942			break;
1943		case IR_ADD:
1944		case IR_SUB:
1945			if (IR_IS_TYPE_INT(insn->type)) {
1946				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1947					op2_insn = &ctx->ir_base[insn->op2];
1948					if (IR_IS_CONST_REF(insn->op1)) {
1949						// const
1950						// TODO: add support for sym+offset ???
1951					} else if (IR_IS_SYM_CONST(op2_insn->op)) {
1952						if (insn->op == IR_ADD && ir_may_fuse_addr(ctx, op2_insn)) {
1953							goto lea;
1954						}
1955						/* pass */
1956					} else if (op2_insn->val.i64 == 0) {
1957						// return IR_COPY_INT;
1958					} else if ((ir_type_size[insn->type] >= 4 && insn->op == IR_ADD && IR_IS_SIGNED_32BIT(op2_insn->val.i64)) ||
1959							(ir_type_size[insn->type] >= 4 && insn->op == IR_SUB && IR_IS_SIGNED_NEG_32BIT(op2_insn->val.i64))) {
1960lea:
1961						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
1962							uint32_t rule = ctx->rules[insn->op1];
1963
1964							if (!rule) {
1965								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
1966							}
1967							if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
1968								/* z = MUL(Y, 2|4|8) ... ADD(z, imm32) => SKIP ... LEA [Y*2|4|8+im32] */
1969								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1970								return IR_LEA_SI_O;
1971							} else if (rule == IR_LEA_SIB) {
1972								/* z = ADD(X, MUL(Y, 2|4|8)) ... ADD(z, imm32) => SKIP ... LEA [X+Y*2|4|8+im32] */
1973								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SIB;
1974								return IR_LEA_SIB_O;
1975							} else if (rule == IR_LEA_IB) {
1976								/* z = ADD(X, Y) ... ADD(z, imm32) => SKIP ... LEA [X+Y+im32] */
1977								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_IB;
1978								return IR_LEA_IB_O;
1979							}
1980						}
1981						/* ADD(X, imm32) => LEA [X+imm32] */
1982						return IR_LEA_OB;
1983					} else if (op2_insn->val.i64 == 1 || op2_insn->val.i64 == -1) {
1984						if (insn->op == IR_ADD) {
1985							if (op2_insn->val.i64 == 1) {
1986								/* ADD(_, 1) => INC */
1987								return IR_INC;
1988						    } else {
1989								/* ADD(_, -1) => DEC */
1990								return IR_DEC;
1991						    }
1992						} else {
1993							if (op2_insn->val.i64 == 1) {
1994								/* SUB(_, 1) => DEC */
1995								return IR_DEC;
1996						    } else {
1997								/* SUB(_, -1) => INC */
1998								return IR_INC;
1999						    }
2000						}
2001					}
2002				} else if ((ctx->flags & IR_OPT_CODEGEN) && insn->op == IR_ADD && ir_type_size[insn->type] >= 4) {
2003					if (insn->op1 != insn->op2) {
2004						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
2005							uint32_t rule =ctx->rules[insn->op1];
2006							if (!rule) {
2007								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
2008							}
2009							if (rule == IR_LEA_OB) {
2010								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
2011								if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
2012									rule = ctx->rules[insn->op2];
2013									if (!rule) {
2014										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
2015									}
2016									if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
2017										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(x, y) => SKIP ... SKIP ... LEA */
2018										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
2019										return IR_LEA_OB_SI;
2020									}
2021								}
2022								/* x = ADD(X, imm32) ... ADD(x, Y) => SKIP ... LEA */
2023								return IR_LEA_OB_I;
2024							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
2025								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
2026								if (ctx->use_lists[insn->op2].count == 1) {
2027									rule = ctx->rules[insn->op2];
2028									if (!rule) {
2029										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
2030									}
2031									if (rule == IR_LEA_OB) {
2032										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(y, x) => SKIP ... SKIP ... LEA */
2033										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
2034										return IR_LEA_SI_OB;
2035									}
2036								}
2037								/* x = MUL(X, 2|4|8) ... ADD(x, Y) => SKIP ... LEA */
2038								return IR_LEA_SI_B;
2039							}
2040						}
2041						if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
2042							uint32_t rule = ctx->rules[insn->op2];
2043							if (!rule) {
2044								ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
2045							}
2046							if (rule == IR_LEA_OB) {
2047								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
2048								/* x = ADD(X, imm32) ... ADD(Y, x) => SKIP ... LEA */
2049								return IR_LEA_I_OB;
2050							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
2051								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
2052								/* x = MUL(X, 2|4|8) ... ADD(Y, x) => SKIP ... LEA */
2053								return IR_LEA_B_SI;
2054							}
2055						}
2056					}
2057					/* ADD(X, Y) => LEA [X + Y] */
2058					return IR_LEA_IB;
2059				}
2060binop_int:
2061				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2062					ir_match_fuse_load_commutative_int(ctx, insn, ref);
2063					return IR_BINOP_INT | IR_MAY_SWAP;
2064				} else {
2065					ir_match_fuse_load(ctx, insn->op2, ref);
2066					return IR_BINOP_INT;
2067				}
2068			} else {
2069binop_fp:
2070				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2071					ir_match_fuse_load_commutative_fp(ctx, insn, ref);
2072					if (ctx->mflags & IR_X86_AVX) {
2073						return IR_BINOP_AVX;
2074					} else {
2075						return IR_BINOP_SSE2 | IR_MAY_SWAP;
2076					}
2077				} else {
2078					ir_match_fuse_load(ctx, insn->op2, ref);
2079					if (ctx->mflags & IR_X86_AVX) {
2080						return IR_BINOP_AVX;
2081					} else {
2082						return IR_BINOP_SSE2;
2083					}
2084				}
2085			}
2086			break;
2087		case IR_MUL:
2088			if (IR_IS_TYPE_INT(insn->type)) {
2089				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2090					op2_insn = &ctx->ir_base[insn->op2];
2091					if (IR_IS_SYM_CONST(op2_insn->op)) {
2092						/* pass */
2093					} else if (IR_IS_CONST_REF(insn->op1)) {
2094						// const
2095					} else if (op2_insn->val.u64 == 0) {
2096						// 0
2097					} else if (op2_insn->val.u64 == 1) {
2098						// return IR_COPY_INT;
2099					} else if (ir_type_size[insn->type] >= 4 &&
2100							(op2_insn->val.u64 == 2 || op2_insn->val.u64 == 4 || op2_insn->val.u64 == 8)) {
2101						/* MUL(X, 2|4|8) => LEA [X*2|4|8] */
2102						return IR_LEA_SI;
2103					} else if (ir_type_size[insn->type] >= 4 &&
2104							(op2_insn->val.u64 == 3 || op2_insn->val.u64 == 5 || op2_insn->val.u64 == 9)) {
2105						/* MUL(X, 3|5|9) => LEA [X+X*2|4|8] */
2106						return IR_LEA_SIB;
2107					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2108						/* MUL(X, PWR2) => SHL */
2109						return IR_MUL_PWR2;
2110					} else if (IR_IS_TYPE_SIGNED(insn->type)
2111					 && ir_type_size[insn->type] != 1
2112					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
2113					 && !IR_IS_CONST_REF(insn->op1)) {
2114						/* MUL(_, imm32) => IMUL */
2115						ir_match_fuse_load(ctx, insn->op1, ref);
2116						return IR_IMUL3;
2117					}
2118				}
2119				/* Prefer IMUL over MUL because it's more flexible and uses less registers ??? */
2120//				if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
2121				if (ir_type_size[insn->type] != 1) {
2122					goto binop_int;
2123				}
2124				ir_match_fuse_load(ctx, insn->op2, ref);
2125				return IR_MUL_INT;
2126			} else {
2127				goto binop_fp;
2128			}
2129			break;
2130		case IR_ADD_OV:
2131		case IR_SUB_OV:
2132			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2133			goto binop_int;
2134		case IR_MUL_OV:
2135			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2136			if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
2137				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2138					op2_insn = &ctx->ir_base[insn->op2];
2139					if (!IR_IS_SYM_CONST(op2_insn->op)
2140					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
2141					 && !IR_IS_CONST_REF(insn->op1)) {
2142						/* MUL(_, imm32) => IMUL */
2143						ir_match_fuse_load(ctx, insn->op1, ref);
2144						return IR_IMUL3;
2145					}
2146				}
2147				goto binop_int;
2148			}
2149			ir_match_fuse_load(ctx, insn->op2, ref);
2150			return IR_MUL_INT;
2151		case IR_DIV:
2152			if (IR_IS_TYPE_INT(insn->type)) {
2153				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2154					op2_insn = &ctx->ir_base[insn->op2];
2155					if (IR_IS_SYM_CONST(op2_insn->op)) {
2156						/* pass */
2157					} else if (IR_IS_CONST_REF(insn->op1)) {
2158						// const
2159					} else if (op2_insn->val.u64 == 1) {
2160						// return IR_COPY_INT;
2161					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2162						/* DIV(X, PWR2) => SHR */
2163						if (IR_IS_TYPE_UNSIGNED(insn->type)) {
2164							return IR_DIV_PWR2;
2165						} else {
2166							return IR_SDIV_PWR2;
2167						}
2168					}
2169				}
2170				ir_match_fuse_load(ctx, insn->op2, ref);
2171				return IR_DIV_INT;
2172			} else {
2173				goto binop_fp;
2174			}
2175			break;
2176		case IR_MOD:
2177			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2178				op2_insn = &ctx->ir_base[insn->op2];
2179				if (IR_IS_SYM_CONST(op2_insn->op)) {
2180					/* pass */
2181				} else if (IR_IS_CONST_REF(insn->op1)) {
2182					// const
2183				} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2184					/* MOD(X, PWR2) => AND */
2185					if (IR_IS_TYPE_UNSIGNED(insn->type)) {
2186						return IR_MOD_PWR2;
2187					} else {
2188						return IR_SMOD_PWR2;
2189					}
2190				}
2191			}
2192			ir_match_fuse_load(ctx, insn->op2, ref);
2193			return IR_MOD_INT;
2194		case IR_BSWAP:
2195		case IR_NOT:
2196			if (insn->type == IR_BOOL) {
2197				IR_ASSERT(IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)); // TODO: IR_BOOL_NOT_FP
2198				return IR_BOOL_NOT_INT;
2199			} else {
2200				IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2201				return IR_OP_INT;
2202			}
2203			break;
2204		case IR_NEG:
2205			if (IR_IS_TYPE_INT(insn->type)) {
2206				return IR_OP_INT;
2207			} else {
2208				return IR_OP_FP;
2209			}
2210		case IR_ABS:
2211			if (IR_IS_TYPE_INT(insn->type)) {
2212				return IR_ABS_INT; // movl %edi, %eax; negl %eax; cmovs %edi, %eax
2213			} else {
2214				return IR_OP_FP;
2215			}
2216		case IR_OR:
2217			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2218				op2_insn = &ctx->ir_base[insn->op2];
2219				if (IR_IS_SYM_CONST(op2_insn->op)) {
2220					/* pass */
2221				} else if (IR_IS_CONST_REF(insn->op1)) {
2222					// const
2223				} else if (op2_insn->val.i64 == 0) {
2224					// return IR_COPY_INT;
2225				} else if (op2_insn->val.i64 == -1) {
2226					// -1
2227				}
2228			}
2229			goto binop_int;
2230		case IR_AND:
2231			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2232				op2_insn = &ctx->ir_base[insn->op2];
2233				if (IR_IS_SYM_CONST(op2_insn->op)) {
2234					/* pass */
2235				} else if (IR_IS_CONST_REF(insn->op1)) {
2236					// const
2237				} else if (op2_insn->val.i64 == 0) {
2238					// 0
2239				} else if (op2_insn->val.i64 == -1) {
2240					// return IR_COPY_INT;
2241				}
2242			}
2243			goto binop_int;
2244		case IR_XOR:
2245			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2246				op2_insn = &ctx->ir_base[insn->op2];
2247				if (IR_IS_SYM_CONST(op2_insn->op)) {
2248					/* pass */
2249				} else if (IR_IS_CONST_REF(insn->op1)) {
2250					// const
2251				}
2252			}
2253			goto binop_int;
2254		case IR_SHL:
2255			if (IR_IS_CONST_REF(insn->op2)) {
2256				if (ctx->flags & IR_OPT_CODEGEN) {
2257					op2_insn = &ctx->ir_base[insn->op2];
2258					if (IR_IS_SYM_CONST(op2_insn->op)) {
2259						/* pass */
2260					} else if (IR_IS_CONST_REF(insn->op1)) {
2261						// const
2262					} else if (op2_insn->val.u64 == 0) {
2263						// return IR_COPY_INT;
2264					} else if (ir_type_size[insn->type] >= 4) {
2265						if (op2_insn->val.u64 == 1) {
2266							// lea [op1*2]
2267						} else if (op2_insn->val.u64 == 2) {
2268							// lea [op1*4]
2269						} else if (op2_insn->val.u64 == 3) {
2270							// lea [op1*8]
2271						}
2272					}
2273				}
2274				return IR_SHIFT_CONST;
2275			}
2276			return IR_SHIFT;
2277		case IR_SHR:
2278		case IR_SAR:
2279		case IR_ROL:
2280		case IR_ROR:
2281			if (IR_IS_CONST_REF(insn->op2)) {
2282				if (ctx->flags & IR_OPT_CODEGEN) {
2283					op2_insn = &ctx->ir_base[insn->op2];
2284					if (IR_IS_SYM_CONST(op2_insn->op)) {
2285						/* pass */
2286					} else if (IR_IS_CONST_REF(insn->op1)) {
2287						// const
2288					} else if (op2_insn->val.u64 == 0) {
2289						// return IR_COPY_INT;
2290					}
2291				}
2292				return IR_SHIFT_CONST;
2293			}
2294			return IR_SHIFT;
2295		case IR_MIN:
2296		case IR_MAX:
2297			if (IR_IS_TYPE_INT(insn->type)) {
2298				return IR_MIN_MAX_INT | IR_MAY_SWAP;
2299			} else {
2300				goto binop_fp;
2301			}
2302			break;
2303		case IR_COPY:
2304			if (IR_IS_TYPE_INT(insn->type)) {
2305				return IR_COPY_INT | IR_MAY_REUSE;
2306			} else {
2307				return IR_COPY_FP | IR_MAY_REUSE;
2308			}
2309			break;
2310		case IR_CALL:
2311			if (IR_IS_CONST_REF(insn->op2)) {
2312				const ir_insn *func = &ctx->ir_base[insn->op2];
2313
2314				if (func->op == IR_FUNC && func->proto) {
2315					uint32_t rule = ir_match_builtin_call(ctx, func);
2316
2317					if (rule) {
2318						return rule;
2319					}
2320				}
2321			}
2322			ctx->flags2 |= IR_HAS_CALLS | IR_16B_FRAME_ALIGNMENT;
2323#ifndef IR_REG_FP_RET1
2324			if (IR_IS_TYPE_FP(insn->type)) {
2325				ctx->flags2 |= IR_HAS_FP_RET_SLOT;
2326			}
2327#endif
2328			IR_FALLTHROUGH;
2329		case IR_TAILCALL:
2330		case IR_IJMP:
2331			ir_match_fuse_load(ctx, insn->op2, ref);
2332			return insn->op;
2333		case IR_VAR:
2334			return IR_SKIPPED | IR_VAR;
2335		case IR_PARAM:
2336			return ctx->use_lists[ref].count > 0 ? IR_PARAM : IR_SKIPPED | IR_PARAM;
2337		case IR_ALLOCA:
2338			/* alloca() may be used only in functions */
2339			if (ctx->flags & IR_FUNCTION) {
2340				if (IR_IS_CONST_REF(insn->op2) && ctx->cfg_map[ref] == 1) {
2341					ir_insn *val = &ctx->ir_base[insn->op2];
2342
2343					if (!IR_IS_SYM_CONST(val->op)) {
2344						return IR_STATIC_ALLOCA;
2345					}
2346				}
2347				ctx->flags |= IR_USE_FRAME_POINTER;
2348				ctx->flags2 |= IR_HAS_ALLOCA | IR_16B_FRAME_ALIGNMENT;
2349			}
2350			return IR_ALLOCA;
2351		case IR_VSTORE:
2352			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2353				store_rule = IR_VSTORE_INT;
2354				load_op = IR_VLOAD;
2355store_int:
2356				if ((ctx->flags & IR_OPT_CODEGEN)
2357				 && ir_in_same_block(ctx, insn->op3)
2358				 && (ctx->use_lists[insn->op3].count == 1 ||
2359				     (ctx->use_lists[insn->op3].count == 2
2360				   && (ctx->ir_base[insn->op3].op == IR_ADD_OV ||
2361				       ctx->ir_base[insn->op3].op == IR_SUB_OV)))) {
2362					ir_insn *op_insn = &ctx->ir_base[insn->op3];
2363					uint32_t rule = ctx->rules[insn->op3];
2364
2365					if (!rule) {
2366						ctx->rules[insn->op3] = rule = ir_match_insn(ctx, insn->op3);
2367					}
2368					if (((rule & IR_RULE_MASK) == IR_BINOP_INT && op_insn->op != IR_MUL) || rule == IR_LEA_OB || rule == IR_LEA_IB) {
2369						if (insn->op1 == op_insn->op1
2370						 && ctx->ir_base[op_insn->op1].op == load_op
2371						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2372						 && ctx->use_lists[op_insn->op1].count == 2) {
2373							/* l = LOAD(_, a) ... v = BINOP(l, _) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2374							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2375							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2376							if (!IR_IS_CONST_REF(op_insn->op2)
2377							 && ctx->rules[op_insn->op2] == (IR_FUSED|IR_SIMPLE|IR_LOAD)) {
2378								ctx->rules[op_insn->op2] = IR_LOAD_INT;
2379							}
2380							return IR_MEM_BINOP_INT;
2381						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2382						 && insn->op1 == op_insn->op2
2383						 && ctx->ir_base[op_insn->op2].op == load_op
2384						 && ctx->ir_base[op_insn->op2].op2 == insn->op2
2385						 && ctx->use_lists[op_insn->op2].count == 2) {
2386							/* l = LOAD(_, a) ... v = BINOP(_, l) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2387							ir_swap_ops(op_insn);
2388							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2389							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2390							return IR_MEM_BINOP_INT;
2391						}
2392					} else if (rule == IR_INC) {
2393						if (insn->op1 == op_insn->op1
2394						 && ctx->ir_base[op_insn->op1].op == load_op
2395						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2396						 && ctx->use_lists[op_insn->op1].count == 2) {
2397							/* l = LOAD(_, a) ... v = INC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_INC */
2398							ctx->rules[insn->op3] = IR_SKIPPED | IR_INC;
2399							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2400							return IR_MEM_INC;
2401						}
2402					} else if (rule == IR_DEC) {
2403						if (insn->op1 == op_insn->op1
2404						 && ctx->ir_base[op_insn->op1].op == load_op
2405						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2406						 && ctx->use_lists[op_insn->op1].count == 2){
2407							/* l = LOAD(_, a) ... v = DEC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DEC */
2408							ctx->rules[insn->op3] = IR_SKIPPED | IR_DEC;
2409							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2410							return IR_MEM_DEC;
2411						}
2412					} else if (rule == IR_MUL_PWR2) {
2413						if (insn->op1 == op_insn->op1
2414						 && ctx->ir_base[op_insn->op1].op == load_op
2415						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2416						 && ctx->use_lists[op_insn->op1].count == 2) {
2417							/* l = LOAD(_, a) ... v = MUL_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MUL_PWR2 */
2418							ctx->rules[insn->op3] = IR_SKIPPED | IR_MUL_PWR2;
2419							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2420							return IR_MEM_MUL_PWR2;
2421						}
2422					} else if (rule == IR_DIV_PWR2) {
2423						if (insn->op1 == op_insn->op1
2424						 && ctx->ir_base[op_insn->op1].op == load_op
2425						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2426						 && ctx->use_lists[op_insn->op1].count == 2) {
2427							/* l = LOAD(_, a) ... v = DIV_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DIV_PWR2 */
2428							ctx->rules[insn->op3] = IR_SKIPPED | IR_DIV_PWR2;
2429							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2430							return IR_MEM_DIV_PWR2;
2431						}
2432					} else if (rule == IR_MOD_PWR2) {
2433						if (insn->op1 == op_insn->op1
2434						 && ctx->ir_base[op_insn->op1].op == load_op
2435						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2436						 && ctx->use_lists[op_insn->op1].count == 2) {
2437							/* l = LOAD(_, a) ... v = MOD_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MOD_PWR2 */
2438							ctx->rules[insn->op3] = IR_SKIPPED | IR_MOD_PWR2;
2439							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2440							return IR_MEM_MOD_PWR2;
2441						}
2442					} else if (rule == IR_SHIFT) {
2443						if (insn->op1 == op_insn->op1
2444						 && ctx->ir_base[op_insn->op1].op == load_op
2445						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2446						 && ctx->use_lists[op_insn->op1].count == 2) {
2447							/* l = LOAD(_, a) ... v = SHIFT(l, _) ... STORE(l, a, v) => SKIP ... SKIP_SHIFT ... MEM_SHIFT */
2448							ctx->rules[insn->op3] = IR_FUSED | IR_SHIFT;
2449							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2450							return IR_MEM_SHIFT;
2451						}
2452					} else if (rule == IR_SHIFT_CONST) {
2453						if (insn->op1 == op_insn->op1
2454						 && ctx->ir_base[op_insn->op1].op == load_op
2455						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2456						 && ctx->use_lists[op_insn->op1].count == 2) {
2457							/* l = LOAD(_, a) ... v = SHIFT(l, CONST) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_SHIFT_CONST */
2458							ctx->rules[insn->op3] = IR_SKIPPED | IR_SHIFT_CONST;
2459							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2460							return IR_MEM_SHIFT_CONST;
2461						}
2462					} else if (rule == IR_OP_INT && op_insn->op != IR_BSWAP) {
2463						if (insn->op1 == op_insn->op1
2464						 && ctx->ir_base[op_insn->op1].op == load_op
2465						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2466						 && ctx->use_lists[op_insn->op1].count == 2) {
2467							/* l = LOAD(_, a) ... v = OP(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_OP */
2468							ctx->rules[insn->op3] = IR_SKIPPED | IR_OP_INT;
2469							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2470							return IR_MEM_OP_INT;
2471						}
2472					} else if (rule == IR_CMP_INT && load_op == IR_LOAD) {
2473						/* c = CMP(_, _) ... STORE(c) => SKIP_CMP ... CMP_AND_STORE_INT */
2474						ctx->rules[insn->op3] = IR_FUSED | IR_CMP_INT;
2475						return IR_CMP_AND_STORE_INT;
2476					}
2477				}
2478				return store_rule;
2479			} else {
2480				return IR_VSTORE_FP;
2481			}
2482			break;
2483		case IR_LOAD:
2484			ir_match_fuse_addr(ctx, insn->op2);
2485			if (IR_IS_TYPE_INT(insn->type)) {
2486				return IR_LOAD_INT;
2487			} else {
2488				return IR_LOAD_FP;
2489			}
2490			break;
2491		case IR_STORE:
2492			ir_match_fuse_addr(ctx, insn->op2);
2493			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2494				store_rule = IR_STORE_INT;
2495				load_op = IR_LOAD;
2496				goto store_int;
2497			} else {
2498				return IR_STORE_FP;
2499			}
2500			break;
2501		case IR_RLOAD:
2502			if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), insn->op2)) {
2503				return IR_SKIPPED | IR_RLOAD;
2504			}
2505			return IR_RLOAD;
2506		case IR_RSTORE:
2507			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2508				if ((ctx->flags & IR_OPT_CODEGEN)
2509				 && ir_in_same_block(ctx, insn->op2)
2510				 && ctx->use_lists[insn->op2].count == 1
2511				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2512					ir_insn *op_insn = &ctx->ir_base[insn->op2];
2513
2514					if (op_insn->op == IR_ADD ||
2515				        op_insn->op == IR_SUB ||
2516//				        op_insn->op == IR_MUL ||
2517				        op_insn->op == IR_OR  ||
2518				        op_insn->op == IR_AND ||
2519				        op_insn->op == IR_XOR) {
2520						if (insn->op1 == op_insn->op1
2521						 && ctx->ir_base[op_insn->op1].op == IR_RLOAD
2522						 && ctx->ir_base[op_insn->op1].op2 == insn->op3
2523						 && ctx->use_lists[op_insn->op1].count == 2) {
2524							/* l = RLOAD(r) ... v = BINOP(l, _) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2525							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2526							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2527							return IR_REG_BINOP_INT;
2528						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2529						 && insn->op1 == op_insn->op2
2530						 && ctx->ir_base[op_insn->op2].op == IR_RLOAD
2531						 && ctx->ir_base[op_insn->op2].op2 == insn->op3
2532						 && ctx->use_lists[op_insn->op2].count == 2) {
2533							/* l = RLOAD(r) ... v = BINOP(x, l) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2534							ir_swap_ops(op_insn);
2535							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2536							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2537							return IR_REG_BINOP_INT;
2538						}
2539					}
2540				}
2541			}
2542			ir_match_fuse_load(ctx, insn->op2, ref);
2543			return IR_RSTORE;
2544		case IR_START:
2545		case IR_BEGIN:
2546		case IR_IF_TRUE:
2547		case IR_IF_FALSE:
2548		case IR_CASE_VAL:
2549		case IR_CASE_DEFAULT:
2550		case IR_MERGE:
2551		case IR_LOOP_BEGIN:
2552		case IR_UNREACHABLE:
2553			return IR_SKIPPED | insn->op;
2554		case IR_RETURN:
2555			if (!insn->op2) {
2556				return IR_RETURN_VOID;
2557			} else if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2558				return IR_RETURN_INT;
2559			} else {
2560				return IR_RETURN_FP;
2561			}
2562		case IR_IF:
2563			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2564				op2_insn = &ctx->ir_base[insn->op2];
2565				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT) {
2566					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2567						if (IR_IS_CONST_REF(op2_insn->op2)
2568						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2569						 && ctx->ir_base[op2_insn->op2].val.i64 == 0
2570						 && op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2571							ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2572
2573							if (op1_insn->op == IR_AND && ctx->use_lists[op2_insn->op1].count == 1) {
2574								/* v = AND(_, _); c = CMP(v, 0) ... IF(c) => SKIP_TEST; SKIP ... TEST_AND_BRANCH */
2575								ir_match_fuse_load_test_int(ctx, op1_insn, ref);
2576								ctx->rules[op2_insn->op1] = IR_FUSED | IR_TEST_INT;
2577								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_NOP;
2578								return IR_TEST_AND_BRANCH_INT;
2579							} else if (insn->op2 == ref - 1 && /* previous instruction */
2580									((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2581										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2582										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2583											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE)))) {
2584								/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... JCC */
2585								if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2586									ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2587									ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
2588								} else {
2589									ir_match_fuse_load(ctx, op1_insn->op2, ref);
2590									ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2591								}
2592								ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2593								return IR_JCC_INT;
2594							}
2595						}
2596						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2597						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2598						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2599						return IR_CMP_AND_BRANCH_INT;
2600					} else {
2601						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2602						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, 1);
2603						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2604						return IR_CMP_AND_BRANCH_FP;
2605					}
2606				} else if (op2_insn->op == IR_AND) {
2607					/* c = AND(_, _) ... IF(c) => SKIP_TEST ... TEST_AND_BRANCH */
2608					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2609					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2610					return IR_TEST_AND_BRANCH_INT;
2611				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
2612					/* c = OVERFLOW(_) ... IF(c) => SKIP_OVERFLOW ... OVERFLOW_AND_BRANCH */
2613					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2614					return IR_OVERFLOW_AND_BRANCH;
2615				}
2616			}
2617			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2618				if (insn->op2 == ref - 1) { /* previous instruction */
2619					op2_insn = &ctx->ir_base[insn->op2];
2620					if (op2_insn->op == IR_ADD ||
2621					    op2_insn->op == IR_SUB ||
2622//					    op2_insn->op == IR_MUL ||
2623					    op2_insn->op == IR_OR  ||
2624					    op2_insn->op == IR_AND ||
2625					    op2_insn->op == IR_XOR) {
2626
2627							/* v = BINOP(_, _); IF(v) => BINOP; JCC */
2628						if (ir_op_flags[op2_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2629							ir_match_fuse_load_commutative_int(ctx, op2_insn, ref);
2630							ctx->rules[insn->op2] = IR_BINOP_INT | IR_MAY_SWAP;
2631						} else {
2632							ir_match_fuse_load(ctx, op2_insn->op2, ref);
2633							ctx->rules[insn->op2] = IR_BINOP_INT;
2634						}
2635						return IR_JCC_INT;
2636					}
2637				} else if ((ctx->flags & IR_OPT_CODEGEN)
2638				 && insn->op1 == ref - 1 /* previous instruction */
2639				 && insn->op2 == ref - 2 /* previous instruction */
2640				 && ctx->use_lists[insn->op2].count == 2
2641				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2642					ir_insn *store_insn = &ctx->ir_base[insn->op1];
2643
2644					if (store_insn->op == IR_STORE && store_insn->op3 == insn->op2) {
2645						ir_insn *op_insn = &ctx->ir_base[insn->op2];
2646
2647						if (op_insn->op == IR_ADD ||
2648						    op_insn->op == IR_SUB ||
2649//						    op_insn->op == IR_MUL ||
2650						    op_insn->op == IR_OR  ||
2651						    op_insn->op == IR_AND ||
2652						    op_insn->op == IR_XOR) {
2653							if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2654							 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2655								if (ir_in_same_block(ctx, op_insn->op1)
2656								 && ctx->use_lists[op_insn->op1].count == 2
2657								 && store_insn->op1 == op_insn->op1) {
2658									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2659									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2660									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2661									ir_match_fuse_addr(ctx, store_insn->op2);
2662									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2663									return IR_JCC_INT;
2664								}
2665							} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2666							 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2667							 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2668								if (ir_in_same_block(ctx, op_insn->op2)
2669								 && ctx->use_lists[op_insn->op2].count == 2
2670								 && store_insn->op1 == op_insn->op2) {
2671									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2672									ir_swap_ops(op_insn);
2673									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2674									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2675									ir_match_fuse_addr(ctx, store_insn->op2);
2676									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2677									return IR_JCC_INT;
2678								}
2679							}
2680						}
2681					}
2682				}
2683				ir_match_fuse_load(ctx, insn->op2, ref);
2684				return IR_IF_INT;
2685			} else {
2686				IR_ASSERT(0 && "NIY IR_IF_FP");
2687				break;
2688			}
2689		case IR_COND:
2690			if (!IR_IS_CONST_REF(insn->op1) && ctx->use_lists[insn->op1].count == 1) {
2691				ir_insn *op1_insn = &ctx->ir_base[insn->op1];
2692
2693				if (op1_insn->op >= IR_EQ && op1_insn->op <= IR_UGT) {
2694					if (IR_IS_TYPE_INT(ctx->ir_base[op1_insn->op1].type)) {
2695						ir_match_fuse_load_cmp_int(ctx, op1_insn, ref);
2696						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_INT;
2697						return IR_COND_CMP_INT;
2698					} else {
2699						ir_match_fuse_load_cmp_fp_br(ctx, op1_insn, ref, 1);
2700						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_FP;
2701						return IR_COND_CMP_FP;
2702					}
2703				}
2704			}
2705			return IR_COND;
2706		case IR_GUARD:
2707		case IR_GUARD_NOT:
2708			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2709				op2_insn = &ctx->ir_base[insn->op2];
2710				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT
2711					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
2712				 && (insn->op2 == ref - 1 ||
2713				     (insn->op2 == ctx->prev_ref[ref] - 1
2714				   && ctx->ir_base[ctx->prev_ref[ref]].op == IR_SNAPSHOT))) {
2715					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2716						if (IR_IS_CONST_REF(op2_insn->op2)
2717						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2718						 && ctx->ir_base[op2_insn->op2].val.i64 == 0) {
2719							if (op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2720								ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2721
2722								if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2723										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2724										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2725											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2726									if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2727										ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2728										ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
2729									} else {
2730										ir_match_fuse_load(ctx, op1_insn->op2, ref);
2731										ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2732									}
2733									/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... GUARD_JCC */
2734									ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2735									return IR_GUARD_JCC_INT;
2736								}
2737							} else if ((ctx->flags & IR_OPT_CODEGEN)
2738							 && op2_insn->op1 == insn->op2 - 2 /* before previous instruction */
2739							 && ir_in_same_block(ctx, op2_insn->op1)
2740							 && ctx->use_lists[op2_insn->op1].count == 2) {
2741								ir_insn *store_insn = &ctx->ir_base[insn->op2 - 1];
2742
2743								if (store_insn->op == IR_STORE && store_insn->op3 == op2_insn->op1) {
2744									ir_insn *op_insn = &ctx->ir_base[op2_insn->op1];
2745
2746									if ((op_insn->op == IR_OR || op_insn->op == IR_AND || op_insn->op == IR_XOR) ||
2747											/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2748											((op_insn->op == IR_ADD || op_insn->op == IR_SUB) &&
2749												(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2750										if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2751										 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2752											if (ir_in_same_block(ctx, op_insn->op1)
2753											 && ctx->use_lists[op_insn->op1].count == 2
2754											 && store_insn->op1 == op_insn->op1) {
2755												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; GUARD_JCC */
2756												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2757												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2758												ir_match_fuse_addr(ctx, store_insn->op2);
2759												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2760												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2761												return IR_GUARD_JCC_INT;
2762											}
2763										} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2764										 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2765										 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2766											if (ir_in_same_block(ctx, op_insn->op2)
2767											 && ctx->use_lists[op_insn->op2].count == 2
2768											 && store_insn->op1 == op_insn->op2) {
2769												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2770												ir_swap_ops(op_insn);
2771												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2772												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2773												ir_match_fuse_addr(ctx, store_insn->op2);
2774												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2775												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2776												return IR_GUARD_JCC_INT;
2777											}
2778										}
2779									}
2780								}
2781							}
2782						}
2783						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2784						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2785						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2786						return IR_GUARD_CMP_INT;
2787					} else {
2788						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2789						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, insn->op == IR_GUARD_NOT);
2790						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2791						return IR_GUARD_CMP_FP;
2792					}
2793				} else if (op2_insn->op == IR_AND) { // TODO: OR, XOR. etc
2794					/* c = AND(_, _) ... GUARD(c) => SKIP_TEST ... GUARD_TEST */
2795					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2796					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2797					return IR_GUARD_TEST_INT;
2798				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
2799					/* c = OVERFLOW(_) ... GUARD(c) => SKIP_OVERFLOW ... GUARD_OVERFLOW */
2800					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2801					return IR_GUARD_OVERFLOW;
2802				}
2803			}
2804			ir_match_fuse_load(ctx, insn->op2, ref);
2805			return insn->op;
2806		case IR_INT2FP:
2807			if (ir_type_size[ctx->ir_base[insn->op1].type] > (IR_IS_TYPE_SIGNED(ctx->ir_base[insn->op1].type) ? 2 : 4)) {
2808				ir_match_fuse_load(ctx, insn->op1, ref);
2809			}
2810			return insn->op;
2811		case IR_SEXT:
2812		case IR_ZEXT:
2813		case IR_FP2INT:
2814		case IR_FP2FP:
2815			ir_match_fuse_load(ctx, insn->op1, ref);
2816			return insn->op;
2817		case IR_TRUNC:
2818		case IR_PROTO:
2819			ir_match_fuse_load(ctx, insn->op1, ref);
2820			return insn->op | IR_MAY_REUSE;
2821		case IR_BITCAST:
2822			ir_match_fuse_load(ctx, insn->op1, ref);
2823			if (IR_IS_TYPE_INT(insn->type) && IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
2824				return insn->op | IR_MAY_REUSE;
2825			} else {
2826				return insn->op;
2827			}
2828		case IR_CTLZ:
2829		case IR_CTTZ:
2830			ir_match_fuse_load(ctx, insn->op1, ref);
2831			return IR_BIT_COUNT;
2832		case IR_CTPOP:
2833			ir_match_fuse_load(ctx, insn->op1, ref);
2834			return (ctx->mflags & IR_X86_BMI1) ? IR_BIT_COUNT : IR_CTPOP;
2835		case IR_VA_START:
2836			ctx->flags2 |= IR_HAS_VA_START;
2837			if ((ctx->ir_base[insn->op2].op == IR_ALLOCA) || (ctx->ir_base[insn->op2].op == IR_VADDR)) {
2838				ir_use_list *use_list = &ctx->use_lists[insn->op2];
2839				ir_ref *p, n = use_list->count;
2840				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2841					ir_insn *use_insn = &ctx->ir_base[*p];
2842					if (use_insn->op == IR_VA_START || use_insn->op == IR_VA_END) {
2843					} else if (use_insn->op == IR_VA_COPY) {
2844						if (use_insn->op3 == insn->op2) {
2845							ctx->flags2 |= IR_HAS_VA_COPY;
2846						}
2847					} else if (use_insn->op == IR_VA_ARG) {
2848						if (use_insn->op2 == insn->op2) {
2849							if (IR_IS_TYPE_INT(use_insn->type)) {
2850								ctx->flags2 |= IR_HAS_VA_ARG_GP;
2851							} else {
2852								IR_ASSERT(IR_IS_TYPE_FP(use_insn->type));
2853								ctx->flags2 |= IR_HAS_VA_ARG_FP;
2854							}
2855						}
2856					} else if (*p > ref) {
2857						/* diriect va_list access */
2858						ctx->flags2 |= IR_HAS_VA_ARG_GP|IR_HAS_VA_ARG_FP;
2859					}
2860				}
2861			}
2862			return IR_VA_START;
2863		case IR_VA_END:
2864			return IR_SKIPPED | IR_NOP;
2865		case IR_VADDR:
2866			if (ctx->use_lists[ref].count > 0) {
2867				ir_use_list *use_list = &ctx->use_lists[ref];
2868				ir_ref *p, n = use_list->count;
2869
2870				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2871					if (ctx->ir_base[*p].op != IR_VA_END) {
2872						return IR_STATIC_ALLOCA;
2873					}
2874				}
2875			}
2876			return IR_SKIPPED | IR_NOP;
2877		default:
2878			break;
2879	}
2880
2881	return insn->op;
2882}
2883
2884static void ir_match_insn2(ir_ctx *ctx, ir_ref ref, uint32_t rule)
2885{
2886	if (rule == IR_LEA_IB) {
2887		ir_match_try_revert_lea_to_add(ctx, ref);
2888	}
2889}
2890
2891/* code generation */
2892static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg)
2893{
2894	int32_t offset;
2895
2896	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2897	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
2898	IR_ASSERT(offset != -1);
2899	if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2900		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2901		*reg = ctx->spill_base;
2902		return offset;
2903	}
2904	*reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2905	return IR_SPILL_POS_TO_OFFSET(offset);
2906}
2907
2908static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v)
2909{
2910	int32_t offset;
2911	ir_reg base;
2912
2913	IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]);
2914	offset = ctx->live_intervals[v]->stack_spill_pos;
2915	IR_ASSERT(offset != -1);
2916	if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2917		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2918		return IR_MEM_BO(ctx->spill_base, offset);
2919	}
2920	base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2921	offset = IR_SPILL_POS_TO_OFFSET(offset);
2922	return IR_MEM_BO(base, offset);
2923}
2924
2925static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref)
2926{
2927	IR_ASSERT(!IR_IS_CONST_REF(ref));
2928	return ir_vreg_spill_slot(ctx, ctx->vregs[ref]);
2929}
2930
2931static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem)
2932{
2933	ir_mem m = ir_ref_spill_slot(ctx, ref);
2934	return IR_MEM_VAL(m) == IR_MEM_VAL(mem);
2935}
2936
2937static ir_mem ir_var_spill_slot(ir_ctx *ctx, ir_ref ref)
2938{
2939	ir_insn *var_insn = &ctx->ir_base[ref];
2940	ir_reg reg;
2941
2942	IR_ASSERT(var_insn->op == IR_VAR);
2943	reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2944	return IR_MEM_BO(reg, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
2945}
2946
2947static bool ir_may_avoid_spill_load(ir_ctx *ctx, ir_ref ref, ir_ref use)
2948{
2949	ir_live_interval *ival;
2950
2951	IR_ASSERT(ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2952	ival = ctx->live_intervals[ctx->vregs[ref]];
2953	while (ival) {
2954		ir_use_pos *use_pos = ival->use_pos;
2955		while (use_pos) {
2956			if (IR_LIVE_POS_TO_REF(use_pos->pos) == use) {
2957				return !use_pos->next || use_pos->next->op_num == 0;
2958			}
2959			use_pos = use_pos->next;
2960		}
2961		ival = ival->next;
2962	}
2963	return 0;
2964}
2965
2966static void ir_emit_mov_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
2967{
2968	ir_backend_data *data = ctx->data;
2969	dasm_State **Dst = &data->dasm_state;
2970
2971	if (ir_type_size[type] == 8) {
2972		IR_ASSERT(sizeof(void*) == 8);
2973|.if X64
2974		if (IR_IS_UNSIGNED_32BIT(val)) {
2975			|	mov Rd(reg), (uint32_t)val // zero extended load
2976		} else if (IR_IS_SIGNED_32BIT(val)) {
2977			|	mov Rq(reg), (int32_t)val // sign extended load
2978		} else if (type == IR_ADDR && IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, (intptr_t)val)) {
2979			|	lea Ra(reg), [&val]
2980		} else {
2981			|	mov64 Ra(reg), val
2982		}
2983|.endif
2984	} else {
2985		|	ASM_REG_IMM_OP mov, type, reg, (int32_t)val // sign extended load
2986	}
2987}
2988
2989static void ir_emit_load_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
2990{
2991	ir_backend_data *data = ctx->data;
2992	dasm_State **Dst = &data->dasm_state;
2993
2994	IR_ASSERT(IR_IS_TYPE_INT(type));
2995	if (val == 0) {
2996		|	ASM_REG_REG_OP xor, type, reg, reg
2997	} else {
2998		ir_emit_mov_imm_int(ctx, type, reg, val);
2999	}
3000}
3001
3002static void ir_emit_load_mem_int(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
3003{
3004	ir_backend_data *data = ctx->data;
3005	dasm_State **Dst = &data->dasm_state;
3006
3007	|	ASM_REG_MEM_OP mov, type, reg, mem
3008}
3009
3010static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
3011{
3012	ir_backend_data *data = ctx->data;
3013	dasm_State **Dst = &data->dasm_state;
3014	ir_insn *insn = &ctx->ir_base[src];
3015	int label;
3016
3017	if (type == IR_FLOAT && insn->val.u32 == 0) {
3018		if (ctx->mflags & IR_X86_AVX) {
3019			|	vxorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
3020		} else {
3021			|	xorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
3022		}
3023	} else if (type == IR_DOUBLE && insn->val.u64 == 0) {
3024		if (ctx->mflags & IR_X86_AVX) {
3025			|	vxorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
3026		} else {
3027			|	xorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
3028		}
3029	} else {
3030		label = ir_const_label(ctx, src);
3031		|	ASM_FP_REG_TXT_OP movs, type, reg, [=>label]
3032	}
3033}
3034
3035static void ir_emit_load_mem_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
3036{
3037	ir_backend_data *data = ctx->data;
3038	dasm_State **Dst = &data->dasm_state;
3039
3040	|	ASM_FP_REG_MEM_OP movs, type, reg, mem
3041}
3042
3043static void ir_emit_load_mem(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
3044{
3045	if (IR_IS_TYPE_INT(type)) {
3046		ir_emit_load_mem_int(ctx, type, reg, mem);
3047	} else {
3048		ir_emit_load_mem_fp(ctx, type, reg, mem);
3049	}
3050}
3051
3052static void ir_load_local_addr(ir_ctx *ctx, ir_reg reg, ir_ref src)
3053{
3054	ir_backend_data *data = ctx->data;
3055	dasm_State **Dst = &data->dasm_state;
3056	ir_reg base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3057	int32_t offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[src].op3);
3058
3059	IR_ASSERT(ir_rule(ctx, src) == IR_STATIC_ALLOCA);
3060	if (offset == 0) {
3061		| mov Ra(reg), Ra(base)
3062	} else {
3063		| lea Ra(reg), [Ra(base)+offset]
3064	}
3065}
3066
3067static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
3068{
3069	if (IR_IS_CONST_REF(src)) {
3070		if (IR_IS_TYPE_INT(type)) {
3071			ir_insn *insn = &ctx->ir_base[src];
3072
3073			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
3074				void *addr = ir_sym_val(ctx, insn);
3075				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
3076			} else if (insn->op == IR_STR) {
3077				ir_backend_data *data = ctx->data;
3078				dasm_State **Dst = &data->dasm_state;
3079				int label = ir_const_label(ctx, src);
3080
3081				|	lea Ra(reg), aword [=>label]
3082			} else {
3083				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
3084			}
3085		} else {
3086			ir_emit_load_imm_fp(ctx, type, reg, src);
3087		}
3088	} else if (ctx->vregs[src]) {
3089		ir_emit_load_mem(ctx, type, reg, ir_ref_spill_slot(ctx, src));
3090	} else {
3091		ir_load_local_addr(ctx, reg, src);
3092	}
3093}
3094
3095static void ir_emit_store_mem_int(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3096{
3097	ir_backend_data *data = ctx->data;
3098	dasm_State **Dst = &data->dasm_state;
3099
3100	|	ASM_MEM_REG_OP mov, type, mem, reg
3101}
3102
3103static void ir_emit_store_mem_fp(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3104{
3105	ir_backend_data *data = ctx->data;
3106	dasm_State **Dst = &data->dasm_state;
3107
3108	|	ASM_FP_MEM_REG_OP movs, type, mem, reg
3109}
3110
3111static void ir_emit_store_mem_imm(ir_ctx *ctx, ir_type type, ir_mem mem, int32_t imm)
3112{
3113	ir_backend_data *data = ctx->data;
3114	dasm_State **Dst = &data->dasm_state;
3115
3116	|	ASM_MEM_IMM_OP mov, type, mem, imm
3117}
3118
3119static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, bool is_arg)
3120{
3121	ir_backend_data *data = ctx->data;
3122	dasm_State **Dst = &data->dasm_state;
3123	ir_insn *val_insn = &ctx->ir_base[src];
3124
3125	IR_ASSERT(IR_IS_CONST_REF(src));
3126	if (val_insn->op == IR_STR) {
3127		int label = ir_const_label(ctx, src);
3128
3129		IR_ASSERT(tmp_reg != IR_REG_NONE);
3130|.if X64
3131		|	lea Ra(tmp_reg), aword [=>label]
3132||		ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
3133|.else
3134		|	ASM_TMEM_TXT_OP mov, aword, mem, =>label
3135|.endif
3136	} else {
3137		int64_t val = val_insn->val.i64;
3138
3139		if (val_insn->op == IR_FUNC || val_insn->op == IR_SYM) {
3140			val = (int64_t)(intptr_t)ir_sym_val(ctx, val_insn);
3141		}
3142
3143		if (sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(val)) {
3144			if (is_arg && ir_type_size[type] < 4) {
3145				type = IR_U32;
3146			}
3147			ir_emit_store_mem_imm(ctx, type, mem, val);
3148		} else {
3149			IR_ASSERT(tmp_reg != IR_REG_NONE);
3150			tmp_reg = IR_REG_NUM(tmp_reg);
3151			ir_emit_load_imm_int(ctx, type, tmp_reg, val);
3152			ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
3153		}
3154	}
3155}
3156
3157static void ir_emit_store_mem_fp_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, ir_reg tmp_fp_reg)
3158{
3159	ir_val *val = &ctx->ir_base[src].val;
3160
3161	if (type == IR_FLOAT) {
3162		ir_emit_store_mem_imm(ctx, IR_U32, mem, val->i32);
3163	} else if (sizeof(void*) == 8 && val->i64 == 0) {
3164		ir_emit_store_mem_imm(ctx, IR_U64, mem, 0);
3165	} else if (sizeof(void*) == 8 && tmp_reg != IR_REG_NONE) {
3166		ir_emit_load_imm_int(ctx, IR_U64, tmp_reg, val->i64);
3167		ir_emit_store_mem_int(ctx, IR_U64, mem, tmp_reg);
3168	} else {
3169		tmp_fp_reg = IR_REG_NUM(tmp_fp_reg);
3170		ir_emit_load(ctx, type, tmp_fp_reg, src);
3171		ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
3172	}
3173}
3174
3175static void ir_emit_store_mem(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3176{
3177	if (IR_IS_TYPE_INT(type)) {
3178		ir_emit_store_mem_int(ctx, type, mem, reg);
3179	} else {
3180		ir_emit_store_mem_fp(ctx, type, mem, reg);
3181	}
3182}
3183
3184static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg)
3185{
3186	IR_ASSERT(dst >= 0);
3187	ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg);
3188}
3189
3190static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3191{
3192	ir_backend_data *data = ctx->data;
3193	dasm_State **Dst = &data->dasm_state;
3194
3195	|	ASM_REG_REG_OP mov, type, dst, src
3196}
3197
3198#define IR_HAVE_SWAP_INT
3199
3200static void ir_emit_swap(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3201{
3202	ir_backend_data *data = ctx->data;
3203	dasm_State **Dst = &data->dasm_state;
3204
3205	|	ASM_REG_REG_OP xchg, type, dst, src
3206}
3207
3208static void ir_emit_mov_ext(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3209{
3210	ir_backend_data *data = ctx->data;
3211	dasm_State **Dst = &data->dasm_state;
3212
3213	if (ir_type_size[type] > 2) {
3214		|	ASM_REG_REG_OP mov, type, dst, src
3215	} else if (ir_type_size[type] == 2) {
3216		if (IR_IS_TYPE_SIGNED(type)) {
3217			|	movsx Rd(dst), Rw(src)
3218		} else {
3219			|	movzx Rd(dst), Rw(src)
3220		}
3221	} else /* if (ir_type_size[type] == 1) */ {
3222		if (IR_IS_TYPE_SIGNED(type)) {
3223			|	movsx Rd(dst), Rb(src)
3224		} else {
3225			|	movzx Rd(dst), Rb(src)
3226		}
3227	}
3228}
3229
3230static void ir_emit_fp_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3231{
3232	ir_backend_data *data = ctx->data;
3233	dasm_State **Dst = &data->dasm_state;
3234
3235	|	ASM_FP_REG_REG_OP movap, type, dst, src
3236}
3237
3238static ir_mem ir_fuse_addr_const(ir_ctx *ctx, ir_ref ref)
3239{
3240	ir_mem mem;
3241	ir_insn *addr_insn = &ctx->ir_base[ref];
3242
3243	IR_ASSERT(IR_IS_CONST_REF(ref));
3244	if (IR_IS_SYM_CONST(addr_insn->op)) {
3245		void *addr = ir_sym_val(ctx, addr_insn);
3246		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT((intptr_t)addr));
3247		mem = IR_MEM_O((int32_t)(intptr_t)addr);
3248	} else {
3249		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(addr_insn->val.i64));
3250		mem = IR_MEM_O(addr_insn->val.i32);
3251	}
3252	return mem;
3253}
3254
3255static ir_mem ir_fuse_addr(ir_ctx *ctx, ir_ref root, ir_ref ref)
3256{
3257	uint32_t rule = ctx->rules[ref];
3258	ir_insn *insn = &ctx->ir_base[ref];
3259	ir_insn *op1_insn, *op2_insn, *offset_insn;
3260	ir_ref base_reg_ref, index_reg_ref;
3261	ir_reg base_reg = IR_REG_NONE, index_reg;
3262	int32_t offset = 0, scale;
3263
3264	IR_ASSERT(((rule & IR_RULE_MASK) >= IR_LEA_OB &&
3265			(rule & IR_RULE_MASK) <= IR_LEA_SI_B) ||
3266		rule == IR_STATIC_ALLOCA);
3267	switch (rule & IR_RULE_MASK) {
3268		default:
3269			IR_ASSERT(0);
3270		case IR_LEA_OB:
3271			offset_insn = insn;
3272			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3273				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3274				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3275				base_reg_ref = IR_UNUSED;
3276			} else {
3277				base_reg_ref = ref * sizeof(ir_ref) + 1;
3278			}
3279			index_reg_ref = IR_UNUSED;
3280			scale = 1;
3281			break;
3282		case IR_LEA_SI:
3283			scale = ctx->ir_base[insn->op2].val.i32;
3284			index_reg_ref = ref * sizeof(ir_ref) + 1;
3285			base_reg_ref = IR_UNUSED;
3286			offset_insn = NULL;
3287			break;
3288		case IR_LEA_SIB:
3289			base_reg_ref = index_reg_ref = ref * sizeof(ir_ref) + 1;
3290			scale = ctx->ir_base[insn->op2].val.i32 - 1;
3291			offset_insn = NULL;
3292			break;
3293		case IR_LEA_IB:
3294			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3295				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3296				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3297				base_reg_ref = IR_UNUSED;
3298				index_reg_ref = ref * sizeof(ir_ref) + 2;
3299			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3300				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3301				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3302				base_reg_ref = IR_UNUSED;
3303				index_reg_ref = ref * sizeof(ir_ref) + 1;
3304			} else {
3305				base_reg_ref = ref * sizeof(ir_ref) + 1;
3306				index_reg_ref = ref * sizeof(ir_ref) + 2;
3307			}
3308			offset_insn = NULL;
3309			scale = 1;
3310			break;
3311		case IR_LEA_OB_I:
3312			op1_insn = &ctx->ir_base[insn->op1];
3313			offset_insn = op1_insn;
3314			scale = 1;
3315			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3316				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3317				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3318				base_reg_ref = IR_UNUSED;
3319				index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3320			} else if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
3321				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op1_insn->op1].op3);
3322				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3323				base_reg_ref = IR_UNUSED;
3324				index_reg_ref = ref * sizeof(ir_ref) + 2;
3325			} else {
3326				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3327				index_reg_ref = ref * sizeof(ir_ref) + 2;
3328			}
3329			break;
3330		case IR_LEA_I_OB:
3331			op2_insn = &ctx->ir_base[insn->op2];
3332			offset_insn = op2_insn;
3333			scale = 1;
3334			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3335				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3336				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3337				base_reg_ref = IR_UNUSED;
3338				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3339			} else if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
3340				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op2_insn->op1].op3);
3341				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3342				base_reg_ref = IR_UNUSED;
3343				index_reg_ref = ref * sizeof(ir_ref) + 1;
3344			} else {
3345				base_reg_ref = ref * sizeof(ir_ref) + 1;
3346				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3347			}
3348			break;
3349		case IR_LEA_SI_O:
3350			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3351			op1_insn = &ctx->ir_base[insn->op1];
3352			scale = ctx->ir_base[op1_insn->op2].val.i32;
3353			offset_insn = insn;
3354			base_reg_ref = IR_UNUSED;
3355			break;
3356		case IR_LEA_SIB_O:
3357			base_reg_ref = index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3358			op1_insn = &ctx->ir_base[insn->op1];
3359			scale = ctx->ir_base[op1_insn->op2].val.i32 - 1;
3360			offset_insn = insn;
3361			break;
3362		case IR_LEA_IB_O:
3363			base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3364			index_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
3365			offset_insn = insn;
3366			scale = 1;
3367			break;
3368		case IR_LEA_OB_SI:
3369			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3370			op1_insn = &ctx->ir_base[insn->op1];
3371			offset_insn = op1_insn;
3372			op2_insn = &ctx->ir_base[insn->op2];
3373			scale = ctx->ir_base[op2_insn->op2].val.i32;
3374			if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
3375				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op1_insn->op1].op3);
3376				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3377				base_reg_ref = IR_UNUSED;
3378			} else {
3379				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3380			}
3381			break;
3382		case IR_LEA_SI_OB:
3383			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3384			op1_insn = &ctx->ir_base[insn->op1];
3385			scale = ctx->ir_base[op1_insn->op2].val.i32;
3386			op2_insn = &ctx->ir_base[insn->op2];
3387			offset_insn = op2_insn;
3388			if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
3389				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op2_insn->op1].op3);
3390				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3391				base_reg_ref = IR_UNUSED;
3392			} else {
3393				base_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3394			}
3395			break;
3396		case IR_LEA_B_SI:
3397			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3398				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3399				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3400				base_reg_ref = IR_UNUSED;
3401			} else {
3402				base_reg_ref = ref * sizeof(ir_ref) + 1;
3403			}
3404			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3405			op2_insn = &ctx->ir_base[insn->op2];
3406			scale = ctx->ir_base[op2_insn->op2].val.i32;
3407			offset_insn = NULL;
3408			break;
3409		case IR_LEA_SI_B:
3410			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3411			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3412				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3413				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3414				base_reg_ref = IR_UNUSED;
3415			} else {
3416				base_reg_ref = ref * sizeof(ir_ref) + 2;
3417			}
3418			op1_insn = &ctx->ir_base[insn->op1];
3419			scale = ctx->ir_base[op1_insn->op2].val.i32;
3420			offset_insn = NULL;
3421			break;
3422		case IR_ALLOCA:
3423			offset = IR_SPILL_POS_TO_OFFSET(insn->op3);
3424			base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3425			base_reg_ref = index_reg_ref = IR_UNUSED;
3426			scale = 1;
3427			offset_insn = NULL;
3428			break;
3429	}
3430
3431	if (offset_insn) {
3432		ir_insn *addr_insn = &ctx->ir_base[offset_insn->op2];
3433
3434		if (IR_IS_SYM_CONST(addr_insn->op)) {
3435			void *addr = ir_sym_val(ctx, addr_insn);
3436			IR_ASSERT(sizeof(void*) != 8 || IR_IS_SIGNED_32BIT((intptr_t)addr));
3437			offset += (int64_t)(intptr_t)(addr);
3438		} else {
3439			if (offset_insn->op == IR_SUB) {
3440				offset = -addr_insn->val.i32;
3441			} else {
3442				offset += addr_insn->val.i32;
3443			}
3444		}
3445	}
3446
3447	if (base_reg_ref) {
3448		if (UNEXPECTED(ctx->rules[base_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3449			base_reg = ir_get_fused_reg(ctx, root, base_reg_ref);
3450		} else {
3451			base_reg = ((int8_t*)ctx->regs)[base_reg_ref];
3452		}
3453		IR_ASSERT(base_reg != IR_REG_NONE);
3454		if (IR_REG_SPILLED(base_reg)) {
3455			base_reg = IR_REG_NUM(base_reg);
3456			ir_emit_load(ctx, insn->type, base_reg, ((ir_ref*)ctx->ir_base)[base_reg_ref]);
3457		}
3458	}
3459
3460	index_reg = IR_REG_NONE;
3461	if (index_reg_ref) {
3462		if (base_reg_ref
3463			&& ((ir_ref*)ctx->ir_base)[index_reg_ref]
3464				== ((ir_ref*)ctx->ir_base)[base_reg_ref]) {
3465			index_reg = base_reg;
3466		} else {
3467			if (UNEXPECTED(ctx->rules[index_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3468				index_reg = ir_get_fused_reg(ctx, root, index_reg_ref);
3469			} else {
3470				index_reg = ((int8_t*)ctx->regs)[index_reg_ref];
3471			}
3472			IR_ASSERT(index_reg != IR_REG_NONE);
3473			if (IR_REG_SPILLED(index_reg)) {
3474				index_reg = IR_REG_NUM(index_reg);
3475				ir_emit_load(ctx, insn->type, index_reg, ((ir_ref*)ctx->ir_base)[index_reg_ref]);
3476			}
3477		}
3478	}
3479
3480	return IR_MEM(base_reg, offset, index_reg, scale);
3481}
3482
3483static ir_mem ir_fuse_mem(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *mem_insn, ir_reg reg)
3484{
3485	if (reg != IR_REG_NONE) {
3486		if (IR_REG_SPILLED(reg)) {
3487			reg = IR_REG_NUM(reg);
3488			ir_emit_load(ctx, IR_ADDR, reg, mem_insn->op2);
3489		}
3490		return IR_MEM_B(reg);
3491	} else if (IR_IS_CONST_REF(mem_insn->op2)) {
3492		return ir_fuse_addr_const(ctx, mem_insn->op2);
3493	} else {
3494		return ir_fuse_addr(ctx, root, mem_insn->op2);
3495	}
3496}
3497
3498static ir_mem ir_fuse_load(ir_ctx *ctx, ir_ref root, ir_ref ref)
3499{
3500	ir_insn *load_insn = &ctx->ir_base[ref];
3501	ir_reg reg;
3502
3503	IR_ASSERT(load_insn->op == IR_LOAD);
3504	if (UNEXPECTED(ctx->rules[ref] & IR_FUSED_REG)) {
3505		reg = ir_get_fused_reg(ctx, root, ref * sizeof(ir_ref) + 2);
3506	} else {
3507		reg = ctx->regs[ref][2];
3508	}
3509	return ir_fuse_mem(ctx, root, ref, load_insn, reg);
3510}
3511
3512static int32_t ir_fuse_imm(ir_ctx *ctx, ir_ref ref)
3513{
3514	ir_insn *val_insn = &ctx->ir_base[ref];
3515
3516	IR_ASSERT(IR_IS_CONST_REF(ref));
3517	if (IR_IS_SYM_CONST(val_insn->op)) {
3518		void *addr = ir_sym_val(ctx, val_insn);
3519		IR_ASSERT(IR_IS_SIGNED_32BIT((intptr_t)addr));
3520		return (int32_t)(intptr_t)addr;
3521	} else {
3522		IR_ASSERT(IR_IS_SIGNED_32BIT(val_insn->val.i32));
3523		return val_insn->val.i32;
3524	}
3525}
3526
3527static void ir_emit_load_ex(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src, ir_ref root)
3528{
3529	if (IR_IS_CONST_REF(src)) {
3530		if (IR_IS_TYPE_INT(type)) {
3531			ir_insn *insn = &ctx->ir_base[src];
3532
3533			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
3534				void *addr = ir_sym_val(ctx, insn);
3535				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
3536			} else if (insn->op == IR_STR) {
3537				ir_backend_data *data = ctx->data;
3538				dasm_State **Dst = &data->dasm_state;
3539				int label = ir_const_label(ctx, src);
3540
3541				|	lea Ra(reg), aword [=>label]
3542			} else {
3543				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
3544			}
3545		} else {
3546			ir_emit_load_imm_fp(ctx, type, reg, src);
3547		}
3548	} else if (ir_rule(ctx, src) == IR_STATIC_ALLOCA) {
3549		ir_load_local_addr(ctx, reg, src);
3550	} else {
3551		ir_mem mem;
3552
3553		if (ir_rule(ctx, src) & IR_FUSED) {
3554			mem = ir_fuse_load(ctx, root, src);
3555		} else {
3556			mem = ir_ref_spill_slot(ctx, src);
3557		}
3558		ir_emit_load_mem(ctx, type, reg, mem);
3559	}
3560}
3561
3562static void ir_emit_prologue(ir_ctx *ctx)
3563{
3564	ir_backend_data *data = ctx->data;
3565	dasm_State **Dst = &data->dasm_state;
3566	int offset = ctx->stack_frame_size + ctx->call_stack_size;
3567
3568	if (ctx->flags & IR_USE_FRAME_POINTER) {
3569		|	push Ra(IR_REG_RBP)
3570		|	mov Ra(IR_REG_RBP), Ra(IR_REG_RSP)
3571	}
3572	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3573		int i;
3574		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3575
3576		for (i = IR_REG_GP_FIRST; i <= IR_REG_GP_LAST; i++) {
3577			if (IR_REGSET_IN(used_preserved_regs, i)) {
3578				offset -= sizeof(void*);
3579				|	push Ra(i)
3580			}
3581		}
3582	}
3583	if (ctx->stack_frame_size + ctx->call_stack_size) {
3584		if (ctx->fixed_stack_red_zone) {
3585			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3586		} else if (offset) {
3587			|	sub Ra(IR_REG_RSP), offset
3588		}
3589	}
3590	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
3591		ir_reg fp;
3592		int i;
3593		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP);
3594
3595		if (ctx->flags & IR_USE_FRAME_POINTER) {
3596			fp = IR_REG_FRAME_POINTER;
3597			offset -= ctx->stack_frame_size + ctx->call_stack_size;
3598		} else {
3599			fp = IR_REG_STACK_POINTER;
3600		}
3601		for (i = IR_REG_FP_FIRST; i <= IR_REG_FP_LAST; i++) {
3602			if (IR_REGSET_IN(used_preserved_regs, i)) {
3603				offset -= sizeof(void*);
3604				if (ctx->mflags & IR_X86_AVX) {
3605					|	vmovsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3606				} else {
3607					|	movsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3608				}
3609			}
3610		}
3611	}
3612	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
3613#if defined(_WIN64)
3614		ir_reg fp;
3615		int offset;
3616
3617		if (ctx->flags & IR_USE_FRAME_POINTER) {
3618			fp = IR_REG_FRAME_POINTER;
3619			offset = sizeof(void*) * 2;
3620		} else {
3621			fp = IR_REG_STACK_POINTER;
3622			offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*);
3623		}
3624		|	mov [Ra(fp)+offset], Ra(IR_REG_INT_ARG1)
3625		|	mov [Ra(fp)+offset+8], Ra(IR_REG_INT_ARG2)
3626		|	mov [Ra(fp)+offset+16], Ra(IR_REG_INT_ARG3)
3627		|	mov [Ra(fp)+offset+24], Ra(IR_REG_INT_ARG4)
3628#elif defined(IR_TARGET_X64)
3629|.if X64
3630		const int8_t *int_reg_params = _ir_int_reg_params;
3631		const int8_t *fp_reg_params = _ir_fp_reg_params;
3632		uint32_t i;
3633		ir_reg fp;
3634		int offset;
3635
3636		if (ctx->flags & IR_USE_FRAME_POINTER) {
3637			fp = IR_REG_FRAME_POINTER;
3638
3639			offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
3640		} else {
3641			fp = IR_REG_STACK_POINTER;
3642			offset = ctx->locals_area_size + ctx->call_stack_size;
3643		}
3644
3645		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
3646			/* skip named args */
3647			offset += sizeof(void*) * ctx->gp_reg_params;
3648			for (i = ctx->gp_reg_params; i < IR_REG_INT_ARGS; i++) {
3649				|	mov qword [Ra(fp)+offset], Rq(int_reg_params[i])
3650				offset += sizeof(void*);
3651			}
3652		}
3653		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
3654			|	test al, al
3655			|	je	>1
3656			/* skip named args */
3657			offset += 16 * ctx->fp_reg_params;
3658			for (i = ctx->fp_reg_params; i < IR_REG_FP_ARGS; i++) {
3659				|	movaps [Ra(fp)+offset], xmm(fp_reg_params[i]-IR_REG_FP_FIRST)
3660				offset += 16;
3661			}
3662			|1:
3663		}
3664|.endif
3665#endif
3666	}
3667}
3668
3669static void ir_emit_epilogue(ir_ctx *ctx)
3670{
3671	ir_backend_data *data = ctx->data;
3672	dasm_State **Dst = &data->dasm_state;
3673
3674	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
3675		int i;
3676		int offset;
3677		ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3678		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
3679
3680		if (ctx->flags & IR_USE_FRAME_POINTER) {
3681			fp = IR_REG_FRAME_POINTER;
3682			offset = 0;
3683		} else {
3684			fp = IR_REG_STACK_POINTER;
3685			offset = ctx->stack_frame_size + ctx->call_stack_size;
3686		}
3687		for (i = 0; i < IR_REG_NUM; i++) {
3688			if (IR_REGSET_IN(used_preserved_regs, i)) {
3689				if (i < IR_REG_FP_FIRST) {
3690					offset -= sizeof(void*);
3691				} else {
3692					offset -= sizeof(void*);
3693					if (ctx->mflags & IR_X86_AVX) {
3694						|	vmovsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3695					} else {
3696						|	movsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3697					}
3698				}
3699			}
3700		}
3701	}
3702
3703	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3704		int i;
3705		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3706		int offset;
3707
3708		if (ctx->flags & IR_USE_FRAME_POINTER) {
3709			offset = 0;
3710		} else {
3711			offset = ctx->stack_frame_size + ctx->call_stack_size;
3712		}
3713		if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3714			int i;
3715			ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3716
3717			for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
3718				if (IR_REGSET_IN(used_preserved_regs, i)) {
3719					offset -= sizeof(void*);
3720				}
3721			}
3722		}
3723		if (ctx->flags & IR_USE_FRAME_POINTER) {
3724			|	lea Ra(IR_REG_RSP), [Ra(IR_REG_RBP)+offset]
3725		} else if (offset) {
3726			|	add Ra(IR_REG_RSP), offset
3727		}
3728		for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
3729			if (IR_REGSET_IN(used_preserved_regs, i)) {
3730				|	pop Ra(i)
3731			}
3732		}
3733		if (ctx->flags & IR_USE_FRAME_POINTER) {
3734			|	pop Ra(IR_REG_RBP)
3735		}
3736	} else if (ctx->flags & IR_USE_FRAME_POINTER) {
3737		|	mov Ra(IR_REG_RSP), Ra(IR_REG_RBP)
3738		|	pop Ra(IR_REG_RBP)
3739	} else if (ctx->stack_frame_size + ctx->call_stack_size) {
3740		if (ctx->fixed_stack_red_zone) {
3741			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3742		} else {
3743			|	add Ra(IR_REG_RSP), (ctx->stack_frame_size + ctx->call_stack_size)
3744		}
3745	}
3746}
3747
3748static void ir_emit_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3749{
3750	ir_backend_data *data = ctx->data;
3751	dasm_State **Dst = &data->dasm_state;
3752	ir_type type = insn->type;
3753	ir_ref op1 = insn->op1;
3754	ir_ref op2 = insn->op2;
3755	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3756	ir_reg op1_reg = ctx->regs[def][1];
3757	ir_reg op2_reg = ctx->regs[def][2];
3758
3759	IR_ASSERT(def_reg != IR_REG_NONE);
3760
3761	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3762		op1_reg = IR_REG_NUM(op1_reg);
3763		ir_emit_load(ctx, type, op1_reg, op1);
3764	}
3765	if (def_reg != op1_reg) {
3766		if (op1_reg != IR_REG_NONE) {
3767			ir_emit_mov(ctx, type, def_reg, op1_reg);
3768		} else {
3769			ir_emit_load(ctx, type, def_reg, op1);
3770		}
3771		if (op1 == op2) {
3772			op2_reg = def_reg;
3773		}
3774	}
3775
3776	if (op2_reg != IR_REG_NONE) {
3777		if (IR_REG_SPILLED(op2_reg)) {
3778			op2_reg = IR_REG_NUM(op2_reg);
3779			if (op1 != op2) {
3780				ir_emit_load(ctx, type, op2_reg, op2);
3781			}
3782		}
3783		switch (insn->op) {
3784			default:
3785				IR_ASSERT(0 && "NIY binary op");
3786			case IR_ADD:
3787			case IR_ADD_OV:
3788				|	ASM_REG_REG_OP add, type, def_reg, op2_reg
3789				break;
3790			case IR_SUB:
3791			case IR_SUB_OV:
3792				|	ASM_REG_REG_OP sub, type, def_reg, op2_reg
3793				break;
3794			case IR_MUL:
3795			case IR_MUL_OV:
3796				|	ASM_REG_REG_MUL imul, type, def_reg, op2_reg
3797				break;
3798			case IR_OR:
3799				|	ASM_REG_REG_OP or, type, def_reg, op2_reg
3800				break;
3801			case IR_AND:
3802				|	ASM_REG_REG_OP and, type, def_reg, op2_reg
3803				break;
3804			case IR_XOR:
3805				|	ASM_REG_REG_OP xor, type, def_reg, op2_reg
3806				break;
3807		}
3808	} else if (IR_IS_CONST_REF(op2)) {
3809		int32_t val = ir_fuse_imm(ctx, op2);
3810
3811		switch (insn->op) {
3812			default:
3813				IR_ASSERT(0 && "NIY binary op");
3814			case IR_ADD:
3815			case IR_ADD_OV:
3816				|	ASM_REG_IMM_OP add, type, def_reg, val
3817				break;
3818			case IR_SUB:
3819			case IR_SUB_OV:
3820				|	ASM_REG_IMM_OP sub, type, def_reg, val
3821				break;
3822			case IR_MUL:
3823			case IR_MUL_OV:
3824				|	ASM_REG_IMM_MUL imul, type, def_reg, val
3825				break;
3826			case IR_OR:
3827				|	ASM_REG_IMM_OP or, type, def_reg, val
3828				break;
3829			case IR_AND:
3830				|	ASM_REG_IMM_OP and, type, def_reg, val
3831				break;
3832			case IR_XOR:
3833				|	ASM_REG_IMM_OP xor, type, def_reg, val
3834				break;
3835		}
3836	} else {
3837		ir_mem mem;
3838
3839		if (ir_rule(ctx, op2) & IR_FUSED) {
3840			mem = ir_fuse_load(ctx, def, op2);
3841		} else {
3842			mem = ir_ref_spill_slot(ctx, op2);
3843		}
3844		switch (insn->op) {
3845			default:
3846				IR_ASSERT(0 && "NIY binary op");
3847			case IR_ADD:
3848			case IR_ADD_OV:
3849				|	ASM_REG_MEM_OP add, type, def_reg, mem
3850				break;
3851			case IR_SUB:
3852			case IR_SUB_OV:
3853				|	ASM_REG_MEM_OP sub, type, def_reg, mem
3854				break;
3855			case IR_MUL:
3856			case IR_MUL_OV:
3857				|	ASM_REG_MEM_MUL imul, type, def_reg, mem
3858				break;
3859			case IR_OR:
3860				|	ASM_REG_MEM_OP or, type, def_reg, mem
3861				break;
3862			case IR_AND:
3863				|	ASM_REG_MEM_OP and, type, def_reg, mem
3864				break;
3865			case IR_XOR:
3866				|	ASM_REG_MEM_OP xor, type, def_reg, mem
3867				break;
3868		}
3869	}
3870	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3871		ir_emit_store(ctx, type, def, def_reg);
3872	}
3873}
3874
3875static void ir_emit_imul3(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3876{
3877	ir_backend_data *data = ctx->data;
3878	dasm_State **Dst = &data->dasm_state;
3879	ir_type type = insn->type;
3880	ir_ref op1 = insn->op1;
3881	ir_ref op2 = insn->op2;
3882	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3883	ir_reg op1_reg = ctx->regs[def][1];
3884	int32_t val = ir_fuse_imm(ctx, op2);
3885
3886	IR_ASSERT(def_reg != IR_REG_NONE);
3887	IR_ASSERT(!IR_IS_CONST_REF(op1));
3888
3889	if (op1_reg != IR_REG_NONE) {
3890		if (IR_REG_SPILLED(op1_reg)) {
3891			op1_reg = IR_REG_NUM(op1_reg);
3892			ir_emit_load(ctx, type, op1_reg, op1);
3893		}
3894		switch (ir_type_size[type]) {
3895			default:
3896				IR_ASSERT(0);
3897			case 2:
3898				|	imul Rw(def_reg), Rw(op1_reg), val
3899				break;
3900			case 4:
3901				|	imul Rd(def_reg), Rd(op1_reg), val
3902				break;
3903|.if X64
3904||			case 8:
3905|				imul Rq(def_reg), Rq(op1_reg), val
3906||				break;
3907|.endif
3908		}
3909	} else {
3910		ir_mem mem;
3911
3912		if (ir_rule(ctx, op1) & IR_FUSED) {
3913			mem = ir_fuse_load(ctx, def, op1);
3914		} else {
3915			mem = ir_ref_spill_slot(ctx, op1);
3916		}
3917		|	ASM_REG_MEM_TXT_MUL imul, type, def_reg, mem, val
3918	}
3919	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3920		ir_emit_store(ctx, type, def, def_reg);
3921	}
3922}
3923
3924static void ir_emit_min_max_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3925{
3926	ir_backend_data *data = ctx->data;
3927	dasm_State **Dst = &data->dasm_state;
3928	ir_type type = insn->type;
3929	ir_ref op1 = insn->op1;
3930	ir_ref op2 = insn->op2;
3931	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3932	ir_reg op1_reg = ctx->regs[def][1];
3933	ir_reg op2_reg = ctx->regs[def][2];
3934
3935	IR_ASSERT(def_reg != IR_REG_NONE && op2_reg != IR_REG_NONE);
3936
3937	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3938		op1_reg = IR_REG_NUM(op1_reg);
3939		ir_emit_load(ctx, type, op1_reg, op1);
3940	}
3941	if (def_reg != op1_reg) {
3942		if (op1_reg != IR_REG_NONE) {
3943			ir_emit_mov(ctx, type, def_reg, op1_reg);
3944		} else {
3945			ir_emit_load(ctx, type, def_reg, op1);
3946		}
3947	}
3948
3949	if (IR_REG_SPILLED(op2_reg)) {
3950		op2_reg = IR_REG_NUM(op2_reg);
3951		if (op1 != op2) {
3952			ir_emit_load(ctx, type, op2_reg, op2);
3953		}
3954	}
3955
3956	if (op1 == op2) {
3957		return;
3958	}
3959
3960	|	ASM_REG_REG_OP cmp, type, def_reg, op2_reg
3961	if (insn->op == IR_MIN) {
3962		if (IR_IS_TYPE_SIGNED(type)) {
3963			|	ASM_REG_REG_OP2 cmovg, type, def_reg, op2_reg
3964		} else {
3965			|	ASM_REG_REG_OP2 cmova, type, def_reg, op2_reg
3966		}
3967	} else {
3968		IR_ASSERT(insn->op == IR_MAX);
3969		if (IR_IS_TYPE_SIGNED(type)) {
3970			|	ASM_REG_REG_OP2 cmovl, type, def_reg, op2_reg
3971		} else {
3972			|	ASM_REG_REG_OP2 cmovb, type, def_reg, op2_reg
3973		}
3974	}
3975
3976	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3977		ir_emit_store(ctx, type, def, def_reg);
3978	}
3979}
3980
3981static void ir_emit_overflow(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3982{
3983	ir_backend_data *data = ctx->data;
3984	dasm_State **Dst = &data->dasm_state;
3985	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3986	ir_type type = ctx->ir_base[insn->op1].type;
3987
3988	IR_ASSERT(def_reg != IR_REG_NONE);
3989	IR_ASSERT(IR_IS_TYPE_INT(type));
3990	if (IR_IS_TYPE_SIGNED(type)) {
3991		|	seto Rb(def_reg)
3992	} else {
3993		|	setc Rb(def_reg)
3994	}
3995	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3996		ir_emit_store(ctx, insn->type, def, def_reg);
3997	}
3998}
3999
4000static void ir_emit_overflow_and_branch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
4001{
4002	ir_backend_data *data = ctx->data;
4003	dasm_State **Dst = &data->dasm_state;
4004	ir_insn *overflow_insn = &ctx->ir_base[insn->op2];
4005	ir_type type = ctx->ir_base[overflow_insn->op1].type;
4006	uint32_t true_block, false_block;
4007	bool reverse = 0;
4008
4009	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
4010	if (true_block == next_block) {
4011		reverse = 1;
4012		true_block = false_block;
4013		false_block = 0;
4014	} else if (false_block == next_block) {
4015		false_block = 0;
4016	}
4017
4018	if (IR_IS_TYPE_SIGNED(type)) {
4019		if (reverse) {
4020			|	jno =>true_block
4021		} else {
4022			|	jo =>true_block
4023		}
4024	} else {
4025		if (reverse) {
4026			|	jnc =>true_block
4027		} else {
4028			|	jc =>true_block
4029		}
4030	}
4031	if (false_block) {
4032		|	jmp =>false_block
4033	}
4034}
4035
4036static void ir_emit_mem_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4037{
4038	ir_backend_data *data = ctx->data;
4039	dasm_State **Dst = &data->dasm_state;
4040	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4041	ir_type type = op_insn->type;
4042	ir_ref op2 = op_insn->op2;
4043	ir_reg op2_reg = ctx->regs[insn->op3][2];
4044	ir_mem mem;
4045
4046	if (insn->op == IR_STORE) {
4047		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4048	} else {
4049		IR_ASSERT(insn->op == IR_VSTORE);
4050		mem = ir_var_spill_slot(ctx, insn->op2);
4051	}
4052
4053	if (op2_reg == IR_REG_NONE) {
4054		int32_t val = ir_fuse_imm(ctx, op2);
4055
4056		switch (op_insn->op) {
4057			default:
4058				IR_ASSERT(0 && "NIY binary op");
4059			case IR_ADD:
4060			case IR_ADD_OV:
4061				|	ASM_MEM_IMM_OP add, type, mem, val
4062				break;
4063			case IR_SUB:
4064			case IR_SUB_OV:
4065				|	ASM_MEM_IMM_OP sub, type, mem, val
4066				break;
4067			case IR_OR:
4068				|	ASM_MEM_IMM_OP or, type, mem, val
4069				break;
4070			case IR_AND:
4071				|	ASM_MEM_IMM_OP and, type, mem, val
4072				break;
4073			case IR_XOR:
4074				|	ASM_MEM_IMM_OP xor, type, mem, val
4075				break;
4076		}
4077	} else {
4078		if (IR_REG_SPILLED(op2_reg)) {
4079			op2_reg = IR_REG_NUM(op2_reg);
4080			ir_emit_load(ctx, type, op2_reg, op2);
4081		}
4082		switch (op_insn->op) {
4083			default:
4084				IR_ASSERT(0 && "NIY binary op");
4085			case IR_ADD:
4086			case IR_ADD_OV:
4087				|	ASM_MEM_REG_OP add, type, mem, op2_reg
4088				break;
4089			case IR_SUB:
4090			case IR_SUB_OV:
4091				|	ASM_MEM_REG_OP sub, type, mem, op2_reg
4092				break;
4093			case IR_OR:
4094				|	ASM_MEM_REG_OP or, type, mem, op2_reg
4095				break;
4096			case IR_AND:
4097				|	ASM_MEM_REG_OP and, type, mem, op2_reg
4098				break;
4099			case IR_XOR:
4100				|	ASM_MEM_REG_OP xor, type, mem, op2_reg
4101				break;
4102		}
4103	}
4104}
4105
4106static void ir_emit_reg_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4107{
4108	ir_backend_data *data = ctx->data;
4109	dasm_State **Dst = &data->dasm_state;
4110	ir_insn *op_insn = &ctx->ir_base[insn->op2];
4111	ir_type type = op_insn->type;
4112	ir_ref op2 = op_insn->op2;
4113	ir_reg op2_reg = ctx->regs[insn->op2][2];
4114	ir_reg reg;
4115
4116	IR_ASSERT(insn->op == IR_RSTORE);
4117	reg = insn->op3;
4118
4119	if (op2_reg == IR_REG_NONE) {
4120		int32_t val = ir_fuse_imm(ctx, op2);
4121
4122		switch (op_insn->op) {
4123			default:
4124				IR_ASSERT(0 && "NIY binary op");
4125			case IR_ADD:
4126				|	ASM_REG_IMM_OP add, type, reg, val
4127				break;
4128			case IR_SUB:
4129				|	ASM_REG_IMM_OP sub, type, reg, val
4130				break;
4131			case IR_OR:
4132				|	ASM_REG_IMM_OP or, type, reg, val
4133				break;
4134			case IR_AND:
4135				|	ASM_REG_IMM_OP and, type, reg, val
4136				break;
4137			case IR_XOR:
4138				|	ASM_REG_IMM_OP xor, type, reg, val
4139				break;
4140		}
4141	} else {
4142		if (IR_REG_SPILLED(op2_reg)) {
4143			op2_reg = IR_REG_NUM(op2_reg);
4144			ir_emit_load(ctx, type, op2_reg, op2);
4145		}
4146		switch (op_insn->op) {
4147			default:
4148				IR_ASSERT(0 && "NIY binary op");
4149			case IR_ADD:
4150				|	ASM_REG_REG_OP add, type, reg, op2_reg
4151				break;
4152			case IR_SUB:
4153				|	ASM_REG_REG_OP sub, type, reg, op2_reg
4154				break;
4155			case IR_OR:
4156				|	ASM_REG_REG_OP or, type, reg, op2_reg
4157				break;
4158			case IR_AND:
4159				|	ASM_REG_REG_OP and, type, reg, op2_reg
4160				break;
4161			case IR_XOR:
4162				|	ASM_REG_REG_OP xor, type, reg, op2_reg
4163				break;
4164		}
4165	}
4166}
4167
4168static void ir_emit_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4169{
4170	ir_backend_data *data = ctx->data;
4171	dasm_State **Dst = &data->dasm_state;
4172	ir_type type = insn->type;
4173	ir_ref op1 = insn->op1;
4174	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4175	ir_reg op1_reg = ctx->regs[def][1];
4176
4177	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4178	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4179	IR_ASSERT(def_reg != IR_REG_NONE);
4180
4181	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4182		op1_reg = IR_REG_NUM(op1_reg);
4183		ir_emit_load(ctx, type, op1_reg, op1);
4184	}
4185	if (def_reg != op1_reg) {
4186		if (op1_reg != IR_REG_NONE) {
4187			ir_emit_mov(ctx, type, def_reg, op1_reg);
4188		} else {
4189			ir_emit_load(ctx, type, def_reg, op1);
4190		}
4191	}
4192	if (insn->op == IR_MUL) {
4193		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4194
4195		if (shift == 1) {
4196			|	ASM_REG_REG_OP add, type, def_reg, def_reg
4197		} else {
4198			|	ASM_REG_IMM_OP shl, type, def_reg, shift
4199		}
4200	} else if (insn->op == IR_DIV) {
4201		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4202
4203		|	ASM_REG_IMM_OP shr, type, def_reg, shift
4204	} else {
4205		IR_ASSERT(insn->op == IR_MOD);
4206		uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
4207
4208|.if X64
4209||		if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
4210||			ir_reg op2_reg = ctx->regs[def][2];
4211||
4212||			op2_reg = IR_REG_NUM(op2_reg);
4213||			ir_emit_load_imm_int(ctx, type, op2_reg, mask);
4214			|	ASM_REG_REG_OP and, type, def_reg, op2_reg
4215||		} else {
4216|.endif
4217			|	ASM_REG_IMM_OP and, type, def_reg, mask
4218|.if X64
4219||		}
4220|.endif
4221	}
4222	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4223		ir_emit_store(ctx, type, def, def_reg);
4224	}
4225}
4226
4227static void ir_emit_sdiv_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4228{
4229	ir_backend_data *data = ctx->data;
4230	dasm_State **Dst = &data->dasm_state;
4231	ir_type type = insn->type;
4232	ir_ref op1 = insn->op1;
4233	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4234	ir_reg op1_reg = ctx->regs[def][1];
4235	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4236	int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
4237
4238	IR_ASSERT(shift != 0);
4239	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4240	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4241	IR_ASSERT(op1_reg != IR_REG_NONE && def_reg != IR_REG_NONE && op1_reg != def_reg);
4242
4243	if (IR_REG_SPILLED(op1_reg)) {
4244		op1_reg = IR_REG_NUM(op1_reg);
4245		ir_emit_load(ctx, type, op1_reg, op1);
4246	}
4247
4248	if (shift == 1) {
4249|.if X64
4250||		if (ir_type_size[type] == 8) {
4251			|	mov Rq(def_reg), Rq(op1_reg)
4252			|	ASM_REG_IMM_OP shr, type, def_reg, 63
4253			|	add Rq(def_reg), Rq(op1_reg)
4254||		} else {
4255|.endif
4256			|	mov Rd(def_reg), Rd(op1_reg)
4257			|	ASM_REG_IMM_OP shr, type, def_reg, (ir_type_size[type]*8-1)
4258			|	add Rd(def_reg), Rd(op1_reg)
4259|.if X64
4260||		}
4261|.endif
4262	} else {
4263|.if X64
4264||		if (ir_type_size[type] == 8) {
4265||			ir_reg op2_reg = ctx->regs[def][2];
4266||
4267||			if (op2_reg != IR_REG_NONE) {
4268||				op2_reg =  IR_REG_NUM(op2_reg);
4269||				ir_emit_load_imm_int(ctx, type, op2_reg, offset);
4270				|	lea Rq(def_reg), [Rq(op1_reg)+Rq(op2_reg)]
4271||			} else {
4272				|	lea Rq(def_reg), [Rq(op1_reg)+(int32_t)offset]
4273||			}
4274||		} else {
4275|.endif
4276			|	lea Rd(def_reg), [Rd(op1_reg)+(int32_t)offset]
4277|.if X64
4278||		}
4279|.endif
4280		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
4281		|	ASM_REG_REG_OP2 cmovns, type, def_reg, op1_reg
4282	}
4283	|	ASM_REG_IMM_OP sar, type, def_reg, shift
4284
4285	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4286		ir_emit_store(ctx, type, def, def_reg);
4287	}
4288}
4289
4290static void ir_emit_smod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4291{
4292	ir_backend_data *data = ctx->data;
4293	dasm_State **Dst = &data->dasm_state;
4294	ir_type type = insn->type;
4295	ir_ref op1 = insn->op1;
4296	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4297	ir_reg op1_reg = ctx->regs[def][1];
4298	ir_reg tmp_reg = ctx->regs[def][3];
4299	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4300	uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
4301
4302	IR_ASSERT(shift != 0);
4303	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4304	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4305	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE && def_reg != tmp_reg);
4306
4307	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4308		op1_reg = IR_REG_NUM(op1_reg);
4309		ir_emit_load(ctx, type, op1_reg, op1);
4310	}
4311	if (def_reg != op1_reg) {
4312		if (op1_reg != IR_REG_NONE) {
4313			ir_emit_mov(ctx, type, def_reg, op1_reg);
4314		} else {
4315			ir_emit_load(ctx, type, def_reg, op1);
4316		}
4317	}
4318	if (tmp_reg != op1_reg) {
4319		ir_emit_mov(ctx, type, tmp_reg, def_reg);
4320	}
4321
4322
4323	if (shift == 1) {
4324		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-1)
4325	} else {
4326		|	ASM_REG_IMM_OP sar, type, tmp_reg, (ir_type_size[type]*8-1)
4327		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-shift)
4328	}
4329	|	ASM_REG_REG_OP add, type, def_reg, tmp_reg
4330
4331|.if X64
4332||	if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
4333||		ir_reg op2_reg = ctx->regs[def][2];
4334||
4335||		op2_reg = IR_REG_NUM(op2_reg);
4336||		ir_emit_load_imm_int(ctx, type, op2_reg, mask);
4337		|	ASM_REG_REG_OP and, type, def_reg, op2_reg
4338||	} else {
4339|.endif
4340		|	ASM_REG_IMM_OP and, type, def_reg, mask
4341|.if X64
4342||	}
4343|.endif
4344
4345	|	ASM_REG_REG_OP sub, type, def_reg, tmp_reg
4346
4347	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4348		ir_emit_store(ctx, type, def, def_reg);
4349	}
4350}
4351
4352static void ir_emit_mem_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4353{
4354	ir_backend_data *data = ctx->data;
4355	dasm_State **Dst = &data->dasm_state;
4356	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4357	ir_type type = op_insn->type;
4358	ir_mem mem;
4359
4360	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
4361	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
4362
4363	if (insn->op == IR_STORE) {
4364		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4365	} else {
4366		IR_ASSERT(insn->op == IR_VSTORE);
4367		mem = ir_var_spill_slot(ctx, insn->op2);
4368	}
4369
4370	if (op_insn->op == IR_MUL) {
4371		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
4372		|	ASM_MEM_IMM_OP shl, type, mem, shift
4373	} else if (op_insn->op == IR_DIV) {
4374		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
4375		|	ASM_MEM_IMM_OP shr, type, mem, shift
4376	} else {
4377		IR_ASSERT(op_insn->op == IR_MOD);
4378		uint64_t mask = ctx->ir_base[op_insn->op2].val.u64 - 1;
4379		IR_ASSERT(IR_IS_UNSIGNED_32BIT(mask));
4380		|	ASM_MEM_IMM_OP and, type, mem, mask
4381	}
4382}
4383
4384static void ir_emit_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4385{
4386	ir_backend_data *data = ctx->data;
4387	dasm_State **Dst = &data->dasm_state;
4388	ir_type type = insn->type;
4389	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4390	ir_reg op1_reg = ctx->regs[def][1];
4391	ir_reg op2_reg = ctx->regs[def][2];
4392
4393	IR_ASSERT(def_reg != IR_REG_NONE && def_reg != IR_REG_RCX);
4394	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4395		op1_reg = IR_REG_NUM(op1_reg);
4396		ir_emit_load(ctx, type, op1_reg, insn->op1);
4397	}
4398	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
4399		op2_reg = IR_REG_NUM(op2_reg);
4400		ir_emit_load(ctx, type, op2_reg, insn->op2);
4401	}
4402	if (op2_reg != IR_REG_RCX) {
4403		if (op1_reg == IR_REG_RCX) {
4404			ir_emit_mov(ctx, type, def_reg, op1_reg);
4405			op1_reg = def_reg;
4406		}
4407		if (op2_reg != IR_REG_NONE) {
4408			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
4409		} else {
4410			ir_emit_load(ctx, type, IR_REG_RCX, insn->op2);
4411		}
4412	}
4413	if (def_reg != op1_reg) {
4414		if (op1_reg != IR_REG_NONE) {
4415			ir_emit_mov(ctx, type, def_reg, op1_reg);
4416		} else {
4417			ir_emit_load(ctx, type, def_reg, insn->op1);
4418		}
4419	}
4420	switch (insn->op) {
4421		default:
4422			IR_ASSERT(0);
4423		case IR_SHL:
4424			|	ASM_REG_TXT_OP shl, insn->type, def_reg, cl
4425			break;
4426		case IR_SHR:
4427			|	ASM_REG_TXT_OP shr, insn->type, def_reg, cl
4428			break;
4429		case IR_SAR:
4430			|	ASM_REG_TXT_OP sar, insn->type, def_reg, cl
4431			break;
4432		case IR_ROL:
4433			|	ASM_REG_TXT_OP rol, insn->type, def_reg, cl
4434			break;
4435		case IR_ROR:
4436			|	ASM_REG_TXT_OP ror, insn->type, def_reg, cl
4437			break;
4438	}
4439	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4440		ir_emit_store(ctx, type, def, def_reg);
4441	}
4442}
4443
4444static void ir_emit_mem_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4445{
4446	ir_backend_data *data = ctx->data;
4447	dasm_State **Dst = &data->dasm_state;
4448	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4449	ir_type type = op_insn->type;
4450	ir_ref op2 = op_insn->op2;
4451	ir_reg op2_reg = ctx->regs[insn->op3][2];
4452	ir_mem mem;
4453
4454	if (insn->op == IR_STORE) {
4455		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4456	} else {
4457		IR_ASSERT(insn->op == IR_VSTORE);
4458		mem = ir_var_spill_slot(ctx, insn->op2);
4459	}
4460
4461	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
4462		op2_reg = IR_REG_NUM(op2_reg);
4463		ir_emit_load(ctx, type, op2_reg, op2);
4464	}
4465	if (op2_reg != IR_REG_RCX) {
4466		if (op2_reg != IR_REG_NONE) {
4467			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
4468		} else {
4469			ir_emit_load(ctx, type, IR_REG_RCX, op2);
4470		}
4471	}
4472	switch (op_insn->op) {
4473		default:
4474			IR_ASSERT(0);
4475		case IR_SHL:
4476			|	ASM_MEM_TXT_OP shl, type, mem, cl
4477			break;
4478		case IR_SHR:
4479			|	ASM_MEM_TXT_OP shr, type, mem, cl
4480			break;
4481		case IR_SAR:
4482			|	ASM_MEM_TXT_OP sar, type, mem, cl
4483			break;
4484		case IR_ROL:
4485			|	ASM_MEM_TXT_OP rol, type, mem, cl
4486			break;
4487		case IR_ROR:
4488			|	ASM_MEM_TXT_OP ror, type, mem, cl
4489			break;
4490	}
4491}
4492
4493static void ir_emit_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4494{
4495	ir_backend_data *data = ctx->data;
4496	dasm_State **Dst = &data->dasm_state;
4497	int32_t shift;
4498	ir_type type = insn->type;
4499	ir_ref op1 = insn->op1;
4500	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4501	ir_reg op1_reg = ctx->regs[def][1];
4502
4503	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4504	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[insn->op2].val.i64));
4505	shift = ctx->ir_base[insn->op2].val.i32;
4506	IR_ASSERT(def_reg != IR_REG_NONE);
4507
4508	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4509		op1_reg = IR_REG_NUM(op1_reg);
4510		ir_emit_load(ctx, type, op1_reg, op1);
4511	}
4512	if (def_reg != op1_reg) {
4513		if (op1_reg != IR_REG_NONE) {
4514			ir_emit_mov(ctx, type, def_reg, op1_reg);
4515		} else {
4516			ir_emit_load(ctx, type, def_reg, op1);
4517		}
4518	}
4519	switch (insn->op) {
4520		default:
4521			IR_ASSERT(0);
4522		case IR_SHL:
4523			|	ASM_REG_IMM_OP shl, insn->type, def_reg, shift
4524			break;
4525		case IR_SHR:
4526			|	ASM_REG_IMM_OP shr, insn->type, def_reg, shift
4527			break;
4528		case IR_SAR:
4529			|	ASM_REG_IMM_OP sar, insn->type, def_reg, shift
4530			break;
4531		case IR_ROL:
4532			|	ASM_REG_IMM_OP rol, insn->type, def_reg, shift
4533			break;
4534		case IR_ROR:
4535			|	ASM_REG_IMM_OP ror, insn->type, def_reg, shift
4536			break;
4537	}
4538	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4539		ir_emit_store(ctx, type, def, def_reg);
4540	}
4541}
4542
4543static void ir_emit_mem_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4544{
4545	ir_backend_data *data = ctx->data;
4546	dasm_State **Dst = &data->dasm_state;
4547	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4548	ir_type type = op_insn->type;
4549	int32_t shift;
4550	ir_mem mem;
4551
4552	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
4553	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
4554	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[op_insn->op2].val.i64));
4555	shift = ctx->ir_base[op_insn->op2].val.i32;
4556	if (insn->op == IR_STORE) {
4557		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4558	} else {
4559		IR_ASSERT(insn->op == IR_VSTORE);
4560		mem = ir_var_spill_slot(ctx, insn->op2);
4561	}
4562
4563	switch (op_insn->op) {
4564		default:
4565			IR_ASSERT(0);
4566		case IR_SHL:
4567			|	ASM_MEM_IMM_OP shl, type, mem, shift
4568			break;
4569		case IR_SHR:
4570			|	ASM_MEM_IMM_OP shr, type, mem, shift
4571			break;
4572		case IR_SAR:
4573			|	ASM_MEM_IMM_OP sar, type, mem, shift
4574			break;
4575		case IR_ROL:
4576			|	ASM_MEM_IMM_OP rol, type, mem, shift
4577			break;
4578		case IR_ROR:
4579			|	ASM_MEM_IMM_OP ror, type, mem, shift
4580			break;
4581	}
4582}
4583
4584static void ir_emit_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4585{
4586	ir_backend_data *data = ctx->data;
4587	dasm_State **Dst = &data->dasm_state;
4588	ir_type type = insn->type;
4589	ir_ref op1 = insn->op1;
4590	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4591	ir_reg op1_reg = ctx->regs[def][1];
4592
4593	IR_ASSERT(def_reg != IR_REG_NONE);
4594
4595	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4596		op1_reg = IR_REG_NUM(op1_reg);
4597		ir_emit_load(ctx, type, op1_reg, op1);
4598	}
4599	if (def_reg != op1_reg) {
4600		if (op1_reg != IR_REG_NONE) {
4601			ir_emit_mov(ctx, type, def_reg, op1_reg);
4602		} else {
4603			ir_emit_load(ctx, type, def_reg, op1);
4604		}
4605	}
4606	if (rule == IR_INC) {
4607		|	ASM_REG_OP inc, insn->type, def_reg
4608	} else if (rule == IR_DEC) {
4609		|	ASM_REG_OP dec, insn->type, def_reg
4610	} else if (insn->op == IR_NOT) {
4611		|	ASM_REG_OP not, insn->type, def_reg
4612	} else if (insn->op == IR_NEG) {
4613		|	ASM_REG_OP neg, insn->type, def_reg
4614	} else {
4615		IR_ASSERT(insn->op == IR_BSWAP);
4616		switch (ir_type_size[insn->type]) {
4617			default:
4618				IR_ASSERT(0);
4619			case 4:
4620				|	bswap Rd(def_reg)
4621				break;
4622			case 8:
4623				IR_ASSERT(sizeof(void*) == 8);
4624|.if X64
4625				|	bswap Rq(def_reg)
4626|.endif
4627				break;
4628		}
4629	}
4630	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4631		ir_emit_store(ctx, type, def, def_reg);
4632	}
4633}
4634
4635static void ir_emit_bit_count(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4636{
4637	ir_backend_data *data = ctx->data;
4638	dasm_State **Dst = &data->dasm_state;
4639	ir_type type = insn->type;
4640	ir_ref op1 = insn->op1;
4641	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4642	ir_reg op1_reg = ctx->regs[def][1];
4643
4644	IR_ASSERT(def_reg != IR_REG_NONE);
4645
4646	if (op1_reg != IR_REG_NONE) {
4647		if (IR_REG_SPILLED(op1_reg)) {
4648			op1_reg = IR_REG_NUM(op1_reg);
4649			ir_emit_load(ctx, type, op1_reg, op1);
4650		}
4651		switch (ir_type_size[insn->type]) {
4652			default:
4653				IR_ASSERT(0);
4654			case 2:
4655				if (insn->op == IR_CTLZ) {
4656					if (ctx->mflags & IR_X86_BMI1) {
4657						|	lzcnt Rw(def_reg), Rw(op1_reg)
4658					} else {
4659						|	bsr Rw(def_reg), Rw(op1_reg)
4660						|	xor Rw(def_reg), 0xf
4661					}
4662				} else if (insn->op == IR_CTTZ) {
4663					if (ctx->mflags & IR_X86_BMI1) {
4664						|	tzcnt Rw(def_reg), Rw(op1_reg)
4665					} else {
4666						|	bsf Rw(def_reg), Rw(op1_reg)
4667					}
4668				} else {
4669					IR_ASSERT(insn->op == IR_CTPOP);
4670					|	popcnt Rw(def_reg), Rw(op1_reg)
4671				}
4672				break;
4673			case 1:
4674				|   movzx Rd(op1_reg), Rb(op1_reg)
4675				if (insn->op == IR_CTLZ) {
4676					if (ctx->mflags & IR_X86_BMI1) {
4677						|	lzcnt Rd(def_reg), Rd(op1_reg)
4678						|	sub Rd(def_reg), 24
4679					} else {
4680						|	bsr Rd(def_reg), Rd(op1_reg)
4681						|	xor Rw(def_reg), 0x7
4682					}
4683					break;
4684				}
4685				IR_FALLTHROUGH;
4686			case 4:
4687				if (insn->op == IR_CTLZ) {
4688					if (ctx->mflags & IR_X86_BMI1) {
4689						|	lzcnt Rd(def_reg), Rd(op1_reg)
4690					} else {
4691						|	bsr Rd(def_reg), Rd(op1_reg)
4692						|	xor Rw(def_reg), 0x1f
4693					}
4694				} else if (insn->op == IR_CTTZ) {
4695					if (ctx->mflags & IR_X86_BMI1) {
4696						|	tzcnt Rd(def_reg), Rd(op1_reg)
4697					} else {
4698						|	bsf Rd(def_reg), Rd(op1_reg)
4699					}
4700				} else {
4701					IR_ASSERT(insn->op == IR_CTPOP);
4702					|	popcnt Rd(def_reg), Rd(op1_reg)
4703				}
4704				break;
4705|.if X64
4706			case 8:
4707				if (insn->op == IR_CTLZ) {
4708					if (ctx->mflags & IR_X86_BMI1) {
4709						|	lzcnt Rq(def_reg), Rq(op1_reg)
4710					} else {
4711						|	bsr Rq(def_reg), Rq(op1_reg)
4712						|	xor Rw(def_reg), 0x3f
4713					}
4714				} else if (insn->op == IR_CTTZ) {
4715					if (ctx->mflags & IR_X86_BMI1) {
4716						|	tzcnt Rq(def_reg), Rq(op1_reg)
4717					} else {
4718						|	bsf Rq(def_reg), Rq(op1_reg)
4719					}
4720				} else {
4721					IR_ASSERT(insn->op == IR_CTPOP);
4722					|	popcnt Rq(def_reg), Rq(op1_reg)
4723				}
4724				break;
4725|.endif
4726		}
4727	} else {
4728		ir_mem mem;
4729
4730		if (ir_rule(ctx, op1) & IR_FUSED) {
4731			mem = ir_fuse_load(ctx, def, op1);
4732		} else {
4733			mem = ir_ref_spill_slot(ctx, op1);
4734		}
4735		switch (ir_type_size[insn->type]) {
4736			default:
4737				IR_ASSERT(0);
4738			case 2:
4739				if (insn->op == IR_CTLZ) {
4740					if (ctx->mflags & IR_X86_BMI1) {
4741						|	ASM_TXT_TMEM_OP lzcnt, Rw(def_reg), word, mem
4742					} else {
4743						|	ASM_TXT_TMEM_OP bsr, Rw(def_reg), word, mem
4744						|	xor Rw(def_reg), 0xf
4745					}
4746				} else if (insn->op == IR_CTTZ) {
4747					if (ctx->mflags & IR_X86_BMI1) {
4748						|	ASM_TXT_TMEM_OP tzcnt, Rw(def_reg), word, mem
4749					} else {
4750						|	ASM_TXT_TMEM_OP bsf, Rw(def_reg), word, mem
4751					}
4752				} else {
4753					|	ASM_TXT_TMEM_OP popcnt, Rw(def_reg), word, mem
4754				}
4755				break;
4756			case 4:
4757				if (insn->op == IR_CTLZ) {
4758					if (ctx->mflags & IR_X86_BMI1) {
4759						|	ASM_TXT_TMEM_OP lzcnt, Rd(def_reg), dword, mem
4760					} else {
4761						|	ASM_TXT_TMEM_OP bsr, Rd(def_reg), dword, mem
4762						|	xor Rw(def_reg), 0x1f
4763					}
4764				} else if (insn->op == IR_CTTZ) {
4765					if (ctx->mflags & IR_X86_BMI1) {
4766						|	ASM_TXT_TMEM_OP tzcnt, Rd(def_reg), dword, mem
4767					} else {
4768						|	ASM_TXT_TMEM_OP bsf, Rd(def_reg), dword, mem
4769					}
4770				} else {
4771					|	ASM_TXT_TMEM_OP popcnt, Rd(def_reg), dword, mem
4772				}
4773				break;
4774|.if X64
4775			case 8:
4776				if (insn->op == IR_CTLZ) {
4777					if (ctx->mflags & IR_X86_BMI1) {
4778						|	ASM_TXT_TMEM_OP lzcnt, Rq(def_reg), qword, mem
4779					} else {
4780						|	ASM_TXT_TMEM_OP bsr, Rq(def_reg), qword, mem
4781						|	xor Rw(def_reg), 0x3f
4782					}
4783				} else if (insn->op == IR_CTTZ) {
4784					if (ctx->mflags & IR_X86_BMI1) {
4785						|	ASM_TXT_TMEM_OP tzcnt, Rq(def_reg), qword, mem
4786					} else {
4787						|	ASM_TXT_TMEM_OP bsf, Rq(def_reg), qword, mem
4788					}
4789				} else {
4790					|	ASM_TXT_TMEM_OP popcnt, Rq(def_reg), qword, mem
4791				}
4792				break;
4793|.endif
4794		}
4795	}
4796
4797	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4798		ir_emit_store(ctx, type, def, def_reg);
4799	}
4800}
4801
4802static void ir_emit_ctpop(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4803{
4804	ir_backend_data *data = ctx->data;
4805	dasm_State **Dst = &data->dasm_state;
4806	ir_type type = insn->type;
4807	ir_ref op1 = insn->op1;
4808	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4809	ir_reg op1_reg = ctx->regs[def][1];
4810	ir_reg tmp_reg = ctx->regs[def][2];
4811|.if X64
4812||	ir_reg const_reg = ctx->regs[def][3];
4813|.endif
4814
4815	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
4816	if (op1_reg == IR_REG_NONE) {
4817		ir_emit_load(ctx, type, def_reg, op1);
4818		if (ir_type_size[insn->type] == 1) {
4819			|	movzx Rd(def_reg), Rb(def_reg)
4820		} else if (ir_type_size[insn->type] == 2) {
4821			|	movzx Rd(def_reg), Rw(def_reg)
4822		}
4823	} else {
4824		if (IR_REG_SPILLED(op1_reg)) {
4825			op1_reg = IR_REG_NUM(op1_reg);
4826			ir_emit_load(ctx, type, op1_reg, op1);
4827		}
4828		switch (ir_type_size[insn->type]) {
4829			default:
4830				IR_ASSERT(0);
4831			case 1:
4832				|	movzx Rd(def_reg), Rb(op1_reg)
4833				break;
4834			case 2:
4835				|	movzx Rd(def_reg), Rw(op1_reg)
4836				break;
4837			case 4:
4838				|	mov Rd(def_reg), Rd(op1_reg)
4839				break;
4840|.if X64
4841||			case 8:
4842				|	mov Rq(def_reg), Rq(op1_reg)
4843||				break;
4844|.endif
4845		}
4846	}
4847	switch (ir_type_size[insn->type]) {
4848		default:
4849			IR_ASSERT(0);
4850		case 1:
4851			|	mov Rd(tmp_reg), Rd(def_reg)
4852			|	shr Rd(def_reg), 1
4853			|	and Rd(def_reg), 0x55
4854			|	sub Rd(tmp_reg), Rd(def_reg)
4855			|	mov Rd(def_reg), Rd(tmp_reg)
4856			|	and Rd(def_reg), 0x33
4857			|	shr Rd(tmp_reg), 2
4858			|	and Rd(tmp_reg), 0x33
4859			|	add Rd(tmp_reg), Rd(def_reg)
4860			|	mov Rd(def_reg), Rd(tmp_reg)
4861			|	shr Rd(def_reg), 4
4862			|	add Rd(def_reg), Rd(tmp_reg)
4863			|	and Rd(def_reg), 0x0f
4864			break;
4865		case 2:
4866			|	mov Rd(tmp_reg), Rd(def_reg)
4867			|	shr Rd(def_reg), 1
4868			|	and Rd(def_reg), 0x5555
4869			|	sub Rd(tmp_reg), Rd(def_reg)
4870			|	mov Rd(def_reg), Rd(tmp_reg)
4871			|	and Rd(def_reg), 0x3333
4872			|	shr Rd(tmp_reg), 2
4873			|	and Rd(tmp_reg), 0x3333
4874			|	add Rd(tmp_reg), Rd(def_reg)
4875			|	mov Rd(def_reg), Rd(tmp_reg)
4876			|	shr Rd(def_reg), 4
4877			|	add Rd(def_reg), Rd(tmp_reg)
4878			|	and Rd(def_reg), 0x0f0f
4879			|	mov	Rd(tmp_reg), Rd(def_reg)
4880			|	shr Rd(tmp_reg), 8
4881			|	and Rd(def_reg), 0x0f
4882			|	add Rd(def_reg), Rd(tmp_reg)
4883			break;
4884		case 4:
4885			|	mov Rd(tmp_reg), Rd(def_reg)
4886			|	shr Rd(def_reg), 1
4887			|	and Rd(def_reg), 0x55555555
4888			|	sub Rd(tmp_reg), Rd(def_reg)
4889			|	mov Rd(def_reg), Rd(tmp_reg)
4890			|	and Rd(def_reg), 0x33333333
4891			|	shr Rd(tmp_reg), 2
4892			|	and Rd(tmp_reg), 0x33333333
4893			|	add Rd(tmp_reg), Rd(def_reg)
4894			|	mov Rd(def_reg), Rd(tmp_reg)
4895			|	shr Rd(def_reg), 4
4896			|	add Rd(def_reg), Rd(tmp_reg)
4897			|	and Rd(def_reg), 0x0f0f0f0f
4898			|	imul Rd(def_reg), 0x01010101
4899			|	shr Rd(def_reg), 24
4900			break;
4901|.if X64
4902||		case 8:
4903||			IR_ASSERT(const_reg != IR_REG_NONE);
4904			|	mov Rq(tmp_reg), Rq(def_reg)
4905			|	shr Rq(def_reg), 1
4906			|	mov64 Rq(const_reg), 0x5555555555555555
4907			|	and Rq(def_reg), Rq(const_reg)
4908			|	sub Rq(tmp_reg), Rq(def_reg)
4909			|	mov Rq(def_reg), Rq(tmp_reg)
4910			|	mov64 Rq(const_reg), 0x3333333333333333
4911			|	and Rq(def_reg), Rq(const_reg)
4912			|	shr Rq(tmp_reg), 2
4913			|	and Rq(tmp_reg), Rq(const_reg)
4914			|	add Rq(tmp_reg), Rq(def_reg)
4915			|	mov Rq(def_reg), Rq(tmp_reg)
4916			|	shr Rq(def_reg), 4
4917			|	add Rq(def_reg), Rq(tmp_reg)
4918			|	mov64 Rq(const_reg), 0x0f0f0f0f0f0f0f0f
4919			|	and Rq(def_reg), Rq(const_reg)
4920			|	mov64 Rq(const_reg), 0x0101010101010101
4921			|	imul Rq(def_reg), Rq(const_reg)
4922			|	shr Rq(def_reg), 56
4923||			break;
4924|.endif
4925	}
4926
4927	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4928		ir_emit_store(ctx, type, def, def_reg);
4929	}
4930}
4931
4932static void ir_emit_mem_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4933{
4934	ir_backend_data *data = ctx->data;
4935	dasm_State **Dst = &data->dasm_state;
4936	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4937	ir_type type = op_insn->type;
4938	ir_mem mem;
4939
4940	if (insn->op == IR_STORE) {
4941		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4942	} else {
4943		IR_ASSERT(insn->op == IR_VSTORE);
4944		mem = ir_var_spill_slot(ctx, insn->op2);
4945	}
4946
4947	if (rule == IR_MEM_INC) {
4948		|	ASM_MEM_OP inc, type, mem
4949	} else if (rule == IR_MEM_DEC) {
4950		|	ASM_MEM_OP dec, type, mem
4951	} else if (op_insn->op == IR_NOT) {
4952		|	ASM_MEM_OP not, type, mem
4953	} else {
4954		IR_ASSERT(op_insn->op == IR_NEG);
4955		|	ASM_MEM_OP neg, type, mem
4956	}
4957}
4958
4959static void ir_emit_abs_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4960{
4961	ir_backend_data *data = ctx->data;
4962	dasm_State **Dst = &data->dasm_state;
4963	ir_type type = insn->type;
4964	ir_ref op1 = insn->op1;
4965	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4966	ir_reg op1_reg = ctx->regs[def][1];
4967
4968	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
4969
4970	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4971		op1_reg = IR_REG_NUM(op1_reg);
4972		ir_emit_load(ctx, type, op1_reg, op1);
4973	}
4974
4975	IR_ASSERT(def_reg != op1_reg);
4976
4977	ir_emit_mov(ctx, insn->type, def_reg, op1_reg);
4978	|	ASM_REG_OP neg, insn->type, def_reg
4979	|	ASM_REG_REG_OP2, cmovs, type, def_reg, op1_reg
4980	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4981		ir_emit_store(ctx, type, def, def_reg);
4982	}
4983}
4984
4985static void ir_emit_bool_not_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4986{
4987	ir_backend_data *data = ctx->data;
4988	dasm_State **Dst = &data->dasm_state;
4989	ir_type type = ctx->ir_base[insn->op1].type;
4990	ir_ref op1 = insn->op1;
4991	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4992	ir_reg op1_reg = ctx->regs[def][1];
4993
4994	IR_ASSERT(def_reg != IR_REG_NONE);
4995
4996	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4997		op1_reg = IR_REG_NUM(op1_reg);
4998		ir_emit_load(ctx, type, op1_reg, op1);
4999	}
5000
5001	if (op1_reg != IR_REG_NONE) {
5002		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
5003	} else {
5004		ir_mem mem = ir_ref_spill_slot(ctx, op1);
5005
5006		|	ASM_MEM_IMM_OP cmp, type, mem, 0
5007	}
5008	|	sete Rb(def_reg)
5009
5010	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5011		ir_emit_store(ctx, type, def, def_reg);
5012	}
5013}
5014
5015static void ir_emit_mul_div_mod(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5016{
5017	ir_backend_data *data = ctx->data;
5018	dasm_State **Dst = &data->dasm_state;
5019	ir_type type = insn->type;
5020	ir_ref op1 = insn->op1;
5021	ir_ref op2 = insn->op2;
5022	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5023	ir_reg op1_reg = ctx->regs[def][1];
5024	ir_reg op2_reg = ctx->regs[def][2];
5025	ir_mem mem;
5026
5027	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5028		op1_reg = IR_REG_NUM(op1_reg);
5029		ir_emit_load(ctx, type, op1_reg, op1);
5030	}
5031	if (op1_reg != IR_REG_RAX) {
5032		if (op1_reg != IR_REG_NONE) {
5033			ir_emit_mov(ctx, type, IR_REG_RAX, op1_reg);
5034		} else {
5035			ir_emit_load(ctx, type, IR_REG_RAX, op1);
5036		}
5037	}
5038	if (op2_reg == IR_REG_NONE && op1 == op2) {
5039		op2_reg = IR_REG_RAX;
5040	} else if (op2_reg != IR_REG_NONE) {
5041		if (IR_REG_SPILLED(op2_reg)) {
5042			op2_reg = IR_REG_NUM(op2_reg);
5043			ir_emit_load(ctx, type, op2_reg, op2);
5044		}
5045	} else if (IR_IS_CONST_REF(op2)
5046	 && (insn->op == IR_MUL || insn->op == IR_MUL_OV)) {
5047		op2_reg = IR_REG_RDX;
5048		ir_emit_load(ctx, type, op2_reg, op2);
5049	}
5050	if (insn->op == IR_MUL || insn->op == IR_MUL_OV) {
5051		if (IR_IS_TYPE_SIGNED(insn->type)) {
5052			if (op2_reg != IR_REG_NONE) {
5053				|	ASM_REG_OP imul, type, op2_reg
5054			} else {
5055				if (ir_rule(ctx, op2) & IR_FUSED) {
5056					mem = ir_fuse_load(ctx, def, op2);
5057				} else {
5058					mem = ir_ref_spill_slot(ctx, op2);
5059				}
5060				|	ASM_MEM_OP imul, type, mem
5061			}
5062		} else {
5063			if (op2_reg != IR_REG_NONE) {
5064				|	ASM_REG_OP mul, type, op2_reg
5065			} else {
5066				if (ir_rule(ctx, op2) & IR_FUSED) {
5067					mem = ir_fuse_load(ctx, def, op2);
5068				} else {
5069					mem = ir_ref_spill_slot(ctx, op2);
5070				}
5071				|	ASM_MEM_OP mul, type, mem
5072			}
5073		}
5074	} else {
5075		if (IR_IS_TYPE_SIGNED(type)) {
5076			if (ir_type_size[type] == 8) {
5077				|	cqo
5078			} else if (ir_type_size[type] == 4) {
5079				|	cdq
5080			} else if (ir_type_size[type] == 2) {
5081				|	cwd
5082			} else {
5083				|	movsx ax, al
5084			}
5085			if (op2_reg != IR_REG_NONE) {
5086				|	ASM_REG_OP idiv, type, op2_reg
5087			} else {
5088				if (ir_rule(ctx, op2) & IR_FUSED) {
5089					mem = ir_fuse_load(ctx, def, op2);
5090				} else {
5091					mem = ir_ref_spill_slot(ctx, op2);
5092				}
5093				|	ASM_MEM_OP idiv, type, mem
5094			}
5095		} else {
5096			if (ir_type_size[type] == 1) {
5097				|	movzx ax, al
5098			} else {
5099				|	ASM_REG_REG_OP xor, type, IR_REG_RDX, IR_REG_RDX
5100			}
5101			if (op2_reg != IR_REG_NONE) {
5102				|	ASM_REG_OP div, type, op2_reg
5103			} else {
5104				if (ir_rule(ctx, op2) & IR_FUSED) {
5105					mem = ir_fuse_load(ctx, def, op2);
5106				} else {
5107					mem = ir_ref_spill_slot(ctx, op2);
5108				}
5109				|	ASM_MEM_OP div, type, mem
5110			}
5111		}
5112	}
5113
5114	if (insn->op == IR_MUL || insn->op == IR_MUL_OV || insn->op == IR_DIV) {
5115		if (def_reg != IR_REG_NONE) {
5116			if (def_reg != IR_REG_RAX) {
5117				ir_emit_mov(ctx, type, def_reg, IR_REG_RAX);
5118			}
5119			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5120				ir_emit_store(ctx, type, def, def_reg);
5121			}
5122		} else {
5123			ir_emit_store(ctx, type, def, IR_REG_RAX);
5124		}
5125	} else {
5126		IR_ASSERT(insn->op == IR_MOD);
5127		if (ir_type_size[type] == 1) {
5128			if (def_reg != IR_REG_NONE) {
5129				|	mov al, ah
5130				if (def_reg != IR_REG_RAX) {
5131					|	mov Rb(def_reg), al
5132				}
5133				if (IR_REG_SPILLED(ctx->regs[def][0])) {
5134					ir_emit_store(ctx, type, def, def_reg);
5135				}
5136			} else {
5137				ir_reg fp;
5138				int32_t offset = ir_ref_spill_slot_offset(ctx, def, &fp);
5139
5140//?????
5141				|	mov byte [Ra(fp)+offset], ah
5142			}
5143		} else {
5144			if (def_reg != IR_REG_NONE) {
5145				if (def_reg != IR_REG_RDX) {
5146					ir_emit_mov(ctx, type, def_reg, IR_REG_RDX);
5147				}
5148				if (IR_REG_SPILLED(ctx->regs[def][0])) {
5149					ir_emit_store(ctx, type, def, def_reg);
5150				}
5151			} else {
5152				ir_emit_store(ctx, type, def, IR_REG_RDX);
5153			}
5154		}
5155	}
5156}
5157
5158static void ir_rodata(ir_ctx *ctx)
5159{
5160	ir_backend_data *data = ctx->data;
5161	dasm_State **Dst = &data->dasm_state;
5162
5163	|.rodata
5164	if (!data->rodata_label) {
5165		int label = data->rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
5166		|=>label:
5167	}
5168}
5169
5170static void ir_emit_op_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5171{
5172	ir_backend_data *data = ctx->data;
5173	dasm_State **Dst = &data->dasm_state;
5174	ir_type type = insn->type;
5175	ir_ref op1 = insn->op1;
5176	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5177	ir_reg op1_reg = ctx->regs[def][1];
5178
5179	IR_ASSERT(def_reg != IR_REG_NONE);
5180
5181	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5182		op1_reg = IR_REG_NUM(op1_reg);
5183		ir_emit_load(ctx, type, op1_reg, op1);
5184	}
5185	if (def_reg != op1_reg) {
5186		if (op1_reg != IR_REG_NONE) {
5187			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
5188		} else {
5189			ir_emit_load(ctx, type, def_reg, op1);
5190		}
5191	}
5192	if (insn->op == IR_NEG) {
5193		if (insn->type == IR_DOUBLE) {
5194			if (!data->double_neg_const) {
5195				data->double_neg_const = 1;
5196				ir_rodata(ctx);
5197				|.align 16
5198				|->double_neg_const:
5199				|.dword 0, 0x80000000, 0, 0
5200				|.code
5201			}
5202			if (ctx->mflags & IR_X86_AVX) {
5203				|	vxorpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
5204			} else {
5205				|	xorpd xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
5206			}
5207		} else {
5208			IR_ASSERT(insn->type == IR_FLOAT);
5209			if (!data->float_neg_const) {
5210				data->float_neg_const = 1;
5211				ir_rodata(ctx);
5212				|.align 16
5213				|->float_neg_const:
5214				|.dword 0x80000000, 0, 0, 0
5215				|.code
5216			}
5217			if (ctx->mflags & IR_X86_AVX) {
5218				|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
5219			} else {
5220				|	xorps xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
5221			}
5222		}
5223	} else {
5224		IR_ASSERT(insn->op == IR_ABS);
5225		if (insn->type == IR_DOUBLE) {
5226			if (!data->double_abs_const) {
5227				data->double_abs_const = 1;
5228				ir_rodata(ctx);
5229				|.align 16
5230				|->double_abs_const:
5231				|.dword 0xffffffff, 0x7fffffff, 0, 0
5232				|.code
5233			}
5234			if (ctx->mflags & IR_X86_AVX) {
5235				|	vandpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
5236			} else {
5237				|	andpd xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
5238			}
5239		} else {
5240			IR_ASSERT(insn->type == IR_FLOAT);
5241			if (!data->float_abs_const) {
5242				data->float_abs_const = 1;
5243				ir_rodata(ctx);
5244				|.align 16
5245				|->float_abs_const:
5246				|.dword 0x7fffffff, 0, 0, 0
5247				|.code
5248			}
5249			if (ctx->mflags & IR_X86_AVX) {
5250				|	vandps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
5251			} else {
5252				|	andps xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
5253			}
5254		}
5255	}
5256	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5257		ir_emit_store(ctx, insn->type, def, def_reg);
5258	}
5259}
5260
5261static void ir_emit_binop_sse2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5262{
5263	ir_backend_data *data = ctx->data;
5264	dasm_State **Dst = &data->dasm_state;
5265	ir_type type = insn->type;
5266	ir_ref op1 = insn->op1;
5267	ir_ref op2 = insn->op2;
5268	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5269	ir_reg op1_reg = ctx->regs[def][1];
5270	ir_reg op2_reg = ctx->regs[def][2];
5271
5272	IR_ASSERT(def_reg != IR_REG_NONE);
5273
5274	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5275		op1_reg = IR_REG_NUM(op1_reg);
5276		ir_emit_load(ctx, type, op1_reg, op1);
5277	}
5278	if (def_reg != op1_reg) {
5279		if (op1_reg != IR_REG_NONE) {
5280			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
5281		} else {
5282			ir_emit_load(ctx, type, def_reg, op1);
5283		}
5284		if (op1 == op2) {
5285			op2_reg = def_reg;
5286		}
5287	}
5288	if (op2_reg != IR_REG_NONE) {
5289		if (IR_REG_SPILLED(op2_reg)) {
5290			op2_reg = IR_REG_NUM(op2_reg);
5291			if (op1 != op2) {
5292				ir_emit_load(ctx, type, op2_reg, op2);
5293			}
5294		}
5295		switch (insn->op) {
5296			default:
5297				IR_ASSERT(0 && "NIY binary op");
5298			case IR_ADD:
5299				|	ASM_SSE2_REG_REG_OP adds, type, def_reg, op2_reg
5300				break;
5301			case IR_SUB:
5302				|	ASM_SSE2_REG_REG_OP subs, type, def_reg, op2_reg
5303				break;
5304			case IR_MUL:
5305				|	ASM_SSE2_REG_REG_OP muls, type, def_reg, op2_reg
5306				break;
5307			case IR_DIV:
5308				|	ASM_SSE2_REG_REG_OP divs, type, def_reg, op2_reg
5309				break;
5310			case IR_MIN:
5311				|	ASM_SSE2_REG_REG_OP mins, type, def_reg, op2_reg
5312				break;
5313			case IR_MAX:
5314				|	ASM_SSE2_REG_REG_OP maxs, type, def_reg, op2_reg
5315				break;
5316		}
5317	} else if (IR_IS_CONST_REF(op2)) {
5318		int label = ir_const_label(ctx, op2);
5319
5320		switch (insn->op) {
5321			default:
5322				IR_ASSERT(0 && "NIY binary op");
5323			case IR_ADD:
5324				|	ASM_SSE2_REG_TXT_OP adds, type, def_reg, [=>label]
5325				break;
5326			case IR_SUB:
5327				|	ASM_SSE2_REG_TXT_OP subs, type, def_reg, [=>label]
5328				break;
5329			case IR_MUL:
5330				|	ASM_SSE2_REG_TXT_OP muls, type, def_reg, [=>label]
5331				break;
5332			case IR_DIV:
5333				|	ASM_SSE2_REG_TXT_OP divs, type, def_reg, [=>label]
5334				break;
5335			case IR_MIN:
5336				|	ASM_SSE2_REG_TXT_OP mins, type, def_reg, [=>label]
5337				break;
5338			case IR_MAX:
5339				|	ASM_SSE2_REG_TXT_OP maxs, type, def_reg, [=>label]
5340				break;
5341		}
5342	} else {
5343		ir_mem mem;
5344
5345		if (ir_rule(ctx, op2) & IR_FUSED) {
5346			mem = ir_fuse_load(ctx, def, op2);
5347		} else {
5348			mem = ir_ref_spill_slot(ctx, op2);
5349		}
5350		switch (insn->op) {
5351			default:
5352				IR_ASSERT(0 && "NIY binary op");
5353			case IR_ADD:
5354				|	ASM_SSE2_REG_MEM_OP adds, type, def_reg, mem
5355				break;
5356			case IR_SUB:
5357				|	ASM_SSE2_REG_MEM_OP subs, type, def_reg, mem
5358				break;
5359			case IR_MUL:
5360				|	ASM_SSE2_REG_MEM_OP muls, type, def_reg, mem
5361				break;
5362			case IR_DIV:
5363				|	ASM_SSE2_REG_MEM_OP divs, type, def_reg, mem
5364				break;
5365			case IR_MIN:
5366				|	ASM_SSE2_REG_MEM_OP mins, type, def_reg, mem
5367				break;
5368			case IR_MAX:
5369				|	ASM_SSE2_REG_MEM_OP maxs, type, def_reg, mem
5370				break;
5371		}
5372	}
5373	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5374		ir_emit_store(ctx, insn->type, def, def_reg);
5375	}
5376}
5377
5378static void ir_emit_binop_avx(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5379{
5380	ir_backend_data *data = ctx->data;
5381	dasm_State **Dst = &data->dasm_state;
5382	ir_type type = insn->type;
5383	ir_ref op1 = insn->op1;
5384	ir_ref op2 = insn->op2;
5385	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5386	ir_reg op1_reg = ctx->regs[def][1];
5387	ir_reg op2_reg = ctx->regs[def][2];
5388
5389	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
5390
5391	if (IR_REG_SPILLED(op1_reg)) {
5392		op1_reg = IR_REG_NUM(op1_reg);
5393		ir_emit_load(ctx, type, op1_reg, op1);
5394	}
5395	if (op2_reg != IR_REG_NONE) {
5396		if (IR_REG_SPILLED(op2_reg)) {
5397			op2_reg = IR_REG_NUM(op2_reg);
5398			if (op1 != op2) {
5399				ir_emit_load(ctx, type, op2_reg, op2);
5400			}
5401		}
5402		switch (insn->op) {
5403			default:
5404				IR_ASSERT(0 && "NIY binary op");
5405			case IR_ADD:
5406				|	ASM_AVX_REG_REG_REG_OP vadds, type, def_reg, op1_reg, op2_reg
5407				break;
5408			case IR_SUB:
5409				|	ASM_AVX_REG_REG_REG_OP vsubs, type, def_reg, op1_reg, op2_reg
5410				break;
5411			case IR_MUL:
5412				|	ASM_AVX_REG_REG_REG_OP vmuls, type, def_reg, op1_reg, op2_reg
5413				break;
5414			case IR_DIV:
5415				|	ASM_AVX_REG_REG_REG_OP vdivs, type, def_reg, op1_reg, op2_reg
5416				break;
5417			case IR_MIN:
5418				|	ASM_AVX_REG_REG_REG_OP vmins, type, def_reg, op1_reg, op2_reg
5419				break;
5420			case IR_MAX:
5421				|	ASM_AVX_REG_REG_REG_OP vmaxs, type, def_reg, op1_reg, op2_reg
5422				break;
5423		}
5424	} else if (IR_IS_CONST_REF(op2)) {
5425		int label = ir_const_label(ctx, op2);
5426
5427		switch (insn->op) {
5428			default:
5429				IR_ASSERT(0 && "NIY binary op");
5430			case IR_ADD:
5431				|	ASM_AVX_REG_REG_TXT_OP vadds, type, def_reg, op1_reg, [=>label]
5432				break;
5433			case IR_SUB:
5434				|	ASM_AVX_REG_REG_TXT_OP vsubs, type, def_reg, op1_reg, [=>label]
5435				break;
5436			case IR_MUL:
5437				|	ASM_AVX_REG_REG_TXT_OP vmuls, type, def_reg, op1_reg, [=>label]
5438				break;
5439			case IR_DIV:
5440				|	ASM_AVX_REG_REG_TXT_OP vdivs, type, def_reg, op1_reg, [=>label]
5441				break;
5442			case IR_MIN:
5443				|	ASM_AVX_REG_REG_TXT_OP vmins, type, def_reg, op1_reg, [=>label]
5444				break;
5445			case IR_MAX:
5446				|	ASM_AVX_REG_REG_TXT_OP vmaxs, type, def_reg, op1_reg, [=>label]
5447				break;
5448		}
5449	} else {
5450		ir_mem mem;
5451
5452		if (ir_rule(ctx, op2) & IR_FUSED) {
5453			mem = ir_fuse_load(ctx, def, op2);
5454		} else {
5455			mem = ir_ref_spill_slot(ctx, op2);
5456		}
5457		switch (insn->op) {
5458			default:
5459				IR_ASSERT(0 && "NIY binary op");
5460			case IR_ADD:
5461				|	ASM_AVX_REG_REG_MEM_OP vadds, type, def_reg, op1_reg, mem
5462				break;
5463			case IR_SUB:
5464				|	ASM_AVX_REG_REG_MEM_OP vsubs, type, def_reg, op1_reg, mem
5465				break;
5466			case IR_MUL:
5467				|	ASM_AVX_REG_REG_MEM_OP vmuls, type, def_reg, op1_reg, mem
5468				break;
5469			case IR_DIV:
5470				|	ASM_AVX_REG_REG_MEM_OP vdivs, type, def_reg, op1_reg, mem
5471				break;
5472			case IR_MIN:
5473				|	ASM_AVX_REG_REG_MEM_OP vmins, type, def_reg, op1_reg, mem
5474				break;
5475			case IR_MAX:
5476				|	ASM_AVX_REG_REG_MEM_OP vmaxs, type, def_reg, op1_reg, mem
5477				break;
5478		}
5479	}
5480	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5481		ir_emit_store(ctx, insn->type, def, def_reg);
5482	}
5483}
5484
5485static void ir_emit_cmp_int_common(ir_ctx *ctx, ir_type type, ir_ref root, ir_insn *insn, ir_reg op1_reg, ir_ref op1, ir_reg op2_reg, ir_ref op2)
5486{
5487	ir_backend_data *data = ctx->data;
5488	dasm_State **Dst = &data->dasm_state;
5489
5490	if (op1_reg != IR_REG_NONE) {
5491		if (op2_reg != IR_REG_NONE) {
5492			|	ASM_REG_REG_OP cmp, type, op1_reg, op2_reg
5493		} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5494			|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
5495		} else if (IR_IS_CONST_REF(op2)) {
5496			int32_t val = ir_fuse_imm(ctx, op2);
5497			|	ASM_REG_IMM_OP cmp, type, op1_reg, val
5498		} else {
5499			ir_mem mem;
5500
5501			if (ir_rule(ctx, op2) & IR_FUSED) {
5502				mem = ir_fuse_load(ctx, root, op2);
5503			} else {
5504				mem = ir_ref_spill_slot(ctx, op2);
5505			}
5506			|	ASM_REG_MEM_OP cmp, type, op1_reg, mem
5507		}
5508	} else if (IR_IS_CONST_REF(op1)) {
5509		IR_ASSERT(0);
5510	} else {
5511		ir_mem mem;
5512
5513		if (ir_rule(ctx, op1) & IR_FUSED) {
5514			mem = ir_fuse_load(ctx, root, op1);
5515		} else {
5516			mem = ir_ref_spill_slot(ctx, op1);
5517		}
5518		if (op2_reg != IR_REG_NONE) {
5519			|	ASM_MEM_REG_OP cmp, type, mem, op2_reg
5520		} else {
5521			int32_t val = ir_fuse_imm(ctx, op2);
5522			|	ASM_MEM_IMM_OP cmp, type, mem, val
5523		}
5524	}
5525}
5526
5527static void ir_emit_cmp_int_common2(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *cmp_insn)
5528{
5529	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5530	ir_ref op1 = cmp_insn->op1;
5531	ir_ref op2 = cmp_insn->op2;
5532	ir_reg op1_reg = ctx->regs[ref][1];
5533	ir_reg op2_reg = ctx->regs[ref][2];
5534
5535	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5536		op1_reg = IR_REG_NUM(op1_reg);
5537		ir_emit_load(ctx, type, op1_reg, op1);
5538	}
5539	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
5540		op2_reg = IR_REG_NUM(op2_reg);
5541		if (op1 != op2) {
5542			ir_emit_load(ctx, type, op2_reg, op2);
5543		}
5544	}
5545
5546	ir_emit_cmp_int_common(ctx, type, root, cmp_insn, op1_reg, op1, op2_reg, op2);
5547}
5548
5549static void _ir_emit_setcc_int(ir_ctx *ctx, uint8_t op, ir_reg def_reg)
5550{
5551	ir_backend_data *data = ctx->data;
5552	dasm_State **Dst = &data->dasm_state;
5553
5554	switch (op) {
5555		default:
5556			IR_ASSERT(0 && "NIY binary op");
5557		case IR_EQ:
5558			|	sete Rb(def_reg)
5559			break;
5560		case IR_NE:
5561			|	setne Rb(def_reg)
5562			break;
5563		case IR_LT:
5564			|	setl Rb(def_reg)
5565			break;
5566		case IR_GE:
5567			|	setge Rb(def_reg)
5568			break;
5569		case IR_LE:
5570			|	setle Rb(def_reg)
5571			break;
5572		case IR_GT:
5573			|	setg Rb(def_reg)
5574			break;
5575		case IR_ULT:
5576			|	setb Rb(def_reg)
5577			break;
5578		case IR_UGE:
5579			|	setae Rb(def_reg)
5580			break;
5581		case IR_ULE:
5582			|	setbe Rb(def_reg)
5583			break;
5584		case IR_UGT:
5585			|	seta Rb(def_reg)
5586			break;
5587	}
5588}
5589
5590static void _ir_emit_setcc_int_mem(ir_ctx *ctx, uint8_t op, ir_mem mem)
5591{
5592	ir_backend_data *data = ctx->data;
5593	dasm_State **Dst = &data->dasm_state;
5594
5595
5596	switch (op) {
5597		default:
5598			IR_ASSERT(0 && "NIY binary op");
5599		case IR_EQ:
5600			|	ASM_TMEM_OP sete, byte, mem
5601			break;
5602		case IR_NE:
5603			|	ASM_TMEM_OP setne, byte, mem
5604			break;
5605		case IR_LT:
5606			|	ASM_TMEM_OP setl, byte, mem
5607			break;
5608		case IR_GE:
5609			|	ASM_TMEM_OP setge, byte, mem
5610			break;
5611		case IR_LE:
5612			|	ASM_TMEM_OP setle, byte, mem
5613			break;
5614		case IR_GT:
5615			|	ASM_TMEM_OP setg, byte, mem
5616			break;
5617		case IR_ULT:
5618			|	ASM_TMEM_OP setb, byte, mem
5619			break;
5620		case IR_UGE:
5621			|	ASM_TMEM_OP setae, byte, mem
5622			break;
5623		case IR_ULE:
5624			|	ASM_TMEM_OP setbe, byte, mem
5625			break;
5626		case IR_UGT:
5627			|	ASM_TMEM_OP seta, byte, mem
5628			break;
5629	}
5630}
5631
5632static void ir_emit_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5633{
5634	ir_backend_data *data = ctx->data;
5635	dasm_State **Dst = &data->dasm_state;
5636	ir_type type = ctx->ir_base[insn->op1].type;
5637	ir_op op = insn->op;
5638	ir_ref op1 = insn->op1;
5639	ir_ref op2 = insn->op2;
5640	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5641	ir_reg op1_reg = ctx->regs[def][1];
5642	ir_reg op2_reg = ctx->regs[def][2];
5643
5644	IR_ASSERT(def_reg != IR_REG_NONE);
5645	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5646		op1_reg = IR_REG_NUM(op1_reg);
5647		ir_emit_load(ctx, type, op1_reg, op1);
5648	}
5649	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
5650		op2_reg = IR_REG_NUM(op2_reg);
5651		if (op1 != op2) {
5652			ir_emit_load(ctx, type, op2_reg, op2);
5653		}
5654	}
5655	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5656		if (op == IR_ULT) {
5657			/* always false */
5658			|	xor Ra(def_reg), Ra(def_reg)
5659			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5660				ir_emit_store(ctx, insn->type, def, def_reg);
5661			}
5662			return;
5663		} else if (op == IR_UGE) {
5664			/* always true */
5665			|	ASM_REG_IMM_OP mov, insn->type, def_reg, 1
5666			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5667				ir_emit_store(ctx, insn->type, def, def_reg);
5668			}
5669			return;
5670		} else if (op == IR_ULE) {
5671			op = IR_EQ;
5672		} else if (op == IR_UGT) {
5673			op = IR_NE;
5674		}
5675	}
5676	ir_emit_cmp_int_common(ctx, type, def, insn, op1_reg, op1, op2_reg, op2);
5677	_ir_emit_setcc_int(ctx, op, def_reg);
5678	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5679		ir_emit_store(ctx, insn->type, def, def_reg);
5680	}
5681}
5682
5683static void ir_emit_test_int_common(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_op op)
5684{
5685	ir_backend_data *data = ctx->data;
5686	dasm_State **Dst = &data->dasm_state;
5687	ir_insn *binop_insn = &ctx->ir_base[ref];
5688	ir_type type = binop_insn->type;
5689	ir_ref op1 = binop_insn->op1;
5690	ir_ref op2 = binop_insn->op2;
5691	ir_reg op1_reg = ctx->regs[ref][1];
5692	ir_reg op2_reg = ctx->regs[ref][2];
5693
5694	IR_ASSERT(binop_insn->op == IR_AND);
5695	if (op1_reg != IR_REG_NONE) {
5696		if (IR_REG_SPILLED(op1_reg)) {
5697			op1_reg = IR_REG_NUM(op1_reg);
5698			ir_emit_load(ctx, type, op1_reg, op1);
5699		}
5700		if (op2_reg != IR_REG_NONE) {
5701			if (IR_REG_SPILLED(op2_reg)) {
5702				op2_reg = IR_REG_NUM(op2_reg);
5703				if (op1 != op2) {
5704					ir_emit_load(ctx, type, op2_reg, op2);
5705				}
5706			}
5707			|	ASM_REG_REG_OP test, type, op1_reg, op2_reg
5708		} else if (IR_IS_CONST_REF(op2)) {
5709			int32_t val = ir_fuse_imm(ctx, op2);
5710
5711			if ((op == IR_EQ || op == IR_NE) && val == 0xff && (sizeof(void*) == 8 || op1_reg <= IR_REG_R3)) {
5712				|	test Rb(op1_reg), Rb(op1_reg)
5713			} else if ((op == IR_EQ || op == IR_NE) && val == 0xff00 && op1_reg <= IR_REG_R3) {
5714				if (op1_reg == IR_REG_RAX) {
5715					|	test ah, ah
5716				} else if (op1_reg == IR_REG_RBX) {
5717					|	test bh, bh
5718				} else if (op1_reg == IR_REG_RCX) {
5719					|	test ch, ch
5720				} else if (op1_reg == IR_REG_RDX) {
5721					|	test dh, dh
5722				} else {
5723					IR_ASSERT(0);
5724				}
5725			} else if ((op == IR_EQ || op == IR_NE) && val == 0xffff) {
5726				|	test Rw(op1_reg), Rw(op1_reg)
5727			} else if ((op == IR_EQ || op == IR_NE) && val == -1) {
5728				|	test Rd(op1_reg), Rd(op1_reg)
5729			} else {
5730				|	ASM_REG_IMM_OP test, type, op1_reg, val
5731			}
5732		} else {
5733			ir_mem mem;
5734
5735			if (ir_rule(ctx, op2) & IR_FUSED) {
5736				mem = ir_fuse_load(ctx, root, op2);
5737			} else {
5738				mem = ir_ref_spill_slot(ctx, op2);
5739			}
5740			|	ASM_REG_MEM_OP test, type, op1_reg, mem
5741		}
5742	} else if (IR_IS_CONST_REF(op1)) {
5743		IR_ASSERT(0);
5744	} else {
5745		ir_mem mem;
5746
5747		if (ir_rule(ctx, op1) & IR_FUSED) {
5748			mem = ir_fuse_load(ctx, root, op1);
5749		} else {
5750			mem = ir_ref_spill_slot(ctx, op1);
5751		}
5752		if (op2_reg != IR_REG_NONE) {
5753			if (IR_REG_SPILLED(op2_reg)) {
5754				op2_reg = IR_REG_NUM(op2_reg);
5755				if (op1 != op2) {
5756					ir_emit_load(ctx, type, op2_reg, op2);
5757				}
5758			}
5759			|	ASM_MEM_REG_OP test, type, mem, op2_reg
5760		} else {
5761			IR_ASSERT(!IR_IS_CONST_REF(op1));
5762			int32_t val = ir_fuse_imm(ctx, op2);
5763			|	ASM_MEM_IMM_OP test, type, mem, val
5764		}
5765	}
5766}
5767
5768static void ir_emit_testcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5769{
5770	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5771
5772	IR_ASSERT(def_reg != IR_REG_NONE);
5773	ir_emit_test_int_common(ctx, def, insn->op1, insn->op);
5774	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5775	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5776		ir_emit_store(ctx, insn->type, def, def_reg);
5777	}
5778}
5779
5780static void ir_emit_setcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5781{
5782	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5783
5784	IR_ASSERT(def_reg != IR_REG_NONE);
5785	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5786	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5787		ir_emit_store(ctx, insn->type, def, def_reg);
5788	}
5789}
5790
5791static ir_op ir_emit_cmp_fp_common(ir_ctx *ctx, ir_ref root, ir_ref cmp_ref, ir_insn *cmp_insn)
5792{
5793	ir_backend_data *data = ctx->data;
5794	dasm_State **Dst = &data->dasm_state;
5795	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5796	ir_op op = cmp_insn->op;
5797	ir_ref op1, op2;
5798	ir_reg op1_reg, op2_reg;
5799
5800	op1 = cmp_insn->op1;
5801	op2 = cmp_insn->op2;
5802	op1_reg = ctx->regs[cmp_ref][1];
5803	op2_reg = ctx->regs[cmp_ref][2];
5804
5805	if (op1_reg == IR_REG_NONE && op2_reg != IR_REG_NONE && (op == IR_EQ || op == IR_NE)) {
5806		ir_reg tmp_reg;
5807
5808		SWAP_REFS(op1, op2);
5809		tmp_reg = op1_reg;
5810		op1_reg = op2_reg;
5811		op2_reg = tmp_reg;
5812	}
5813
5814
5815	IR_ASSERT(op1_reg != IR_REG_NONE);
5816	if (IR_REG_SPILLED(op1_reg)) {
5817		op1_reg = IR_REG_NUM(op1_reg);
5818		ir_emit_load(ctx, type, op1_reg, op1);
5819	}
5820	if (op2_reg != IR_REG_NONE) {
5821		if (IR_REG_SPILLED(op2_reg)) {
5822			op2_reg = IR_REG_NUM(op2_reg);
5823			if (op1 != op2) {
5824				ir_emit_load(ctx, type, op2_reg, op2);
5825			}
5826		}
5827		|	ASM_FP_REG_REG_OP ucomis, type, op1_reg, op2_reg
5828	} else if (IR_IS_CONST_REF(op2)) {
5829		int label = ir_const_label(ctx, op2);
5830
5831		|	ASM_FP_REG_TXT_OP ucomis, type, op1_reg, [=>label]
5832	} else {
5833		ir_mem mem;
5834
5835		if (ir_rule(ctx, op2) & IR_FUSED) {
5836			mem = ir_fuse_load(ctx, root, op2);
5837		} else {
5838			mem = ir_ref_spill_slot(ctx, op2);
5839		}
5840		|	ASM_FP_REG_MEM_OP ucomis, type, op1_reg, mem
5841	}
5842	return op;
5843}
5844
5845static void ir_emit_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5846{
5847	ir_backend_data *data = ctx->data;
5848	dasm_State **Dst = &data->dasm_state;
5849	ir_op op = ir_emit_cmp_fp_common(ctx, def, def, insn);
5850	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5851	ir_reg tmp_reg = ctx->regs[def][3];
5852
5853	IR_ASSERT(def_reg != IR_REG_NONE);
5854	switch (op) {
5855		default:
5856			IR_ASSERT(0 && "NIY binary op");
5857		case IR_EQ:
5858			|	setnp Rb(def_reg)
5859			|	mov Rd(tmp_reg), 0
5860			|	cmovne Rd(def_reg), Rd(tmp_reg)
5861			break;
5862		case IR_NE:
5863			|	setp Rb(def_reg)
5864			|	mov Rd(tmp_reg), 1
5865			|	cmovne Rd(def_reg), Rd(tmp_reg)
5866			break;
5867		case IR_LT:
5868			|	setnp Rb(def_reg)
5869			|	mov Rd(tmp_reg), 0
5870			|	cmovae Rd(def_reg), Rd(tmp_reg)
5871			break;
5872		case IR_GE:
5873			|	setae Rb(def_reg)
5874			break;
5875		case IR_LE:
5876			|	setnp Rb(def_reg)
5877			|	mov Rd(tmp_reg), 0
5878			|	cmova Rd(def_reg), Rd(tmp_reg)
5879			break;
5880		case IR_GT:
5881			|	seta Rb(def_reg)
5882			break;
5883		case IR_ULT:
5884			|	setb Rb(def_reg)
5885			break;
5886		case IR_UGE:
5887			|	setp Rb(def_reg)
5888			|	mov Rd(tmp_reg), 1
5889			|	cmovae Rd(def_reg), Rd(tmp_reg)
5890			break;
5891		case IR_ULE:
5892			|	setbe Rb(def_reg)
5893			break;
5894		case IR_UGT:
5895			|	setp Rb(def_reg)
5896			|	mov Rd(tmp_reg), 1
5897			|	cmova Rd(def_reg), Rd(tmp_reg)
5898			break;
5899	}
5900	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5901		ir_emit_store(ctx, insn->type, def, def_reg);
5902	}
5903}
5904
5905static void ir_emit_jmp_true(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
5906{
5907	uint32_t true_block, false_block;
5908	ir_backend_data *data = ctx->data;
5909	dasm_State **Dst = &data->dasm_state;
5910
5911	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5912	if (true_block != next_block) {
5913		|	jmp =>true_block
5914	}
5915}
5916
5917static void ir_emit_jmp_false(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
5918{
5919	uint32_t true_block, false_block;
5920	ir_backend_data *data = ctx->data;
5921	dasm_State **Dst = &data->dasm_state;
5922
5923	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5924	if (false_block != next_block) {
5925		|	jmp =>false_block
5926	}
5927}
5928
5929static void ir_emit_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block, uint8_t op, bool int_cmp)
5930{
5931	uint32_t true_block, false_block;
5932	ir_backend_data *data = ctx->data;
5933	dasm_State **Dst = &data->dasm_state;
5934
5935	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5936	if (true_block == next_block) {
5937		/* swap to avoid unconditional JMP */
5938		if (int_cmp || op == IR_EQ || op == IR_NE) {
5939			op ^= 1; // reverse
5940		} else {
5941			op ^= 5; // reverse
5942		}
5943		true_block = false_block;
5944		false_block = 0;
5945	} else if (false_block == next_block) {
5946		false_block = 0;
5947	}
5948
5949	if (int_cmp) {
5950		switch (op) {
5951			default:
5952				IR_ASSERT(0 && "NIY binary op");
5953			case IR_EQ:
5954				|	je =>true_block
5955				break;
5956			case IR_NE:
5957				|	jne =>true_block
5958				break;
5959			case IR_LT:
5960				|	jl =>true_block
5961				break;
5962			case IR_GE:
5963				|	jge =>true_block
5964				break;
5965			case IR_LE:
5966				|	jle =>true_block
5967				break;
5968			case IR_GT:
5969				|	jg =>true_block
5970				break;
5971			case IR_ULT:
5972				|	jb =>true_block
5973				break;
5974			case IR_UGE:
5975				|	jae =>true_block
5976				break;
5977			case IR_ULE:
5978				|	jbe =>true_block
5979				break;
5980			case IR_UGT:
5981				|	ja =>true_block
5982				break;
5983		}
5984	} else {
5985		switch (op) {
5986			default:
5987				IR_ASSERT(0 && "NIY binary op");
5988			case IR_EQ:
5989				if (!false_block) {
5990					|	jp >1
5991					|	je =>true_block
5992					|1:
5993				} else {
5994					|	jp =>false_block
5995					|	je =>true_block
5996				}
5997				break;
5998			case IR_NE:
5999				|	jne =>true_block
6000				|	jp =>true_block
6001				break;
6002			case IR_LT:
6003				if (!false_block) {
6004					|	jp >1
6005					|	jb =>true_block
6006					|1:
6007				} else {
6008					|	jp =>false_block
6009					|	jb =>true_block
6010				}
6011				break;
6012			case IR_GE:
6013				|	jae =>true_block
6014				break;
6015			case IR_LE:
6016				if (!false_block) {
6017					|	jp >1
6018					|	jbe =>true_block
6019					|1:
6020				} else {
6021					|	jp =>false_block
6022					|	jbe =>true_block
6023				}
6024				break;
6025			case IR_GT:
6026				|	ja =>true_block
6027				break;
6028			case IR_ULT:
6029				|	jb =>true_block
6030				break;
6031			case IR_UGE:
6032				|	jp =>true_block
6033				|	jae =>true_block
6034				break;
6035			case IR_ULE:
6036				|	jbe =>true_block
6037				break;
6038			case IR_UGT:
6039				|	jp =>true_block
6040				|	ja =>true_block
6041				break;
6042		}
6043	}
6044	if (false_block) {
6045		|	jmp =>false_block
6046	}
6047}
6048
6049static void ir_emit_cmp_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6050{
6051	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
6052	ir_op op = cmp_insn->op;
6053	ir_type type = ctx->ir_base[cmp_insn->op1].type;
6054	ir_ref op1 = cmp_insn->op1;
6055	ir_ref op2 = cmp_insn->op2;
6056	ir_reg op1_reg = ctx->regs[insn->op2][1];
6057	ir_reg op2_reg = ctx->regs[insn->op2][2];
6058
6059	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
6060		op1_reg = IR_REG_NUM(op1_reg);
6061		ir_emit_load(ctx, type, op1_reg, op1);
6062	}
6063	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6064		op2_reg = IR_REG_NUM(op2_reg);
6065		if (op1 != op2) {
6066			ir_emit_load(ctx, type, op2_reg, op2);
6067		}
6068	}
6069	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
6070		if (op == IR_ULT) {
6071			/* always false */
6072			ir_emit_jmp_false(ctx, b, def, next_block);
6073			return;
6074		} else if (op == IR_UGE) {
6075			/* always true */
6076			ir_emit_jmp_true(ctx, b, def, next_block);
6077			return;
6078		} else if (op == IR_ULE) {
6079			op = IR_EQ;
6080		} else if (op == IR_UGT) {
6081			op = IR_NE;
6082		}
6083	}
6084
6085	bool same_comparison = 0;
6086	ir_insn *prev_insn = &ctx->ir_base[insn->op1];
6087	if (prev_insn->op == IR_IF_TRUE || prev_insn->op == IR_IF_FALSE) {
6088		if (ir_rule(ctx, prev_insn->op1) == IR_CMP_AND_BRANCH_INT) {
6089			prev_insn = &ctx->ir_base[prev_insn->op1];
6090			prev_insn = &ctx->ir_base[prev_insn->op2];
6091			if (prev_insn->op1 == cmp_insn->op1 && prev_insn->op2 == cmp_insn->op2) {
6092				same_comparison = true;
6093			}
6094		}
6095	}
6096	if (!same_comparison) {
6097		ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
6098	}
6099	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1);
6100}
6101
6102static void ir_emit_test_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6103{
6104	ir_ref op2 = insn->op2;
6105	ir_op op = ctx->ir_base[op2].op;
6106
6107	if (op >= IR_EQ && op <= IR_UGT) {
6108		op2 = ctx->ir_base[op2].op1;
6109	} else {
6110		IR_ASSERT(op == IR_AND);
6111		op = IR_NE;
6112	}
6113
6114	ir_emit_test_int_common(ctx, def, op2, op);
6115	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1);
6116}
6117
6118static void ir_emit_cmp_and_branch_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6119{
6120	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
6121	ir_emit_jcc(ctx, b, def, insn, next_block, op, 0);
6122}
6123
6124static void ir_emit_if_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6125{
6126	ir_type type = ctx->ir_base[insn->op2].type;
6127	ir_reg op2_reg = ctx->regs[def][2];
6128	ir_backend_data *data = ctx->data;
6129	dasm_State **Dst = &data->dasm_state;
6130
6131	if (op2_reg != IR_REG_NONE) {
6132		if (IR_REG_SPILLED(op2_reg)) {
6133			op2_reg = IR_REG_NUM(op2_reg);
6134			ir_emit_load(ctx, type, op2_reg, insn->op2);
6135		}
6136		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
6137	} else if (IR_IS_CONST_REF(insn->op2)) {
6138		uint32_t true_block, false_block;
6139
6140		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
6141		if (ir_const_is_true(&ctx->ir_base[insn->op2])) {
6142			if (true_block != next_block) {
6143				|	jmp =>true_block
6144			}
6145		} else {
6146			if (false_block != next_block) {
6147				|	jmp =>false_block
6148			}
6149		}
6150		return;
6151	} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
6152		uint32_t true_block, false_block;
6153
6154		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
6155		if (true_block != next_block) {
6156			|	jmp =>true_block
6157		}
6158		return;
6159	} else {
6160		ir_mem mem;
6161
6162		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
6163			mem = ir_fuse_load(ctx, def, insn->op2);
6164		} else {
6165			mem = ir_ref_spill_slot(ctx, insn->op2);
6166		}
6167		|	ASM_MEM_IMM_OP cmp, type, mem, 0
6168	}
6169	ir_emit_jcc(ctx, b, def, insn, next_block, IR_NE, 1);
6170}
6171
6172static void ir_emit_cond(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6173{
6174	ir_backend_data *data = ctx->data;
6175	dasm_State **Dst = &data->dasm_state;
6176	ir_type type = insn->type;
6177	ir_ref op1 = insn->op1;
6178	ir_ref op2 = insn->op2;
6179	ir_ref op3 = insn->op3;
6180	ir_type op1_type = ctx->ir_base[op1].type;
6181	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6182	ir_reg op1_reg = ctx->regs[def][1];
6183	ir_reg op2_reg = ctx->regs[def][2];
6184	ir_reg op3_reg = ctx->regs[def][3];
6185
6186	IR_ASSERT(def_reg != IR_REG_NONE);
6187
6188	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6189		op2_reg = IR_REG_NUM(op2_reg);
6190		ir_emit_load(ctx, type, op2_reg, op2);
6191		if (op1 == op2) {
6192			op1_reg = op2_reg;
6193		}
6194		if (op3 == op2) {
6195			op3_reg = op2_reg;
6196		}
6197	}
6198	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6199		op3_reg = IR_REG_NUM(op3_reg);
6200		ir_emit_load(ctx, type, op3_reg, op3);
6201		if (op1 == op2) {
6202			op1_reg = op3_reg;
6203		}
6204	}
6205	if (op1_reg != IR_REG_NONE && op1 != op2 && op1 != op3 && IR_REG_SPILLED(op1_reg)) {
6206		op1_reg = IR_REG_NUM(op1_reg);
6207		ir_emit_load(ctx, op1_type, op1_reg, op1);
6208	}
6209
6210	if (IR_IS_TYPE_INT(op1_type)) {
6211		if (op1_reg != IR_REG_NONE) {
6212			|	ASM_REG_REG_OP test, op1_type, op1_reg, op1_reg
6213		} else {
6214			ir_mem mem = ir_ref_spill_slot(ctx, op1);
6215
6216			|	ASM_MEM_IMM_OP cmp, op1_type, mem, 0
6217		}
6218		if (IR_IS_TYPE_INT(type)) {
6219			IR_ASSERT(op2_reg != IR_REG_NONE || op3_reg != IR_REG_NONE);
6220			if (op3_reg != IR_REG_NONE) {
6221				if (op3_reg == def_reg) {
6222					IR_ASSERT(op2_reg != IR_REG_NONE);
6223					|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
6224				} else {
6225					if (op2_reg != IR_REG_NONE) {
6226						if (def_reg != op2_reg) {
6227							if (IR_IS_TYPE_INT(type)) {
6228								ir_emit_mov(ctx, type, def_reg, op2_reg);
6229							} else {
6230								ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6231							}
6232						}
6233					} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
6234						/* prevent "xor" and flags clobbering */
6235						ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
6236					} else {
6237						ir_emit_load_ex(ctx, type, def_reg, op2, def);
6238					}
6239					|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
6240				}
6241			} else {
6242				IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
6243				if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
6244					/* prevent "xor" and flags clobbering */
6245					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
6246				} else {
6247					ir_emit_load_ex(ctx, type, def_reg, op3, def);
6248				}
6249				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
6250			}
6251
6252			if (IR_REG_SPILLED(ctx->regs[def][0])) {
6253				ir_emit_store(ctx, type, def, def_reg);
6254			}
6255			return;
6256		}
6257		|	je >2
6258	} else {
6259		if (!data->double_zero_const) {
6260			data->double_zero_const = 1;
6261			ir_rodata(ctx);
6262			|.align 16
6263			|->double_zero_const:
6264			|.dword 0, 0
6265			|.code
6266		}
6267		|	ASM_FP_REG_TXT_OP ucomis, op1_type, op1_reg, [->double_zero_const]
6268		|	jp >1
6269		|	je >2
6270		|1:
6271	}
6272
6273	if (op2_reg != IR_REG_NONE) {
6274		if (def_reg != op2_reg) {
6275			if (IR_IS_TYPE_INT(type)) {
6276				ir_emit_mov(ctx, type, def_reg, op2_reg);
6277			} else {
6278				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6279			}
6280		}
6281	} else {
6282		ir_emit_load_ex(ctx, type, def_reg, op2, def);
6283	}
6284	|	jmp >3
6285	|2:
6286	if (op3_reg != IR_REG_NONE) {
6287		if (def_reg != op3_reg) {
6288			if (IR_IS_TYPE_INT(type)) {
6289				ir_emit_mov(ctx, type, def_reg, op3_reg);
6290			} else {
6291				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6292			}
6293		}
6294	} else {
6295		ir_emit_load_ex(ctx, type, def_reg, op3, def);
6296	}
6297	|3:
6298
6299	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6300		ir_emit_store(ctx, type, def, def_reg);
6301	}
6302}
6303
6304static void ir_emit_cond_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6305{
6306	ir_backend_data *data = ctx->data;
6307	dasm_State **Dst = &data->dasm_state;
6308	ir_type type = insn->type;
6309	ir_ref op2 = insn->op2;
6310	ir_ref op3 = insn->op3;
6311	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6312	ir_reg op2_reg = ctx->regs[def][2];
6313	ir_reg op3_reg = ctx->regs[def][3];
6314	ir_op op;
6315
6316	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6317		op2_reg = IR_REG_NUM(op2_reg);
6318		ir_emit_load(ctx, type, op2_reg, op2);
6319		if (op3 == op2) {
6320			op3_reg = op2_reg;
6321		}
6322	}
6323	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6324		op3_reg = IR_REG_NUM(op3_reg);
6325		ir_emit_load(ctx, type, op3_reg, op3);
6326	}
6327
6328	ir_emit_cmp_int_common2(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);
6329	op = ctx->ir_base[insn->op1].op;
6330
6331	if (IR_IS_TYPE_INT(type)) {
6332		if (op3_reg != IR_REG_NONE) {
6333			if (op3_reg == def_reg) {
6334				IR_ASSERT(op2_reg != IR_REG_NONE);
6335				op3_reg = op2_reg;
6336				op ^= 1; // reverse
6337			} else {
6338				if (op2_reg != IR_REG_NONE) {
6339					if (def_reg != op2_reg) {
6340//						if (IR_IS_TYPE_INT(type)) {
6341							ir_emit_mov(ctx, type, def_reg, op2_reg);
6342//						} else {
6343//							ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6344//						}
6345					}
6346				} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
6347					/* prevent "xor" and flags clobbering */
6348					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
6349				} else {
6350					ir_emit_load_ex(ctx, type, def_reg, op2, def);
6351				}
6352			}
6353		} else {
6354			IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
6355			if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
6356				/* prevent "xor" and flags clobbering */
6357				ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
6358			} else {
6359				ir_emit_load_ex(ctx, type, def_reg, op3, def);
6360			}
6361			op3_reg = op2_reg;
6362			op ^= 1; // reverse
6363		}
6364
6365		switch (op) {
6366			default:
6367				IR_ASSERT(0 && "NIY binary op");
6368			case IR_EQ:
6369				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op3_reg
6370				break;
6371			case IR_NE:
6372				|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
6373				break;
6374			case IR_LT:
6375				|	ASM_REG_REG_OP2 cmovge, type, def_reg, op3_reg
6376				break;
6377			case IR_GE:
6378				|	ASM_REG_REG_OP2 cmovl, type, def_reg, op3_reg
6379				break;
6380			case IR_LE:
6381				|	ASM_REG_REG_OP2 cmovg, type, def_reg, op3_reg
6382				break;
6383			case IR_GT:
6384				|	ASM_REG_REG_OP2 cmovle, type, def_reg, op3_reg
6385				break;
6386			case IR_ULT:
6387				|	ASM_REG_REG_OP2 cmovae, type, def_reg, op3_reg
6388				break;
6389			case IR_UGE:
6390				|	ASM_REG_REG_OP2 cmovb, type, def_reg, op3_reg
6391				break;
6392			case IR_ULE:
6393				|	ASM_REG_REG_OP2 cmova, type, def_reg, op3_reg
6394				break;
6395			case IR_UGT:
6396				|	ASM_REG_REG_OP2 cmovbe, type, def_reg, op3_reg
6397				break;
6398		}
6399	} else {
6400		switch (op) {
6401			default:
6402				IR_ASSERT(0 && "NIY binary op");
6403			case IR_EQ:
6404				|	jne >2
6405				break;
6406			case IR_NE:
6407				|	je >2
6408				break;
6409			case IR_LT:
6410				|	jge >2
6411				break;
6412			case IR_GE:
6413				|	jl >2
6414				break;
6415			case IR_LE:
6416				|	jg >2
6417				break;
6418			case IR_GT:
6419				|	jle >2
6420				break;
6421			case IR_ULT:
6422				|	jae >2
6423				break;
6424			case IR_UGE:
6425				|	jb >2
6426				break;
6427			case IR_ULE:
6428				|	ja >2
6429				break;
6430			case IR_UGT:
6431				|	jbe >2
6432				break;
6433		}
6434		|1:
6435
6436		if (op2_reg != IR_REG_NONE) {
6437			if (def_reg != op2_reg) {
6438				if (IR_IS_TYPE_INT(type)) {
6439					ir_emit_mov(ctx, type, def_reg, op2_reg);
6440				} else {
6441					ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6442				}
6443			}
6444		} else {
6445			ir_emit_load_ex(ctx, type, def_reg, op2, def);
6446		}
6447		|	jmp >3
6448		|2:
6449		if (op3_reg != IR_REG_NONE) {
6450			if (def_reg != op3_reg) {
6451				if (IR_IS_TYPE_INT(type)) {
6452					ir_emit_mov(ctx, type, def_reg, op3_reg);
6453				} else {
6454					ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6455				}
6456			}
6457		} else {
6458			ir_emit_load_ex(ctx, type, def_reg, op3, def);
6459		}
6460		|3:
6461	}
6462
6463	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6464		ir_emit_store(ctx, type, def, def_reg);
6465	}
6466}
6467
6468static void ir_emit_cond_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6469{
6470	ir_backend_data *data = ctx->data;
6471	dasm_State **Dst = &data->dasm_state;
6472	ir_type type = insn->type;
6473	ir_ref op2 = insn->op2;
6474	ir_ref op3 = insn->op3;
6475	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6476	ir_reg op2_reg = ctx->regs[def][2];
6477	ir_reg op3_reg = ctx->regs[def][3];
6478	ir_op op;
6479
6480	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6481		op2_reg = IR_REG_NUM(op2_reg);
6482		ir_emit_load(ctx, type, op2_reg, op2);
6483		if (op3 == op2) {
6484			op3_reg = op2_reg;
6485		}
6486	}
6487	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6488		op3_reg = IR_REG_NUM(op3_reg);
6489		ir_emit_load(ctx, type, op3_reg, op3);
6490	}
6491
6492	op = ir_emit_cmp_fp_common(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);
6493
6494	switch (op) {
6495		default:
6496			IR_ASSERT(0 && "NIY binary op");
6497		case IR_EQ:
6498			|	jne >2
6499			|	jp >2
6500			break;
6501		case IR_NE:
6502			|	jp >1
6503			|	je >2
6504			break;
6505		case IR_LT:
6506			|	jp >2
6507			|	jae >2
6508			break;
6509		case IR_GE:
6510			|	jb >2
6511			break;
6512		case IR_LE:
6513			|	jp >2
6514			|	ja >2
6515			break;
6516		case IR_GT:
6517			|	jbe >2
6518			break;
6519		case IR_ULT:
6520			|	jae >2
6521			break;
6522		case IR_UGE:
6523			|	jp >1
6524			|	jb >2
6525			break;
6526		case IR_ULE:
6527			|	ja >2
6528			break;
6529		case IR_UGT:
6530			|	jp >1
6531			|	jbe >2
6532			break;
6533	}
6534	|1:
6535
6536	if (op2_reg != IR_REG_NONE) {
6537		if (def_reg != op2_reg) {
6538			if (IR_IS_TYPE_INT(type)) {
6539				ir_emit_mov(ctx, type, def_reg, op2_reg);
6540			} else {
6541				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6542			}
6543		}
6544	} else {
6545		ir_emit_load_ex(ctx, type, def_reg, op2, def);
6546	}
6547	|	jmp >3
6548	|2:
6549	if (op3_reg != IR_REG_NONE) {
6550		if (def_reg != op3_reg) {
6551			if (IR_IS_TYPE_INT(type)) {
6552				ir_emit_mov(ctx, type, def_reg, op3_reg);
6553			} else {
6554				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6555			}
6556		}
6557	} else {
6558		ir_emit_load_ex(ctx, type, def_reg, op3, def);
6559	}
6560	|3:
6561
6562	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6563		ir_emit_store(ctx, type, def, def_reg);
6564	}
6565}
6566
6567static void ir_emit_return_void(ir_ctx *ctx)
6568{
6569	ir_backend_data *data = ctx->data;
6570	dasm_State **Dst = &data->dasm_state;
6571
6572	ir_emit_epilogue(ctx);
6573
6574#ifdef IR_TARGET_X86
6575	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC) && ctx->param_stack_size) {
6576		|	ret ctx->param_stack_size
6577		return;
6578	}
6579#endif
6580
6581	|	ret
6582}
6583
6584static void ir_emit_return_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6585{
6586	ir_reg op2_reg = ctx->regs[ref][2];
6587
6588	if (op2_reg != IR_REG_INT_RET1) {
6589		ir_type type = ctx->ir_base[insn->op2].type;
6590
6591		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
6592			ir_emit_mov(ctx, type, IR_REG_INT_RET1, op2_reg);
6593		} else {
6594			ir_emit_load(ctx, type, IR_REG_INT_RET1, insn->op2);
6595		}
6596	}
6597	ir_emit_return_void(ctx);
6598}
6599
6600static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6601{
6602	ir_reg op2_reg = ctx->regs[ref][2];
6603	ir_type type = ctx->ir_base[insn->op2].type;
6604
6605#ifdef IR_REG_FP_RET1
6606	if (op2_reg != IR_REG_FP_RET1) {
6607		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
6608			ir_emit_fp_mov(ctx, type, IR_REG_FP_RET1, op2_reg);
6609		} else {
6610			ir_emit_load(ctx, type, IR_REG_FP_RET1, insn->op2);
6611		}
6612	}
6613#else
6614	ir_backend_data *data = ctx->data;
6615	dasm_State **Dst = &data->dasm_state;
6616
6617	if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
6618		ir_reg fp;
6619		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp);
6620
6621		if (type == IR_DOUBLE) {
6622			|	fld qword [Ra(fp)+offset]
6623		} else {
6624			IR_ASSERT(type == IR_FLOAT);
6625			|	fld dword [Ra(fp)+offset]
6626		}
6627	} else {
6628		int32_t offset = ctx->ret_slot;
6629		ir_reg fp;
6630
6631		IR_ASSERT(offset != -1);
6632		offset = IR_SPILL_POS_TO_OFFSET(offset);
6633		fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
6634		ir_emit_store_mem_fp(ctx, type, IR_MEM_BO(fp, offset), op2_reg);
6635		if (type == IR_DOUBLE) {
6636			|	fld qword [Ra(fp)+offset]
6637		} else {
6638			IR_ASSERT(type == IR_FLOAT);
6639			|	fld dword [Ra(fp)+offset]
6640		}
6641	}
6642#endif
6643	ir_emit_return_void(ctx);
6644}
6645
6646static void ir_emit_sext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6647{
6648	ir_type dst_type = insn->type;
6649	ir_type src_type = ctx->ir_base[insn->op1].type;
6650	ir_backend_data *data = ctx->data;
6651	dasm_State **Dst = &data->dasm_state;
6652	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6653	ir_reg op1_reg = ctx->regs[def][1];
6654
6655	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6656	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6657	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
6658	IR_ASSERT(def_reg != IR_REG_NONE);
6659
6660	if (op1_reg != IR_REG_NONE) {
6661		if (IR_REG_SPILLED(op1_reg)) {
6662			op1_reg = IR_REG_NUM(op1_reg);
6663			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6664		}
6665		if (ir_type_size[src_type] == 1) {
6666			if (ir_type_size[dst_type] == 2) {
6667				|	movsx Rw(def_reg), Rb(op1_reg)
6668			} else if (ir_type_size[dst_type] == 4) {
6669				|	movsx Rd(def_reg), Rb(op1_reg)
6670			} else {
6671				IR_ASSERT(ir_type_size[dst_type] == 8);
6672				IR_ASSERT(sizeof(void*) == 8);
6673|.if X64
6674				|	movsx Rq(def_reg), Rb(op1_reg)
6675|.endif
6676			}
6677		} else if (ir_type_size[src_type] == 2) {
6678			if (ir_type_size[dst_type] == 4) {
6679				|	movsx Rd(def_reg), Rw(op1_reg)
6680			} else {
6681				IR_ASSERT(ir_type_size[dst_type] == 8);
6682				IR_ASSERT(sizeof(void*) == 8);
6683|.if X64
6684				|	movsx Rq(def_reg), Rw(op1_reg)
6685|.endif
6686			}
6687		} else {
6688			IR_ASSERT(ir_type_size[src_type] == 4);
6689			IR_ASSERT(ir_type_size[dst_type] == 8);
6690			IR_ASSERT(sizeof(void*) == 8);
6691|.if X64
6692			|	movsxd Rq(def_reg), Rd(op1_reg)
6693|.endif
6694		}
6695	} else if (IR_IS_CONST_REF(insn->op1)) {
6696		IR_ASSERT(0);
6697	} else {
6698		ir_mem mem;
6699
6700		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6701			mem = ir_fuse_load(ctx, def, insn->op1);
6702		} else {
6703			mem = ir_ref_spill_slot(ctx, insn->op1);
6704		}
6705
6706		if (ir_type_size[src_type] == 1) {
6707			if (ir_type_size[dst_type] == 2) {
6708				|	ASM_TXT_TMEM_OP movsx, Rw(def_reg), byte, mem
6709			} else if (ir_type_size[dst_type] == 4) {
6710				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), byte, mem
6711			} else {
6712				IR_ASSERT(ir_type_size[dst_type] == 8);
6713				IR_ASSERT(sizeof(void*) == 8);
6714|.if X64
6715				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), byte, mem
6716|.endif
6717			}
6718		} else if (ir_type_size[src_type] == 2) {
6719			if (ir_type_size[dst_type] == 4) {
6720				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), word, mem
6721			} else {
6722				IR_ASSERT(ir_type_size[dst_type] == 8);
6723				IR_ASSERT(sizeof(void*) == 8);
6724|.if X64
6725				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), word, mem
6726|.endif
6727			}
6728		} else {
6729			IR_ASSERT(ir_type_size[src_type] == 4);
6730			IR_ASSERT(ir_type_size[dst_type] == 8);
6731			IR_ASSERT(sizeof(void*) == 8);
6732|.if X64
6733			|	ASM_TXT_TMEM_OP movsxd, Rq(def_reg), dword, mem
6734|.endif
6735		}
6736	}
6737	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6738		ir_emit_store(ctx, dst_type, def, def_reg);
6739	}
6740}
6741
6742static void ir_emit_zext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6743{
6744	ir_type dst_type = insn->type;
6745	ir_type src_type = ctx->ir_base[insn->op1].type;
6746	ir_backend_data *data = ctx->data;
6747	dasm_State **Dst = &data->dasm_state;
6748	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6749	ir_reg op1_reg = ctx->regs[def][1];
6750
6751	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6752	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6753	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
6754	IR_ASSERT(def_reg != IR_REG_NONE);
6755
6756	if (op1_reg != IR_REG_NONE) {
6757		if (IR_REG_SPILLED(op1_reg)) {
6758			op1_reg = IR_REG_NUM(op1_reg);
6759			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6760		}
6761		if (ir_type_size[src_type] == 1) {
6762			if (ir_type_size[dst_type] == 2) {
6763				|	movzx Rw(def_reg), Rb(op1_reg)
6764			} else if (ir_type_size[dst_type] == 4) {
6765				|	movzx Rd(def_reg), Rb(op1_reg)
6766			} else {
6767				IR_ASSERT(ir_type_size[dst_type] == 8);
6768				IR_ASSERT(sizeof(void*) == 8);
6769|.if X64
6770				|	movzx Rq(def_reg), Rb(op1_reg)
6771|.endif
6772			}
6773		} else if (ir_type_size[src_type] == 2) {
6774			if (ir_type_size[dst_type] == 4) {
6775				|	movzx Rd(def_reg), Rw(op1_reg)
6776			} else {
6777				IR_ASSERT(ir_type_size[dst_type] == 8);
6778				IR_ASSERT(sizeof(void*) == 8);
6779|.if X64
6780				|	movzx Rq(def_reg), Rw(op1_reg)
6781|.endif
6782			}
6783		} else {
6784			IR_ASSERT(ir_type_size[src_type] == 4);
6785			IR_ASSERT(ir_type_size[dst_type] == 8);
6786			IR_ASSERT(sizeof(void*) == 8);
6787|.if X64
6788			/* Avoid zero extension to the same register. This may be not always safe ??? */
6789			if (op1_reg != def_reg) {
6790				|	mov Rd(def_reg), Rd(op1_reg)
6791			}
6792|.endif
6793		}
6794	} else if (IR_IS_CONST_REF(insn->op1)) {
6795		IR_ASSERT(0);
6796	} else {
6797		ir_mem mem;
6798
6799		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6800			mem = ir_fuse_load(ctx, def, insn->op1);
6801		} else {
6802			mem = ir_ref_spill_slot(ctx, insn->op1);
6803		}
6804
6805		if (ir_type_size[src_type] == 1) {
6806			if (ir_type_size[dst_type] == 2) {
6807				|	ASM_TXT_TMEM_OP movzx, Rw(def_reg), byte, mem
6808			} else if (ir_type_size[dst_type] == 4) {
6809				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), byte, mem
6810			} else {
6811				IR_ASSERT(ir_type_size[dst_type] == 8);
6812				IR_ASSERT(sizeof(void*) == 8);
6813|.if X64
6814				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), byte, mem
6815|.endif
6816			}
6817		} else if (ir_type_size[src_type] == 2) {
6818			if (ir_type_size[dst_type] == 4) {
6819				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), word, mem
6820			} else {
6821				IR_ASSERT(ir_type_size[dst_type] == 8);
6822				IR_ASSERT(sizeof(void*) == 8);
6823|.if X64
6824				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), word, mem
6825|.endif
6826			}
6827		} else {
6828			IR_ASSERT(ir_type_size[src_type] == 4);
6829			IR_ASSERT(ir_type_size[dst_type] == 8);
6830|.if X64
6831			|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6832|.endif
6833		}
6834	}
6835	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6836		ir_emit_store(ctx, dst_type, def, def_reg);
6837	}
6838}
6839
6840static void ir_emit_trunc(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6841{
6842	ir_type dst_type = insn->type;
6843	ir_type src_type = ctx->ir_base[insn->op1].type;
6844	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6845	ir_reg op1_reg = ctx->regs[def][1];
6846
6847	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6848	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6849	IR_ASSERT(ir_type_size[dst_type] < ir_type_size[src_type]);
6850	IR_ASSERT(def_reg != IR_REG_NONE);
6851	if (op1_reg != IR_REG_NONE) {
6852		if (IR_REG_SPILLED(op1_reg)) {
6853			op1_reg = IR_REG_NUM(op1_reg);
6854			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6855		}
6856		if (op1_reg != def_reg) {
6857			ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6858		}
6859	} else {
6860		ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6861	}
6862	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6863		ir_emit_store(ctx, dst_type, def, def_reg);
6864	}
6865}
6866
6867static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6868{
6869	ir_type dst_type = insn->type;
6870	ir_type src_type = ctx->ir_base[insn->op1].type;
6871	ir_backend_data *data = ctx->data;
6872	dasm_State **Dst = &data->dasm_state;
6873	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6874	ir_reg op1_reg = ctx->regs[def][1];
6875
6876	IR_ASSERT(ir_type_size[dst_type] == ir_type_size[src_type]);
6877	IR_ASSERT(def_reg != IR_REG_NONE);
6878	if (IR_IS_TYPE_INT(src_type) && IR_IS_TYPE_INT(dst_type)) {
6879		if (op1_reg != IR_REG_NONE) {
6880			if (IR_REG_SPILLED(op1_reg)) {
6881				op1_reg = IR_REG_NUM(op1_reg);
6882				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6883			}
6884			if (op1_reg != def_reg) {
6885				ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6886			}
6887		} else {
6888			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6889		}
6890	} else if (IR_IS_TYPE_FP(src_type) && IR_IS_TYPE_FP(dst_type)) {
6891		if (op1_reg != IR_REG_NONE) {
6892			if (IR_REG_SPILLED(op1_reg)) {
6893				op1_reg = IR_REG_NUM(op1_reg);
6894				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6895			}
6896			if (op1_reg != def_reg) {
6897				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
6898			}
6899		} else {
6900			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6901		}
6902	} else if (IR_IS_TYPE_FP(src_type)) {
6903		IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6904		if (op1_reg != IR_REG_NONE) {
6905			if (IR_REG_SPILLED(op1_reg)) {
6906				op1_reg = IR_REG_NUM(op1_reg);
6907				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6908			}
6909			if (src_type == IR_DOUBLE) {
6910				IR_ASSERT(sizeof(void*) == 8);
6911|.if X64
6912				if (ctx->mflags & IR_X86_AVX) {
6913					|	vmovd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6914				} else {
6915					|	movd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6916				}
6917|.endif
6918			} else {
6919				IR_ASSERT(src_type == IR_FLOAT);
6920				if (ctx->mflags & IR_X86_AVX) {
6921					|	vmovd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6922				} else {
6923					|	movd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6924				}
6925			}
6926		} else if (IR_IS_CONST_REF(insn->op1)) {
6927			ir_insn *_insn = &ctx->ir_base[insn->op1];
6928			IR_ASSERT(!IR_IS_SYM_CONST(_insn->op));
6929			if (src_type == IR_DOUBLE) {
6930				IR_ASSERT(sizeof(void*) == 8);
6931|.if X64
6932				|	mov64 Rq(def_reg), _insn->val.i64
6933|.endif
6934			} else {
6935				IR_ASSERT(src_type == IR_FLOAT);
6936				|	mov Rd(def_reg), _insn->val.i32
6937			}
6938		} else {
6939			ir_mem mem;
6940
6941			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6942				mem = ir_fuse_load(ctx, def, insn->op1);
6943			} else {
6944				mem = ir_ref_spill_slot(ctx, insn->op1);
6945			}
6946
6947			if (src_type == IR_DOUBLE) {
6948				IR_ASSERT(sizeof(void*) == 8);
6949|.if X64
6950				|	ASM_TXT_TMEM_OP mov, Rq(def_reg), qword, mem
6951|.endif
6952			} else {
6953				IR_ASSERT(src_type == IR_FLOAT);
6954				|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6955			}
6956		}
6957	} else if (IR_IS_TYPE_FP(dst_type)) {
6958		IR_ASSERT(IR_IS_TYPE_INT(src_type));
6959		if (op1_reg != IR_REG_NONE) {
6960			if (IR_REG_SPILLED(op1_reg)) {
6961				op1_reg = IR_REG_NUM(op1_reg);
6962				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6963			}
6964			if (dst_type == IR_DOUBLE) {
6965				IR_ASSERT(sizeof(void*) == 8);
6966|.if X64
6967				if (ctx->mflags & IR_X86_AVX) {
6968					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6969				} else {
6970					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6971				}
6972|.endif
6973			} else {
6974				IR_ASSERT(dst_type == IR_FLOAT);
6975				if (ctx->mflags & IR_X86_AVX) {
6976					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6977				} else {
6978					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6979				}
6980			}
6981		} else if (IR_IS_CONST_REF(insn->op1)) {
6982			int label = ir_const_label(ctx, insn->op1);
6983
6984			|	ASM_FP_REG_TXT_OP movs, dst_type, def_reg, [=>label]
6985		} else {
6986			ir_mem mem;
6987
6988			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6989				mem = ir_fuse_load(ctx, def, insn->op1);
6990			} else {
6991				mem = ir_ref_spill_slot(ctx, insn->op1);
6992			}
6993
6994			|	ASM_FP_REG_MEM_OP movs, dst_type, def_reg, mem
6995		}
6996	}
6997	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6998		ir_emit_store(ctx, dst_type, def, def_reg);
6999	}
7000}
7001
7002static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7003{
7004	ir_type dst_type = insn->type;
7005	ir_type src_type = ctx->ir_base[insn->op1].type;
7006	ir_backend_data *data = ctx->data;
7007	dasm_State **Dst = &data->dasm_state;
7008	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7009	ir_reg op1_reg = ctx->regs[def][1];
7010
7011	IR_ASSERT(IR_IS_TYPE_INT(src_type));
7012	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
7013	IR_ASSERT(def_reg != IR_REG_NONE);
7014	if (op1_reg != IR_REG_NONE) {
7015		bool src64 = 0;
7016
7017		if (IR_REG_SPILLED(op1_reg)) {
7018			op1_reg = IR_REG_NUM(op1_reg);
7019			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
7020		}
7021		if (IR_IS_TYPE_SIGNED(src_type)) {
7022			if (ir_type_size[src_type] < 4) {
7023|.if X64
7024||				if (ir_type_size[src_type] == 1) {
7025					| movsx Rq(op1_reg), Rb(op1_reg)
7026||				} else {
7027					| movsx Rq(op1_reg), Rw(op1_reg)
7028||				}
7029||				src64 = 1;
7030|.else
7031||				if (ir_type_size[src_type] == 1) {
7032					| movsx Rd(op1_reg), Rb(op1_reg)
7033||				} else {
7034					| movsx Rd(op1_reg), Rw(op1_reg)
7035||				}
7036|.endif
7037			} else if (ir_type_size[src_type] > 4) {
7038				src64 = 1;
7039			}
7040		} else {
7041			if (ir_type_size[src_type] < 8) {
7042|.if X64
7043||				if (ir_type_size[src_type] == 1) {
7044					| movzx Rq(op1_reg), Rb(op1_reg)
7045||				} else if (ir_type_size[src_type] == 2) {
7046					| movzx Rq(op1_reg), Rw(op1_reg)
7047||				}
7048||				src64 = 1;
7049|.else
7050||				if (ir_type_size[src_type] == 1) {
7051					| movzx Rd(op1_reg), Rb(op1_reg)
7052||				} else if (ir_type_size[src_type] == 2) {
7053					| movzx Rd(op1_reg), Rw(op1_reg)
7054||				}
7055|.endif
7056			} else {
7057				// TODO: uint64_t -> double
7058				src64 = 1;
7059			}
7060		}
7061		if (!src64) {
7062			if (dst_type == IR_DOUBLE) {
7063				if (ctx->mflags & IR_X86_AVX) {
7064					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7065					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7066				} else {
7067					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7068					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7069				}
7070			} else {
7071				IR_ASSERT(dst_type == IR_FLOAT);
7072				if (ctx->mflags & IR_X86_AVX) {
7073					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7074					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7075				} else {
7076					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7077					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7078				}
7079			}
7080		} else {
7081			IR_ASSERT(sizeof(void*) == 8);
7082|.if X64
7083			if (dst_type == IR_DOUBLE) {
7084				if (ctx->mflags & IR_X86_AVX) {
7085					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7086					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7087				} else {
7088					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7089					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7090				}
7091			} else {
7092				IR_ASSERT(dst_type == IR_FLOAT);
7093				if (ctx->mflags & IR_X86_AVX) {
7094					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7095					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7096				} else {
7097					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7098					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7099				}
7100			}
7101|.endif
7102		}
7103	} else {
7104		ir_mem mem;
7105		bool src64 = ir_type_size[src_type] == 8;
7106
7107		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7108			mem = ir_fuse_load(ctx, def, insn->op1);
7109		} else {
7110			mem = ir_ref_spill_slot(ctx, insn->op1);
7111		}
7112
7113		if (!src64) {
7114			if (dst_type == IR_DOUBLE) {
7115				if (ctx->mflags & IR_X86_AVX) {
7116					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7117					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7118				} else {
7119					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7120					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7121				}
7122			} else {
7123				IR_ASSERT(dst_type == IR_FLOAT);
7124				if (ctx->mflags & IR_X86_AVX) {
7125					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7126					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7127				} else {
7128					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7129					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7130				}
7131			}
7132		} else {
7133			IR_ASSERT(sizeof(void*) == 8);
7134|.if X64
7135			if (dst_type == IR_DOUBLE) {
7136				if (ctx->mflags & IR_X86_AVX) {
7137					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7138					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7139				} else {
7140					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7141					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7142				}
7143			} else {
7144				IR_ASSERT(dst_type == IR_FLOAT);
7145				if (ctx->mflags & IR_X86_AVX) {
7146					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7147					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7148				} else {
7149					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7150					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7151				}
7152			}
7153|.endif
7154		}
7155	}
7156	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7157		ir_emit_store(ctx, dst_type, def, def_reg);
7158	}
7159}
7160
7161static void ir_emit_fp2int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7162{
7163	ir_type dst_type = insn->type;
7164	ir_type src_type = ctx->ir_base[insn->op1].type;
7165	ir_backend_data *data = ctx->data;
7166	dasm_State **Dst = &data->dasm_state;
7167	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7168	ir_reg op1_reg = ctx->regs[def][1];
7169	bool dst64 = 0;
7170
7171	IR_ASSERT(IR_IS_TYPE_FP(src_type));
7172	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
7173	IR_ASSERT(def_reg != IR_REG_NONE);
7174	if (IR_IS_TYPE_SIGNED(dst_type) ? ir_type_size[dst_type] == 8 : ir_type_size[dst_type] >= 4) {
7175		// TODO: we might need to perform truncation from 32/64 bit integer
7176		dst64 = 1;
7177	}
7178	if (op1_reg != IR_REG_NONE) {
7179		if (IR_REG_SPILLED(op1_reg)) {
7180			op1_reg = IR_REG_NUM(op1_reg);
7181			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
7182		}
7183		if (!dst64) {
7184			if (src_type == IR_DOUBLE) {
7185				if (ctx->mflags & IR_X86_AVX) {
7186					|	vcvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7187				} else {
7188					|	cvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7189				}
7190			} else {
7191				IR_ASSERT(src_type == IR_FLOAT);
7192				if (ctx->mflags & IR_X86_AVX) {
7193					|	vcvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7194				} else {
7195					|	cvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7196				}
7197			}
7198		} else {
7199			IR_ASSERT(sizeof(void*) == 8);
7200|.if X64
7201			if (src_type == IR_DOUBLE) {
7202				if (ctx->mflags & IR_X86_AVX) {
7203					|	vcvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7204				} else {
7205					|	cvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7206				}
7207			} else {
7208				IR_ASSERT(src_type == IR_FLOAT);
7209				if (ctx->mflags & IR_X86_AVX) {
7210					|	vcvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7211				} else {
7212					|	cvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7213				}
7214			}
7215|.endif
7216		}
7217	} else if (IR_IS_CONST_REF(insn->op1)) {
7218		int label = ir_const_label(ctx, insn->op1);
7219
7220		if (!dst64) {
7221			if (src_type == IR_DOUBLE) {
7222				if (ctx->mflags & IR_X86_AVX) {
7223					|	vcvttsd2si Rd(def_reg), qword [=>label]
7224				} else {
7225					|	cvttsd2si Rd(def_reg), qword [=>label]
7226				}
7227			} else {
7228				IR_ASSERT(src_type == IR_FLOAT);
7229				if (ctx->mflags & IR_X86_AVX) {
7230					|	vcvttss2si Rd(def_reg), dword [=>label]
7231				} else {
7232					|	cvttss2si Rd(def_reg), dword [=>label]
7233				}
7234			}
7235		} else {
7236			IR_ASSERT(sizeof(void*) == 8);
7237|.if X64
7238			if (src_type == IR_DOUBLE) {
7239				if (ctx->mflags & IR_X86_AVX) {
7240					|	vcvttsd2si Rq(def_reg), qword [=>label]
7241				} else {
7242					|	cvttsd2si Rq(def_reg), qword [=>label]
7243				}
7244			} else {
7245				IR_ASSERT(src_type == IR_FLOAT);
7246				if (ctx->mflags & IR_X86_AVX) {
7247					|	vcvttss2si Rq(def_reg), dword [=>label]
7248				} else {
7249					|	cvttss2si Rq(def_reg), dword [=>label]
7250				}
7251			}
7252|.endif
7253		}
7254	} else {
7255		ir_mem mem;
7256
7257		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7258			mem = ir_fuse_load(ctx, def, insn->op1);
7259		} else {
7260			mem = ir_ref_spill_slot(ctx, insn->op1);
7261		}
7262
7263		if (!dst64) {
7264			if (src_type == IR_DOUBLE) {
7265				if (ctx->mflags & IR_X86_AVX) {
7266					|	ASM_TXT_TMEM_OP vcvttsd2si, Rd(def_reg), qword, mem
7267				} else {
7268					|	ASM_TXT_TMEM_OP cvttsd2si, Rd(def_reg), qword, mem
7269				}
7270			} else {
7271				IR_ASSERT(src_type == IR_FLOAT);
7272				if (ctx->mflags & IR_X86_AVX) {
7273					|	ASM_TXT_TMEM_OP vcvttss2si, Rd(def_reg), dword, mem
7274				} else {
7275					|	ASM_TXT_TMEM_OP cvttss2si, Rd(def_reg), dword, mem
7276				}
7277			}
7278		} else {
7279			IR_ASSERT(sizeof(void*) == 8);
7280|.if X64
7281			if (src_type == IR_DOUBLE) {
7282				if (ctx->mflags & IR_X86_AVX) {
7283					|	ASM_TXT_TMEM_OP vcvttsd2si, Rq(def_reg), qword, mem
7284				} else {
7285					|	ASM_TXT_TMEM_OP cvttsd2si, Rq(def_reg), qword, mem
7286				}
7287			} else {
7288				IR_ASSERT(src_type == IR_FLOAT);
7289				if (ctx->mflags & IR_X86_AVX) {
7290					|	ASM_TXT_TMEM_OP vcvttss2si, Rq(def_reg), dword, mem
7291				} else {
7292					|	ASM_TXT_TMEM_OP cvttss2si, Rq(def_reg), dword, mem
7293				}
7294			}
7295|.endif
7296		}
7297	}
7298	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7299		ir_emit_store(ctx, dst_type, def, def_reg);
7300	}
7301}
7302
7303static void ir_emit_fp2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7304{
7305	ir_type dst_type = insn->type;
7306	ir_type src_type = ctx->ir_base[insn->op1].type;
7307	ir_backend_data *data = ctx->data;
7308	dasm_State **Dst = &data->dasm_state;
7309	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7310	ir_reg op1_reg = ctx->regs[def][1];
7311
7312	IR_ASSERT(IR_IS_TYPE_FP(src_type));
7313	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
7314	IR_ASSERT(def_reg != IR_REG_NONE);
7315	if (op1_reg != IR_REG_NONE) {
7316		if (IR_REG_SPILLED(op1_reg)) {
7317			op1_reg = IR_REG_NUM(op1_reg);
7318			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
7319		}
7320		if (src_type == dst_type) {
7321			if (op1_reg != def_reg) {
7322				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
7323			}
7324		} else if (src_type == IR_DOUBLE) {
7325			if (ctx->mflags & IR_X86_AVX) {
7326				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7327			} else {
7328				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7329			}
7330		} else {
7331			IR_ASSERT(src_type == IR_FLOAT);
7332			if (ctx->mflags & IR_X86_AVX) {
7333				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7334			} else {
7335				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7336			}
7337		}
7338	} else if (IR_IS_CONST_REF(insn->op1)) {
7339		int label = ir_const_label(ctx, insn->op1);
7340
7341		if (src_type == IR_DOUBLE) {
7342			if (ctx->mflags & IR_X86_AVX) {
7343				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
7344			} else {
7345				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
7346			}
7347		} else {
7348			IR_ASSERT(src_type == IR_FLOAT);
7349			if (ctx->mflags & IR_X86_AVX) {
7350				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
7351			} else {
7352				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
7353			}
7354		}
7355	} else {
7356		ir_mem mem;
7357
7358		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7359			mem = ir_fuse_load(ctx, def, insn->op1);
7360		} else {
7361			mem = ir_ref_spill_slot(ctx, insn->op1);
7362		}
7363
7364		if (src_type == IR_DOUBLE) {
7365			if (ctx->mflags & IR_X86_AVX) {
7366				|	ASM_TXT_TXT_TMEM_OP vcvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7367			} else {
7368				|	ASM_TXT_TMEM_OP cvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7369			}
7370		} else {
7371			IR_ASSERT(src_type == IR_FLOAT);
7372			if (ctx->mflags & IR_X86_AVX) {
7373				|	ASM_TXT_TXT_TMEM_OP vcvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7374			} else {
7375				|	ASM_TXT_TMEM_OP cvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7376			}
7377		}
7378	}
7379	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7380		ir_emit_store(ctx, dst_type, def, def_reg);
7381	}
7382}
7383
7384static void ir_emit_copy_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7385{
7386	ir_ref type = insn->type;
7387	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7388	ir_reg op1_reg = ctx->regs[def][1];
7389
7390	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
7391	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7392		op1_reg = IR_REG_NUM(op1_reg);
7393		ir_emit_load(ctx, type, op1_reg, insn->op1);
7394	}
7395	if (def_reg == op1_reg) {
7396		/* same reg */
7397	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
7398		ir_emit_mov(ctx, type, def_reg, op1_reg);
7399	} else if (def_reg != IR_REG_NONE) {
7400		ir_emit_load(ctx, type, def_reg, insn->op1);
7401	} else if (op1_reg != IR_REG_NONE) {
7402		ir_emit_store(ctx, type, def, op1_reg);
7403	} else {
7404		IR_ASSERT(0);
7405	}
7406	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
7407		ir_emit_store(ctx, type, def, def_reg);
7408	}
7409}
7410
7411static void ir_emit_copy_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7412{
7413	ir_type type = insn->type;
7414	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7415	ir_reg op1_reg = ctx->regs[def][1];
7416
7417	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
7418	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7419		op1_reg = IR_REG_NUM(op1_reg);
7420		ir_emit_load(ctx, type, op1_reg, insn->op1);
7421	}
7422	if (def_reg == op1_reg) {
7423		/* same reg */
7424	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
7425		ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
7426	} else if (def_reg != IR_REG_NONE) {
7427		ir_emit_load(ctx, type, def_reg, insn->op1);
7428	} else if (op1_reg != IR_REG_NONE) {
7429		ir_emit_store(ctx, type, def, op1_reg);
7430	} else {
7431		IR_ASSERT(0);
7432	}
7433	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
7434		ir_emit_store(ctx, type, def, def_reg);
7435	}
7436}
7437
7438static void ir_emit_vaddr(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7439{
7440	ir_backend_data *data = ctx->data;
7441	dasm_State **Dst = &data->dasm_state;
7442	ir_ref type = insn->type;
7443	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7444	ir_mem mem;
7445	int32_t offset;
7446	ir_reg fp;
7447
7448	IR_ASSERT(def_reg != IR_REG_NONE);
7449	mem = ir_var_spill_slot(ctx, insn->op1);
7450	fp = IR_MEM_BASE(mem);
7451	offset = IR_MEM_OFFSET(mem);
7452	|	lea Ra(def_reg), aword [Ra(fp)+offset]
7453	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7454		ir_emit_store(ctx, type, def, def_reg);
7455	}
7456}
7457
7458static void ir_emit_vload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7459{
7460	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7461	ir_ref type = insn->type;
7462	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7463	ir_reg fp;
7464	ir_mem mem;
7465
7466	IR_ASSERT(var_insn->op == IR_VAR);
7467	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7468	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7469	if (def_reg == IR_REG_NONE && ir_is_same_mem_var(ctx, def, var_insn->op3)) {
7470		return; // fake load
7471	}
7472	IR_ASSERT(def_reg != IR_REG_NONE);
7473
7474	ir_emit_load_mem(ctx, type, def_reg, mem);
7475	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7476		ir_emit_store(ctx, type, def, def_reg);
7477	}
7478}
7479
7480static void ir_emit_vstore_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7481{
7482	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7483	ir_insn *val_insn = &ctx->ir_base[insn->op3];
7484	ir_ref type = val_insn->type;
7485	ir_reg op3_reg = ctx->regs[ref][3];
7486	ir_reg fp;
7487	ir_mem mem;
7488
7489	IR_ASSERT(var_insn->op == IR_VAR);
7490	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7491	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7492	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
7493	 && !IR_IS_CONST_REF(insn->op3)
7494	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7495	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
7496		return; // fake store
7497	}
7498	if (IR_IS_CONST_REF(insn->op3)) {
7499		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
7500	} else {
7501		IR_ASSERT(op3_reg != IR_REG_NONE);
7502		if (IR_REG_SPILLED(op3_reg)) {
7503			op3_reg = IR_REG_NUM(op3_reg);
7504			ir_emit_load(ctx, type, op3_reg, insn->op3);
7505		}
7506		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
7507	}
7508}
7509
7510static void ir_emit_vstore_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7511{
7512	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7513	ir_ref type = ctx->ir_base[insn->op3].type;
7514	ir_reg op3_reg = ctx->regs[ref][3];
7515	ir_reg fp;
7516	ir_mem mem;
7517
7518	IR_ASSERT(var_insn->op == IR_VAR);
7519	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7520	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7521	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
7522	 && !IR_IS_CONST_REF(insn->op3)
7523	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7524	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
7525		return; // fake store
7526	}
7527	if (IR_IS_CONST_REF(insn->op3)) {
7528		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
7529	} else {
7530		IR_ASSERT(op3_reg != IR_REG_NONE);
7531		if (IR_REG_SPILLED(op3_reg)) {
7532			op3_reg = IR_REG_NUM(op3_reg);
7533			ir_emit_load(ctx, type, op3_reg, insn->op3);
7534		}
7535		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
7536	}
7537}
7538
7539static void ir_emit_load_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7540{
7541	ir_ref type = insn->type;
7542	ir_reg op2_reg = ctx->regs[def][2];
7543	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7544	ir_mem mem;
7545
7546	if (ctx->use_lists[def].count == 1) {
7547		/* dead load */
7548		return;
7549	}
7550	IR_ASSERT(def_reg != IR_REG_NONE);
7551	if (op2_reg != IR_REG_NONE) {
7552		if (IR_REG_SPILLED(op2_reg)) {
7553			op2_reg = IR_REG_NUM(op2_reg);
7554			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7555			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7556		}
7557		mem = IR_MEM_B(op2_reg);
7558	} else if (IR_IS_CONST_REF(insn->op2)) {
7559		mem = ir_fuse_addr_const(ctx, insn->op2);
7560	} else {
7561		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7562		mem = ir_fuse_addr(ctx, def, insn->op2);
7563		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
7564			if (!ir_may_avoid_spill_load(ctx, def, def)) {
7565				ir_emit_load_mem_int(ctx, type, def_reg, mem);
7566			}
7567			/* avoid load to the same location (valid only when register is not reused) */
7568			return;
7569		}
7570	}
7571
7572	ir_emit_load_mem_int(ctx, type, def_reg, mem);
7573	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7574		ir_emit_store(ctx, type, def, def_reg);
7575	}
7576}
7577
7578static void ir_emit_load_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7579{
7580	ir_ref type = insn->type;
7581	ir_reg op2_reg = ctx->regs[def][2];
7582	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7583	ir_mem mem;
7584
7585	if (ctx->use_lists[def].count == 1) {
7586		/* dead load */
7587		return;
7588	}
7589	IR_ASSERT(def_reg != IR_REG_NONE);
7590	if (op2_reg != IR_REG_NONE) {
7591		if (IR_REG_SPILLED(op2_reg)) {
7592			op2_reg = IR_REG_NUM(op2_reg);
7593			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7594			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7595		}
7596		mem = IR_MEM_B(op2_reg);
7597	} else if (IR_IS_CONST_REF(insn->op2)) {
7598		mem = ir_fuse_addr_const(ctx, insn->op2);
7599	} else {
7600		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7601		mem = ir_fuse_addr(ctx, def, insn->op2);
7602		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
7603			if (!ir_may_avoid_spill_load(ctx, def, def)) {
7604				ir_emit_load_mem_fp(ctx, type, def_reg, mem);
7605			}
7606			/* avoid load to the same location (valid only when register is not reused) */
7607			return;
7608		}
7609	}
7610
7611	ir_emit_load_mem_fp(ctx, type, def_reg, mem);
7612	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7613		ir_emit_store(ctx, type, def, def_reg);
7614	}
7615}
7616
7617static void ir_emit_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7618{
7619	ir_insn *val_insn = &ctx->ir_base[insn->op3];
7620	ir_ref type = val_insn->type;
7621	ir_reg op2_reg = ctx->regs[ref][2];
7622	ir_reg op3_reg = ctx->regs[ref][3];
7623	ir_mem mem;
7624
7625	if (op2_reg != IR_REG_NONE) {
7626		if (IR_REG_SPILLED(op2_reg)) {
7627			op2_reg = IR_REG_NUM(op2_reg);
7628			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7629			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7630		}
7631		mem = IR_MEM_B(op2_reg);
7632	} else if (IR_IS_CONST_REF(insn->op2)) {
7633		mem = ir_fuse_addr_const(ctx, insn->op2);
7634	} else {
7635		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7636		mem = ir_fuse_addr(ctx, ref, insn->op2);
7637		if (!IR_IS_CONST_REF(insn->op3)
7638		 && IR_REG_SPILLED(op3_reg)
7639		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7640		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
7641			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
7642				op3_reg = IR_REG_NUM(op3_reg);
7643				ir_emit_load(ctx, type, op3_reg, insn->op3);
7644			}
7645			/* avoid store to the same location */
7646			return;
7647		}
7648	}
7649
7650	if (IR_IS_CONST_REF(insn->op3)) {
7651		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
7652	} else {
7653		IR_ASSERT(op3_reg != IR_REG_NONE);
7654		if (IR_REG_SPILLED(op3_reg)) {
7655			op3_reg = IR_REG_NUM(op3_reg);
7656			ir_emit_load(ctx, type, op3_reg, insn->op3);
7657		}
7658		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
7659	}
7660}
7661
7662static void ir_emit_cmp_and_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7663{
7664	ir_reg addr_reg = ctx->regs[ref][2];
7665	ir_mem mem;
7666	ir_insn *cmp_insn = &ctx->ir_base[insn->op3];
7667	ir_op op = cmp_insn->op;
7668	ir_type type = ctx->ir_base[cmp_insn->op1].type;
7669	ir_ref op1 = cmp_insn->op1;
7670	ir_ref op2 = cmp_insn->op2;
7671	ir_reg op1_reg = ctx->regs[insn->op3][1];
7672	ir_reg op2_reg = ctx->regs[insn->op3][2];
7673
7674	if (addr_reg != IR_REG_NONE) {
7675		if (IR_REG_SPILLED(addr_reg)) {
7676			addr_reg = IR_REG_NUM(addr_reg);
7677			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7678			ir_emit_load(ctx, IR_ADDR, addr_reg, insn->op2);
7679		}
7680		mem = IR_MEM_B(addr_reg);
7681	} else if (IR_IS_CONST_REF(insn->op2)) {
7682		mem = ir_fuse_addr_const(ctx, insn->op2);
7683	} else {
7684		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7685		mem = ir_fuse_addr(ctx, ref, insn->op2);
7686	}
7687
7688	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7689		op1_reg = IR_REG_NUM(op1_reg);
7690		ir_emit_load(ctx, type, op1_reg, op1);
7691	}
7692	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7693		op2_reg = IR_REG_NUM(op2_reg);
7694		if (op1 != op2) {
7695			ir_emit_load(ctx, type, op2_reg, op2);
7696		}
7697	}
7698
7699	ir_emit_cmp_int_common(ctx, type, ref, cmp_insn, op1_reg, op1, op2_reg, op2);
7700	_ir_emit_setcc_int_mem(ctx, op, mem);
7701}
7702
7703static void ir_emit_store_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7704{
7705	ir_ref type = ctx->ir_base[insn->op3].type;
7706	ir_reg op2_reg = ctx->regs[ref][2];
7707	ir_reg op3_reg = ctx->regs[ref][3];
7708	ir_mem mem;
7709
7710	IR_ASSERT(op3_reg != IR_REG_NONE);
7711	if (op2_reg != IR_REG_NONE) {
7712		if (IR_REG_SPILLED(op2_reg)) {
7713			op2_reg = IR_REG_NUM(op2_reg);
7714			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7715			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7716		}
7717		mem = IR_MEM_B(op2_reg);
7718	} else if (IR_IS_CONST_REF(insn->op2)) {
7719		mem = ir_fuse_addr_const(ctx, insn->op2);
7720	} else {
7721		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7722		mem = ir_fuse_addr(ctx, ref, insn->op2);
7723		if (!IR_IS_CONST_REF(insn->op3)
7724		 && IR_REG_SPILLED(op3_reg)
7725		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7726		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
7727			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
7728				op3_reg = IR_REG_NUM(op3_reg);
7729				ir_emit_load(ctx, type, op3_reg, insn->op3);
7730			}
7731			/* avoid store to the same location */
7732			return;
7733		}
7734	}
7735
7736	if (IR_IS_CONST_REF(insn->op3)) {
7737		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
7738	} else {
7739		IR_ASSERT(op3_reg != IR_REG_NONE);
7740		if (IR_REG_SPILLED(op3_reg)) {
7741			op3_reg = IR_REG_NUM(op3_reg);
7742			ir_emit_load(ctx, type, op3_reg, insn->op3);
7743		}
7744		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
7745	}
7746}
7747
7748static void ir_emit_rload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7749{
7750	ir_reg src_reg = insn->op2;
7751	ir_type type = insn->type;
7752
7753	if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), src_reg)) {
7754		if (ctx->vregs[def]
7755		 && ctx->live_intervals[ctx->vregs[def]]
7756		 && ctx->live_intervals[ctx->vregs[def]]->stack_spill_pos != -1) {
7757			ir_emit_store(ctx, type, def, src_reg);
7758		}
7759	} else {
7760		ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7761
7762		if (def_reg == IR_REG_NONE) {
7763			/* op3 is used as a flag that the value is already stored in memory.
7764			 * If op3 is set we don't have to store the value once again (in case of spilling)
7765			 */
7766			if (!insn->op3 || !ir_is_same_spill_slot(ctx, def, IR_MEM_BO(ctx->spill_base, insn->op3))) {
7767				ir_emit_store(ctx, type, def, src_reg);
7768			}
7769		} else {
7770			if (src_reg != def_reg) {
7771				if (IR_IS_TYPE_INT(type)) {
7772					ir_emit_mov(ctx, type, def_reg, src_reg);
7773				} else {
7774					IR_ASSERT(IR_IS_TYPE_FP(type));
7775					ir_emit_fp_mov(ctx, type, def_reg, src_reg);
7776				}
7777			}
7778			if (IR_REG_SPILLED(ctx->regs[def][0])
7779			 && (!insn->op3 || !ir_is_same_spill_slot(ctx, def,  IR_MEM_BO(ctx->spill_base, insn->op3)))) {
7780				ir_emit_store(ctx, type, def, def_reg);
7781			}
7782		}
7783	}
7784}
7785
7786static void ir_emit_rstore(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7787{
7788	ir_ref type = ctx->ir_base[insn->op2].type;
7789	ir_reg op2_reg = ctx->regs[ref][2];
7790	ir_reg dst_reg = insn->op3;
7791
7792	if (op2_reg != IR_REG_NONE) {
7793		if (IR_REG_SPILLED(op2_reg)) {
7794			op2_reg = IR_REG_NUM(op2_reg);
7795			ir_emit_load(ctx, type, op2_reg, insn->op2);
7796		}
7797		if (op2_reg != dst_reg) {
7798			if (IR_IS_TYPE_INT(type)) {
7799				ir_emit_mov(ctx, type, dst_reg, op2_reg);
7800			} else {
7801				IR_ASSERT(IR_IS_TYPE_FP(type));
7802				ir_emit_fp_mov(ctx, type, dst_reg, op2_reg);
7803			}
7804		}
7805	} else {
7806		ir_emit_load_ex(ctx, type, dst_reg, insn->op2, ref);
7807	}
7808}
7809
7810static void ir_emit_alloca(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7811{
7812	ir_backend_data *data = ctx->data;
7813	dasm_State **Dst = &data->dasm_state;
7814	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7815
7816	if (ctx->use_lists[def].count == 1) {
7817		/* dead alloca */
7818		return;
7819	}
7820	if (IR_IS_CONST_REF(insn->op2)) {
7821		ir_insn *val = &ctx->ir_base[insn->op2];
7822		int32_t size = val->val.i32;
7823
7824		IR_ASSERT(IR_IS_TYPE_INT(val->type));
7825		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7826		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 >= 0);
7827		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
7828
7829		/* Stack must be 16 byte aligned */
7830		size = IR_ALIGNED_SIZE(size, 16);
7831		|	ASM_REG_IMM_OP sub, IR_ADDR, IR_REG_RSP, size
7832		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
7833			ctx->call_stack_size += size;
7834		}
7835	} else {
7836		int32_t alignment = 16;
7837		ir_reg op2_reg = ctx->regs[def][2];
7838		ir_type type = ctx->ir_base[insn->op2].type;
7839
7840		IR_ASSERT(ctx->flags & IR_FUNCTION);
7841		IR_ASSERT(ctx->flags & IR_USE_FRAME_POINTER);
7842		IR_ASSERT(def_reg != IR_REG_NONE);
7843		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7844			op2_reg = IR_REG_NUM(op2_reg);
7845			ir_emit_load(ctx, type, op2_reg, insn->op2);
7846		}
7847		if (def_reg != op2_reg) {
7848			if (op2_reg != IR_REG_NONE) {
7849				ir_emit_mov(ctx, type, def_reg, op2_reg);
7850			} else {
7851				ir_emit_load(ctx, type, def_reg, insn->op2);
7852			}
7853		}
7854
7855		|	ASM_REG_IMM_OP add, IR_ADDR, def_reg, (alignment-1)
7856		|	ASM_REG_IMM_OP and, IR_ADDR, def_reg, ~(alignment-1)
7857		|	ASM_REG_REG_OP sub, IR_ADDR, IR_REG_RSP, def_reg
7858	}
7859	if (def_reg != IR_REG_NONE) {
7860		|	mov Ra(def_reg), Ra(IR_REG_RSP)
7861		if (IR_REG_SPILLED(ctx->regs[def][0])) {
7862			ir_emit_store(ctx, insn->type, def, def_reg);
7863		}
7864	} else {
7865		ir_emit_store(ctx, IR_ADDR, def, IR_REG_STACK_POINTER);
7866	}
7867}
7868
7869static void ir_emit_afree(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7870{
7871	ir_backend_data *data = ctx->data;
7872	dasm_State **Dst = &data->dasm_state;
7873
7874	if (IR_IS_CONST_REF(insn->op2)) {
7875		ir_insn *val = &ctx->ir_base[insn->op2];
7876		int32_t size = val->val.i32;
7877
7878		IR_ASSERT(IR_IS_TYPE_INT(val->type));
7879		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7880		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 > 0);
7881		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
7882
7883		/* Stack must be 16 byte aligned */
7884		size = IR_ALIGNED_SIZE(size, 16);
7885		|	ASM_REG_IMM_OP add, IR_ADDR, IR_REG_RSP, size
7886		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
7887			ctx->call_stack_size -= size;
7888		}
7889	} else {
7890//		int32_t alignment = 16;
7891		ir_reg op2_reg = ctx->regs[def][2];
7892		ir_type type = ctx->ir_base[insn->op2].type;
7893
7894		IR_ASSERT(ctx->flags & IR_FUNCTION);
7895		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7896			op2_reg = IR_REG_NUM(op2_reg);
7897			ir_emit_load(ctx, type, op2_reg, insn->op2);
7898		}
7899
7900		// TODO: alignment ???
7901
7902		|	ASM_REG_REG_OP add, IR_ADDR, IR_REG_RSP, op2_reg
7903	}
7904}
7905
7906static void ir_emit_block_begin(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7907{
7908	ir_backend_data *data = ctx->data;
7909	dasm_State **Dst = &data->dasm_state;
7910	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7911
7912	|	mov Ra(def_reg), Ra(IR_REG_RSP)
7913
7914	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7915		ir_emit_store(ctx, IR_ADDR, def, def_reg);
7916	}
7917}
7918
7919static void ir_emit_block_end(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7920{
7921	ir_backend_data *data = ctx->data;
7922	dasm_State **Dst = &data->dasm_state;
7923	ir_reg op2_reg = ctx->regs[def][2];
7924
7925	IR_ASSERT(op2_reg != IR_REG_NONE);
7926	if (IR_REG_SPILLED(op2_reg)) {
7927		op2_reg = IR_REG_NUM(op2_reg);
7928		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7929	}
7930
7931	|	mov Ra(IR_REG_RSP), Ra(op2_reg)
7932}
7933
7934static void ir_emit_frame_addr(ir_ctx *ctx, ir_ref def)
7935{
7936	ir_backend_data *data = ctx->data;
7937	dasm_State **Dst = &data->dasm_state;
7938	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7939
7940	if (ctx->flags & IR_USE_FRAME_POINTER) {
7941		|	mov Ra(def_reg), Ra(IR_REG_RBP)
7942	} else {
7943		|	lea Ra(def_reg), [Ra(IR_REG_RSP)+(ctx->stack_frame_size + ctx->call_stack_size)]
7944	}
7945	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7946		ir_emit_store(ctx, IR_ADDR, def, def_reg);
7947	}
7948}
7949
7950static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7951{
7952#if defined(_WIN64) || defined(IR_TARGET_X86)
7953	ir_backend_data *data = ctx->data;
7954	dasm_State **Dst = &data->dasm_state;
7955	ir_reg fp;
7956	int arg_area_offset;
7957	ir_reg op2_reg = ctx->regs[def][2];
7958	ir_reg tmp_reg = ctx->regs[def][3];
7959	int32_t offset;
7960
7961	IR_ASSERT(tmp_reg != IR_REG_NONE);
7962	if (op2_reg != IR_REG_NONE) {
7963		if (IR_REG_SPILLED(op2_reg)) {
7964			op2_reg = IR_REG_NUM(op2_reg);
7965			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7966		}
7967		offset = 0;
7968	} else {
7969		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
7970		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7971		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
7972	}
7973
7974	if (ctx->flags & IR_USE_FRAME_POINTER) {
7975		fp = IR_REG_FRAME_POINTER;
7976		arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
7977	} else {
7978		fp = IR_REG_STACK_POINTER;
7979		arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
7980	}
7981	|	lea Ra(tmp_reg), aword [Ra(fp)+arg_area_offset]
7982	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
7983#elif defined(IR_TARGET_X64)
7984|.if X64
7985	ir_backend_data *data = ctx->data;
7986	dasm_State **Dst = &data->dasm_state;
7987	ir_reg fp;
7988	int reg_save_area_offset;
7989	int overflow_arg_area_offset;
7990	ir_reg op2_reg = ctx->regs[def][2];
7991	ir_reg tmp_reg = ctx->regs[def][3];
7992	bool have_reg_save_area = 0;
7993	int32_t offset;
7994
7995	IR_ASSERT(tmp_reg != IR_REG_NONE);
7996	if (op2_reg != IR_REG_NONE) {
7997		if (IR_REG_SPILLED(op2_reg)) {
7998			op2_reg = IR_REG_NUM(op2_reg);
7999			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8000		}
8001		offset = 0;
8002	} else {
8003		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8004		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8005		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8006	}
8007
8008	if (ctx->flags & IR_USE_FRAME_POINTER) {
8009		fp = IR_REG_FRAME_POINTER;
8010		reg_save_area_offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
8011		overflow_arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
8012	} else {
8013		fp = IR_REG_STACK_POINTER;
8014		reg_save_area_offset = ctx->locals_area_size + ctx->call_stack_size;
8015		overflow_arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
8016	}
8017
8018	if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
8019		|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
8020		have_reg_save_area = 1;
8021		/* Set va_list.gp_offset */
8022		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * ctx->gp_reg_params
8023	} else {
8024		reg_save_area_offset -= sizeof(void*) * IR_REG_INT_ARGS;
8025		/* Set va_list.gp_offset */
8026		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * IR_REG_INT_ARGS
8027	}
8028	if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
8029		if (!have_reg_save_area) {
8030			|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
8031			have_reg_save_area = 1;
8032		}
8033		/* Set va_list.fp_offset */
8034		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * ctx->fp_reg_params
8035	} else {
8036		/* Set va_list.fp_offset */
8037		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
8038	}
8039	if (have_reg_save_area) {
8040		/* Set va_list.reg_save_area */
8041		|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
8042	}
8043	|	lea Ra(tmp_reg), aword [Ra(fp)+overflow_arg_area_offset]
8044	/* Set va_list.overflow_arg_area */
8045	|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8046|.endif
8047#else
8048	IR_ASSERT(0 && "NIY va_start");
8049#endif
8050}
8051
8052static void ir_emit_va_copy(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8053{
8054#if defined(_WIN64) || defined(IR_TARGET_X86)
8055	ir_backend_data *data = ctx->data;
8056	dasm_State **Dst = &data->dasm_state;
8057	ir_reg tmp_reg = ctx->regs[def][1];
8058	ir_reg op2_reg = ctx->regs[def][2];
8059	ir_reg op3_reg = ctx->regs[def][3];
8060	int32_t op2_offset, op3_offset;
8061
8062	IR_ASSERT(tmp_reg != IR_REG_NONE);
8063	if (op2_reg != IR_REG_NONE) {
8064		if (IR_REG_SPILLED(op2_reg)) {
8065			op2_reg = IR_REG_NUM(op2_reg);
8066			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8067		}
8068		op2_offset = 0;
8069	} else {
8070		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8071		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8072		op2_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8073	}
8074	if (op3_reg != IR_REG_NONE) {
8075		if (IR_REG_SPILLED(op3_reg)) {
8076			op3_reg = IR_REG_NUM(op3_reg);
8077			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
8078		}
8079		op3_offset = 0;
8080	} else {
8081		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
8082		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8083		op3_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op3].op3);
8084	}
8085	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+op3_offset]
8086	|	mov aword [Ra(op2_reg)+op2_offset], Ra(tmp_reg)
8087#elif defined(IR_TARGET_X64)
8088|.if X64
8089	ir_backend_data *data = ctx->data;
8090	dasm_State **Dst = &data->dasm_state;
8091	ir_reg tmp_reg = ctx->regs[def][1];
8092	ir_reg op2_reg = ctx->regs[def][2];
8093	ir_reg op3_reg = ctx->regs[def][3];
8094	int32_t op2_offset, op3_offset;
8095
8096	IR_ASSERT(tmp_reg != IR_REG_NONE);
8097	if (op2_reg != IR_REG_NONE) {
8098		if (IR_REG_SPILLED(op2_reg)) {
8099			op2_reg = IR_REG_NUM(op2_reg);
8100			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8101		}
8102		op2_offset = 0;
8103	} else {
8104		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8105		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8106		op2_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8107	}
8108	if (op3_reg != IR_REG_NONE) {
8109		if (IR_REG_SPILLED(op3_reg)) {
8110			op3_reg = IR_REG_NUM(op3_reg);
8111			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
8112		}
8113		op3_offset = 0;
8114	} else {
8115		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
8116		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8117		op3_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op3].op3);
8118	}
8119	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, gp_offset))]
8120	|	mov dword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
8121	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, fp_offset))]
8122	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, fp_offset))], Ra(tmp_reg)
8123	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, overflow_arg_area))]
8124	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8125	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, reg_save_area))]
8126	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
8127|.endif
8128#else
8129	IR_ASSERT(0 && "NIY va_copy");
8130#endif
8131}
8132
8133static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8134{
8135#if defined(_WIN64) || defined(IR_TARGET_X86)
8136	ir_backend_data *data = ctx->data;
8137	dasm_State **Dst = &data->dasm_state;
8138	ir_type type = insn->type;
8139	ir_reg def_reg = ctx->regs[def][0];
8140	ir_reg op2_reg = ctx->regs[def][2];
8141	ir_reg tmp_reg = ctx->regs[def][3];
8142	int32_t offset;
8143
8144	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
8145	if (op2_reg != IR_REG_NONE) {
8146		if (IR_REG_SPILLED(op2_reg)) {
8147			op2_reg = IR_REG_NUM(op2_reg);
8148			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8149		}
8150		offset = 0;
8151	} else {
8152		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8153		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8154		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8155	}
8156	|	mov Ra(tmp_reg), aword [Ra(op2_reg)+offset]
8157	ir_emit_load_mem(ctx, type, def_reg, IR_MEM_B(tmp_reg));
8158	|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
8159	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
8160	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8161		ir_emit_store(ctx, type, def, def_reg);
8162	}
8163#elif defined(IR_TARGET_X64)
8164|.if X64
8165	ir_backend_data *data = ctx->data;
8166	dasm_State **Dst = &data->dasm_state;
8167	ir_type type = insn->type;
8168	ir_reg def_reg = ctx->regs[def][0];
8169	ir_reg op2_reg = ctx->regs[def][2];
8170	ir_reg tmp_reg = ctx->regs[def][3];
8171	int32_t offset;
8172
8173	IR_ASSERT(def_reg != IR_REG_NONE&& tmp_reg != IR_REG_NONE);
8174	if (op2_reg != IR_REG_NONE) {
8175		if (IR_REG_SPILLED(op2_reg)) {
8176			op2_reg = IR_REG_NUM(op2_reg);
8177			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8178		}
8179		offset = 0;
8180	} else {
8181		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8182		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8183		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8184	}
8185	if (IR_IS_TYPE_INT(type)) {
8186		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))]
8187		|	cmp Rd(tmp_reg), sizeof(void*)*IR_REG_INT_ARGS
8188		|	jge >1
8189		|	add Rd(tmp_reg), sizeof(void*)
8190		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
8191		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
8192		|	jmp >2
8193		|1:
8194		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
8195		|	add Ra(tmp_reg), sizeof(void*)
8196		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8197		|2:
8198		|	mov Ra(def_reg), aword [Ra(tmp_reg)-sizeof(void*)]
8199	} else {
8200		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))]
8201		|	cmp Rd(tmp_reg), sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
8202		|	jge >1
8203		|	add Rd(tmp_reg), 16
8204		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], Rd(tmp_reg)
8205		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
8206		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, -16));
8207		|	jmp >2
8208		|1:
8209		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
8210		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, 0));
8211		|	add Ra(tmp_reg), 8
8212		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8213		|2:
8214	}
8215	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8216		ir_emit_store(ctx, type, def, def_reg);
8217	}
8218|.endif
8219#else
8220	IR_ASSERT(0 && "NIY va_arg");
8221#endif
8222}
8223
8224static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8225{
8226	ir_backend_data *data = ctx->data;
8227	dasm_State **Dst = &data->dasm_state;
8228	ir_type type;
8229	ir_block *bb;
8230	ir_insn *use_insn, *val;
8231	uint32_t n, *p, use_block;
8232	int i;
8233	int label, default_label = 0;
8234	int count = 0;
8235	ir_val min, max;
8236	ir_reg op2_reg = ctx->regs[def][2];
8237	ir_reg tmp_reg = ctx->regs[def][3];
8238
8239	type = ctx->ir_base[insn->op2].type;
8240	IR_ASSERT(tmp_reg != IR_REG_NONE);
8241	if (IR_IS_TYPE_SIGNED(type)) {
8242		min.u64 = 0x7fffffffffffffff;
8243		max.u64 = 0x8000000000000000;
8244	} else {
8245		min.u64 = 0xffffffffffffffff;
8246		max.u64 = 0x0;
8247	}
8248
8249	bb = &ctx->cfg_blocks[b];
8250	p = &ctx->cfg_edges[bb->successors];
8251	for (n = bb->successors_count; n != 0; p++, n--) {
8252		use_block = *p;
8253		use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8254		if (use_insn->op == IR_CASE_VAL) {
8255			val = &ctx->ir_base[use_insn->op2];
8256			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8257			if (IR_IS_TYPE_SIGNED(type)) {
8258				IR_ASSERT(IR_IS_TYPE_SIGNED(val->type));
8259				min.i64 = IR_MIN(min.i64, val->val.i64);
8260				max.i64 = IR_MAX(max.i64, val->val.i64);
8261			} else {
8262				IR_ASSERT(!IR_IS_TYPE_SIGNED(val->type));
8263				min.u64 = (int64_t)IR_MIN(min.u64, val->val.u64);
8264				max.u64 = (int64_t)IR_MAX(max.u64, val->val.u64);
8265			}
8266			count++;
8267		} else {
8268			IR_ASSERT(use_insn->op == IR_CASE_DEFAULT);
8269			default_label = ir_skip_empty_target_blocks(ctx, use_block);
8270		}
8271	}
8272
8273	IR_ASSERT(op2_reg != IR_REG_NONE);
8274	if (IR_REG_SPILLED(op2_reg)) {
8275		op2_reg = IR_REG_NUM(op2_reg);
8276		ir_emit_load(ctx, type, op2_reg, insn->op2);
8277	}
8278
8279	/* Generate a table jmp or a seqence of calls */
8280	if (count > 2 && (max.i64-min.i64) < count * 8) {
8281		int *labels = ir_mem_malloc(sizeof(int) * (size_t)(max.i64 - min.i64 + 1));
8282
8283		for (i = 0; i <= (max.i64 - min.i64); i++) {
8284			labels[i] = default_label;
8285		}
8286		p = &ctx->cfg_edges[bb->successors];
8287		for (n = bb->successors_count; n != 0; p++, n--) {
8288			use_block = *p;
8289			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8290			if (use_insn->op == IR_CASE_VAL) {
8291				val = &ctx->ir_base[use_insn->op2];
8292				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8293				label = ir_skip_empty_target_blocks(ctx, use_block);
8294				labels[val->val.i64 - min.i64] = label;
8295			}
8296		}
8297
8298		switch (ir_type_size[type]) {
8299			default:
8300				IR_ASSERT(0 && "Unsupported type size");
8301			case 1:
8302				if (IR_IS_TYPE_SIGNED(type)) {
8303					|	movsx Ra(op2_reg), Rb(op2_reg)
8304				} else {
8305					|	movzx Ra(op2_reg), Rb(op2_reg)
8306				}
8307				break;
8308			case 2:
8309				if (IR_IS_TYPE_SIGNED(type)) {
8310					|	movsx Ra(op2_reg), Rw(op2_reg)
8311				} else {
8312					|	movzx Ra(op2_reg), Rw(op2_reg)
8313				}
8314				break;
8315			case 4:
8316|.if X64
8317				if (IR_IS_TYPE_SIGNED(type)) {
8318					|	movsxd Ra(op2_reg), Rd(op2_reg)
8319				} else {
8320					|	mov Rd(op2_reg), Rd(op2_reg)
8321				}
8322				break;
8323||			case 8:
8324|.endif
8325				break;
8326		}
8327
8328		if (min.i64 != 0) {
8329			int64_t offset = -min.i64;
8330
8331			if (IR_IS_SIGNED_32BIT(offset)) {
8332				|	lea Ra(tmp_reg), [Ra(op2_reg)+(int32_t)offset]
8333			} else {
8334				IR_ASSERT(sizeof(void*) == 8);
8335|.if X64
8336				|	mov64 Rq(tmp_reg), offset
8337				|	add Ra(tmp_reg), Ra(op2_reg)
8338|.endif
8339			}
8340			if (default_label) {
8341				offset = max.i64 - min.i64;
8342
8343				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8344				|	cmp Ra(tmp_reg), (int32_t)offset
8345				|	ja =>default_label
8346			}
8347|.if X64
8348			if (ctx->code_buffer
8349			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
8350			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
8351				|	jmp aword [Ra(tmp_reg)*8+>1]
8352			} else {
8353				int64_t offset = -min.i64;
8354
8355				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8356				offset *= 8;
8357				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8358				|	lea Ra(tmp_reg), aword [>1]
8359				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8+offset]
8360			}
8361|.else
8362			|	jmp aword [Ra(tmp_reg)*4+>1]
8363|.endif
8364		} else {
8365			if (default_label) {
8366				int64_t offset = max.i64;
8367
8368				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8369				|	cmp Ra(op2_reg), (int32_t)offset
8370				|	ja =>default_label
8371			}
8372|.if X64
8373			if (ctx->code_buffer
8374			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
8375			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
8376				|	jmp aword [Ra(op2_reg)*8+>1]
8377			} else {
8378				|	lea Ra(tmp_reg), aword [>1]
8379				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8]
8380			}
8381|.else
8382			|	jmp aword [Ra(op2_reg)*4+>1]
8383|.endif
8384		}
8385
8386		|.jmp_table
8387		if (!data->jmp_table_label) {
8388			data->jmp_table_label = ctx->cfg_blocks_count + ctx->consts_count + 3;
8389			|=>data->jmp_table_label:
8390		}
8391		|.align aword
8392		|1:
8393		for (i = 0; i <= (max.i64 - min.i64); i++) {
8394			int b = labels[i];
8395			if (b) {
8396				ir_block *bb = &ctx->cfg_blocks[b];
8397				ir_insn *insn = &ctx->ir_base[bb->end];
8398
8399				if (insn->op == IR_IJMP && IR_IS_CONST_REF(insn->op2)) {
8400					ir_ref prev = ctx->prev_ref[bb->end];
8401					if (prev != bb->start && ctx->ir_base[prev].op == IR_SNAPSHOT) {
8402						prev = ctx->prev_ref[prev];
8403					}
8404					if (prev == bb->start) {
8405						void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8406
8407						|	.aword &addr
8408						if (ctx->ir_base[bb->start].op != IR_CASE_DEFAULT) {
8409							bb->flags |= IR_BB_EMPTY;
8410						}
8411						continue;
8412					}
8413				}
8414				|	.aword =>b
8415			} else {
8416				|	.aword 0
8417			}
8418		}
8419		|.code
8420		ir_mem_free(labels);
8421	} else {
8422		p = &ctx->cfg_edges[bb->successors];
8423		for (n = bb->successors_count; n != 0; p++, n--) {
8424			use_block = *p;
8425			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8426			if (use_insn->op == IR_CASE_VAL) {
8427				val = &ctx->ir_base[use_insn->op2];
8428				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8429				label = ir_skip_empty_target_blocks(ctx, use_block);
8430				if (IR_IS_32BIT(type, val->val)) {
8431					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
8432				} else {
8433					IR_ASSERT(sizeof(void*) == 8);
8434|.if X64
8435					|	mov64 Ra(tmp_reg), val->val.i64
8436					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
8437|.endif
8438				}
8439				|	je =>label
8440			}
8441		}
8442		if (default_label) {
8443			|	jmp =>default_label
8444		}
8445	}
8446}
8447
8448static int32_t ir_call_used_stack(ir_ctx *ctx, ir_insn *insn)
8449{
8450	int j, n;
8451	ir_type type;
8452	int int_param = 0;
8453	int fp_param = 0;
8454	int int_reg_params_count = IR_REG_INT_ARGS;
8455	int fp_reg_params_count = IR_REG_FP_ARGS;
8456	int32_t used_stack = 0;
8457
8458#ifdef IR_HAVE_FASTCALL
8459	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
8460		int_reg_params_count = IR_REG_INT_FCARGS;
8461		fp_reg_params_count = IR_REG_FP_FCARGS;
8462	}
8463#endif
8464
8465	n = insn->inputs_count;
8466	for (j = 3; j <= n; j++) {
8467		type = ctx->ir_base[ir_insn_op(insn, j)].type;
8468		if (IR_IS_TYPE_INT(type)) {
8469			if (int_param >= int_reg_params_count) {
8470				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
8471			}
8472			int_param++;
8473#ifdef _WIN64
8474			/* WIN64 calling convention use common couter for int and fp registers */
8475			fp_param++;
8476#endif
8477		} else {
8478			IR_ASSERT(IR_IS_TYPE_FP(type));
8479			if (fp_param >= fp_reg_params_count) {
8480				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
8481			}
8482			fp_param++;
8483#ifdef _WIN64
8484			/* WIN64 calling convention use common couter for int and fp registers */
8485			int_param++;
8486#endif
8487		}
8488	}
8489
8490	/* Reserved "home space" or "shadow store" for register arguments (used in Windows64 ABI) */
8491	used_stack += IR_SHADOW_ARGS;
8492
8493	return used_stack;
8494}
8495
8496static int32_t ir_emit_arguments(ir_ctx *ctx, ir_ref def, ir_insn *insn, ir_reg tmp_reg)
8497{
8498	ir_backend_data *data = ctx->data;
8499	dasm_State **Dst = &data->dasm_state;
8500	int j, n;
8501	ir_ref arg;
8502	ir_insn *arg_insn;
8503	uint8_t type;
8504	ir_reg src_reg, dst_reg;
8505	int int_param = 0;
8506	int fp_param = 0;
8507	int count = 0;
8508	int int_reg_params_count = IR_REG_INT_ARGS;
8509	int fp_reg_params_count = IR_REG_FP_ARGS;
8510	const int8_t *int_reg_params = _ir_int_reg_params;
8511	const int8_t *fp_reg_params = _ir_fp_reg_params;
8512	int32_t used_stack, stack_offset = IR_SHADOW_ARGS;
8513	ir_copy *copies;
8514	bool do_pass3 = 0;
8515	/* For temporaries we may use any scratch registers except for registers used for parameters */
8516	ir_reg tmp_fp_reg = IR_REG_FP_LAST; /* Temporary register for FP loads and swap */
8517
8518	n = insn->inputs_count;
8519	if (n < 3) {
8520		return 0;
8521	}
8522
8523	if (tmp_reg == IR_REG_NONE) {
8524		tmp_reg = IR_REG_RAX;
8525	}
8526
8527#ifdef IR_HAVE_FASTCALL
8528	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
8529		int_reg_params_count = IR_REG_INT_FCARGS;
8530		fp_reg_params_count = IR_REG_FP_FCARGS;
8531		int_reg_params = _ir_int_fc_reg_params;
8532		fp_reg_params = _ir_fp_fc_reg_params;
8533	}
8534#endif
8535
8536	if (insn->op == IR_CALL
8537	 && (ctx->flags & IR_PREALLOCATED_STACK)
8538#ifdef IR_HAVE_FASTCALL
8539	 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
8540#endif
8541	) {
8542		// TODO: support for preallocated stack
8543		used_stack = 0;
8544	} else {
8545		used_stack = ir_call_used_stack(ctx, insn);
8546		if (IR_SHADOW_ARGS
8547		 && insn->op == IR_TAILCALL
8548		 && used_stack == IR_SHADOW_ARGS) {
8549			used_stack = 0;
8550		}
8551		if (ctx->fixed_call_stack_size
8552		 && used_stack <= ctx->fixed_call_stack_size
8553#ifdef IR_HAVE_FASTCALL
8554		 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
8555#endif
8556		) {
8557			used_stack = 0;
8558		} else {
8559			/* Stack must be 16 byte aligned */
8560			int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
8561			ctx->call_stack_size += aligned_stack;
8562			if (aligned_stack) {
8563				|	sub Ra(IR_REG_RSP), aligned_stack
8564			}
8565		}
8566	}
8567
8568	/* 1. move all register arguments that should be passed through stack
8569	 *    and collect arguments that should be passed through registers */
8570	copies = ir_mem_malloc((n - 2) * sizeof(ir_copy));
8571	for (j = 3; j <= n; j++) {
8572		arg = ir_insn_op(insn, j);
8573		src_reg = ir_get_alocated_reg(ctx, def, j);
8574		arg_insn = &ctx->ir_base[arg];
8575		type = arg_insn->type;
8576		if (IR_IS_TYPE_INT(type)) {
8577			if (int_param < int_reg_params_count) {
8578				dst_reg = int_reg_params[int_param];
8579			} else {
8580				dst_reg = IR_REG_NONE; /* pass argument through stack */
8581			}
8582			int_param++;
8583#ifdef _WIN64
8584			/* WIN64 calling convention use common couter for int and fp registers */
8585			fp_param++;
8586#endif
8587		} else {
8588			IR_ASSERT(IR_IS_TYPE_FP(type));
8589			if (fp_param < fp_reg_params_count) {
8590				dst_reg = fp_reg_params[fp_param];
8591			} else {
8592				dst_reg = IR_REG_NONE; /* pass argument through stack */
8593			}
8594			fp_param++;
8595#ifdef _WIN64
8596			/* WIN64 calling convention use common couter for int and fp registers */
8597			int_param++;
8598#endif
8599		}
8600		if (dst_reg != IR_REG_NONE) {
8601			if (src_reg == IR_REG_NONE) {
8602				/* delay CONST->REG and MEM->REG moves to third pass */
8603				do_pass3 = 1;
8604			} else {
8605				if (IR_REG_SPILLED(src_reg)) {
8606					src_reg = IR_REG_NUM(src_reg);
8607					ir_emit_load(ctx, type, src_reg, arg);
8608				}
8609				if (src_reg != dst_reg) {
8610					/* delay REG->REG moves to second pass */
8611					copies[count].type = type;
8612					copies[count].from = src_reg;
8613					copies[count].to = dst_reg;
8614					count++;
8615				}
8616			}
8617		} else {
8618			/* Pass register arguments to stack (REG->MEM moves) */
8619			if (!IR_IS_CONST_REF(arg) && src_reg != IR_REG_NONE && !IR_REG_SPILLED(src_reg)) {
8620				ir_emit_store_mem(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg);
8621			} else {
8622				do_pass3 = 1;
8623			}
8624			stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
8625		}
8626	}
8627
8628	/* 2. move all arguments that should be passed from one register to another (REG->REG movs) */
8629	if (count) {
8630		ir_parallel_copy(ctx, copies, count, tmp_reg, tmp_fp_reg);
8631	}
8632	ir_mem_free(copies);
8633
8634	/* 3. move the remaining memory and immediate values */
8635	if (do_pass3) {
8636		stack_offset = IR_SHADOW_ARGS;
8637		int_param = 0;
8638		fp_param = 0;
8639		for (j = 3; j <= n; j++) {
8640			arg = ir_insn_op(insn, j);
8641			src_reg = ir_get_alocated_reg(ctx, def, j);
8642			arg_insn = &ctx->ir_base[arg];
8643			type = arg_insn->type;
8644			if (IR_IS_TYPE_INT(type)) {
8645				if (int_param < int_reg_params_count) {
8646					dst_reg = int_reg_params[int_param];
8647				} else {
8648					dst_reg = IR_REG_NONE; /* argument already passed through stack */
8649				}
8650				int_param++;
8651#ifdef _WIN64
8652				/* WIN64 calling convention use common couter for int and fp registers */
8653				fp_param++;
8654#endif
8655			} else {
8656				IR_ASSERT(IR_IS_TYPE_FP(type));
8657				if (fp_param < fp_reg_params_count) {
8658					dst_reg = fp_reg_params[fp_param];
8659				} else {
8660					dst_reg = IR_REG_NONE; /* argument already passed through stack */
8661				}
8662				fp_param++;
8663#ifdef _WIN64
8664				/* WIN64 calling convention use common couter for int and fp registers */
8665				int_param++;
8666#endif
8667			}
8668			if (dst_reg != IR_REG_NONE) {
8669				if (src_reg == IR_REG_NONE) {
8670					if (IR_IS_TYPE_INT(type)) {
8671						if (IR_IS_CONST_REF(arg)) {
8672							if (type == IR_I8 || type == IR_I16) {
8673								type = IR_I32;
8674							} else if (type == IR_U8 || type == IR_U16) {
8675								type = IR_U32;
8676							}
8677							ir_emit_load(ctx, type, dst_reg, arg);
8678						} else if (ctx->vregs[arg]) {
8679							ir_mem mem = ir_ref_spill_slot(ctx, arg);
8680
8681							if (ir_type_size[type] > 2) {
8682								ir_emit_load_mem_int(ctx, type, dst_reg, mem);
8683							} else if (ir_type_size[type] == 2) {
8684								if (type == IR_I16) {
8685									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), word, mem
8686								} else {
8687									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), word, mem
8688								}
8689							} else {
8690								IR_ASSERT(ir_type_size[type] == 1);
8691								if (type == IR_I8) {
8692									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), byte, mem
8693								} else {
8694									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), byte, mem
8695								}
8696							}
8697						} else {
8698							ir_load_local_addr(ctx, dst_reg, arg);
8699						}
8700					} else {
8701						ir_emit_load(ctx, type, dst_reg, arg);
8702					}
8703				}
8704			} else {
8705				ir_mem mem = IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset);
8706
8707				if (IR_IS_TYPE_INT(type)) {
8708					if (IR_IS_CONST_REF(arg)) {
8709						ir_emit_store_mem_int_const(ctx, type, mem, arg, tmp_reg, 1);
8710					} else if (src_reg == IR_REG_NONE) {
8711						IR_ASSERT(tmp_reg != IR_REG_NONE);
8712						ir_emit_load(ctx, type, tmp_reg, arg);
8713						ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
8714					} else if (IR_REG_SPILLED(src_reg)) {
8715						src_reg = IR_REG_NUM(src_reg);
8716						ir_emit_load(ctx, type, src_reg, arg);
8717						ir_emit_store_mem_int(ctx, type, mem, src_reg);
8718					}
8719				} else {
8720					if (IR_IS_CONST_REF(arg)) {
8721						ir_emit_store_mem_fp_const(ctx, type, mem, arg, tmp_reg, tmp_fp_reg);
8722					} else if (src_reg == IR_REG_NONE) {
8723						IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
8724						ir_emit_load(ctx, type, tmp_fp_reg, arg);
8725						ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
8726					} else if (IR_REG_SPILLED(src_reg)) {
8727						src_reg = IR_REG_NUM(src_reg);
8728						ir_emit_load(ctx, type, src_reg, arg);
8729						ir_emit_store_mem_fp(ctx, type, mem, src_reg);
8730					}
8731				}
8732				stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
8733			}
8734		}
8735	}
8736
8737#ifdef _WIN64
8738	/* WIN64 calling convention requires duplcation of parameters passed in FP register into GP ones */
8739	if (ir_is_vararg(ctx, insn)) {
8740		n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
8741		for (j = 3; j <= n; j++) {
8742			arg = ir_insn_op(insn, j);
8743			arg_insn = &ctx->ir_base[arg];
8744			type = arg_insn->type;
8745			if (IR_IS_TYPE_FP(type)) {
8746				src_reg = fp_reg_params[j-3];
8747				dst_reg = int_reg_params[j-3];
8748|.if X64
8749				if (ctx->mflags & IR_X86_AVX) {
8750					|	vmovd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
8751				} else {
8752					|	movd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
8753				}
8754|.endif
8755			}
8756		}
8757	}
8758#endif
8759#ifdef IR_REG_VARARG_FP_REGS
8760	/* set hidden argument to specify the number of vector registers used */
8761	if (ir_is_vararg(ctx, insn)) {
8762		fp_param = IR_MIN(fp_param, fp_reg_params_count);
8763		|	mov Rd(IR_REG_VARARG_FP_REGS), fp_param
8764	}
8765#endif
8766
8767	return used_stack;
8768}
8769
8770static void ir_emit_call_ex(ir_ctx *ctx, ir_ref def, ir_insn *insn, int32_t used_stack)
8771{
8772	ir_backend_data *data = ctx->data;
8773	dasm_State **Dst = &data->dasm_state;
8774	ir_reg def_reg;
8775
8776	if (IR_IS_CONST_REF(insn->op2)) {
8777		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8778
8779		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8780			|	call aword &addr
8781		} else {
8782|.if X64
8783||			ir_reg tmp_reg = IR_REG_RAX;
8784
8785#ifdef IR_REG_VARARG_FP_REGS
8786||			if (ir_is_vararg(ctx, insn)) {
8787||				tmp_reg = IR_REG_R11;
8788||			}
8789#endif
8790||			if (IR_IS_SIGNED_32BIT(addr)) {
8791				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8792||			} else {
8793				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8794||			}
8795			|	call Rq(tmp_reg)
8796|.endif
8797		}
8798    } else {
8799		ir_reg op2_reg = ctx->regs[def][2];
8800
8801		if (op2_reg != IR_REG_NONE) {
8802			if (IR_REG_SPILLED(op2_reg)) {
8803				op2_reg = IR_REG_NUM(op2_reg);
8804				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8805			}
8806			|	call Ra(op2_reg)
8807		} else {
8808			ir_mem mem;
8809
8810			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8811				mem = ir_fuse_load(ctx, def, insn->op2);
8812			} else {
8813				mem = ir_ref_spill_slot(ctx, insn->op2);
8814			}
8815
8816			|	ASM_TMEM_OP call, aword, mem
8817		}
8818    }
8819
8820	if (used_stack) {
8821		int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
8822
8823		ctx->call_stack_size -= aligned_stack;
8824		if (ir_is_fastcall(ctx, insn)) {
8825			aligned_stack -= used_stack;
8826			if (aligned_stack) {
8827				|	add Ra(IR_REG_RSP), aligned_stack
8828			}
8829		} else {
8830			|	add Ra(IR_REG_RSP), aligned_stack
8831		}
8832	}
8833
8834	if (insn->type != IR_VOID) {
8835		if (IR_IS_TYPE_INT(insn->type)) {
8836			def_reg = IR_REG_NUM(ctx->regs[def][0]);
8837			if (def_reg != IR_REG_NONE) {
8838				if (def_reg != IR_REG_INT_RET1) {
8839					ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
8840				}
8841				if (IR_REG_SPILLED(ctx->regs[def][0])) {
8842					ir_emit_store(ctx, insn->type, def, def_reg);
8843				}
8844			} else if (ctx->use_lists[def].count > 1) {
8845				ir_emit_store(ctx, insn->type, def, IR_REG_INT_RET1);
8846			}
8847		} else {
8848			IR_ASSERT(IR_IS_TYPE_FP(insn->type));
8849			def_reg = IR_REG_NUM(ctx->regs[def][0]);
8850#ifdef IR_REG_FP_RET1
8851			if (def_reg != IR_REG_NONE) {
8852				if (def_reg != IR_REG_FP_RET1) {
8853					ir_emit_fp_mov(ctx, insn->type, def_reg, IR_REG_FP_RET1);
8854				}
8855				if (IR_REG_SPILLED(ctx->regs[def][0])) {
8856					ir_emit_store(ctx, insn->type, def, def_reg);
8857				}
8858			} else if (ctx->use_lists[def].count > 1) {
8859				ir_emit_store(ctx, insn->type, def, IR_REG_FP_RET1);
8860			}
8861#else
8862			if (ctx->use_lists[def].count > 1) {
8863				int32_t offset;
8864				ir_reg fp;
8865
8866				if (def_reg == IR_REG_NONE) {
8867					offset = ir_ref_spill_slot_offset(ctx, def, &fp);
8868					if (insn->type == IR_DOUBLE) {
8869						|	fstp qword [Ra(fp)+offset]
8870					} else {
8871						IR_ASSERT(insn->type == IR_FLOAT);
8872						|	fstp dword [Ra(fp)+offset]
8873					}
8874				} else {
8875					offset = ctx->ret_slot;
8876					IR_ASSERT(offset != -1);
8877					offset = IR_SPILL_POS_TO_OFFSET(offset);
8878					fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8879					if (insn->type == IR_DOUBLE) {
8880						|	fstp qword [Ra(fp)+offset]
8881					} else {
8882						IR_ASSERT(insn->type == IR_FLOAT);
8883						|	fstp dword [Ra(fp)+offset]
8884					}
8885					ir_emit_load_mem_fp(ctx, insn->type, def_reg, IR_MEM_BO(fp, offset));
8886					if (IR_REG_SPILLED(ctx->regs[def][0])) {
8887						ir_emit_store(ctx, insn->type, def, def_reg);
8888					}
8889				}
8890			}
8891#endif
8892		}
8893	}
8894}
8895
8896static void ir_emit_call(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8897{
8898	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
8899	ir_emit_call_ex(ctx, def, insn, used_stack);
8900}
8901
8902static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8903{
8904	ir_backend_data *data = ctx->data;
8905	dasm_State **Dst = &data->dasm_state;
8906	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
8907
8908	if (used_stack != 0) {
8909		ir_emit_call_ex(ctx, def, insn, used_stack);
8910		ir_emit_return_void(ctx);
8911		return;
8912	}
8913
8914	ir_emit_epilogue(ctx);
8915
8916	if (IR_IS_CONST_REF(insn->op2)) {
8917		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8918
8919		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8920			|	jmp aword &addr
8921		} else {
8922|.if X64
8923||			ir_reg tmp_reg = IR_REG_RAX;
8924
8925#ifdef IR_REG_VARARG_FP_REGS
8926||			if (ir_is_vararg(ctx, insn)) {
8927||				tmp_reg = IR_REG_R11;
8928||			}
8929#endif
8930||			if (IR_IS_SIGNED_32BIT(addr)) {
8931				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8932||			} else {
8933				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8934||			}
8935			|	jmp Rq(tmp_reg)
8936|.endif
8937		}
8938    } else {
8939		ir_reg op2_reg = ctx->regs[def][2];
8940
8941		if (op2_reg != IR_REG_NONE) {
8942			if (IR_REG_SPILLED(op2_reg)) {
8943				op2_reg = IR_REG_NUM(op2_reg);
8944				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8945			}
8946			|	jmp Ra(op2_reg)
8947		} else {
8948			ir_mem mem;
8949
8950			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8951				mem = ir_fuse_load(ctx, def, insn->op2);
8952			} else {
8953				mem = ir_ref_spill_slot(ctx, insn->op2);
8954			}
8955			|	ASM_TMEM_OP jmp, aword, mem
8956		}
8957    }
8958}
8959
8960static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8961{
8962	ir_backend_data *data = ctx->data;
8963	dasm_State **Dst = &data->dasm_state;
8964	ir_reg op2_reg = ctx->regs[def][2];
8965
8966	if (IR_IS_CONST_REF(insn->op2)) {
8967		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8968
8969		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8970			|	jmp aword &addr
8971		} else {
8972|.if X64
8973			if (IR_IS_SIGNED_32BIT(addr)) {
8974				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8975			} else {
8976				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8977			}
8978			|	jmp rax
8979|.endif
8980		}
8981	} else if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8982	    ir_mem mem = ir_fuse_load(ctx, def, insn->op2);
8983		|	ASM_TMEM_OP jmp, aword, mem
8984	} else if (op2_reg != IR_REG_NONE) {
8985		if (IR_REG_SPILLED(op2_reg)) {
8986			op2_reg = IR_REG_NUM(op2_reg);
8987			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8988		}
8989		|	jmp Ra(op2_reg)
8990	} else {
8991		ir_mem mem = ir_ref_spill_slot(ctx, insn->op2);
8992
8993		|	ASM_TMEM_OP jmp, aword, mem
8994	}
8995}
8996
8997static bool ir_emit_guard_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block, uint8_t op, void *addr, bool int_cmp)
8998{
8999	ir_backend_data *data = ctx->data;
9000	dasm_State **Dst = &data->dasm_state;
9001	ir_insn *next_insn = &ctx->ir_base[def + 1];
9002
9003	if (next_insn->op == IR_END || next_insn->op == IR_LOOP_END) {
9004		ir_block *bb = &ctx->cfg_blocks[b];
9005		uint32_t target;
9006
9007		if (!(bb->flags & IR_BB_DESSA_MOVES)) {
9008			target = ctx->cfg_edges[bb->successors];
9009			if (UNEXPECTED(bb->successors_count == 2)) {
9010				if (ctx->cfg_blocks[target].flags & IR_BB_ENTRY) {
9011					target = ctx->cfg_edges[bb->successors + 1];
9012				} else {
9013					IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
9014				}
9015			} else {
9016				IR_ASSERT(bb->successors_count == 1);
9017			}
9018			target = ir_skip_empty_target_blocks(ctx, target);
9019			if (target != next_block) {
9020				if (int_cmp) {
9021					switch (op) {
9022						default:
9023							IR_ASSERT(0 && "NIY binary op");
9024						case IR_EQ:
9025							|	jne =>target
9026							break;
9027						case IR_NE:
9028							|	je =>target
9029							break;
9030						case IR_LT:
9031							|	jge =>target
9032							break;
9033						case IR_GE:
9034							|	jl =>target
9035							break;
9036						case IR_LE:
9037							|	jg =>target
9038							break;
9039						case IR_GT:
9040							|	jle =>target
9041							break;
9042						case IR_ULT:
9043							|	jae =>target
9044							break;
9045						case IR_UGE:
9046							|	jb =>target
9047							break;
9048						case IR_ULE:
9049							|	ja =>target
9050							break;
9051						case IR_UGT:
9052							|	jbe =>target
9053							break;
9054					}
9055				} else {
9056					switch (op) {
9057						default:
9058							IR_ASSERT(0 && "NIY binary op");
9059						case IR_EQ:
9060							|	jne =>target
9061							|	jp =>target
9062							break;
9063						case IR_NE:
9064							|	jp &addr
9065							|	je =>target
9066							break;
9067						case IR_LT:
9068							|	jae =>target
9069							break;
9070						case IR_GE:
9071							|	jp &addr
9072							|	jb =>target
9073							break;
9074						case IR_LE:
9075							|	ja =>target
9076							break;
9077						case IR_GT:
9078							|	jp &addr
9079							|	jbe =>target
9080							break;
9081					}
9082				}
9083				|	jmp &addr
9084				return 1;
9085			}
9086		}
9087	} else if (next_insn->op == IR_IJMP && IR_IS_CONST_REF(next_insn->op2)) {
9088		void *target_addr = ir_jmp_addr(ctx, next_insn, &ctx->ir_base[next_insn->op2]);
9089
9090		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, target_addr)) {
9091			if (int_cmp) {
9092				switch (op) {
9093					default:
9094						IR_ASSERT(0 && "NIY binary op");
9095					case IR_EQ:
9096						|	jne &target_addr
9097						break;
9098					case IR_NE:
9099						|	je &target_addr
9100						break;
9101					case IR_LT:
9102						|	jge &target_addr
9103						break;
9104					case IR_GE:
9105						|	jl &target_addr
9106						break;
9107					case IR_LE:
9108						|	jg &target_addr
9109						break;
9110					case IR_GT:
9111						|	jle &target_addr
9112						break;
9113					case IR_ULT:
9114						|	jae &target_addr
9115						break;
9116					case IR_UGE:
9117						|	jb &target_addr
9118						break;
9119					case IR_ULE:
9120						|	ja &target_addr
9121						break;
9122					case IR_UGT:
9123						|	jbe &target_addr
9124						break;
9125				}
9126			} else {
9127				switch (op) {
9128					default:
9129						IR_ASSERT(0 && "NIY binary op");
9130					case IR_EQ:
9131						|	jne &target_addr
9132						|	jp &target_addr
9133						break;
9134					case IR_NE:
9135						|	jp &addr
9136						|	je &target_addr
9137						break;
9138					case IR_LT:
9139						|	jae &target_addr
9140						break;
9141					case IR_GE:
9142						|	jp &addr
9143						|	jb &target_addr
9144						break;
9145					case IR_LE:
9146						|	ja &target_addr
9147						break;
9148					case IR_GT:
9149						|	jp &addr
9150						|	jbe &target_addr
9151						break;
9152				}
9153			}
9154			|	jmp &addr
9155			return 1;
9156		}
9157	}
9158
9159	if (int_cmp) {
9160		switch (op) {
9161			default:
9162				IR_ASSERT(0 && "NIY binary op");
9163			case IR_EQ:
9164				|	je &addr
9165				break;
9166			case IR_NE:
9167				|	jne &addr
9168				break;
9169			case IR_LT:
9170				|	jl &addr
9171				break;
9172			case IR_GE:
9173				|	jge &addr
9174				break;
9175			case IR_LE:
9176				|	jle &addr
9177				break;
9178			case IR_GT:
9179				|	jg &addr
9180				break;
9181			case IR_ULT:
9182				|	jb &addr
9183				break;
9184			case IR_UGE:
9185				|	jae &addr
9186				break;
9187			case IR_ULE:
9188				|	jbe &addr
9189				break;
9190			case IR_UGT:
9191				|	ja &addr
9192				break;
9193		}
9194	} else {
9195		switch (op) {
9196			default:
9197				IR_ASSERT(0 && "NIY binary op");
9198			case IR_EQ:
9199				|	jp >1
9200				|	je &addr
9201				|1:
9202				break;
9203			case IR_NE:
9204				|	jne &addr
9205				|	jp &addr
9206				break;
9207			case IR_LT:
9208				|	jp >1
9209				|	jb &addr
9210				|1:
9211				break;
9212			case IR_GE:
9213				|	jae &addr
9214				break;
9215			case IR_LE:
9216				|	jp >1
9217				|	jbe &addr
9218				|1:
9219				break;
9220			case IR_GT:
9221				|	ja &addr
9222				break;
9223//			case IR_ULT: fprintf(stderr, "\tjb .LL%d\n", true_block); break;
9224//			case IR_UGE: fprintf(stderr, "\tjae .LL%d\n", true_block); break;
9225//			case IR_ULE: fprintf(stderr, "\tjbe .LL%d\n", true_block); break;
9226//			case IR_UGT: fprintf(stderr, "\tja .LL%d\n", true_block); break;
9227		}
9228	}
9229	return 0;
9230}
9231
9232static bool ir_emit_guard(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9233{
9234	ir_backend_data *data = ctx->data;
9235	dasm_State **Dst = &data->dasm_state;
9236	ir_reg op2_reg = ctx->regs[def][2];
9237	ir_type type = ctx->ir_base[insn->op2].type;
9238	void *addr;
9239
9240	IR_ASSERT(IR_IS_TYPE_INT(type));
9241	if (IR_IS_CONST_REF(insn->op2)) {
9242		bool is_true = ir_ref_is_true(ctx, insn->op2);
9243
9244		if ((insn->op == IR_GUARD && !is_true) || (insn->op == IR_GUARD_NOT && is_true)) {
9245			addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9246			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9247				|	jmp aword &addr
9248			} else {
9249|.if X64
9250				if (IR_IS_SIGNED_32BIT(addr)) {
9251					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9252				} else {
9253					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9254				}
9255				|	jmp aword [rax]
9256|.endif
9257			}
9258		}
9259		return 0;
9260	}
9261
9262	if (op2_reg != IR_REG_NONE) {
9263		if (IR_REG_SPILLED(op2_reg)) {
9264			op2_reg = IR_REG_NUM(op2_reg);
9265			ir_emit_load(ctx, type, op2_reg, insn->op2);
9266		}
9267		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
9268	} else {
9269		ir_mem mem;
9270
9271		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
9272			mem = ir_fuse_load(ctx, def, insn->op2);
9273		} else {
9274			mem = ir_ref_spill_slot(ctx, insn->op2);
9275		}
9276		|	ASM_MEM_IMM_OP cmp, type, mem, 0
9277	}
9278
9279	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9280	if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9281		ir_op op;
9282
9283		if (insn->op == IR_GUARD) {
9284			op = IR_EQ;
9285		} else {
9286			op = IR_NE;
9287		}
9288		return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9289	} else {
9290|.if X64
9291		if (insn->op == IR_GUARD) {
9292			|	je >1
9293		} else {
9294			|	jne >1
9295		}
9296		|.cold_code
9297		|1:
9298		if (IR_IS_SIGNED_32BIT(addr)) {
9299			|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9300		} else {
9301			|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9302		}
9303		|	jmp aword [rax]
9304		|.code
9305|.endif
9306		return 0;
9307	}
9308}
9309
9310static bool ir_emit_guard_cmp_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9311{
9312	ir_backend_data *data = ctx->data;
9313	dasm_State **Dst = &data->dasm_state;
9314	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
9315	ir_op op = cmp_insn->op;
9316	ir_type type = ctx->ir_base[cmp_insn->op1].type;
9317	ir_ref op1 = cmp_insn->op1;
9318	ir_ref op2 = cmp_insn->op2;
9319	ir_reg op1_reg = ctx->regs[insn->op2][1];
9320	ir_reg op2_reg = ctx->regs[insn->op2][2];
9321	void *addr;
9322
9323	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
9324		op1_reg = IR_REG_NUM(op1_reg);
9325		ir_emit_load(ctx, type, op1_reg, op1);
9326	}
9327	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
9328		op2_reg = IR_REG_NUM(op2_reg);
9329		if (op1 != op2) {
9330			ir_emit_load(ctx, type, op2_reg, op2);
9331		}
9332	}
9333
9334	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9335	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
9336		if (op == IR_ULT) {
9337			/* always false */
9338			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9339				|	jmp aword &addr
9340			} else {
9341|.if X64
9342				if (IR_IS_SIGNED_32BIT(addr)) {
9343					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9344				} else {
9345					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9346				}
9347				|	jmp aword [rax]
9348|.endif
9349			}
9350			return 0;
9351		} else if (op == IR_UGE) {
9352			/* always true */
9353			return 0;
9354		} else if (op == IR_ULE) {
9355			op = IR_EQ;
9356		} else if (op == IR_UGT) {
9357			op = IR_NE;
9358		}
9359	}
9360	ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
9361
9362	if (insn->op == IR_GUARD) {
9363		op ^= 1; // reverse
9364	}
9365
9366	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9367}
9368
9369static bool ir_emit_guard_cmp_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9370{
9371	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
9372	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9373
9374	if (insn->op == IR_GUARD) {
9375		op ^= 1; // reverse
9376	}
9377	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 0);
9378}
9379
9380static bool ir_emit_guard_test_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9381{
9382	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9383	ir_op op = (insn->op == IR_GUARD) ? IR_EQ : IR_NE;
9384
9385	ir_emit_test_int_common(ctx, def, insn->op2, op);
9386	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9387}
9388
9389static bool ir_emit_guard_jcc_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9390{
9391	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9392	ir_op op = ctx->ir_base[insn->op2].op;
9393
9394	if (insn->op == IR_GUARD) {
9395		op ^= 1; // reverse
9396	}
9397	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9398}
9399
9400static bool ir_emit_guard_overflow(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
9401{
9402	ir_backend_data *data = ctx->data;
9403	dasm_State **Dst = &data->dasm_state;
9404	ir_type type;
9405	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9406
9407	type = ctx->ir_base[ctx->ir_base[insn->op2].op1].type;
9408
9409	IR_ASSERT(IR_IS_TYPE_INT(type));
9410	if (IR_IS_TYPE_SIGNED(type)) {
9411		if (insn->op == IR_GUARD) {
9412			|	jno &addr
9413		} else {
9414			|	jo &addr
9415		}
9416	} else {
9417		if (insn->op == IR_GUARD) {
9418			|	jnc &addr
9419		} else {
9420			|	jc &addr
9421		}
9422	}
9423	return 0;
9424}
9425
9426static void ir_emit_lea(ir_ctx *ctx, ir_ref def, ir_type type)
9427{
9428	ir_backend_data *data = ctx->data;
9429	dasm_State **Dst = &data->dasm_state;
9430	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9431	ir_mem mem = ir_fuse_addr(ctx, def, def);
9432
9433	IR_ASSERT(def_reg != IR_REG_NONE);
9434	if (ir_type_size[type] == 4) {
9435		if (IR_MEM_BASE(mem) == def_reg
9436		 && IR_MEM_OFFSET(mem) == 0
9437		 && IR_MEM_SCALE(mem) == 1
9438		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
9439			ir_reg reg = IR_MEM_INDEX(mem);
9440			|	add Rd(def_reg), Rd(reg)
9441		} else if (IR_MEM_INDEX(mem) == def_reg
9442		 && IR_MEM_OFFSET(mem) == 0
9443		 && IR_MEM_SCALE(mem) == 1
9444		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
9445			ir_reg reg = IR_MEM_BASE(mem);
9446			|	add Rd(def_reg), Rd(reg)
9447		} else {
9448			|	ASM_TXT_TMEM_OP lea, Rd(def_reg), dword, mem
9449		}
9450	} else {
9451		if (IR_MEM_BASE(mem) == def_reg
9452		 && IR_MEM_OFFSET(mem) == 0
9453		 && IR_MEM_SCALE(mem) == 1
9454		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
9455			ir_reg reg = IR_MEM_INDEX(mem);
9456			|	add Ra(def_reg), Ra(reg)
9457		} else if (IR_MEM_INDEX(mem) == def_reg
9458		 && IR_MEM_OFFSET(mem) == 0
9459		 && IR_MEM_SCALE(mem) == 1
9460		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
9461			ir_reg reg = IR_MEM_BASE(mem);
9462			|	add Ra(def_reg), Ra(reg)
9463		} else {
9464			|	ASM_TXT_TMEM_OP lea, Ra(def_reg), aword, mem
9465		}
9466	}
9467	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9468		ir_emit_store(ctx, type, def, def_reg);
9469	}
9470}
9471
9472static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9473{
9474	ir_backend_data *data = ctx->data;
9475	dasm_State **Dst = &data->dasm_state;
9476	ir_reg reg = IR_REG_NUM(ctx->regs[def][0]);
9477
9478	if (ctx->use_lists[def].count == 1) {
9479		/* dead load */
9480		return;
9481	}
9482
9483|.if X64WIN
9484|	gs
9485|	mov Ra(reg), aword [0x58]
9486|	mov Ra(reg), aword [Ra(reg)+insn->op2]
9487|	mov Ra(reg), aword [Ra(reg)+insn->op3]
9488|.elif WIN
9489|	fs
9490|	mov Ra(reg), aword [0x2c]
9491|	mov Ra(reg), aword [Ra(reg)+insn->op2]
9492|	mov Ra(reg), aword [Ra(reg)+insn->op3]
9493|.elif X64APPLE
9494|	gs
9495||	if (insn->op3 == IR_NULL) {
9496|		mov Ra(reg), aword [insn->op2]
9497||	} else {
9498|		mov Ra(reg), aword [insn->op2]
9499|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9500||	}
9501|.elif X64
9502|	fs
9503||	if (insn->op3 == IR_NULL) {
9504|		mov Ra(reg), aword [insn->op2]
9505||	} else {
9506|		mov Ra(reg), [0x8]
9507|		mov Ra(reg), aword [Ra(reg)+insn->op2]
9508|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9509||	}
9510|.else
9511|	gs
9512||	if (insn->op3 == IR_NULL) {
9513|		mov Ra(reg), aword [insn->op2]
9514||	} else {
9515|		mov Ra(reg), [0x4]
9516|		mov Ra(reg), aword [Ra(reg)+insn->op2]
9517|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9518||	}
9519|	.endif
9520	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9521		ir_emit_store(ctx, IR_ADDR, def, reg);
9522	}
9523}
9524
9525static void ir_emit_sse_sqrt(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9526{
9527	ir_backend_data *data = ctx->data;
9528	dasm_State **Dst = &data->dasm_state;
9529	ir_reg op3_reg = ctx->regs[def][3];
9530	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9531
9532	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
9533	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);
9534
9535	if (IR_REG_SPILLED(op3_reg)) {
9536		op3_reg = IR_REG_NUM(op3_reg);
9537		ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
9538	}
9539
9540	|	ASM_FP_REG_REG_OP sqrts, insn->type, def_reg, op3_reg
9541
9542	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9543		ir_emit_store(ctx, insn->type, def, def_reg);
9544	}
9545}
9546
9547static void ir_emit_sse_round(ir_ctx *ctx, ir_ref def, ir_insn *insn, int round_op)
9548{
9549	ir_backend_data *data = ctx->data;
9550	dasm_State **Dst = &data->dasm_state;
9551	ir_reg op3_reg = ctx->regs[def][3];
9552	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9553
9554	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
9555	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);
9556
9557	if (IR_REG_SPILLED(op3_reg)) {
9558		op3_reg = IR_REG_NUM(op3_reg);
9559		ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
9560	}
9561
9562	if (ctx->mflags & IR_X86_AVX) {
9563		|	ASM_SSE2_REG_REG_REG_TXT_OP vrounds, insn->type, def_reg, def_reg, op3_reg, round_op
9564	} else {
9565		|	ASM_SSE2_REG_REG_TXT_OP rounds, insn->type, def_reg, op3_reg, round_op
9566	}
9567
9568	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9569		ir_emit_store(ctx, insn->type, def, def_reg);
9570	}
9571}
9572
9573static void ir_emit_exitcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9574{
9575	ir_backend_data *data = ctx->data;
9576	dasm_State **Dst = &data->dasm_state;
9577	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9578
9579	IR_ASSERT(def_reg != IR_REG_NONE);
9580
9581	|.if X64
9582	|	sub rsp, 16*8+16*8+8 /* CPU regs + SSE regs */
9583	|	mov aword [rsp+0*8], rax
9584	|	mov aword [rsp+1*8], rcx
9585	|	mov aword [rsp+2*8], rdx
9586	|	mov aword [rsp+3*8], rbx
9587	|	mov aword [rsp+5*8], rbp
9588	|	mov aword [rsp+6*8], rsi
9589	|	mov aword [rsp+7*8], rdi
9590	|	mov aword [rsp+8*8], r8
9591	|	mov aword [rsp+9*8], r9
9592	|	mov aword [rsp+10*8], r10
9593	|	mov aword [rsp+11*8], r11
9594	|	mov aword [rsp+12*8], r12
9595	|	mov aword [rsp+13*8], r13
9596	|	mov aword [rsp+14*8], r14
9597	|	mov aword [rsp+15*8], r15
9598	|	movsd qword [rsp+16*8+0*8], xmm0
9599	|	movsd qword [rsp+16*8+1*8], xmm1
9600	|	movsd qword [rsp+16*8+2*8], xmm2
9601	|	movsd qword [rsp+16*8+3*8], xmm3
9602	|	movsd qword [rsp+16*8+4*8], xmm4
9603	|	movsd qword [rsp+16*8+5*8], xmm5
9604	|	movsd qword [rsp+16*8+6*8], xmm6
9605	|	movsd qword [rsp+16*8+7*8], xmm7
9606	|	movsd qword [rsp+16*8+8*8], xmm8
9607	|	movsd qword [rsp+16*8+9*8], xmm9
9608	|	movsd qword [rsp+16*8+10*8], xmm10
9609	|	movsd qword [rsp+16*8+11*8], xmm11
9610	|	movsd qword [rsp+16*8+12*8], xmm12
9611	|	movsd qword [rsp+16*8+13*8], xmm13
9612	|	movsd qword [rsp+16*8+14*8], xmm14
9613	|	movsd qword [rsp+16*8+15*8], xmm15
9614	|
9615	|	mov Ra(IR_REG_INT_ARG2), rsp
9616	|	lea Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+16]
9617	|	mov aword [rsp+4*8], Ra(IR_REG_INT_ARG1)
9618	|	mov Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+8]
9619	|.if X64WIN
9620	|	sub rsp, 32 /* shadow space */
9621	|.endif
9622	|.else
9623	|	sub esp, 8*4+8*8+12 /* CPU regs + SSE regs */
9624	|	mov aword [esp+0*4], eax
9625	|	mov aword [esp+1*4], ecx
9626	|	mov aword [esp+2*4], edx
9627	|	mov aword [esp+3*4], ebx
9628	|	mov aword [esp+5*4], ebp
9629	|	mov aword [esp+6*4], esi
9630	|	mov aword [esp+7*4], edi
9631	|	movsd qword [esp+8*4+0*8], xmm0
9632	|	movsd qword [esp+8*4+1*8], xmm1
9633	|	movsd qword [esp+8*4+2*8], xmm2
9634	|	movsd qword [esp+8*4+3*8], xmm3
9635	|	movsd qword [esp+8*4+4*8], xmm4
9636	|	movsd qword [esp+8*4+5*8], xmm5
9637	|	movsd qword [esp+8*4+6*8], xmm6
9638	|	movsd qword [esp+8*4+7*8], xmm7
9639	|
9640	|	mov Ra(IR_REG_INT_FCARG2), esp
9641	|	lea Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+16]
9642	|	mov aword [esp+4*4], Ra(IR_REG_INT_FCARG1)
9643	|	mov Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+12]
9644	|.endif
9645
9646	if (IR_IS_CONST_REF(insn->op2)) {
9647		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
9648
9649		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9650			|	call aword &addr
9651		} else {
9652|.if X64
9653			if (IR_IS_SIGNED_32BIT(addr)) {
9654				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9655			} else {
9656				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9657			}
9658			|	call rax
9659|.endif
9660		}
9661	} else {
9662		IR_ASSERT(0);
9663	}
9664
9665	//  restore SP
9666	|.if X64WIN
9667	|	add rsp, 32+16*8+16*8+16 /* shadow space + CPU regs + SSE regs */
9668	|.elif X64
9669	|	add rsp, 16*8+16*8+16 /* CPU regs + SSE regs */
9670	|.else
9671	|	add esp, 8*4+8*8+16 /* CPU regs + SSE regs */
9672	|.endif
9673
9674	if (def_reg != IR_REG_INT_RET1) {
9675		ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
9676	}
9677	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9678		ir_emit_store(ctx, insn->type, def, def_reg);
9679	}
9680}
9681
9682static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_reg to_reg, ir_ref to, int32_t offset)
9683{
9684	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
9685
9686	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);
9687
9688	if (IR_IS_TYPE_INT(type)) {
9689		if (from_reg != IR_REG_NONE) {
9690			if (to_reg != IR_REG_NONE) {
9691				ir_emit_mov(ctx, type, to_reg, from_reg);
9692			} else {
9693				ir_emit_store(ctx, type, to, from_reg);
9694			}
9695		} else {
9696			ir_emit_load_mem_int(ctx, type, to_reg, IR_MEM_BO(fp, offset));
9697		}
9698	} else {
9699		if (from_reg != IR_REG_NONE) {
9700			if (to_reg != IR_REG_NONE) {
9701				ir_emit_fp_mov(ctx, type, to_reg, from_reg);
9702			} else {
9703				ir_emit_store(ctx, type, to, from_reg);
9704			}
9705		} else {
9706			ir_emit_load_mem_fp(ctx, type, to_reg, IR_MEM_BO(fp, offset));
9707		}
9708	}
9709}
9710
9711static void ir_emit_load_params(ir_ctx *ctx)
9712{
9713	ir_use_list *use_list = &ctx->use_lists[1];
9714	ir_insn *insn;
9715	ir_ref i, n, *p, use;
9716	int int_param_num = 0;
9717	int fp_param_num = 0;
9718	ir_reg src_reg;
9719	ir_reg dst_reg;
9720	// TODO: Calling convention specific
9721	int int_reg_params_count = IR_REG_INT_ARGS;
9722	int fp_reg_params_count = IR_REG_FP_ARGS;
9723	const int8_t *int_reg_params = _ir_int_reg_params;
9724	const int8_t *fp_reg_params = _ir_fp_reg_params;
9725	int32_t stack_offset = 0;
9726
9727#ifdef IR_TARGET_X86
9728	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
9729		int_reg_params_count = IR_REG_INT_FCARGS;
9730		fp_reg_params_count = IR_REG_FP_FCARGS;
9731		int_reg_params = _ir_int_fc_reg_params;
9732		fp_reg_params = _ir_fp_fc_reg_params;
9733	}
9734#endif
9735
9736	if (ctx->flags & IR_USE_FRAME_POINTER) {
9737		stack_offset = sizeof(void*) * 2; /* skip old frame pointer and return address */
9738	} else {
9739		stack_offset = sizeof(void*) + ctx->stack_frame_size + ctx->call_stack_size; /* skip return address */
9740	}
9741	n = use_list->count;
9742	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9743		use = *p;
9744		insn = &ctx->ir_base[use];
9745		if (insn->op == IR_PARAM) {
9746			if (IR_IS_TYPE_INT(insn->type)) {
9747				if (int_param_num < int_reg_params_count) {
9748					src_reg = int_reg_params[int_param_num];
9749				} else {
9750					src_reg = IR_REG_NONE;
9751				}
9752				int_param_num++;
9753#ifdef _WIN64
9754				/* WIN64 calling convention use common couter for int and fp registers */
9755				fp_param_num++;
9756#endif
9757			} else {
9758				if (fp_param_num < fp_reg_params_count) {
9759					src_reg = fp_reg_params[fp_param_num];
9760				} else {
9761					src_reg = IR_REG_NONE;
9762				}
9763				fp_param_num++;
9764#ifdef _WIN64
9765				/* WIN64 calling convention use common couter for int and fp registers */
9766				int_param_num++;
9767#endif
9768			}
9769			if (ctx->vregs[use]) {
9770				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
9771				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
9772					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
9773						((ctx->flags & IR_USE_FRAME_POINTER) ?
9774							-(ctx->stack_frame_size - ctx->stack_frame_alignment) :
9775							ctx->call_stack_size));
9776				if (src_reg != dst_reg) {
9777					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
9778				}
9779				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
9780					ir_emit_store(ctx, insn->type, use, dst_reg);
9781				}
9782			}
9783			if (src_reg == IR_REG_NONE) {
9784				if (sizeof(void*) == 8) {
9785					stack_offset += sizeof(void*);
9786				} else {
9787					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
9788				}
9789			}
9790		}
9791	}
9792}
9793
9794static ir_reg ir_get_free_reg(ir_type type, ir_regset available)
9795{
9796	if (IR_IS_TYPE_INT(type)) {
9797		available = IR_REGSET_INTERSECTION(available, IR_REGSET_GP);
9798	} else {
9799		IR_ASSERT(IR_IS_TYPE_FP(type));
9800		available = IR_REGSET_INTERSECTION(available, IR_REGSET_FP);
9801	}
9802	IR_ASSERT(!IR_REGSET_IS_EMPTY(available));
9803	return IR_REGSET_FIRST(available);
9804}
9805
9806static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
9807{
9808	ir_backend_data *data = ctx->data;
9809	ir_ref ref = ctx->cfg_blocks[data->dessa_from_block].end;
9810
9811	if (to == 0) {
9812		if (IR_IS_TYPE_INT(type)) {
9813			if (ctx->regs[ref][0] == IR_REG_NONE) {
9814				ctx->regs[ref][0] = IR_REG_RAX;
9815			}
9816		} else {
9817			IR_ASSERT(IR_IS_TYPE_FP(type));
9818			if (ctx->regs[ref][1] == IR_REG_NONE) {
9819				ctx->regs[ref][1] = IR_REG_XMM0;
9820			}
9821		}
9822	} else if (from != 0) {
9823		if (IR_IS_TYPE_INT(type)) {
9824			if (ctx->regs[ref][0] == IR_REG_NONE) {
9825				ctx->regs[ref][0] = IR_REG_RAX;
9826			}
9827		} else {
9828			IR_ASSERT(IR_IS_TYPE_FP(type));
9829			if (ctx->regs[ref][1] == IR_REG_NONE) {
9830				ctx->regs[ref][1] = IR_REG_XMM0;
9831			}
9832		}
9833	}
9834	return 1;
9835}
9836
9837static void ir_fix_param_spills(ir_ctx *ctx)
9838{
9839	ir_use_list *use_list = &ctx->use_lists[1];
9840	ir_insn *insn;
9841	ir_ref i, n, *p, use;
9842	int int_param_num = 0;
9843	int fp_param_num = 0;
9844	ir_reg src_reg;
9845	// TODO: Calling convention specific
9846	int int_reg_params_count = IR_REG_INT_ARGS;
9847	int fp_reg_params_count = IR_REG_FP_ARGS;
9848	const int8_t *int_reg_params = _ir_int_reg_params;
9849	const int8_t *fp_reg_params = _ir_fp_reg_params;
9850	int32_t stack_start = 0;
9851	int32_t stack_offset = 0;
9852
9853#ifdef IR_TARGET_X86
9854	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
9855		int_reg_params_count = IR_REG_INT_FCARGS;
9856		fp_reg_params_count = IR_REG_FP_FCARGS;
9857		int_reg_params = _ir_int_fc_reg_params;
9858		fp_reg_params = _ir_fp_fc_reg_params;
9859	}
9860#endif
9861
9862	if (ctx->flags & IR_USE_FRAME_POINTER) {
9863		/* skip old frame pointer and return address */
9864		stack_start = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
9865	} else {
9866		 /* skip return address */
9867		stack_start = sizeof(void*) + ctx->stack_frame_size;
9868	}
9869	n = use_list->count;
9870	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9871		use = *p;
9872		insn = &ctx->ir_base[use];
9873		if (insn->op == IR_PARAM) {
9874			if (IR_IS_TYPE_INT(insn->type)) {
9875				if (int_param_num < int_reg_params_count) {
9876					src_reg = int_reg_params[int_param_num];
9877				} else {
9878					src_reg = IR_REG_NONE;
9879				}
9880				int_param_num++;
9881#ifdef _WIN64
9882				/* WIN64 calling convention use common couter for int and fp registers */
9883				fp_param_num++;
9884#endif
9885			} else {
9886				if (fp_param_num < fp_reg_params_count) {
9887					src_reg = fp_reg_params[fp_param_num];
9888				} else {
9889					src_reg = IR_REG_NONE;
9890				}
9891				fp_param_num++;
9892#ifdef _WIN64
9893				/* WIN64 calling convention use common couter for int and fp registers */
9894				int_param_num++;
9895#endif
9896			}
9897			if (src_reg == IR_REG_NONE) {
9898				if (ctx->vregs[use]) {
9899					ir_live_interval *ival = ctx->live_intervals[ctx->vregs[use]];
9900					if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
9901					 && ival->stack_spill_pos == -1
9902					 && (ival->next || ival->reg == IR_REG_NONE)) {
9903						ival->stack_spill_pos = stack_start + stack_offset;
9904					}
9905				}
9906				if (sizeof(void*) == 8) {
9907					stack_offset += sizeof(void*);
9908				} else {
9909					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
9910				}
9911			}
9912		}
9913	}
9914
9915#ifdef _WIN64
9916	/* WIN64 uses shsow area for registers */
9917	stack_offset += IR_MIN(int_param_num, int_reg_params_count) * sizeof(void*);
9918#endif
9919	ctx->gp_reg_params = IR_MIN(int_param_num, int_reg_params_count);
9920	ctx->fp_reg_params = IR_MIN(fp_param_num, fp_reg_params_count);
9921	ctx->param_stack_size = stack_offset;
9922}
9923
9924static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
9925{
9926	uint32_t b;
9927	ir_block *bb;
9928	ir_insn *insn;
9929	ir_ref i, n, j, *p;
9930	uint32_t *rule, insn_flags;
9931	ir_backend_data *data = ctx->data;
9932	ir_regset available = 0;
9933	ir_target_constraints constraints;
9934	uint32_t def_flags;
9935	ir_reg reg;
9936
9937#ifndef IR_REG_FP_RET1
9938	if (ctx->flags2 & IR_HAS_FP_RET_SLOT) {
9939		ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
9940	} else if (ctx->ret_type == IR_FLOAT || ctx->ret_type == IR_DOUBLE) {
9941		ctx->ret_slot = ir_allocate_spill_slot(ctx, ctx->ret_type, &data->ra_data);
9942	} else {
9943		ctx->ret_slot = -1;
9944	}
9945#endif
9946
9947	ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
9948	memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
9949
9950	/* vregs + tmp + fixed + SRATCH + ALL */
9951	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));
9952
9953    if (!ctx->arena) {
9954		ctx->arena = ir_arena_create(16 * 1024);
9955	}
9956
9957	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
9958		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
9959		for (i = bb->start, insn = ctx->ir_base + i, rule = ctx->rules + i; i <= bb->end;) {
9960			switch (ctx->rules ? *rule : insn->op) {
9961				case IR_START:
9962				case IR_BEGIN:
9963				case IR_END:
9964				case IR_IF_TRUE:
9965				case IR_IF_FALSE:
9966				case IR_CASE_VAL:
9967				case IR_CASE_DEFAULT:
9968				case IR_MERGE:
9969				case IR_LOOP_BEGIN:
9970				case IR_LOOP_END:
9971					break;
9972#ifndef IR_REG_FP_RET1
9973				case IR_CALL:
9974					if (ctx->ret_slot == -1 && (insn->type == IR_FLOAT || insn->type == IR_DOUBLE)) {
9975						ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
9976					}
9977#endif
9978					IR_FALLTHROUGH;
9979				default:
9980					def_flags = ir_get_target_constraints(ctx, i, &constraints);
9981					if (ctx->rules
9982					 && *rule != IR_CMP_AND_BRANCH_INT
9983					 && *rule != IR_CMP_AND_BRANCH_FP
9984					 && *rule != IR_TEST_AND_BRANCH_INT
9985					 && *rule != IR_GUARD_CMP_INT
9986					 && *rule != IR_GUARD_CMP_FP) {
9987						available = IR_REGSET_SCRATCH;
9988					}
9989					if (ctx->vregs[i]) {
9990						reg = constraints.def_reg;
9991						if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
9992							IR_REGSET_EXCL(available, reg);
9993							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
9994						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
9995							if (insn->op == IR_VLOAD
9996							 && ctx->live_intervals[ctx->vregs[i]]
9997							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1) {
9998								/* pass */
9999							} else if (insn->op != IR_PARAM) {
10000								reg = ir_get_free_reg(insn->type, available);
10001								IR_REGSET_EXCL(available, reg);
10002								ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
10003							}
10004						}
10005						if (!ctx->live_intervals[ctx->vregs[i]]) {
10006							ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
10007							memset(ival, 0, sizeof(ir_live_interval));
10008							ctx->live_intervals[ctx->vregs[i]] = ival;
10009							ival->type = insn->type;
10010							ival->reg = IR_REG_NONE;
10011							ival->vreg = ctx->vregs[i];
10012							ival->stack_spill_pos = -1;
10013							if (insn->op == IR_PARAM && reg == IR_REG_NONE) {
10014								ival->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
10015							} else {
10016								ival->stack_spill_pos = ir_allocate_spill_slot(ctx, ival->type, &data->ra_data);
10017							}
10018						} else if (insn->op == IR_PARAM) {
10019							IR_ASSERT(0 && "unexpected PARAM");
10020							return;
10021						}
10022					} else if (insn->op == IR_VAR) {
10023						ir_use_list *use_list = &ctx->use_lists[i];
10024						ir_ref n = use_list->count;
10025
10026						if (n > 0) {
10027							int32_t stack_spill_pos = insn->op3 = ir_allocate_spill_slot(ctx, insn->type, &data->ra_data);
10028							ir_ref i, *p, use;
10029							ir_insn *use_insn;
10030
10031							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
10032								use = *p;
10033								use_insn = &ctx->ir_base[use];
10034								if (use_insn->op == IR_VLOAD) {
10035									if (ctx->vregs[use]
10036									 && !ctx->live_intervals[ctx->vregs[use]]) {
10037										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
10038										memset(ival, 0, sizeof(ir_live_interval));
10039										ctx->live_intervals[ctx->vregs[use]] = ival;
10040										ival->type = insn->type;
10041										ival->reg = IR_REG_NONE;
10042										ival->vreg = ctx->vregs[use];
10043										ival->stack_spill_pos = stack_spill_pos;
10044									}
10045								} else if (use_insn->op == IR_VSTORE) {
10046									if (!IR_IS_CONST_REF(use_insn->op3)
10047									 && ctx->vregs[use_insn->op3]
10048									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
10049										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
10050										memset(ival, 0, sizeof(ir_live_interval));
10051										ctx->live_intervals[ctx->vregs[use_insn->op3]] = ival;
10052										ival->type = insn->type;
10053										ival->reg = IR_REG_NONE;
10054										ival->vreg = ctx->vregs[use_insn->op3];
10055										ival->stack_spill_pos = stack_spill_pos;
10056									}
10057								}
10058							}
10059						}
10060					}
10061
10062					insn_flags = ir_op_flags[insn->op];
10063					n = constraints.tmps_count;
10064					if (n) {
10065						do {
10066							n--;
10067							if (constraints.tmp_regs[n].type) {
10068								ir_reg reg = ir_get_free_reg(constraints.tmp_regs[n].type, available);
10069								ir_ref *ops = insn->ops;
10070								IR_REGSET_EXCL(available, reg);
10071								if (constraints.tmp_regs[n].num > 0
10072								 && IR_IS_CONST_REF(ops[constraints.tmp_regs[n].num])) {
10073									/* rematerialization */
10074									reg |= IR_REG_SPILL_LOAD;
10075								}
10076								ctx->regs[i][constraints.tmp_regs[n].num] = reg;
10077							} else if (constraints.tmp_regs[n].reg == IR_REG_SCRATCH) {
10078								available = IR_REGSET_DIFFERENCE(available, IR_REGSET_SCRATCH);
10079							} else {
10080								IR_REGSET_EXCL(available, constraints.tmp_regs[n].reg);
10081							}
10082						} while (n);
10083					}
10084					n = insn->inputs_count;
10085					for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
10086						ir_ref input = *p;
10087						if (IR_OPND_KIND(insn_flags, j) == IR_OPND_DATA && input > 0 && ctx->vregs[input]) {
10088							if ((def_flags & IR_DEF_REUSES_OP1_REG) && j == 1) {
10089								ir_reg reg = IR_REG_NUM(ctx->regs[i][0]);
10090								ctx->regs[i][1] = reg | IR_REG_SPILL_LOAD;
10091							} else {
10092								uint8_t use_flags = IR_USE_FLAGS(def_flags, j);
10093								ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
10094
10095								if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
10096									IR_REGSET_EXCL(available, reg);
10097									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
10098								} else if (j > 1 && input == insn->op1 && ctx->regs[i][1] != IR_REG_NONE) {
10099									ctx->regs[i][j] = ctx->regs[i][1];
10100								} else if (use_flags & IR_USE_MUST_BE_IN_REG) {
10101									reg = ir_get_free_reg(ctx->ir_base[input].type, available);
10102									IR_REGSET_EXCL(available, reg);
10103									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
10104								}
10105							}
10106						}
10107					}
10108					break;
10109			}
10110			n = ir_insn_len(insn);
10111			i += n;
10112			insn += n;
10113			rule += n;
10114		}
10115		if (bb->flags & IR_BB_DESSA_MOVES) {
10116			data->dessa_from_block = b;
10117			ir_gen_dessa_moves(ctx, b, ir_fix_dessa_tmps);
10118		}
10119	}
10120
10121	ctx->used_preserved_regs = ctx->fixed_save_regset;
10122	ctx->flags |= IR_NO_STACK_COMBINE;
10123	ir_fix_stack_frame(ctx);
10124}
10125
10126static void ir_preallocate_call_stack(ir_ctx *ctx)
10127{
10128	int call_stack_size, peak_call_stack_size = 0;
10129	ir_ref i, n;
10130	ir_insn *insn;
10131
10132	for (i = 1, insn = ctx->ir_base + 1; i < ctx->insns_count;) {
10133		if (insn->op == IR_CALL) {
10134			call_stack_size = ir_call_used_stack(ctx, insn);
10135			if (call_stack_size > peak_call_stack_size
10136#ifdef IR_HAVE_FASTCALL
10137			 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
10138#endif
10139			) {
10140				peak_call_stack_size = call_stack_size;
10141			}
10142		}
10143		n = ir_insn_len(insn);
10144		i += n;
10145		insn += n;
10146	}
10147	if (peak_call_stack_size) {
10148		ctx->call_stack_size = peak_call_stack_size;
10149		ctx->flags |= IR_PREALLOCATED_STACK;
10150	}
10151}
10152
10153void ir_fix_stack_frame(ir_ctx *ctx)
10154{
10155	uint32_t additional_size = 0;
10156
10157	ctx->locals_area_size = ctx->stack_frame_size;
10158
10159#if defined(IR_TARGET_X64) && !defined(_WIN64)
10160	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
10161		ctx->flags2 |= IR_16B_FRAME_ALIGNMENT;
10162		ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, 16);
10163		ctx->locals_area_size = ctx->stack_frame_size;
10164		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
10165			additional_size += sizeof(void*) * IR_REG_INT_ARGS;
10166		}
10167		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
10168			additional_size += 16 * IR_REG_FP_ARGS;
10169		}
10170	}
10171#endif
10172
10173	if (ctx->used_preserved_regs) {
10174		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
10175		ir_reg reg;
10176		(void) reg;
10177
10178		IR_REGSET_FOREACH(used_preserved_regs, reg) {
10179			additional_size += sizeof(void*);
10180		} IR_REGSET_FOREACH_END();
10181	}
10182
10183	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
10184	ctx->stack_frame_size += additional_size;
10185	ctx->stack_frame_alignment = 0;
10186	ctx->call_stack_size = 0;
10187
10188	if (ctx->flags2 & IR_16B_FRAME_ALIGNMENT) {
10189		/* Stack must be 16 byte aligned */
10190		if (!(ctx->flags & IR_FUNCTION)) {
10191			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
10192				ctx->stack_frame_size += sizeof(void*);
10193				ctx->stack_frame_alignment += sizeof(void*);
10194			}
10195		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
10196			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
10197				ctx->stack_frame_size += sizeof(void*);
10198				ctx->stack_frame_alignment += sizeof(void*);
10199			}
10200		} else {
10201			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
10202				ir_preallocate_call_stack(ctx);
10203			}
10204			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*), 16) !=
10205					ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*)) {
10206				ctx->stack_frame_size += sizeof(void*);
10207				ctx->stack_frame_alignment += sizeof(void*);
10208			}
10209		}
10210	}
10211
10212	ir_fix_param_spills(ctx);
10213}
10214
10215static void* dasm_labels[ir_lb_MAX];
10216
10217static uint32_t _ir_next_block(ir_ctx *ctx, uint32_t _b)
10218{
10219	uint32_t b = ctx->cfg_schedule[++_b];
10220
10221	/* Check for empty ENTRY block */
10222	while (b && ((ctx->cfg_blocks[b].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
10223		b = ctx->cfg_schedule[++_b];
10224	}
10225	return b;
10226}
10227
10228void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
10229{
10230	uint32_t _b, b, n, target;
10231	ir_block *bb;
10232	ir_ref i;
10233	ir_insn *insn;
10234	uint32_t *rule;
10235	ir_backend_data data;
10236	dasm_State **Dst;
10237	int ret;
10238	void *entry;
10239	size_t size;
10240
10241	data.ra_data.unused_slot_4 = 0;
10242	data.ra_data.unused_slot_2 = 0;
10243	data.ra_data.unused_slot_1 = 0;
10244	data.ra_data.handled = NULL;
10245	data.rodata_label = 0;
10246	data.jmp_table_label = 0;
10247	data.double_neg_const = 0;
10248	data.float_neg_const = 0;
10249	data.double_abs_const = 0;
10250	data.float_abs_const = 0;
10251	data.double_zero_const = 0;
10252	ctx->data = &data;
10253
10254	if (!ctx->live_intervals) {
10255		ctx->stack_frame_size = 0;
10256		ctx->stack_frame_alignment = 0;
10257		ctx->call_stack_size = 0;
10258		ctx->used_preserved_regs = 0;
10259		ir_allocate_unique_spill_slots(ctx);
10260	}
10261
10262	if (ctx->fixed_stack_frame_size != -1) {
10263		if (ctx->fixed_stack_red_zone) {
10264			IR_ASSERT(ctx->fixed_stack_red_zone == ctx->fixed_stack_frame_size + ctx->fixed_call_stack_size);
10265		}
10266		if (ctx->stack_frame_size > ctx->fixed_stack_frame_size) {
10267			// TODO: report error to caller
10268#ifdef IR_DEBUG_MESSAGES
10269			fprintf(stderr, "IR Compilation Aborted: ctx->stack_frame_size > ctx->fixed_stack_frame_size at %s:%d\n",
10270				__FILE__, __LINE__);
10271#endif
10272			ctx->data = NULL;
10273			ctx->status = IR_ERROR_FIXED_STACK_FRAME_OVERFLOW;
10274			return NULL;
10275		}
10276		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
10277		ctx->call_stack_size = ctx->fixed_call_stack_size;
10278		ctx->stack_frame_alignment = 0;
10279	}
10280
10281	Dst = &data.dasm_state;
10282	data.dasm_state = NULL;
10283	dasm_init(&data.dasm_state, DASM_MAXSECTION);
10284	dasm_setupglobal(&data.dasm_state, dasm_labels, ir_lb_MAX);
10285	dasm_setup(&data.dasm_state, dasm_actions);
10286	/* labels for each block + for each constant + rodata label + jmp_table label + for each entry */
10287	dasm_growpc(&data.dasm_state, ctx->cfg_blocks_count + 1 + ctx->consts_count + 1 + 1 + 1 + ctx->entries_count);
10288	data.emit_constants = ir_bitset_malloc(ctx->consts_count);
10289
10290	if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_START_BR_TARGET)) {
10291		|.if X64
10292		|	endbr64
10293		|.else
10294		|	endbr32
10295		|.endif
10296	}
10297
10298	if (!(ctx->flags & IR_SKIP_PROLOGUE)) {
10299		ir_emit_prologue(ctx);
10300	}
10301	if (ctx->flags & IR_FUNCTION) {
10302		ir_emit_load_params(ctx);
10303	}
10304
10305	if (UNEXPECTED(!ctx->cfg_schedule)) {
10306		uint32_t *list = ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
10307		for (b = 0; b <= ctx->cfg_blocks_count; b++) {
10308			list[b] = b;
10309		}
10310		list[ctx->cfg_blocks_count + 1] = 0;
10311	}
10312
10313	for (_b = 1; _b <= ctx->cfg_blocks_count; _b++) {
10314		b = ctx->cfg_schedule[_b];
10315		bb = &ctx->cfg_blocks[b];
10316		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
10317		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
10318			continue;
10319		}
10320		if (bb->flags & IR_BB_ALIGN_LOOP) {
10321			|	.align IR_LOOP_ALIGNMENT
10322		}
10323		|=>b:
10324
10325		i = bb->start;
10326		insn = ctx->ir_base + i;
10327		if (bb->flags & IR_BB_ENTRY) {
10328			uint32_t label = ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3;
10329
10330			|=>label:
10331			if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_ENTRY_BR_TARGET)) {
10332				|.if X64
10333				|	endbr64
10334				|.else
10335				|	endbr32
10336				|.endif
10337			}
10338			ir_emit_prologue(ctx);
10339			ctx->entries[insn->op3] = i;
10340		}
10341
10342		/* skip first instruction */
10343		n = ir_insn_len(insn);
10344		i += n;
10345		insn += n;
10346		rule = ctx->rules + i;
10347
10348		while (i <= bb->end) {
10349			if (!((*rule) & (IR_FUSED|IR_SKIPPED)))
10350			switch ((*rule) & IR_RULE_MASK) {
10351				case IR_VAR:
10352				case IR_PARAM:
10353				case IR_PI:
10354				case IR_PHI:
10355				case IR_SNAPSHOT:
10356				case IR_VA_END:
10357					break;
10358				case IR_LEA_OB:
10359				case IR_LEA_SI:
10360				case IR_LEA_SIB:
10361				case IR_LEA_IB:
10362				case IR_LEA_OB_I:
10363				case IR_LEA_I_OB:
10364				case IR_LEA_SI_O:
10365				case IR_LEA_SIB_O:
10366				case IR_LEA_IB_O:
10367				case IR_LEA_OB_SI:
10368				case IR_LEA_SI_OB:
10369				case IR_LEA_B_SI:
10370				case IR_LEA_SI_B:
10371					ir_emit_lea(ctx, i, insn->type);
10372					break;
10373				case IR_MUL_PWR2:
10374				case IR_DIV_PWR2:
10375				case IR_MOD_PWR2:
10376					ir_emit_mul_div_mod_pwr2(ctx, i, insn);
10377					break;
10378				case IR_SDIV_PWR2:
10379					ir_emit_sdiv_pwr2(ctx, i, insn);
10380					break;
10381				case IR_SMOD_PWR2:
10382					ir_emit_smod_pwr2(ctx, i, insn);
10383					break;
10384				case IR_SHIFT:
10385					ir_emit_shift(ctx, i, insn);
10386					break;
10387				case IR_SHIFT_CONST:
10388					ir_emit_shift_const(ctx, i, insn);
10389					break;
10390				case IR_BIT_COUNT:
10391					ir_emit_bit_count(ctx, i, insn);
10392					break;
10393				case IR_CTPOP:
10394					ir_emit_ctpop(ctx, i, insn);
10395					break;
10396				case IR_INC:
10397				case IR_DEC:
10398				case IR_OP_INT:
10399					ir_emit_op_int(ctx, i, insn, *rule);
10400					break;
10401				case IR_ABS_INT:
10402					ir_emit_abs_int(ctx, i, insn);
10403					break;
10404				case IR_BOOL_NOT_INT:
10405					ir_emit_bool_not_int(ctx, i, insn);
10406					break;
10407				case IR_OP_FP:
10408					ir_emit_op_fp(ctx, i, insn);
10409					break;
10410				case IR_IMUL3:
10411					ir_emit_imul3(ctx, i, insn);
10412					break;
10413				case IR_BINOP_INT:
10414					ir_emit_binop_int(ctx, i, insn);
10415					break;
10416				case IR_BINOP_SSE2:
10417					ir_emit_binop_sse2(ctx, i, insn);
10418					break;
10419				case IR_BINOP_AVX:
10420					ir_emit_binop_avx(ctx, i, insn);
10421					break;
10422				case IR_MUL_INT:
10423				case IR_DIV_INT:
10424				case IR_MOD_INT:
10425					ir_emit_mul_div_mod(ctx, i, insn);
10426					break;
10427				case IR_CMP_INT:
10428					ir_emit_cmp_int(ctx, i, insn);
10429					break;
10430				case IR_TESTCC_INT:
10431					ir_emit_testcc_int(ctx, i, insn);
10432					break;
10433				case IR_SETCC_INT:
10434					ir_emit_setcc_int(ctx, i, insn);
10435					break;
10436				case IR_CMP_FP:
10437					ir_emit_cmp_fp(ctx, i, insn);
10438					break;
10439				case IR_SEXT:
10440					ir_emit_sext(ctx, i, insn);
10441					break;
10442				case IR_ZEXT:
10443					ir_emit_zext(ctx, i, insn);
10444					break;
10445				case IR_TRUNC:
10446					ir_emit_trunc(ctx, i, insn);
10447					break;
10448				case IR_BITCAST:
10449				case IR_PROTO:
10450					ir_emit_bitcast(ctx, i, insn);
10451					break;
10452				case IR_INT2FP:
10453					ir_emit_int2fp(ctx, i, insn);
10454					break;
10455				case IR_FP2INT:
10456					ir_emit_fp2int(ctx, i, insn);
10457					break;
10458				case IR_FP2FP:
10459					ir_emit_fp2fp(ctx, i, insn);
10460					break;
10461				case IR_COPY_INT:
10462					ir_emit_copy_int(ctx, i, insn);
10463					break;
10464				case IR_COPY_FP:
10465					ir_emit_copy_fp(ctx, i, insn);
10466					break;
10467				case IR_CMP_AND_STORE_INT:
10468					ir_emit_cmp_and_store_int(ctx, i, insn);
10469					break;
10470				case IR_CMP_AND_BRANCH_INT:
10471					ir_emit_cmp_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10472					break;
10473				case IR_CMP_AND_BRANCH_FP:
10474					ir_emit_cmp_and_branch_fp(ctx, b, i, insn, _ir_next_block(ctx, _b));
10475					break;
10476				case IR_TEST_AND_BRANCH_INT:
10477					ir_emit_test_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10478					break;
10479				case IR_JCC_INT:
10480					{
10481						ir_op op = ctx->ir_base[insn->op2].op;
10482
10483						if (op == IR_ADD ||
10484						    op == IR_SUB ||
10485//						    op == IR_MUL ||
10486						    op == IR_OR  ||
10487						    op == IR_AND ||
10488						    op == IR_XOR) {
10489							op = IR_NE;
10490						} else {
10491							IR_ASSERT(op >= IR_EQ && op <= IR_UGT);
10492						}
10493						ir_emit_jcc(ctx, b, i, insn, _ir_next_block(ctx, _b), op, 1);
10494					}
10495					break;
10496				case IR_GUARD_CMP_INT:
10497					if (ir_emit_guard_cmp_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10498						goto next_block;
10499					}
10500					break;
10501				case IR_GUARD_CMP_FP:
10502					if (ir_emit_guard_cmp_fp(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10503						goto next_block;
10504					}
10505					break;
10506				case IR_GUARD_TEST_INT:
10507					if (ir_emit_guard_test_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10508						goto next_block;
10509					}
10510					break;
10511				case IR_GUARD_JCC_INT:
10512					if (ir_emit_guard_jcc_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10513						goto next_block;
10514					}
10515					break;
10516				case IR_IF_INT:
10517					ir_emit_if_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10518					break;
10519				case IR_COND:
10520					ir_emit_cond(ctx, i, insn);
10521					break;
10522				case IR_COND_CMP_INT:
10523					ir_emit_cond_cmp_int(ctx, i, insn);
10524					break;
10525				case IR_COND_CMP_FP:
10526					ir_emit_cond_cmp_fp(ctx, i, insn);
10527					break;
10528				case IR_SWITCH:
10529					ir_emit_switch(ctx, b, i, insn);
10530					break;
10531				case IR_MIN_MAX_INT:
10532					ir_emit_min_max_int(ctx, i, insn);
10533					break;
10534				case IR_OVERFLOW:
10535					ir_emit_overflow(ctx, i, insn);
10536					break;
10537				case IR_OVERFLOW_AND_BRANCH:
10538					ir_emit_overflow_and_branch(ctx, b, i, insn, _ir_next_block(ctx, _b));
10539					break;
10540				case IR_END:
10541				case IR_LOOP_END:
10542					if (bb->flags & IR_BB_OSR_ENTRY_LOADS) {
10543						ir_emit_osr_entry_loads(ctx, b, bb);
10544					}
10545					if (bb->flags & IR_BB_DESSA_MOVES) {
10546						ir_emit_dessa_moves(ctx, b, bb);
10547					}
10548					do {
10549						ir_ref succ = ctx->cfg_edges[bb->successors];
10550
10551						if (UNEXPECTED(bb->successors_count == 2)) {
10552							if (ctx->cfg_blocks[succ].flags & IR_BB_ENTRY) {
10553								succ = ctx->cfg_edges[bb->successors + 1];
10554							} else {
10555								IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
10556							}
10557						} else {
10558							IR_ASSERT(bb->successors_count == 1);
10559						}
10560						target = ir_skip_empty_target_blocks(ctx, succ);
10561						if (target != _ir_next_block(ctx, _b)) {
10562							|	jmp =>target
10563						}
10564					} while (0);
10565					break;
10566				case IR_RETURN_VOID:
10567					ir_emit_return_void(ctx);
10568					break;
10569				case IR_RETURN_INT:
10570					ir_emit_return_int(ctx, i, insn);
10571					break;
10572				case IR_RETURN_FP:
10573					ir_emit_return_fp(ctx, i, insn);
10574					break;
10575				case IR_CALL:
10576					ir_emit_call(ctx, i, insn);
10577					break;
10578				case IR_TAILCALL:
10579					ir_emit_tailcall(ctx, i, insn);
10580					break;
10581				case IR_IJMP:
10582					ir_emit_ijmp(ctx, i, insn);
10583					break;
10584				case IR_MEM_OP_INT:
10585				case IR_MEM_INC:
10586				case IR_MEM_DEC:
10587					ir_emit_mem_op_int(ctx, i, insn, *rule);
10588					break;
10589				case IR_MEM_BINOP_INT:
10590					ir_emit_mem_binop_int(ctx, i, insn);
10591					break;
10592				case IR_MEM_MUL_PWR2:
10593				case IR_MEM_DIV_PWR2:
10594				case IR_MEM_MOD_PWR2:
10595					ir_emit_mem_mul_div_mod_pwr2(ctx, i, insn);
10596					break;
10597				case IR_MEM_SHIFT:
10598					ir_emit_mem_shift(ctx, i, insn);
10599					break;
10600				case IR_MEM_SHIFT_CONST:
10601					ir_emit_mem_shift_const(ctx, i, insn);
10602					break;
10603				case IR_REG_BINOP_INT:
10604					ir_emit_reg_binop_int(ctx, i, insn);
10605					break;
10606				case IR_VADDR:
10607					ir_emit_vaddr(ctx, i, insn);
10608					break;
10609				case IR_VLOAD:
10610					ir_emit_vload(ctx, i, insn);
10611					break;
10612				case IR_VSTORE_INT:
10613					ir_emit_vstore_int(ctx, i, insn);
10614					break;
10615				case IR_VSTORE_FP:
10616					ir_emit_vstore_fp(ctx, i, insn);
10617					break;
10618				case IR_RLOAD:
10619					ir_emit_rload(ctx, i, insn);
10620					break;
10621				case IR_RSTORE:
10622					ir_emit_rstore(ctx, i, insn);
10623					break;
10624				case IR_LOAD_INT:
10625					ir_emit_load_int(ctx, i, insn);
10626					break;
10627				case IR_LOAD_FP:
10628					ir_emit_load_fp(ctx, i, insn);
10629					break;
10630				case IR_STORE_INT:
10631					ir_emit_store_int(ctx, i, insn);
10632					break;
10633				case IR_STORE_FP:
10634					ir_emit_store_fp(ctx, i, insn);
10635					break;
10636				case IR_ALLOCA:
10637					ir_emit_alloca(ctx, i, insn);
10638					break;
10639				case IR_VA_START:
10640					ir_emit_va_start(ctx, i, insn);
10641					break;
10642				case IR_VA_COPY:
10643					ir_emit_va_copy(ctx, i, insn);
10644					break;
10645				case IR_VA_ARG:
10646					ir_emit_va_arg(ctx, i, insn);
10647					break;
10648				case IR_AFREE:
10649					ir_emit_afree(ctx, i, insn);
10650					break;
10651				case IR_BLOCK_BEGIN:
10652					ir_emit_block_begin(ctx, i, insn);
10653					break;
10654				case IR_BLOCK_END:
10655					ir_emit_block_end(ctx, i, insn);
10656					break;
10657				case IR_FRAME_ADDR:
10658					ir_emit_frame_addr(ctx, i);
10659					break;
10660				case IR_EXITCALL:
10661					ir_emit_exitcall(ctx, i, insn);
10662					break;
10663				case IR_GUARD:
10664				case IR_GUARD_NOT:
10665					if (ir_emit_guard(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10666						goto next_block;
10667					}
10668					break;
10669				case IR_GUARD_OVERFLOW:
10670					if (ir_emit_guard_overflow(ctx, b, i, insn)) {
10671						goto next_block;
10672					}
10673					break;
10674				case IR_SSE_SQRT:
10675					ir_emit_sse_sqrt(ctx, i, insn);
10676					break;
10677				case IR_SSE_RINT:
10678					ir_emit_sse_round(ctx, i, insn, 4);
10679					break;
10680				case IR_SSE_FLOOR:
10681					ir_emit_sse_round(ctx, i, insn, 9);
10682					break;
10683				case IR_SSE_CEIL:
10684					ir_emit_sse_round(ctx, i, insn, 10);
10685					break;
10686				case IR_SSE_TRUNC:
10687					ir_emit_sse_round(ctx, i, insn, 11);
10688					break;
10689				case IR_SSE_NEARBYINT:
10690					ir_emit_sse_round(ctx, i, insn, 12);
10691					break;
10692				case IR_TLS:
10693					ir_emit_tls(ctx, i, insn);
10694					break;
10695				case IR_TRAP:
10696					|	int3
10697					break;
10698				default:
10699					IR_ASSERT(0 && "NIY rule/instruction");
10700					ir_mem_free(data.emit_constants);
10701					dasm_free(&data.dasm_state);
10702					ctx->data = NULL;
10703					ctx->status = IR_ERROR_UNSUPPORTED_CODE_RULE;
10704					return NULL;
10705			}
10706			n = ir_insn_len(insn);
10707			i += n;
10708			insn += n;
10709			rule += n;
10710		}
10711next_block:;
10712	}
10713
10714	if (data.rodata_label) {
10715		|.rodata
10716	}
10717	IR_BITSET_FOREACH(data.emit_constants, ir_bitset_len(ctx->consts_count), i) {
10718		insn = &ctx->ir_base[-i];
10719		if (IR_IS_TYPE_FP(insn->type)) {
10720			int label = ctx->cfg_blocks_count + i;
10721
10722			if (!data.rodata_label) {
10723				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
10724
10725				|.rodata
10726				|=>data.rodata_label:
10727			}
10728			if (insn->type == IR_DOUBLE) {
10729				|.align 8
10730				|=>label:
10731				|.dword insn->val.u32, insn->val.u32_hi
10732			} else {
10733				IR_ASSERT(insn->type == IR_FLOAT);
10734				|.align 4
10735				|=>label:
10736				|.dword insn->val.u32
10737			}
10738		} else if (insn->op == IR_STR) {
10739			int label = ctx->cfg_blocks_count + i;
10740			const char *str = ir_get_str(ctx, insn->val.str);
10741			int i = 0;
10742
10743			if (!data.rodata_label) {
10744				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
10745
10746				|.rodata
10747				|=>data.rodata_label:
10748			}
10749			|.align 8
10750			|=>label:
10751			while (str[i]) {
10752				char c = str[i];
10753
10754				|.byte c
10755				i++;
10756			}
10757			|.byte 0
10758
10759		} else {
10760			IR_ASSERT(0);
10761		}
10762	} IR_BITSET_FOREACH_END();
10763	if (data.rodata_label) {
10764		|.code
10765	}
10766	ir_mem_free(data.emit_constants);
10767
10768	if (ctx->status) {
10769		dasm_free(&data.dasm_state);
10770		ctx->data = NULL;
10771		return NULL;
10772	}
10773
10774	ret = dasm_link(&data.dasm_state, size_ptr);
10775	if (ret != DASM_S_OK) {
10776		IR_ASSERT(0);
10777		dasm_free(&data.dasm_state);
10778		ctx->data = NULL;
10779		ctx->status = IR_ERROR_LINK;
10780		return NULL;
10781	}
10782	size = *size_ptr;
10783
10784	if (ctx->code_buffer) {
10785		entry = ctx->code_buffer->pos;
10786		entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
10787		if (size > (size_t)((char*)ctx->code_buffer->end - (char*)entry)) {
10788			ctx->data = NULL;
10789			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
10790			return NULL;
10791		}
10792		ctx->code_buffer->pos = (char*)entry + size;
10793	} else {
10794		entry = ir_mem_mmap(size);
10795		if (!entry) {
10796			dasm_free(&data.dasm_state);
10797			ctx->data = NULL;
10798			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
10799			return NULL;
10800		}
10801		ir_mem_unprotect(entry, size);
10802	}
10803
10804	ret = dasm_encode(&data.dasm_state, entry);
10805	if (ret != DASM_S_OK) {
10806		IR_ASSERT(0);
10807		dasm_free(&data.dasm_state);
10808		if (ctx->code_buffer) {
10809			if (ctx->code_buffer->pos == (char*)entry + size) {
10810				/* rollback */
10811				ctx->code_buffer->pos = (char*)entry - size;
10812			}
10813		} else {
10814			ir_mem_unmap(entry, size);
10815		}
10816		ctx->data = NULL;
10817		ctx->status = IR_ERROR_ENCODE;
10818		return NULL;
10819	}
10820
10821	if (data.jmp_table_label) {
10822		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.jmp_table_label);
10823		ctx->jmp_table_offset = offset;
10824	} else {
10825		ctx->jmp_table_offset = 0;
10826	}
10827	if (data.rodata_label) {
10828		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.rodata_label);
10829		ctx->rodata_offset = offset;
10830	} else {
10831		ctx->rodata_offset = 0;
10832	}
10833
10834	if (ctx->entries_count) {
10835		/* For all entries */
10836		i = ctx->entries_count;
10837		do {
10838			ir_insn *insn = &ctx->ir_base[ctx->entries[--i]];
10839			uint32_t offset = dasm_getpclabel(&data.dasm_state, ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3);
10840			insn->op3 = offset;
10841		} while (i != 0);
10842	}
10843
10844	dasm_free(&data.dasm_state);
10845
10846	ir_mem_flush(entry, size);
10847
10848#if defined(__GNUC__)
10849	if ((ctx->flags & IR_GEN_CACHE_DEMOTE) && (ctx->mflags & IR_X86_CLDEMOTE)) {
10850		uintptr_t start = (uintptr_t)entry;
10851		uintptr_t p = (uintptr_t)start & ~0x3F;
10852
10853		do {
10854			/* _cldemote(p); */
10855			asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
10856			p += 64;
10857		} while (p < start + size);
10858	}
10859#endif
10860
10861	if (!ctx->code_buffer) {
10862		ir_mem_protect(entry, size);
10863	}
10864
10865	ctx->data = NULL;
10866	return entry;
10867}
10868
10869const void *ir_emit_exitgroup(uint32_t first_exit_point, uint32_t exit_points_per_group, const void *exit_addr, ir_code_buffer *code_buffer, size_t *size_ptr)
10870{
10871	void *entry;
10872	size_t size;
10873	uint32_t i;
10874	dasm_State **Dst, *dasm_state;
10875	int ret;
10876
10877	IR_ASSERT(code_buffer);
10878	IR_ASSERT(sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(code_buffer, exit_addr));
10879
10880	Dst = &dasm_state;
10881	dasm_state = NULL;
10882	dasm_init(&dasm_state, DASM_MAXSECTION);
10883	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
10884	dasm_setup(&dasm_state, dasm_actions);
10885
10886	for (i = 0; i < exit_points_per_group - 1; i++) {
10887		|	push byte i
10888		|	.byte 0xeb, (4*(exit_points_per_group-i)-6) // jmp >1
10889	}
10890	|	push byte i
10891	|// 1:
10892	|	add aword [r4], first_exit_point
10893	|	jmp aword &exit_addr
10894
10895	ret = dasm_link(&dasm_state, &size);
10896	if (ret != DASM_S_OK) {
10897		IR_ASSERT(0);
10898		dasm_free(&dasm_state);
10899		return NULL;
10900	}
10901
10902	entry = code_buffer->pos;
10903	entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
10904	if (size > (size_t)((char*)code_buffer->end - (char*)entry)) {
10905		return NULL;
10906	}
10907	code_buffer->pos = (char*)entry + size;
10908
10909	ret = dasm_encode(&dasm_state, entry);
10910	if (ret != DASM_S_OK) {
10911		IR_ASSERT(0);
10912		dasm_free(&dasm_state);
10913		if (code_buffer->pos == (char*)entry + size) {
10914			/* rollback */
10915			code_buffer->pos = (char*)entry - size;
10916		}
10917		return NULL;
10918	}
10919
10920	dasm_free(&dasm_state);
10921
10922	ir_mem_flush(entry, size);
10923
10924	*size_ptr = size;
10925	return entry;
10926}
10927
10928bool ir_needs_thunk(ir_code_buffer *code_buffer, void *addr)
10929{
10930	return sizeof(void*) == 8 && !IR_MAY_USE_32BIT_ADDR(code_buffer, addr);
10931}
10932
10933void *ir_emit_thunk(ir_code_buffer *code_buffer, void *addr, size_t *size_ptr)
10934{
10935	void *entry;
10936	size_t size;
10937	dasm_State **Dst, *dasm_state;
10938	int ret;
10939
10940	Dst = &dasm_state;
10941	dasm_state = NULL;
10942	dasm_init(&dasm_state, DASM_MAXSECTION);
10943	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
10944	dasm_setup(&dasm_state, dasm_actions);
10945
10946	|.code
10947	|.if X64
10948	|	jmp aword [>1]
10949	|1:
10950	|	.aword &addr
10951	|.else
10952	|	jmp &addr
10953	|.endif
10954
10955	ret = dasm_link(&dasm_state, &size);
10956	if (ret != DASM_S_OK) {
10957		IR_ASSERT(0);
10958		dasm_free(&dasm_state);
10959		return NULL;
10960	}
10961
10962	if (size > (size_t)((char*)code_buffer->end - (char*)code_buffer->pos)) {
10963		dasm_free(&dasm_state);
10964		return NULL;
10965	}
10966
10967	entry = code_buffer->pos;
10968	ret = dasm_encode(&dasm_state, entry);
10969	if (ret != DASM_S_OK) {
10970		dasm_free(&dasm_state);
10971		return NULL;
10972	}
10973
10974	*size_ptr = size;
10975	code_buffer->pos = (char*)code_buffer->pos + size;
10976
10977	dasm_free(&dasm_state);
10978	ir_mem_flush(entry, size);
10979
10980	return entry;
10981}
10982
10983void ir_fix_thunk(void *thunk_entry, void *addr)
10984{
10985	unsigned char *code = thunk_entry;
10986
10987	if (sizeof(void*) == 8 && !IR_IS_SIGNED_32BIT(((unsigned char*)addr - (code + 5)))) {
10988		int32_t *offset_ptr;
10989		void **addr_ptr;
10990
10991		IR_ASSERT(code[0] == 0xff && code[1] == 0x25);
10992		offset_ptr = (int32_t*)(code + 2);
10993		addr_ptr = (void**)(code + 6 + *offset_ptr);
10994		*addr_ptr = addr;
10995	} else {
10996		int32_t *addr_ptr;
10997
10998		code[0] = 0xe9;
10999		addr_ptr = (int32_t*)(code + 1);
11000		*addr_ptr = (int32_t)(intptr_t)(void*)((unsigned char*)addr - (code + 5));
11001	}
11002}
11003