xref: /php-src/ext/opcache/jit/ir/ir_x86.dasc (revision 8e4363de)
1/*
2 * IR - Lightweight JIT Compilation Framework
3 * (x86/x86_64 native code generator based on DynAsm)
4 * Copyright (C) 2022 Zend by Perforce.
5 * Authors: Dmitry Stogov <dmitry@php.net>
6 */
7
8|.if X64
9|.arch x64
10|.else
11|.arch x86
12|.endif
13
14|.actionlist dasm_actions
15|.globals ir_lb
16|.section code, cold_code, rodata, jmp_table
17
18|.define IR_LOOP_ALIGNMENT, 16
19
20#ifdef IR_DEBUG
21typedef struct _ir_mem {uint64_t v;} ir_mem;
22
23# define IR_MEM_VAL(loc)            ((loc).v)
24#else
25typedef uint64_t ir_mem;
26
27# define IR_MEM_VAL(loc)            (loc)
28#endif
29
30#define IR_MEM_OFFSET(loc)          ((int32_t)(IR_MEM_VAL(loc) & 0xffffffff))
31#define IR_MEM_BASE(loc)            ((ir_reg)((IR_MEM_VAL(loc) >> 32) & 0xff))
32#define IR_MEM_INDEX(loc)           ((ir_reg)((IR_MEM_VAL(loc) >> 40) & 0xff))
33#define IR_MEM_SCALE(loc)           ((int32_t)((IR_MEM_VAL(loc) >> 48) & 0xff))
34
35#define IR_MEM_O(addr)            IR_MEM(IR_REG_NONE, addr, IR_REG_NONE, 1)
36#define IR_MEM_B(base)            IR_MEM(base, 0, IR_REG_NONE, 1)
37#define IR_MEM_BO(base, offset)   IR_MEM(base, offset, IR_REG_NONE, 1)
38
39IR_ALWAYS_INLINE ir_mem IR_MEM(ir_reg base, int32_t offset, ir_reg index, int32_t scale)
40{
41	ir_mem mem;
42	IR_ASSERT(base == IR_REG_NONE || (base >= IR_REG_GP_FIRST && base <= IR_REG_GP_LAST));
43	IR_ASSERT(index == IR_REG_NONE || (index >= IR_REG_GP_FIRST && index <= IR_REG_GP_LAST));
44	IR_ASSERT(scale == 1 || scale == 2 || scale == 4 || scale == 8);
45#ifdef IR_DEBUG
46	mem.v =
47#else
48	mem =
49#endif
50		((uint64_t)(uint32_t)offset |
51		((uint64_t)(uint8_t)base << 32) |
52		((uint64_t)(uint8_t)index << 40) |
53		((uint64_t)(uint8_t)scale << 48));
54	return mem;
55}
56
57#define IR_IS_SIGNED_32BIT(val)     ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= (-2147483647 - 1)))
58#define IR_IS_SIGNED_NEG_32BIT(val) ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= -2147483647))
59#define IR_IS_UNSIGNED_32BIT(val)   (((uintptr_t)(val)) <= 0xffffffff)
60#define IR_IS_32BIT(type, val)      (IR_IS_TYPE_SIGNED(type) ? IR_IS_SIGNED_32BIT((val).i64) : IR_IS_UNSIGNED_32BIT((val).u64))
61#define IR_IS_FP_ZERO(insn)         ((insn.type == IR_DOUBLE) ? (insn.val.u64 == 0) : (insn.val.u32 == 0))
62#define IR_MAY_USE_32BIT_ADDR(code_buffer, addr) \
63	((code_buffer) && \
64	IR_IS_SIGNED_32BIT((char*)(addr) - (char*)(code_buffer)->start) && \
65	IR_IS_SIGNED_32BIT((char*)(addr) - ((char*)(code_buffer)->end)))
66
67#define IR_SPILL_POS_TO_OFFSET(offset) \
68	((ctx->flags & IR_USE_FRAME_POINTER) ? \
69		((offset) - (ctx->stack_frame_size - ctx->stack_frame_alignment)) : \
70		((offset) + ctx->call_stack_size))
71
72|.macro ASM_EXPAND_OP_MEM, MACRO, op, type, op1
73||	do {
74||		int32_t offset = IR_MEM_OFFSET(op1);
75||		int32_t base = IR_MEM_BASE(op1);
76||		int32_t index = IR_MEM_INDEX(op1);
77||		int32_t scale = IR_MEM_SCALE(op1);
78||  	if (index == IR_REG_NONE) {
79||			if (base == IR_REG_NONE) {
80|				MACRO op, type, [offset]
81||			} else {
82|				MACRO op, type, [Ra(base)+offset]
83||			}
84||		} else if (scale == 8) {
85||			if (base == IR_REG_NONE) {
86|				MACRO op, type, [Ra(index)*8+offset]
87||			} else {
88|				MACRO op, type, [Ra(base)+Ra(index)*8+offset]
89||			}
90||		} else if (scale == 4) {
91||			if (base == IR_REG_NONE) {
92|				MACRO op, type, [Ra(index)*4+offset]
93||			} else {
94|				MACRO op, type, [Ra(base)+Ra(index)*4+offset]
95||			}
96||		} else if (scale == 2) {
97||			if (base == IR_REG_NONE) {
98|				MACRO op, type, [Ra(index)*2+offset]
99||			} else {
100|				MACRO op, type, [Ra(base)+Ra(index)*2+offset]
101||			}
102||		} else {
103||			IR_ASSERT(scale == 1);
104||			if (base == IR_REG_NONE) {
105|				MACRO op, type, [Ra(index)+offset]
106||			} else {
107|				MACRO op, type, [Ra(base)+Ra(index)+offset]
108||			}
109||		}
110||	} while (0);
111|.endmacro
112
113|.macro ASM_EXPAND_OP1_MEM, MACRO, op, type, op1, op2
114||	do {
115||		int32_t offset = IR_MEM_OFFSET(op1);
116||		int32_t base = IR_MEM_BASE(op1);
117||		int32_t index = IR_MEM_INDEX(op1);
118||		int32_t scale = IR_MEM_SCALE(op1);
119||  	if (index == IR_REG_NONE) {
120||			if (base == IR_REG_NONE) {
121|				MACRO op, type, [offset], op2
122||			} else {
123|				MACRO op, type, [Ra(base)+offset], op2
124||			}
125||		} else if (scale == 8) {
126||			if (base == IR_REG_NONE) {
127|				MACRO op, type, [Ra(index)*8+offset], op2
128||			} else {
129|				MACRO op, type, [Ra(base)+Ra(index)*8+offset], op2
130||			}
131||		} else if (scale == 4) {
132||			if (base == IR_REG_NONE) {
133|				MACRO op, type, [Ra(index)*4+offset], op2
134||			} else {
135|				MACRO op, type, [Ra(base)+Ra(index)*4+offset], op2
136||			}
137||		} else if (scale == 2) {
138||			if (base == IR_REG_NONE) {
139|				MACRO op, type, [Ra(index)*2+offset], op2
140||			} else {
141|				MACRO op, type, [Ra(base)+Ra(index)*2+offset], op2
142||			}
143||		} else {
144||			IR_ASSERT(scale == 1);
145||			if (base == IR_REG_NONE) {
146|				MACRO op, type, [Ra(index)+offset], op2
147||			} else {
148|				MACRO op, type, [Ra(base)+Ra(index)+offset], op2
149||			}
150||		}
151||	} while (0);
152|.endmacro
153
154|.macro ASM_EXPAND_OP2_MEM, MACRO, op, type, op1, op2
155||	do {
156||		int32_t offset = IR_MEM_OFFSET(op2);
157||		int32_t base = IR_MEM_BASE(op2);
158||		int32_t index = IR_MEM_INDEX(op2);
159||		int32_t scale = IR_MEM_SCALE(op2);
160||  	if (index == IR_REG_NONE) {
161||			if (base == IR_REG_NONE) {
162|				MACRO op, type, op1, [offset]
163||			} else {
164|				MACRO op, type, op1, [Ra(base)+offset]
165||			}
166||		} else if (scale == 8) {
167||			if (base == IR_REG_NONE) {
168|				MACRO op, type, op1, [Ra(index)*8+offset]
169||			} else {
170|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset]
171||			}
172||		} else if (scale == 4) {
173||			if (base == IR_REG_NONE) {
174|				MACRO op, type, op1, [Ra(index)*4+offset]
175||			} else {
176|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset]
177||			}
178||		} else if (scale == 2) {
179||			if (base == IR_REG_NONE) {
180|				MACRO op, type, op1, [Ra(index)*2+offset]
181||			} else {
182|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset]
183||			}
184||		} else {
185||			IR_ASSERT(scale == 1);
186||			if (base == IR_REG_NONE) {
187|				MACRO op, type, op1, [Ra(index)+offset]
188||			} else {
189|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset]
190||			}
191||		}
192||	} while (0);
193|.endmacro
194
195|.macro ASM_EXPAND_OP2_MEM_3, MACRO, op, type, op1, op2, op3
196||	do {
197||		int32_t offset = IR_MEM_OFFSET(op2);
198||		int32_t base = IR_MEM_BASE(op2);
199||		int32_t index = IR_MEM_INDEX(op2);
200||		int32_t scale = IR_MEM_SCALE(op2);
201||  	if (index == IR_REG_NONE) {
202||			if (base == IR_REG_NONE) {
203|				MACRO op, type, op1, [offset], op3
204||			} else {
205|				MACRO op, type, op1, [Ra(base)+offset], op3
206||			}
207||		} else if (scale == 8) {
208||			if (base == IR_REG_NONE) {
209|				MACRO op, type, op1, [Ra(index)*8+offset], op3
210||			} else {
211|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset], op3
212||			}
213||		} else if (scale == 4) {
214||			if (base == IR_REG_NONE) {
215|				MACRO op, type, op1, [Ra(index)*4+offset], op3
216||			} else {
217|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset], op3
218||			}
219||		} else if (scale == 2) {
220||			if (base == IR_REG_NONE) {
221|				MACRO op, type, op1, [Ra(index)*2+offset], op3
222||			} else {
223|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset], op3
224||			}
225||		} else {
226||			IR_ASSERT(scale == 1);
227||			if (base == IR_REG_NONE) {
228|				MACRO op, type, op1, [Ra(index)+offset], op3
229||			} else {
230|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset], op3
231||			}
232||		}
233||	} while (0);
234|.endmacro
235
236|.macro ASM_EXPAND_OP3_MEM, MACRO, op, type, op1, op2, op3
237||	do {
238||		int32_t offset = IR_MEM_OFFSET(op3);
239||		int32_t base = IR_MEM_BASE(op3);
240||		int32_t index = IR_MEM_INDEX(op3);
241||		int32_t scale = IR_MEM_SCALE(op3);
242||  	if (index == IR_REG_NONE) {
243||			if (base == IR_REG_NONE) {
244|				MACRO op, type, op1, op2, [offset]
245||			} else {
246|				MACRO op, type, op1, op2, [Ra(base)+offset]
247||			}
248||		} else if (scale == 8) {
249||			if (base == IR_REG_NONE) {
250|				MACRO op, type, op1, op2, [Ra(index)*8+offset]
251||			} else {
252|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*8+offset]
253||			}
254||		} else if (scale == 4) {
255||			if (base == IR_REG_NONE) {
256|				MACRO op, type, op1, op2, [Ra(index)*4+offset]
257||			} else {
258|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*4+offset]
259||			}
260||		} else if (scale == 2) {
261||			if (base == IR_REG_NONE) {
262|				MACRO op, type, op1, op2, [Ra(index)*2+offset]
263||			} else {
264|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*2+offset]
265||			}
266||		} else {
267||			IR_ASSERT(scale == 1);
268||			if (base == IR_REG_NONE) {
269|				MACRO op, type, op1, op2, [Ra(index)+offset]
270||			} else {
271|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)+offset]
272||			}
273||		}
274||	} while (0);
275|.endmacro
276
277|.macro ASM_EXPAND_TYPE_MEM, op, type, op1
278||	switch (ir_type_size[type]) {
279|| 		default:
280||			IR_ASSERT(0);
281||		case 1:
282|			op byte op1
283||			break;
284||		case 2:
285|			op word op1
286|| 			break;
287||		case 4:
288|			op dword op1
289|| 			break;
290|.if X64
291||		case 8:
292|			op qword op1
293|| 			break;
294|.endif
295||	}
296|.endmacro
297
298|.macro ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
299||	switch (ir_type_size[type]) {
300|| 		default:
301||			IR_ASSERT(0);
302||		case 1:
303|			op byte op1, Rb(op2)
304||			break;
305||		case 2:
306|			op word op1, Rw(op2)
307|| 			break;
308||		case 4:
309|			op dword op1, Rd(op2)
310|| 			break;
311|.if X64
312||		case 8:
313|			op qword op1, Rq(op2)
314|| 			break;
315|.endif
316||	}
317|.endmacro
318
319|.macro ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
320||	switch (ir_type_size[type]) {
321|| 		default:
322||			IR_ASSERT(0);
323||		case 1:
324|			op byte op1, op2
325||			break;
326||		case 2:
327|			op word op1, op2
328|| 			break;
329||		case 4:
330|			op dword op1, op2
331|| 			break;
332|.if X64
333||		case 8:
334|			op qword op1, op2
335|| 			break;
336|.endif
337||	}
338|.endmacro
339
340|.macro ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
341||	switch (ir_type_size[type]) {
342|| 		default:
343||			IR_ASSERT(0);
344||		case 1:
345|			op byte op1, (op2 & 0xff)
346||			break;
347||		case 2:
348|			op word op1, (op2 & 0xffff)
349|| 			break;
350||		case 4:
351|			op dword op1, op2
352|| 			break;
353|.if X64
354||		case 8:
355|			op qword op1, op2
356|| 			break;
357|.endif
358||	}
359|.endmacro
360
361|.macro ASM_EXPAND_TYPE_REG_MEM, op, type, op1, op2
362||	switch (ir_type_size[type]) {
363|| 		default:
364||			IR_ASSERT(0);
365||		case 1:
366|			op Rb(op1), byte op2
367||			break;
368||		case 2:
369|			op Rw(op1), word op2
370|| 			break;
371||		case 4:
372|			op Rd(op1), dword op2
373|| 			break;
374|.if X64
375||		case 8:
376|			op Rq(op1), qword op2
377|| 			break;
378|.endif
379||	}
380|.endmacro
381
382|.macro ASM_TMEM_OP, op, type, op1
383||	do {
384||		int32_t offset = IR_MEM_OFFSET(op1);
385||		int32_t base = IR_MEM_BASE(op1);
386||		int32_t index = IR_MEM_INDEX(op1);
387||		int32_t scale = IR_MEM_SCALE(op1);
388||  	if (index == IR_REG_NONE) {
389||			if (base == IR_REG_NONE) {
390|				op type [offset]
391||			} else {
392|				op type [Ra(base)+offset]
393||			}
394||		} else if (scale == 8) {
395||			if (base == IR_REG_NONE) {
396|				op type [Ra(index)*8+offset]
397||			} else {
398|				op type [Ra(base)+Ra(index)*8+offset]
399||			}
400||		} else if (scale == 4) {
401||			if (base == IR_REG_NONE) {
402|				op type [Ra(index)*4+offset]
403||			} else {
404|				op type [Ra(base)+Ra(index)*4+offset]
405||			}
406||		} else if (scale == 2) {
407||			if (base == IR_REG_NONE) {
408|				op type [Ra(index)*2+offset]
409||			} else {
410|				op type [Ra(base)+Ra(index)*2+offset]
411||			}
412||		} else {
413||			IR_ASSERT(scale == 1);
414||			if (base == IR_REG_NONE) {
415|				op type [Ra(index)+offset]
416||			} else {
417|				op type [Ra(base)+Ra(index)+offset]
418||			}
419||		}
420||	} while (0);
421|.endmacro
422
423|.macro ASM_TXT_TMEM_OP, op, op1, type, op2
424||	do {
425||		int32_t offset = IR_MEM_OFFSET(op2);
426||		int32_t base = IR_MEM_BASE(op2);
427||		int32_t index = IR_MEM_INDEX(op2);
428||		int32_t scale = IR_MEM_SCALE(op2);
429||  	if (index == IR_REG_NONE) {
430||			if (base == IR_REG_NONE) {
431|				op op1, type [offset]
432||			} else {
433|				op op1, type [Ra(base)+offset]
434||			}
435||		} else if (scale == 8) {
436||			if (base == IR_REG_NONE) {
437|				op op1, type [Ra(index)*8+offset]
438||			} else {
439|				op op1, type [Ra(base)+Ra(index)*8+offset]
440||			}
441||		} else if (scale == 4) {
442||			if (base == IR_REG_NONE) {
443|				op op1, type [Ra(index)*4+offset]
444||			} else {
445|				op op1, type [Ra(base)+Ra(index)*4+offset]
446||			}
447||		} else if (scale == 2) {
448||			if (base == IR_REG_NONE) {
449|				op op1, type [Ra(index)*2+offset]
450||			} else {
451|				op op1, type [Ra(base)+Ra(index)*2+offset]
452||			}
453||		} else {
454||			IR_ASSERT(scale == 1);
455||			if (base == IR_REG_NONE) {
456|				op op1, type [Ra(index)+offset]
457||			} else {
458|				op op1, type [Ra(base)+Ra(index)+offset]
459||			}
460||		}
461||	} while (0);
462|.endmacro
463
464|.macro ASM_TMEM_TXT_OP, op, type, op1, op2
465||	do {
466||		int32_t offset = IR_MEM_OFFSET(op1);
467||		int32_t base = IR_MEM_BASE(op1);
468||		int32_t index = IR_MEM_INDEX(op1);
469||		int32_t scale = IR_MEM_SCALE(op1);
470||  	if (index == IR_REG_NONE) {
471||			if (base == IR_REG_NONE) {
472|				op type [offset], op2
473||			} else {
474|				op type [Ra(base)+offset], op2
475||			}
476||		} else if (scale == 8) {
477||			if (base == IR_REG_NONE) {
478|				op type [Ra(index)*8+offset], op2
479||			} else {
480|				op type [Ra(base)+Ra(index)*8+offset], op2
481||			}
482||		} else if (scale == 4) {
483||			if (base == IR_REG_NONE) {
484|				op type [Ra(index)*4+offset], op2
485||			} else {
486|				op type [Ra(base)+Ra(index)*4+offset], op2
487||			}
488||		} else if (scale == 2) {
489||			if (base == IR_REG_NONE) {
490|				op type [Ra(index)*2+offset], op2
491||			} else {
492|				op type [Ra(base)+Ra(index)*2+offset], op2
493||			}
494||		} else {
495||			IR_ASSERT(scale == 1);
496||			if (base == IR_REG_NONE) {
497|				op type [Ra(index)+offset], op2
498||			} else {
499|				op type [Ra(base)+Ra(index)+offset], op2
500||			}
501||		}
502||	} while (0);
503|.endmacro
504
505|.macro ASM_TXT_TXT_TMEM_OP, op, op1, op2, type, op3
506||	do {
507||		int32_t offset = IR_MEM_OFFSET(op3);
508||		int32_t base = IR_MEM_BASE(op3);
509||		int32_t index = IR_MEM_INDEX(op3);
510||		int32_t scale = IR_MEM_SCALE(op3);
511||  	if (index == IR_REG_NONE) {
512||			if (base == IR_REG_NONE) {
513|				op op1, op2, type [offset]
514||			} else {
515|				op op1, op2, type [Ra(base)+offset]
516||			}
517||		} else if (scale == 8) {
518||			if (base == IR_REG_NONE) {
519|				op op1, op2, type [Ra(index)*8+offset]
520||			} else {
521|				op op1, op2, type [Ra(base)+Ra(index)*8+offset]
522||			}
523||		} else if (scale == 4) {
524||			if (base == IR_REG_NONE) {
525|				op op1, op2, type [Ra(index)*4+offset]
526||			} else {
527|				op op1, op2, type [Ra(base)+Ra(index)*4+offset]
528||			}
529||		} else if (scale == 2) {
530||			if (base == IR_REG_NONE) {
531|				op op1, op2, type [Ra(index)*2+offset]
532||			} else {
533|				op op1, op2, type [Ra(base)+Ra(index)*2+offset]
534||			}
535||		} else {
536||			IR_ASSERT(scale == 1);
537||			if (base == IR_REG_NONE) {
538|				op op1, op2, type [Ra(index)+offset]
539||			} else {
540|				op op1, op2, type [Ra(base)+Ra(index)+offset]
541||			}
542||		}
543||	} while (0);
544|.endmacro
545
546|.macro ASM_REG_OP, op, type, op1
547||	switch (ir_type_size[type]) {
548|| 		default:
549||			IR_ASSERT(0);
550||		case 1:
551|			op Rb(op1)
552||			break;
553||		case 2:
554|			op Rw(op1)
555|| 			break;
556||		case 4:
557|			op Rd(op1)
558|| 			break;
559|.if X64
560||		case 8:
561|			op Rq(op1)
562|| 			break;
563|.endif
564||	}
565|.endmacro
566
567|.macro ASM_MEM_OP, op, type, op1
568|	ASM_EXPAND_OP_MEM ASM_EXPAND_TYPE_MEM, op, type, op1
569|.endmacro
570
571|.macro ASM_REG_REG_OP, op, type, op1, op2
572||	switch (ir_type_size[type]) {
573|| 		default:
574||			IR_ASSERT(0);
575||		case 1:
576|			op Rb(op1), Rb(op2)
577||			break;
578||		case 2:
579|			op Rw(op1), Rw(op2)
580|| 			break;
581||		case 4:
582|			op Rd(op1), Rd(op2)
583|| 			break;
584|.if X64
585||		case 8:
586|			op Rq(op1), Rq(op2)
587|| 			break;
588|.endif
589||	}
590|.endmacro
591
592|.macro ASM_REG_REG_OP2, op, type, op1, op2
593||	switch (ir_type_size[type]) {
594|| 		default:
595||			IR_ASSERT(0);
596||		case 1:
597||		case 2:
598|			op Rw(op1), Rw(op2)
599|| 			break;
600||		case 4:
601|			op Rd(op1), Rd(op2)
602|| 			break;
603|.if X64
604||		case 8:
605|			op Rq(op1), Rq(op2)
606|| 			break;
607|.endif
608||	}
609|.endmacro
610
611|.macro ASM_REG_TXT_OP, op, type, op1, op2
612||	switch (ir_type_size[type]) {
613|| 		default:
614||			IR_ASSERT(0);
615||		case 1:
616|			op Rb(op1), op2
617||			break;
618||		case 2:
619|			op Rw(op1), op2
620|| 			break;
621||		case 4:
622|			op Rd(op1), op2
623|| 			break;
624|.if X64
625||		case 8:
626|			op Rq(op1), op2
627|| 			break;
628|.endif
629||	}
630|.endmacro
631
632|.macro ASM_REG_IMM_OP, op, type, op1, op2
633||	switch (ir_type_size[type]) {
634|| 		default:
635||			IR_ASSERT(0);
636||		case 1:
637|			op Rb(op1), (op2 & 0xff)
638||			break;
639||		case 2:
640|			op Rw(op1), (op2 & 0xffff)
641|| 			break;
642||		case 4:
643|			op Rd(op1), op2
644|| 			break;
645|.if X64
646||		case 8:
647|			op Rq(op1), op2
648|| 			break;
649|.endif
650||	}
651|.endmacro
652
653|.macro ASM_MEM_REG_OP, op, type, op1, op2
654|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
655|.endmacro
656
657|.macro ASM_MEM_TXT_OP, op, type, op1, op2
658|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
659|.endmacro
660
661|.macro ASM_MEM_IMM_OP, op, type, op1, op2
662|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
663|.endmacro
664
665|.macro ASM_REG_MEM_OP, op, type, op1, op2
666|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_OP, op, type, op1, op2
667|.endmacro
668
669|.macro ASM_REG_REG_MUL, op, type, op1, op2
670||	switch (ir_type_size[type]) {
671|| 		default:
672||			IR_ASSERT(0);
673||		case 2:
674|			op Rw(op1), Rw(op2)
675|| 			break;
676||		case 4:
677|			op Rd(op1), Rd(op2)
678|| 			break;
679|.if X64
680||		case 8:
681|			op Rq(op1), Rq(op2)
682|| 			break;
683|.endif
684||	}
685|.endmacro
686
687|.macro ASM_REG_IMM_MUL, op, type, op1, op2
688||	switch (ir_type_size[type]) {
689|| 		default:
690||			IR_ASSERT(0);
691||		case 2:
692|			op Rw(op1), op2
693|| 			break;
694||		case 4:
695|			op Rd(op1), op2
696|| 			break;
697|.if X64
698||		case 8:
699|			op Rq(op1), op2
700|| 			break;
701|.endif
702||	}
703|.endmacro
704
705|.macro ASM_REG_TXT_MUL, op, type, op1, op2
706||	switch (ir_type_size[type]) {
707|| 		default:
708||			IR_ASSERT(0);
709||		case 2:
710|			op Rw(op1), op2
711|| 			break;
712||		case 4:
713|			op Rd(op1), op2
714|| 			break;
715|.if X64
716||		case 8:
717|			op Rq(op1), op2
718|| 			break;
719|.endif
720||	}
721|.endmacro
722
723|.macro ASM_REG_MEM_MUL, op, type, op1, op2
724|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_MUL, op, type, op1, op2
725|.endmacro
726
727|.macro ASM_REG_TXT_TXT_MUL, op, type, op1, op2, op3
728||	switch (ir_type_size[type]) {
729|| 		default:
730||			IR_ASSERT(0);
731||		case 2:
732|			op Rw(op1), op2, op3
733|| 			break;
734||		case 4:
735|			op Rd(op1), op2, op3
736|| 			break;
737|.if X64
738||		case 8:
739|			op Rq(op1), op2, op3
740|| 			break;
741|.endif
742||	}
743|.endmacro
744
745|.macro ASM_REG_MEM_TXT_MUL, op, type, op1, op2, op3
746|	ASM_EXPAND_OP2_MEM_3 ASM_REG_TXT_TXT_MUL, imul, type, op1, op2, op3
747|.endmacro
748
749|.macro ASM_SSE2_REG_REG_OP, op, type, op1, op2
750||	if (type == IR_DOUBLE) {
751|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
752||	} else {
753||		IR_ASSERT(type == IR_FLOAT);
754|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
755||	}
756|.endmacro
757
758|.macro ASM_SSE2_REG_TXT_OP, op, type, op1, op2
759||	if (type == IR_DOUBLE) {
760|		op..d xmm(op1-IR_REG_FP_FIRST), qword op2
761||	} else {
762||		IR_ASSERT(type == IR_FLOAT);
763|		op..s xmm(op1-IR_REG_FP_FIRST), dword op2
764||	}
765|.endmacro
766
767|.macro ASM_SSE2_REG_MEM_OP, op, type, op1, op2
768|	ASM_EXPAND_OP2_MEM ASM_SSE2_REG_TXT_OP, op, type, op1, op2
769|.endmacro
770
771|.macro ASM_AVX_REG_REG_REG_OP, op, type, op1, op2, op3
772||	if (type == IR_DOUBLE) {
773|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
774||	} else {
775||		IR_ASSERT(type == IR_FLOAT);
776|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
777||	}
778|.endmacro
779
780|.macro ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
781||	if (type == IR_DOUBLE) {
782|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), qword op3
783||	} else {
784||		IR_ASSERT(type == IR_FLOAT);
785|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), dword op3
786||	}
787|.endmacro
788
789|.macro ASM_AVX_REG_REG_MEM_OP, op, type, op1, op2, op3
790|	ASM_EXPAND_OP3_MEM ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
791|.endmacro
792
793|.macro ASM_FP_REG_REG_OP, op, type, op1, op2
794||	if (ctx->mflags & IR_X86_AVX) {
795|		ASM_SSE2_REG_REG_OP v..op, type, op1, op2
796||	} else {
797|		ASM_SSE2_REG_REG_OP op, type, op1, op2
798||	}
799|.endmacro
800
801|.macro ASM_FP_TXT_REG_OP, op, type, dst, src
802||	if (type == IR_DOUBLE) {
803||		if (ctx->mflags & IR_X86_AVX) {
804|			v..op..d qword dst, xmm(src-IR_REG_FP_FIRST)
805||		} else {
806|			op..d qword dst, xmm(src-IR_REG_FP_FIRST)
807||		}
808||	} else {
809||		IR_ASSERT(type == IR_FLOAT);
810||		if (ctx->mflags & IR_X86_AVX) {
811|			v..op..s dword dst, xmm(src-IR_REG_FP_FIRST)
812||		} else {
813|			op..s dword dst, xmm(src-IR_REG_FP_FIRST)
814||		}
815||	}
816|.endmacro
817
818|.macro ASM_FP_MEM_REG_OP, op, type, op1, op2
819|	ASM_EXPAND_OP1_MEM ASM_FP_TXT_REG_OP, op, type, op1, op2
820|.endmacro
821
822|.macro ASM_FP_REG_TXT_OP, op, type, op1, op2
823||	if (ctx->mflags & IR_X86_AVX) {
824|		ASM_SSE2_REG_TXT_OP v..op, type, op1, op2
825||	} else {
826|		ASM_SSE2_REG_TXT_OP op, type, op1, op2
827||	}
828|.endmacro
829
830|.macro ASM_FP_REG_MEM_OP, op, type, op1, op2
831||	if (ctx->mflags & IR_X86_AVX) {
832|		ASM_SSE2_REG_MEM_OP v..op, type, op1, op2
833||	} else {
834|		ASM_SSE2_REG_MEM_OP op, type, op1, op2
835||	}
836|.endmacro
837
838|.macro ASM_SSE2_REG_REG_TXT_OP, op, type, op1, op2, op3
839||	if (type == IR_DOUBLE) {
840|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
841||	} else {
842||		IR_ASSERT(type == IR_FLOAT);
843|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
844||	}
845|.endmacro
846
847|.macro ASM_SSE2_REG_REG_REG_TXT_OP, op, type, op1, op2, op3, op4
848||	if (type == IR_DOUBLE) {
849|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
850||	} else {
851||		IR_ASSERT(type == IR_FLOAT);
852|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
853||	}
854|.endmacro
855
856|.macro ASM_FP_REG_REG_TXT_OP, op, type, op1, op2, op3
857||	if (ctx->mflags & IR_X86_AVX) {
858|		ASM_SSE2_REG_REG_REG_TXT_OP v..op, type, op1, op2, op3
859||	} else {
860|		ASM_SSE2_REG_REG_TXT_OP op, type, op1, op2, op3
861||	}
862|.endmacro
863
864typedef struct _ir_backend_data {
865    ir_reg_alloc_data  ra_data;
866	uint32_t           dessa_from_block;
867	dasm_State        *dasm_state;
868	ir_bitset          emit_constants;
869	int                rodata_label, jmp_table_label;
870	bool               double_neg_const;
871	bool               float_neg_const;
872	bool               double_abs_const;
873	bool               float_abs_const;
874	bool               double_zero_const;
875} ir_backend_data;
876
877#define IR_GP_REG_NAME(code, name64, name32, name16, name8, name8h) \
878	#name64,
879#define IR_GP_REG_NAME32(code, name64, name32, name16, name8, name8h) \
880	#name32,
881#define IR_GP_REG_NAME16(code, name64, name32, name16, name8, name8h) \
882	#name16,
883#define IR_GP_REG_NAME8(code, name64, name32, name16, name8, name8h) \
884	#name8,
885#define IR_FP_REG_NAME(code, name) \
886	#name,
887
888static const char *_ir_reg_name[IR_REG_NUM] = {
889	IR_GP_REGS(IR_GP_REG_NAME)
890	IR_FP_REGS(IR_FP_REG_NAME)
891};
892
893static const char *_ir_reg_name32[IR_REG_NUM] = {
894	IR_GP_REGS(IR_GP_REG_NAME32)
895};
896
897static const char *_ir_reg_name16[IR_REG_NUM] = {
898	IR_GP_REGS(IR_GP_REG_NAME16)
899};
900
901static const char *_ir_reg_name8[IR_REG_NUM] = {
902	IR_GP_REGS(IR_GP_REG_NAME8)
903};
904
905/* Calling Convention */
906#ifdef _WIN64
907
908static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
909	IR_REG_INT_ARG1,
910	IR_REG_INT_ARG2,
911	IR_REG_INT_ARG3,
912	IR_REG_INT_ARG4,
913};
914
915static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
916	IR_REG_FP_ARG1,
917	IR_REG_FP_ARG2,
918	IR_REG_FP_ARG3,
919	IR_REG_FP_ARG4,
920};
921
922#elif defined(IR_TARGET_X64)
923
924static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
925	IR_REG_INT_ARG1,
926	IR_REG_INT_ARG2,
927	IR_REG_INT_ARG3,
928	IR_REG_INT_ARG4,
929	IR_REG_INT_ARG5,
930	IR_REG_INT_ARG6,
931};
932
933static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
934	IR_REG_FP_ARG1,
935	IR_REG_FP_ARG2,
936	IR_REG_FP_ARG3,
937	IR_REG_FP_ARG4,
938	IR_REG_FP_ARG5,
939	IR_REG_FP_ARG6,
940	IR_REG_FP_ARG7,
941	IR_REG_FP_ARG8,
942};
943
944#else
945
946static const int8_t *_ir_int_reg_params = NULL;
947static const int8_t *_ir_fp_reg_params = NULL;
948static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS] = {
949	IR_REG_INT_FCARG1,
950	IR_REG_INT_FCARG2,
951};
952static const int8_t *_ir_fp_fc_reg_params = NULL;
953
954#endif
955
956const char *ir_reg_name(int8_t reg, ir_type type)
957{
958	if (reg >= IR_REG_NUM) {
959		if (reg == IR_REG_SCRATCH) {
960			return "SCRATCH";
961		} else {
962			IR_ASSERT(reg == IR_REG_ALL);
963			return "ALL";
964		}
965	}
966	IR_ASSERT(reg >= 0 && reg < IR_REG_NUM);
967	if (type == IR_VOID) {
968		type = (reg < IR_REG_FP_FIRST) ? IR_ADDR : IR_DOUBLE;
969	}
970	if (IR_IS_TYPE_FP(type) || ir_type_size[type] == 8) {
971		return _ir_reg_name[reg];
972	} else if (ir_type_size[type] == 4) {
973		return _ir_reg_name32[reg];
974	} else if (ir_type_size[type] == 2) {
975		return _ir_reg_name16[reg];
976	} else {
977		IR_ASSERT(ir_type_size[type] == 1);
978		return _ir_reg_name8[reg];
979	}
980}
981
982#define IR_RULES(_)        \
983	_(CMP_INT)             \
984	_(CMP_FP)              \
985	_(MUL_INT)             \
986	_(DIV_INT)             \
987	_(MOD_INT)             \
988	_(TEST_INT)            \
989	_(SETCC_INT)           \
990	_(TESTCC_INT)          \
991	_(LEA_OB)              \
992	_(LEA_SI)              \
993	_(LEA_SIB)             \
994	_(LEA_IB)              \
995	_(LEA_SI_O)            \
996	_(LEA_SIB_O)           \
997	_(LEA_IB_O)            \
998	_(LEA_I_OB)            \
999	_(LEA_OB_I)            \
1000	_(LEA_OB_SI)           \
1001	_(LEA_SI_OB)           \
1002	_(LEA_B_SI)            \
1003	_(LEA_SI_B)            \
1004	_(INC)                 \
1005	_(DEC)                 \
1006	_(MUL_PWR2)            \
1007	_(DIV_PWR2)            \
1008	_(MOD_PWR2)            \
1009	_(SDIV_PWR2)           \
1010	_(SMOD_PWR2)           \
1011	_(BOOL_NOT_INT)        \
1012	_(ABS_INT)             \
1013	_(OP_INT)              \
1014	_(OP_FP)               \
1015	_(IMUL3)               \
1016	_(BINOP_INT)           \
1017	_(BINOP_SSE2)          \
1018	_(BINOP_AVX)           \
1019	_(SHIFT)               \
1020	_(SHIFT_CONST)         \
1021	_(COPY_INT)            \
1022	_(COPY_FP)             \
1023	_(CMP_AND_STORE_INT)   \
1024	_(CMP_AND_BRANCH_INT)  \
1025	_(CMP_AND_BRANCH_FP)   \
1026	_(TEST_AND_BRANCH_INT) \
1027	_(JCC_INT)             \
1028	_(COND_CMP_INT)        \
1029	_(COND_CMP_FP)         \
1030	_(GUARD_CMP_INT)       \
1031	_(GUARD_CMP_FP)        \
1032	_(GUARD_TEST_INT)      \
1033	_(GUARD_JCC_INT)       \
1034	_(GUARD_OVERFLOW)      \
1035	_(OVERFLOW_AND_BRANCH) \
1036	_(MIN_MAX_INT)         \
1037	_(MEM_OP_INT)          \
1038	_(MEM_INC)             \
1039	_(MEM_DEC)             \
1040	_(MEM_MUL_PWR2)        \
1041	_(MEM_DIV_PWR2)        \
1042	_(MEM_MOD_PWR2)        \
1043	_(MEM_BINOP_INT)       \
1044	_(MEM_SHIFT)           \
1045	_(MEM_SHIFT_CONST)     \
1046	_(REG_BINOP_INT)       \
1047	_(VSTORE_INT)          \
1048	_(VSTORE_FP)           \
1049	_(LOAD_INT)            \
1050	_(LOAD_FP)             \
1051	_(STORE_INT)           \
1052	_(STORE_FP)            \
1053	_(IF_INT)              \
1054	_(RETURN_VOID)         \
1055	_(RETURN_INT)          \
1056	_(RETURN_FP)           \
1057	_(BIT_COUNT)           \
1058	_(SSE_SQRT)            \
1059	_(SSE_RINT)            \
1060	_(SSE_FLOOR)           \
1061	_(SSE_CEIL)            \
1062	_(SSE_TRUNC)           \
1063	_(SSE_NEARBYINT)       \
1064
1065#define IR_RULE_ENUM(name) IR_ ## name,
1066
1067#define IR_STATIC_ALLOCA   (IR_SKIPPED | IR_FUSED | IR_SIMPLE | IR_ALLOCA)
1068
1069enum _ir_rule {
1070	IR_FIRST_RULE = IR_LAST_OP,
1071	IR_RULES(IR_RULE_ENUM)
1072	IR_LAST_RULE
1073};
1074
1075#define IR_RULE_NAME(name)  #name,
1076const char *ir_rule_name[IR_LAST_OP] = {
1077	NULL,
1078	IR_RULES(IR_RULE_NAME)
1079	NULL
1080};
1081
1082static bool ir_may_fuse_addr(ir_ctx *ctx, const ir_insn *addr_insn)
1083{
1084	if (sizeof(void*) == 4) {
1085		return 1;
1086	} else if (IR_IS_SYM_CONST(addr_insn->op)) {
1087		void *addr = ir_sym_addr(ctx, addr_insn);
1088
1089		if (!addr) {
1090			return 0;
1091		}
1092		return IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
1093	} else {
1094		return IR_IS_SIGNED_32BIT(addr_insn->val.i64);
1095	}
1096}
1097
1098static bool ir_may_fuse_imm(ir_ctx *ctx, const ir_insn *val_insn)
1099{
1100	if (val_insn->type == IR_ADDR) {
1101		if (sizeof(void*) == 4) {
1102			return 1;
1103		} else if (IR_IS_SYM_CONST(val_insn->op)) {
1104			void *addr = ir_sym_addr(ctx, val_insn);
1105
1106			if (!addr) {
1107				return 0;
1108			}
1109			return IR_IS_SIGNED_32BIT((intptr_t)addr);
1110		} else {
1111			return IR_IS_SIGNED_32BIT(val_insn->val.i64);
1112		}
1113	} else {
1114		return (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
1115	}
1116}
1117
1118/* register allocation */
1119static int ir_add_const_tmp_reg(ir_ctx *ctx, ir_ref ref, uint32_t num, int n, ir_target_constraints *constraints)
1120{
1121	IR_ASSERT(IR_IS_CONST_REF(ref));
1122	const ir_insn *val_insn = &ctx->ir_base[ref];
1123
1124	if (!ir_may_fuse_imm(ctx, val_insn)) {
1125		constraints->tmp_regs[n] = IR_TMP_REG(num, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1126		n++;
1127	}
1128	return n;
1129}
1130
1131int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *constraints)
1132{
1133	uint32_t rule = ir_rule(ctx, ref);
1134	const ir_insn *insn;
1135	int n = 0;
1136	int flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1137
1138	constraints->def_reg = IR_REG_NONE;
1139	constraints->hints_count = 0;
1140	switch (rule & IR_RULE_MASK) {
1141		case IR_BINOP_INT:
1142			insn = &ctx->ir_base[ref];
1143			if (rule & IR_FUSED) {
1144				if (ctx->ir_base[insn->op1].op == IR_RLOAD) {
1145					flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1146				} else {
1147					flags = IR_OP2_MUST_BE_IN_REG;
1148				}
1149			} else {
1150				flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1151			}
1152			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1153				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1154			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
1155				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1156				n++;
1157			}
1158			break;
1159		case IR_IMUL3:
1160			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1161			break;
1162		case IR_SHIFT:
1163			if (rule & IR_FUSED) {
1164				flags = IR_OP2_MUST_BE_IN_REG;
1165			} else {
1166				flags = IR_DEF_REUSES_OP1_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1167			}
1168			constraints->hints[1] = IR_REG_NONE;
1169			constraints->hints[2] = IR_REG_RCX;
1170			constraints->hints_count = 3;
1171			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RCX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1172			n = 1;
1173			break;
1174		case IR_MUL_INT:
1175			/* %rax - used as input and result */
1176			constraints->def_reg = IR_REG_RAX;
1177			constraints->hints[1] = IR_REG_RAX;
1178			constraints->hints_count = 2;
1179			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1180			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1181			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1182			n = 2;
1183			break;
1184		case IR_DIV_INT:
1185			/* %rax - used as input and result */
1186			constraints->def_reg = IR_REG_RAX;
1187			constraints->hints[1] = IR_REG_RAX;
1188			constraints->hints_count = 2;
1189			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1190			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1191			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1192			n = 2;
1193			goto op2_const;
1194		case IR_MOD_INT:
1195			constraints->def_reg = IR_REG_RDX;
1196			constraints->hints[1] = IR_REG_RAX;
1197			constraints->hints_count = 2;
1198			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1199			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1200			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1201			n = 2;
1202			goto op2_const;
1203		case IR_MIN_MAX_INT:
1204			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1205op2_const:
1206			insn = &ctx->ir_base[ref];
1207			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1208				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1209				n++;
1210			}
1211			break;
1212		case IR_CMP_INT:
1213		case IR_TEST_INT:
1214			insn = &ctx->ir_base[ref];
1215			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1216			if (IR_IS_CONST_REF(insn->op1)) {
1217				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1218				constraints->tmp_regs[0] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1219				n = 1;
1220			} else if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
1221				constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1222				n = 1;
1223			} else if (ir_rule(ctx, insn->op1) & IR_FUSED) {
1224				flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1225			}
1226			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
1227				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1228				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1229			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
1230				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1231				n++;
1232			}
1233			break;
1234		case IR_CMP_FP:
1235			insn = &ctx->ir_base[ref];
1236			if (!(rule & IR_FUSED)) {
1237				constraints->tmp_regs[0] = IR_TMP_REG(3, IR_BOOL, IR_DEF_SUB_REF, IR_SAVE_SUB_REF);
1238				n = 1;
1239			}
1240			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1241			if (IR_IS_CONST_REF(insn->op1)) {
1242				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
1243				constraints->tmp_regs[n] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1244				n++;
1245			}
1246			break;
1247		case IR_BINOP_AVX:
1248			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1249			insn = &ctx->ir_base[ref];
1250			if (IR_IS_CONST_REF(insn->op1)) {
1251				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1252				n = 1;
1253			}
1254			break;
1255		case IR_COND:
1256			insn = &ctx->ir_base[ref];
1257			if (!IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
1258				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1259				break;
1260			}
1261			IR_FALLTHROUGH;
1262		case IR_COND_CMP_INT:
1263			insn = &ctx->ir_base[ref];
1264			if (IR_IS_TYPE_INT(insn->type)) {
1265				if (IR_IS_CONST_REF(insn->op3)) {
1266					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1267					constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1268					n = 1;
1269				} else if (IR_IS_CONST_REF(insn->op2)) {
1270					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1271					constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1272					n = 1;
1273				} else {
1274					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1275				}
1276			} else {
1277				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1278			}
1279			break;
1280		case IR_COND_CMP_FP:
1281			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1282			break;
1283		case IR_VSTORE_INT:
1284			flags = IR_OP3_MUST_BE_IN_REG;
1285			insn = &ctx->ir_base[ref];
1286			if (IR_IS_CONST_REF(insn->op3)) {
1287				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1288			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
1289				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1290				n++;
1291			}
1292			break;
1293		case IR_STORE_INT:
1294			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1295			insn = &ctx->ir_base[ref];
1296			if (IR_IS_CONST_REF(insn->op2)) {
1297				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1298			}
1299			if (IR_IS_CONST_REF(insn->op3)) {
1300				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
1301			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
1302				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1303				n++;
1304			}
1305			break;
1306		case IR_VSTORE_FP:
1307			flags = IR_OP3_MUST_BE_IN_REG;
1308			insn = &ctx->ir_base[ref];
1309			if (IR_IS_CONST_REF(insn->op3)) {
1310				insn = &ctx->ir_base[insn->op3];
1311				constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1312				n = 1;
1313			}
1314			break;
1315		case IR_LOAD_FP:
1316		case IR_LOAD_INT:
1317		case IR_MEM_OP_INT:
1318		case IR_MEM_INC:
1319		case IR_MEM_DEC:
1320		case IR_MEM_MUL_PWR2:
1321		case IR_MEM_DIV_PWR2:
1322		case IR_MEM_MOD_PWR2:
1323		case IR_MEM_BINOP_INT:
1324		case IR_MEM_SHIFT:
1325		case IR_MEM_SHIFT_CONST:
1326		case IR_CMP_AND_STORE_INT:
1327			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1328			insn = &ctx->ir_base[ref];
1329			if (IR_IS_CONST_REF(insn->op2)) {
1330				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1331			}
1332			break;
1333		case IR_STORE_FP:
1334			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1335			insn = &ctx->ir_base[ref];
1336			if (IR_IS_CONST_REF(insn->op2)) {
1337				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
1338			}
1339			if (IR_IS_CONST_REF(insn->op3)) {
1340				insn = &ctx->ir_base[insn->op3];
1341				constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1342				n++;
1343			}
1344			break;
1345		case IR_SWITCH:
1346			flags = IR_OP2_MUST_BE_IN_REG;
1347			insn = &ctx->ir_base[ref];
1348			if (IR_IS_CONST_REF(insn->op2)) {
1349				insn = &ctx->ir_base[insn->op2];
1350				constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1351				n = 1;
1352			}
1353			/* we need a temporary regeset in case MIN CASE value is not zero or some CASE VAL can't fit into 32-bit */
1354			constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1355			n++;
1356			break;
1357		case IR_CALL:
1358			insn = &ctx->ir_base[ref];
1359			if (IR_IS_TYPE_INT(insn->type)) {
1360				constraints->def_reg = IR_REG_INT_RET1;
1361#ifdef IR_REG_FP_RET1
1362			} else {
1363				constraints->def_reg = IR_REG_FP_RET1;
1364#endif
1365			}
1366			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_SCRATCH, IR_USE_SUB_REF, IR_DEF_SUB_REF);
1367			n = 1;
1368			IR_FALLTHROUGH;
1369		case IR_TAILCALL:
1370			insn = &ctx->ir_base[ref];
1371			if (insn->inputs_count > 2) {
1372				constraints->hints[2] = IR_REG_NONE;
1373				constraints->hints_count = ir_get_args_regs(ctx, insn, constraints->hints);
1374				if (!IR_IS_CONST_REF(insn->op2)) {
1375					constraints->tmp_regs[n] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_USE_SUB_REF);
1376					n++;
1377				}
1378			}
1379			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
1380			break;
1381		case IR_BINOP_SSE2:
1382			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
1383			break;
1384		case IR_SHIFT_CONST:
1385		case IR_INC:
1386		case IR_DEC:
1387		case IR_MUL_PWR2:
1388		case IR_DIV_PWR2:
1389		case IR_OP_INT:
1390		case IR_OP_FP:
1391			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1392			break;
1393		case IR_MOD_PWR2:
1394			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1395			insn = &ctx->ir_base[ref];
1396			if (ir_type_size[insn->type] == 8) {
1397				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1398				if (!IR_IS_SIGNED_32BIT(offset)) {
1399					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1400					n++;
1401				}
1402			}
1403			break;
1404		case IR_SMOD_PWR2:
1405			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1406			insn = &ctx->ir_base[ref];
1407			if (ir_type_size[insn->type] == 8) {
1408				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1409				if (!IR_IS_SIGNED_32BIT(offset)) {
1410					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1411					n++;
1412				}
1413			}
1414			constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1415			n++;
1416			break;
1417		case IR_SDIV_PWR2:
1418			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1419			insn = &ctx->ir_base[ref];
1420			if (ir_type_size[insn->type] == 8) {
1421				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
1422				if (!IR_IS_SIGNED_32BIT(offset)) {
1423					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1424					n++;
1425				}
1426			}
1427			break;
1428		case IR_BIT_COUNT:
1429			insn = &ctx->ir_base[ref];
1430			if (ir_type_size[insn->type] == 1) {
1431				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1432			} else {
1433				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1434			}
1435			if (IR_IS_CONST_REF(insn->op1)) {
1436				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1437				n = 1;
1438			}
1439			break;
1440		case IR_CTPOP:
1441			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1442			insn = &ctx->ir_base[ref];
1443			constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1444			n = 1;
1445			if (ir_type_size[insn->type] == 8) {
1446				constraints->tmp_regs[1] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
1447				n = 2;
1448			}
1449			break;
1450		case IR_COPY_INT:
1451		case IR_COPY_FP:
1452		case IR_SEXT:
1453		case IR_ZEXT:
1454		case IR_TRUNC:
1455		case IR_BITCAST:
1456		case IR_PROTO:
1457		case IR_FP2FP:
1458			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
1459			break;
1460		case IR_ABS_INT:
1461			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
1462			break;
1463		case IR_PARAM:
1464			constraints->def_reg = ir_get_param_reg(ctx, ref);
1465			flags = 0;
1466			break;
1467		case IR_PI:
1468		case IR_PHI:
1469			flags = IR_USE_SHOULD_BE_IN_REG;
1470			break;
1471		case IR_RLOAD:
1472			constraints->def_reg = ctx->ir_base[ref].op2;
1473			flags = IR_USE_SHOULD_BE_IN_REG;
1474			break;
1475		case IR_EXITCALL:
1476			flags = IR_USE_MUST_BE_IN_REG;
1477			constraints->def_reg = IR_REG_INT_RET1;
1478			break;
1479		case IR_IF_INT:
1480		case IR_GUARD:
1481		case IR_GUARD_NOT:
1482			flags = IR_OP2_SHOULD_BE_IN_REG;
1483			break;
1484		case IR_IJMP:
1485			flags = IR_OP2_SHOULD_BE_IN_REG;
1486			break;
1487		case IR_RSTORE:
1488			flags = IR_OP3_SHOULD_BE_IN_REG;
1489			break;
1490		case IR_RETURN_INT:
1491			flags = IR_OP2_SHOULD_BE_IN_REG;
1492			constraints->hints[2] = IR_REG_INT_RET1;
1493			constraints->hints_count = 3;
1494			break;
1495		case IR_RETURN_FP:
1496#ifdef IR_REG_FP_RET1
1497			flags = IR_OP2_SHOULD_BE_IN_REG;
1498			constraints->hints[2] = IR_REG_FP_RET1;
1499			constraints->hints_count = 3;
1500#endif
1501			break;
1502		case IR_SNAPSHOT:
1503			flags = 0;
1504			break;
1505		case IR_VA_START:
1506			flags = IR_OP2_MUST_BE_IN_REG;
1507			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1508			n = 1;
1509			break;
1510		case IR_VA_ARG:
1511			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
1512			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
1513			n = 1;
1514			break;
1515		case IR_VA_COPY:
1516			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
1517			constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
1518			n = 1;
1519			break;
1520	}
1521	constraints->tmps_count = n;
1522
1523	return flags;
1524}
1525
1526/* instruction selection */
1527static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref);
1528static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root);
1529
1530static void ir_swap_ops(ir_insn *insn)
1531{
1532	SWAP_REFS(insn->op1, insn->op2);
1533}
1534
1535static bool ir_match_try_revert_lea_to_add(ir_ctx *ctx, ir_ref ref)
1536{
1537	ir_insn *insn = &ctx->ir_base[ref];
1538
1539	/* TODO: This optimization makes sense only if the other operand is killed */
1540	if (insn->op1 == insn->op2) {
1541		/* pass */
1542	} else if (ir_match_try_fuse_load(ctx, insn->op2, ref)) {
1543		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
1544		return 1;
1545	} else if (ir_match_try_fuse_load(ctx, insn->op1, ref)) {
1546		/* swap for better load fusion */
1547		ir_swap_ops(insn);
1548		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
1549		return 1;
1550	}
1551	return 0;
1552}
1553
1554static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)
1555{
1556	if (!IR_IS_CONST_REF(addr_ref)) {
1557		uint32_t rule = ctx->rules[addr_ref];
1558
1559		if (!rule) {
1560			ctx->rules[addr_ref] = rule = ir_match_insn(ctx, addr_ref);
1561		}
1562		if (rule >= IR_LEA_OB && rule <= IR_LEA_SI_B) {
1563			ir_use_list *use_list;
1564			ir_ref j;
1565
1566			if (rule == IR_LEA_IB && ir_match_try_revert_lea_to_add(ctx, addr_ref)) {
1567				return;
1568			}
1569
1570			use_list = &ctx->use_lists[addr_ref];
1571			j = use_list->count;
1572			if (j > 1) {
1573				/* check if address is used only in LOAD and STORE */
1574				ir_ref *p = &ctx->use_edges[use_list->refs];
1575
1576				do {
1577					ir_insn *insn = &ctx->ir_base[*p];
1578					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
1579						return;
1580					}
1581					p++;
1582				} while (--j);
1583			}
1584			ctx->rules[addr_ref] = IR_FUSED | IR_SIMPLE | rule;
1585		}
1586	}
1587}
1588
1589/* A naive check if there is a STORE or CALL between this LOAD and the fusion root */
1590static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
1591{
1592	if (ref + 1 != root) {
1593		ir_ref pos = ctx->prev_ref[root];
1594
1595		do {
1596			ir_insn *insn = &ctx->ir_base[pos];
1597
1598			if (insn->op == IR_STORE) {
1599				// TODO: check if LOAD and STORE addresses may alias
1600				return 1;
1601			} else if (insn->op == IR_CALL) {
1602				return 1;
1603			}
1604			pos = ctx->prev_ref[pos];
1605		} while (ref != pos);
1606	}
1607	return 0;
1608}
1609
1610static void ir_match_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1611{
1612	if (ir_in_same_block(ctx, ref)
1613	 && ctx->ir_base[ref].op == IR_LOAD) {
1614		if (ctx->use_lists[ref].count == 2
1615		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1616			ir_ref addr_ref = ctx->ir_base[ref].op2;
1617			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1618
1619			if (IR_IS_CONST_REF(addr_ref)) {
1620				if (ir_may_fuse_addr(ctx, addr_insn)) {
1621					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1622					return;
1623				}
1624			} else {
1625				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1626				ir_match_fuse_addr(ctx, addr_ref);
1627				return;
1628			}
1629		}
1630	}
1631}
1632
1633static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
1634{
1635	ir_insn *insn = &ctx->ir_base[ref];
1636
1637	if (ir_in_same_block(ctx, ref)
1638	 && insn->op == IR_LOAD) {
1639		if (ctx->use_lists[ref].count == 2
1640		 && !ir_match_has_mem_deps(ctx, ref, root)) {
1641			ir_ref addr_ref = ctx->ir_base[ref].op2;
1642			ir_insn *addr_insn = &ctx->ir_base[addr_ref];
1643
1644			if (IR_IS_CONST_REF(addr_ref)) {
1645				if (ir_may_fuse_addr(ctx, addr_insn)) {
1646					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1647					return 1;
1648				}
1649			} else {
1650				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
1651				ir_match_fuse_addr(ctx, addr_ref);
1652				return 1;
1653			}
1654		}
1655	} else if (insn->op == IR_PARAM) {
1656		if (ctx->use_lists[ref].count == 1
1657		 && ir_get_param_reg(ctx, ref) == IR_REG_NONE) {
1658			return 1;
1659		}
1660	} else if (ctx->ir_base[ref].op == IR_VLOAD) {
1661		return 1;
1662	}
1663	return 0;
1664}
1665
1666static void ir_match_fuse_load_commutative_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1667{
1668	if (IR_IS_CONST_REF(insn->op2)
1669	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1670		return;
1671	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1672		return;
1673	} else if (ir_match_try_fuse_load(ctx, insn->op1, root)) {
1674		ir_swap_ops(insn);
1675	}
1676}
1677
1678static void ir_match_fuse_load_commutative_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1679{
1680	if (!IR_IS_CONST_REF(insn->op2)
1681	 && !ir_match_try_fuse_load(ctx, insn->op2, root)
1682	 && (IR_IS_CONST_REF(insn->op1) || ir_match_try_fuse_load(ctx, insn->op1, root))) {
1683		ir_swap_ops(insn);
1684	}
1685}
1686
1687static void ir_match_fuse_load_cmp_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1688{
1689	if (IR_IS_CONST_REF(insn->op2)
1690	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1691		ir_match_fuse_load(ctx, insn->op1, root);
1692	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1693	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1694		ir_swap_ops(insn);
1695		if (insn->op != IR_EQ && insn->op != IR_NE) {
1696			insn->op ^= 3;
1697		}
1698	}
1699}
1700
1701static void ir_match_fuse_load_test_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1702{
1703	if (IR_IS_CONST_REF(insn->op2)
1704	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
1705		ir_match_fuse_load(ctx, insn->op1, root);
1706	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
1707	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
1708		ir_swap_ops(insn);
1709	}
1710}
1711
1712static void ir_match_fuse_load_cmp_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
1713{
1714	if (insn->op != IR_EQ && insn->op != IR_NE) {
1715		if (insn->op == IR_LT || insn->op == IR_LE) {
1716			/* swap operands to avoid P flag check */
1717			ir_swap_ops(insn);
1718			insn->op ^= 3;
1719		}
1720		ir_match_fuse_load(ctx, insn->op2, root);
1721	} else if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1722		/* pass */
1723	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1724		/* pass */
1725	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1726		ir_swap_ops(insn);
1727		if (insn->op != IR_EQ && insn->op != IR_NE) {
1728			insn->op ^= 3;
1729		}
1730	}
1731}
1732
1733static void ir_match_fuse_load_cmp_fp_br(ir_ctx *ctx, ir_insn *insn, ir_ref root, bool direct)
1734{
1735	if (direct) {
1736		if (insn->op == IR_LT || insn->op == IR_LE) {
1737			/* swap operands to avoid P flag check */
1738			ir_swap_ops(insn);
1739			insn->op ^= 3;
1740		}
1741	} else {
1742		if (insn->op == IR_GT || insn->op == IR_GE) {
1743			/* swap operands to avoid P flag check */
1744			ir_swap_ops(insn);
1745			insn->op ^= 3;
1746		}
1747	}
1748	if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
1749		/* pass */
1750	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
1751		/* pass */
1752	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
1753		ir_swap_ops(insn);
1754		if (insn->op != IR_EQ && insn->op != IR_NE) {
1755			insn->op ^= 3;
1756		}
1757	}
1758}
1759
1760#define STR_EQUAL(name, name_len, str) (name_len == strlen(str) && memcmp(name, str, strlen(str)) == 0)
1761
1762#define IR_IS_FP_FUNC_1(proto, _type)  (proto->params_count == 1 && \
1763                                        proto->param_types[0] == _type && \
1764                                        proto->ret_type == _type)
1765
1766static uint32_t ir_match_builtin_call(ir_ctx *ctx, const ir_insn *func)
1767{
1768	const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);
1769
1770	if (proto->flags & IR_BUILTIN_FUNC) {
1771		size_t name_len;
1772		const char *name = ir_get_strl(ctx, func->val.name, &name_len);
1773
1774		if (STR_EQUAL(name, name_len, "sqrt")) {
1775			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1776				return IR_SSE_SQRT;
1777			}
1778		} else if (STR_EQUAL(name, name_len, "sqrtf")) {
1779			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1780				return IR_SSE_SQRT;
1781			}
1782		} else if (STR_EQUAL(name, name_len, "rint")) {
1783			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1784				return IR_SSE_RINT;
1785			}
1786		} else if (STR_EQUAL(name, name_len, "rintf")) {
1787			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1788				return IR_SSE_RINT;
1789			}
1790		} else if (STR_EQUAL(name, name_len, "floor")) {
1791			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1792				return IR_SSE_FLOOR;
1793			}
1794		} else if (STR_EQUAL(name, name_len, "floorf")) {
1795			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1796				return IR_SSE_FLOOR;
1797			}
1798		} else if (STR_EQUAL(name, name_len, "ceil")) {
1799			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1800				return IR_SSE_CEIL;
1801			}
1802		} else if (STR_EQUAL(name, name_len, "ceilf")) {
1803			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1804				return IR_SSE_CEIL;
1805			}
1806		} else if (STR_EQUAL(name, name_len, "trunc")) {
1807			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1808				return IR_SSE_TRUNC;
1809			}
1810		} else if (STR_EQUAL(name, name_len, "truncf")) {
1811			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1812				return IR_SSE_TRUNC;
1813			}
1814		} else if (STR_EQUAL(name, name_len, "nearbyint")) {
1815			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
1816				return IR_SSE_NEARBYINT;
1817			}
1818		} else if (STR_EQUAL(name, name_len, "nearbyintf")) {
1819			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
1820				return IR_SSE_NEARBYINT;
1821			}
1822		}
1823	}
1824
1825	return 0;
1826}
1827
1828static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref)
1829{
1830	ir_insn *op2_insn;
1831	ir_insn *insn = &ctx->ir_base[ref];
1832	uint32_t store_rule;
1833	ir_op load_op;
1834
1835	switch (insn->op) {
1836		case IR_EQ:
1837		case IR_NE:
1838		case IR_LT:
1839		case IR_GE:
1840		case IR_LE:
1841		case IR_GT:
1842		case IR_ULT:
1843		case IR_UGE:
1844		case IR_ULE:
1845		case IR_UGT:
1846			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
1847				if (IR_IS_CONST_REF(insn->op2)
1848				 && !IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op)
1849				 && ctx->ir_base[insn->op2].val.i64 == 0
1850				 && insn->op1 == ref - 1) { /* previous instruction */
1851					ir_insn *op1_insn = &ctx->ir_base[insn->op1];
1852
1853					if (op1_insn->op == IR_AND && ctx->use_lists[insn->op1].count == 1) {
1854						/* v = AND(_, _); CMP(v, 0) => SKIP_TEST; TEST */
1855						ir_match_fuse_load_test_int(ctx, op1_insn, ref);
1856						ctx->rules[insn->op1] = IR_FUSED | IR_TEST_INT;
1857						return IR_TESTCC_INT;
1858					} else if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
1859							/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
1860							((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
1861								(insn->op == IR_EQ || insn->op == IR_NE))) {
1862						/* v = BINOP(_, _); CMP(v, 0) => BINOP; SETCC */
1863						if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1864							ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
1865							ctx->rules[insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
1866						} else {
1867							ir_match_fuse_load(ctx, op1_insn->op2, ref);
1868							ctx->rules[insn->op1] = IR_BINOP_INT;
1869						}
1870						return IR_SETCC_INT;
1871					}
1872				}
1873				ir_match_fuse_load_cmp_int(ctx, insn, ref);
1874				return IR_CMP_INT;
1875			} else {
1876				ir_match_fuse_load_cmp_fp(ctx, insn, ref);
1877				return IR_CMP_FP;
1878			}
1879			break;
1880		case IR_ADD:
1881		case IR_SUB:
1882			if (IR_IS_TYPE_INT(insn->type)) {
1883				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
1884					op2_insn = &ctx->ir_base[insn->op2];
1885					if (IR_IS_CONST_REF(insn->op1)) {
1886						// const
1887						// TODO: add support for sym+offset ???
1888					} else if (IR_IS_SYM_CONST(op2_insn->op)) {
1889						if (insn->op == IR_ADD && ir_may_fuse_addr(ctx, op2_insn)) {
1890							goto lea;
1891						}
1892						/* pass */
1893					} else if (op2_insn->val.i64 == 0) {
1894						// return IR_COPY_INT;
1895					} else if ((ir_type_size[insn->type] >= 4 && insn->op == IR_ADD && IR_IS_SIGNED_32BIT(op2_insn->val.i64)) ||
1896							(ir_type_size[insn->type] >= 4 && insn->op == IR_SUB && IR_IS_SIGNED_NEG_32BIT(op2_insn->val.i64))) {
1897lea:
1898						if (ctx->use_lists[insn->op1].count == 1) {
1899							uint32_t rule = ctx->rules[insn->op1];
1900
1901							if (!rule) {
1902								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
1903							}
1904							if (rule == IR_LEA_SI) {
1905								/* z = MUL(Y, 2|4|8) ... ADD(z, imm32) => SKIP ... LEA [Y*2|4|8+im32] */
1906								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1907								return IR_LEA_SI_O;
1908							} else if (rule == IR_LEA_SIB) {
1909								/* z = ADD(X, MUL(Y, 2|4|8)) ... ADD(z, imm32) => SKIP ... LEA [X+Y*2|4|8+im32] */
1910								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SIB;
1911								return IR_LEA_SIB_O;
1912							} else if (rule == IR_LEA_IB) {
1913								/* z = ADD(X, Y) ... ADD(z, imm32) => SKIP ... LEA [X+Y+im32] */
1914								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_IB;
1915								return IR_LEA_IB_O;
1916							}
1917						}
1918						/* ADD(X, imm32) => LEA [X+imm32] */
1919						return IR_LEA_OB;
1920					} else if (op2_insn->val.i64 == 1 || op2_insn->val.i64 == -1) {
1921						if (insn->op == IR_ADD) {
1922							if (op2_insn->val.i64 == 1) {
1923								/* ADD(_, 1) => INC */
1924								return IR_INC;
1925						    } else {
1926								/* ADD(_, -1) => DEC */
1927								return IR_DEC;
1928						    }
1929						} else {
1930							if (op2_insn->val.i64 == 1) {
1931								/* SUB(_, 1) => DEC */
1932								return IR_DEC;
1933						    } else {
1934								/* SUB(_, -1) => INC */
1935								return IR_INC;
1936						    }
1937						}
1938					}
1939				} else if ((ctx->flags & IR_OPT_CODEGEN) && insn->op == IR_ADD && ir_type_size[insn->type] >= 4) {
1940					if (insn->op1 != insn->op2) {
1941						if (ctx->use_lists[insn->op1].count == 1) {
1942							uint32_t rule =ctx->rules[insn->op1];
1943							if (!rule) {
1944								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
1945							}
1946							if (rule == IR_LEA_OB) {
1947								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1948								if (ctx->use_lists[insn->op2].count == 1) {
1949									rule = ctx->rules[insn->op2];
1950									if (!rule) {
1951										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1952									}
1953									if (rule == IR_LEA_SI) {
1954										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(x, y) => SKIP ... SKIP ... LEA */
1955										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1956										return IR_LEA_OB_SI;
1957									}
1958								}
1959								/* x = ADD(X, imm32) ... ADD(x, Y) => SKIP ... LEA */
1960								return IR_LEA_OB_I;
1961							} else if (rule == IR_LEA_SI) {
1962								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1963								if (ctx->use_lists[insn->op2].count == 1) {
1964									rule = ctx->rules[insn->op2];
1965									if (!rule) {
1966										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1967									}
1968									if (rule == IR_LEA_OB) {
1969										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(y, x) => SKIP ... SKIP ... LEA */
1970										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1971										return IR_LEA_SI_OB;
1972									}
1973								}
1974								/* x = MUL(X, 2|4|8) ... ADD(x, Y) => SKIP ... LEA */
1975								return IR_LEA_SI_B;
1976							}
1977						}
1978						if (ctx->use_lists[insn->op2].count == 1) {
1979							uint32_t rule = ctx->rules[insn->op2];
1980							if (!rule) {
1981								ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
1982							}
1983							if (rule == IR_LEA_OB) {
1984								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
1985								/* x = ADD(X, imm32) ... ADD(Y, x) => SKIP ... LEA */
1986								return IR_LEA_I_OB;
1987							} else if (rule == IR_LEA_SI) {
1988								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
1989								/* x = MUL(X, 2|4|8) ... ADD(Y, x) => SKIP ... LEA */
1990								return IR_LEA_B_SI;
1991							}
1992						}
1993					}
1994					/* ADD(X, Y) => LEA [X + Y] */
1995					return IR_LEA_IB;
1996				}
1997binop_int:
1998				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
1999					ir_match_fuse_load_commutative_int(ctx, insn, ref);
2000					return IR_BINOP_INT | IR_MAY_SWAP;
2001				} else {
2002					ir_match_fuse_load(ctx, insn->op2, ref);
2003					return IR_BINOP_INT;
2004				}
2005			} else {
2006binop_fp:
2007				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2008					ir_match_fuse_load_commutative_fp(ctx, insn, ref);
2009					if (ctx->mflags & IR_X86_AVX) {
2010						return IR_BINOP_AVX;
2011					} else {
2012						return IR_BINOP_SSE2 | IR_MAY_SWAP;
2013					}
2014				} else {
2015					ir_match_fuse_load(ctx, insn->op2, ref);
2016					if (ctx->mflags & IR_X86_AVX) {
2017						return IR_BINOP_AVX;
2018					} else {
2019						return IR_BINOP_SSE2;
2020					}
2021				}
2022			}
2023			break;
2024		case IR_MUL:
2025			if (IR_IS_TYPE_INT(insn->type)) {
2026				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2027					op2_insn = &ctx->ir_base[insn->op2];
2028					if (IR_IS_SYM_CONST(op2_insn->op)) {
2029						/* pass */
2030					} else if (IR_IS_CONST_REF(insn->op1)) {
2031						// const
2032					} else if (op2_insn->val.u64 == 0) {
2033						// 0
2034					} else if (op2_insn->val.u64 == 1) {
2035						// return IR_COPY_INT;
2036					} else if (ir_type_size[insn->type] >= 4 &&
2037							(op2_insn->val.u64 == 2 || op2_insn->val.u64 == 4 || op2_insn->val.u64 == 8)) {
2038						/* MUL(X, 2|4|8) => LEA [X*2|4|8] */
2039						return IR_LEA_SI;
2040					} else if (ir_type_size[insn->type] >= 4 &&
2041							(op2_insn->val.u64 == 3 || op2_insn->val.u64 == 5 || op2_insn->val.u64 == 9)) {
2042						/* MUL(X, 3|5|9) => LEA [X+X*2|4|8] */
2043						return IR_LEA_SIB;
2044					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2045						/* MUL(X, PWR2) => SHL */
2046						return IR_MUL_PWR2;
2047					} else if (IR_IS_TYPE_SIGNED(insn->type)
2048					 && ir_type_size[insn->type] != 1
2049					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
2050					 && !IR_IS_CONST_REF(insn->op1)) {
2051						/* MUL(_, imm32) => IMUL */
2052						ir_match_fuse_load(ctx, insn->op1, ref);
2053						return IR_IMUL3;
2054					}
2055				}
2056				/* Prefer IMUL over MUL because it's more flexible and uses less registers ??? */
2057//				if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
2058				if (ir_type_size[insn->type] != 1) {
2059					goto binop_int;
2060				}
2061				ir_match_fuse_load(ctx, insn->op2, ref);
2062				return IR_MUL_INT;
2063			} else {
2064				goto binop_fp;
2065			}
2066			break;
2067		case IR_ADD_OV:
2068		case IR_SUB_OV:
2069			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2070			goto binop_int;
2071		case IR_MUL_OV:
2072			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2073			if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
2074				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2075					op2_insn = &ctx->ir_base[insn->op2];
2076					if (!IR_IS_SYM_CONST(op2_insn->op)
2077					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
2078					 && !IR_IS_CONST_REF(insn->op1)) {
2079						/* MUL(_, imm32) => IMUL */
2080						ir_match_fuse_load(ctx, insn->op1, ref);
2081						return IR_IMUL3;
2082					}
2083				}
2084				goto binop_int;
2085			}
2086			ir_match_fuse_load(ctx, insn->op2, ref);
2087			return IR_MUL_INT;
2088		case IR_DIV:
2089			if (IR_IS_TYPE_INT(insn->type)) {
2090				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2091					op2_insn = &ctx->ir_base[insn->op2];
2092					if (IR_IS_SYM_CONST(op2_insn->op)) {
2093						/* pass */
2094					} else if (IR_IS_CONST_REF(insn->op1)) {
2095						// const
2096					} else if (op2_insn->val.u64 == 1) {
2097						// return IR_COPY_INT;
2098					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2099						/* DIV(X, PWR2) => SHR */
2100						if (IR_IS_TYPE_UNSIGNED(insn->type)) {
2101							return IR_DIV_PWR2;
2102						} else {
2103							return IR_SDIV_PWR2;
2104						}
2105					}
2106				}
2107				ir_match_fuse_load(ctx, insn->op2, ref);
2108				return IR_DIV_INT;
2109			} else {
2110				goto binop_fp;
2111			}
2112			break;
2113		case IR_MOD:
2114			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2115				op2_insn = &ctx->ir_base[insn->op2];
2116				if (IR_IS_SYM_CONST(op2_insn->op)) {
2117					/* pass */
2118				} else if (IR_IS_CONST_REF(insn->op1)) {
2119					// const
2120				} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
2121					/* MOD(X, PWR2) => AND */
2122					if (IR_IS_TYPE_UNSIGNED(insn->type)) {
2123						return IR_MOD_PWR2;
2124					} else {
2125						return IR_SMOD_PWR2;
2126					}
2127				}
2128			}
2129			ir_match_fuse_load(ctx, insn->op2, ref);
2130			return IR_MOD_INT;
2131		case IR_BSWAP:
2132		case IR_NOT:
2133			if (insn->type == IR_BOOL) {
2134				IR_ASSERT(IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)); // TODO: IR_BOOL_NOT_FP
2135				return IR_BOOL_NOT_INT;
2136			} else {
2137				IR_ASSERT(IR_IS_TYPE_INT(insn->type));
2138				return IR_OP_INT;
2139			}
2140			break;
2141		case IR_NEG:
2142			if (IR_IS_TYPE_INT(insn->type)) {
2143				return IR_OP_INT;
2144			} else {
2145				return IR_OP_FP;
2146			}
2147		case IR_ABS:
2148			if (IR_IS_TYPE_INT(insn->type)) {
2149				return IR_ABS_INT; // movl %edi, %eax; negl %eax; cmovs %edi, %eax
2150			} else {
2151				return IR_OP_FP;
2152			}
2153		case IR_OR:
2154			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2155				op2_insn = &ctx->ir_base[insn->op2];
2156				if (IR_IS_SYM_CONST(op2_insn->op)) {
2157					/* pass */
2158				} else if (IR_IS_CONST_REF(insn->op1)) {
2159					// const
2160				} else if (op2_insn->val.i64 == 0) {
2161					// return IR_COPY_INT;
2162				} else if (op2_insn->val.i64 == -1) {
2163					// -1
2164				}
2165			}
2166			goto binop_int;
2167		case IR_AND:
2168			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2169				op2_insn = &ctx->ir_base[insn->op2];
2170				if (IR_IS_SYM_CONST(op2_insn->op)) {
2171					/* pass */
2172				} else if (IR_IS_CONST_REF(insn->op1)) {
2173					// const
2174				} else if (op2_insn->val.i64 == 0) {
2175					// 0
2176				} else if (op2_insn->val.i64 == -1) {
2177					// return IR_COPY_INT;
2178				}
2179			}
2180			goto binop_int;
2181		case IR_XOR:
2182			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
2183				op2_insn = &ctx->ir_base[insn->op2];
2184				if (IR_IS_SYM_CONST(op2_insn->op)) {
2185					/* pass */
2186				} else if (IR_IS_CONST_REF(insn->op1)) {
2187					// const
2188				}
2189			}
2190			goto binop_int;
2191		case IR_SHL:
2192			if (IR_IS_CONST_REF(insn->op2)) {
2193				if (ctx->flags & IR_OPT_CODEGEN) {
2194					op2_insn = &ctx->ir_base[insn->op2];
2195					if (IR_IS_SYM_CONST(op2_insn->op)) {
2196						/* pass */
2197					} else if (IR_IS_CONST_REF(insn->op1)) {
2198						// const
2199					} else if (op2_insn->val.u64 == 0) {
2200						// return IR_COPY_INT;
2201					} else if (ir_type_size[insn->type] >= 4) {
2202						if (op2_insn->val.u64 == 1) {
2203							// lea [op1*2]
2204						} else if (op2_insn->val.u64 == 2) {
2205							// lea [op1*4]
2206						} else if (op2_insn->val.u64 == 3) {
2207							// lea [op1*8]
2208						}
2209					}
2210				}
2211				return IR_SHIFT_CONST;
2212			}
2213			return IR_SHIFT;
2214		case IR_SHR:
2215		case IR_SAR:
2216		case IR_ROL:
2217		case IR_ROR:
2218			if (IR_IS_CONST_REF(insn->op2)) {
2219				if (ctx->flags & IR_OPT_CODEGEN) {
2220					op2_insn = &ctx->ir_base[insn->op2];
2221					if (IR_IS_SYM_CONST(op2_insn->op)) {
2222						/* pass */
2223					} else if (IR_IS_CONST_REF(insn->op1)) {
2224						// const
2225					} else if (op2_insn->val.u64 == 0) {
2226						// return IR_COPY_INT;
2227					}
2228				}
2229				return IR_SHIFT_CONST;
2230			}
2231			return IR_SHIFT;
2232		case IR_MIN:
2233		case IR_MAX:
2234			if (IR_IS_TYPE_INT(insn->type)) {
2235				return IR_MIN_MAX_INT | IR_MAY_SWAP;
2236			} else {
2237				goto binop_fp;
2238			}
2239			break;
2240		case IR_COPY:
2241			if (IR_IS_TYPE_INT(insn->type)) {
2242				return IR_COPY_INT | IR_MAY_REUSE;
2243			} else {
2244				return IR_COPY_FP | IR_MAY_REUSE;
2245			}
2246			break;
2247		case IR_CALL:
2248			if (IR_IS_CONST_REF(insn->op2)) {
2249				const ir_insn *func = &ctx->ir_base[insn->op2];
2250
2251				if (func->op == IR_FUNC && func->proto) {
2252					uint32_t rule = ir_match_builtin_call(ctx, func);
2253
2254					if (rule) {
2255						return rule;
2256					}
2257				}
2258			}
2259			ctx->flags2 |= IR_HAS_CALLS | IR_16B_FRAME_ALIGNMENT;
2260#ifndef IR_REG_FP_RET1
2261			if (IR_IS_TYPE_FP(insn->type)) {
2262				ctx->flags2 |= IR_HAS_FP_RET_SLOT;
2263			}
2264#endif
2265			IR_FALLTHROUGH;
2266		case IR_TAILCALL:
2267		case IR_IJMP:
2268			ir_match_fuse_load(ctx, insn->op2, ref);
2269			return insn->op;
2270		case IR_VAR:
2271			return IR_SKIPPED | IR_VAR;
2272		case IR_PARAM:
2273			return ctx->use_lists[ref].count > 0 ? IR_PARAM : IR_SKIPPED | IR_PARAM;
2274		case IR_ALLOCA:
2275			/* alloca() may be used only in functions */
2276			if (ctx->flags & IR_FUNCTION) {
2277				if (IR_IS_CONST_REF(insn->op2) && ctx->cfg_map[ref] == 1) {
2278					ir_insn *val = &ctx->ir_base[insn->op2];
2279
2280					if (!IR_IS_SYM_CONST(val->op)) {
2281						return IR_STATIC_ALLOCA;
2282					}
2283				}
2284				ctx->flags |= IR_USE_FRAME_POINTER;
2285				ctx->flags2 |= IR_HAS_ALLOCA | IR_16B_FRAME_ALIGNMENT;
2286			}
2287			return IR_ALLOCA;
2288		case IR_VSTORE:
2289			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2290				store_rule = IR_VSTORE_INT;
2291				load_op = IR_VLOAD;
2292store_int:
2293				if ((ctx->flags & IR_OPT_CODEGEN)
2294				 && ir_in_same_block(ctx, insn->op3)
2295				 && (ctx->use_lists[insn->op3].count == 1 ||
2296				     (ctx->use_lists[insn->op3].count == 2
2297				   && (ctx->ir_base[insn->op3].op == IR_ADD_OV ||
2298				       ctx->ir_base[insn->op3].op == IR_SUB_OV)))) {
2299					ir_insn *op_insn = &ctx->ir_base[insn->op3];
2300					uint32_t rule = ctx->rules[insn->op3];
2301
2302					if (!rule) {
2303						ctx->rules[insn->op3] = rule = ir_match_insn(ctx, insn->op3);
2304					}
2305					if (((rule & IR_RULE_MASK) == IR_BINOP_INT && op_insn->op != IR_MUL) || rule == IR_LEA_OB || rule == IR_LEA_IB) {
2306						if (insn->op1 == op_insn->op1
2307						 && ctx->ir_base[op_insn->op1].op == load_op
2308						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2309						 && ctx->use_lists[op_insn->op1].count == 2) {
2310							/* l = LOAD(_, a) ... v = BINOP(l, _) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2311							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2312							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2313							if (!IR_IS_CONST_REF(op_insn->op2)
2314							 && ctx->rules[op_insn->op2] == (IR_FUSED|IR_SIMPLE|IR_LOAD)) {
2315								ctx->rules[op_insn->op2] = IR_LOAD_INT;
2316							}
2317							return IR_MEM_BINOP_INT;
2318						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2319						 && insn->op1 == op_insn->op2
2320						 && ctx->ir_base[op_insn->op2].op == load_op
2321						 && ctx->ir_base[op_insn->op2].op2 == insn->op2
2322						 && ctx->use_lists[op_insn->op2].count == 2) {
2323							/* l = LOAD(_, a) ... v = BINOP(_, l) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
2324							ir_swap_ops(op_insn);
2325							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
2326							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2327							return IR_MEM_BINOP_INT;
2328						}
2329					} else if (rule == IR_INC) {
2330						if (insn->op1 == op_insn->op1
2331						 && ctx->ir_base[op_insn->op1].op == load_op
2332						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2333						 && ctx->use_lists[op_insn->op1].count == 2) {
2334							/* l = LOAD(_, a) ... v = INC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_INC */
2335							ctx->rules[insn->op3] = IR_SKIPPED | IR_INC;
2336							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2337							return IR_MEM_INC;
2338						}
2339					} else if (rule == IR_DEC) {
2340						if (insn->op1 == op_insn->op1
2341						 && ctx->ir_base[op_insn->op1].op == load_op
2342						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2343						 && ctx->use_lists[op_insn->op1].count == 2){
2344							/* l = LOAD(_, a) ... v = DEC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DEC */
2345							ctx->rules[insn->op3] = IR_SKIPPED | IR_DEC;
2346							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2347							return IR_MEM_DEC;
2348						}
2349					} else if (rule == IR_MUL_PWR2) {
2350						if (insn->op1 == op_insn->op1
2351						 && ctx->ir_base[op_insn->op1].op == load_op
2352						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2353						 && ctx->use_lists[op_insn->op1].count == 2) {
2354							/* l = LOAD(_, a) ... v = MUL_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MUL_PWR2 */
2355							ctx->rules[insn->op3] = IR_SKIPPED | IR_MUL_PWR2;
2356							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2357							return IR_MEM_MUL_PWR2;
2358						}
2359					} else if (rule == IR_DIV_PWR2) {
2360						if (insn->op1 == op_insn->op1
2361						 && ctx->ir_base[op_insn->op1].op == load_op
2362						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2363						 && ctx->use_lists[op_insn->op1].count == 2) {
2364							/* l = LOAD(_, a) ... v = DIV_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DIV_PWR2 */
2365							ctx->rules[insn->op3] = IR_SKIPPED | IR_DIV_PWR2;
2366							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2367							return IR_MEM_DIV_PWR2;
2368						}
2369					} else if (rule == IR_MOD_PWR2) {
2370						if (insn->op1 == op_insn->op1
2371						 && ctx->ir_base[op_insn->op1].op == load_op
2372						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2373						 && ctx->use_lists[op_insn->op1].count == 2) {
2374							/* l = LOAD(_, a) ... v = MOD_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MOD_PWR2 */
2375							ctx->rules[insn->op3] = IR_SKIPPED | IR_MOD_PWR2;
2376							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2377							return IR_MEM_MOD_PWR2;
2378						}
2379					} else if (rule == IR_SHIFT) {
2380						if (insn->op1 == op_insn->op1
2381						 && ctx->ir_base[op_insn->op1].op == load_op
2382						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2383						 && ctx->use_lists[op_insn->op1].count == 2) {
2384							/* l = LOAD(_, a) ... v = SHIFT(l, _) ... STORE(l, a, v) => SKIP ... SKIP_SHIFT ... MEM_SHIFT */
2385							ctx->rules[insn->op3] = IR_FUSED | IR_SHIFT;
2386							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2387							return IR_MEM_SHIFT;
2388						}
2389					} else if (rule == IR_SHIFT_CONST) {
2390						if (insn->op1 == op_insn->op1
2391						 && ctx->ir_base[op_insn->op1].op == load_op
2392						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2393						 && ctx->use_lists[op_insn->op1].count == 2) {
2394							/* l = LOAD(_, a) ... v = SHIFT(l, CONST) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_SHIFT_CONST */
2395							ctx->rules[insn->op3] = IR_SKIPPED | IR_SHIFT_CONST;
2396							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2397							return IR_MEM_SHIFT_CONST;
2398						}
2399					} else if (rule == IR_OP_INT && op_insn->op != IR_BSWAP) {
2400						if (insn->op1 == op_insn->op1
2401						 && ctx->ir_base[op_insn->op1].op == load_op
2402						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
2403						 && ctx->use_lists[op_insn->op1].count == 2) {
2404							/* l = LOAD(_, a) ... v = OP(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_OP */
2405							ctx->rules[insn->op3] = IR_SKIPPED | IR_OP_INT;
2406							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
2407							return IR_MEM_OP_INT;
2408						}
2409					} else if (rule == IR_CMP_INT && load_op == IR_LOAD) {
2410						/* c = CMP(_, _) ... STORE(c) => SKIP_CMP ... CMP_AND_STORE_INT */
2411						ctx->rules[insn->op3] = IR_FUSED | IR_CMP_INT;
2412						return IR_CMP_AND_STORE_INT;
2413					}
2414				}
2415				return store_rule;
2416			} else {
2417				return IR_VSTORE_FP;
2418			}
2419			break;
2420		case IR_LOAD:
2421			ir_match_fuse_addr(ctx, insn->op2);
2422			if (IR_IS_TYPE_INT(insn->type)) {
2423				return IR_LOAD_INT;
2424			} else {
2425				return IR_LOAD_FP;
2426			}
2427			break;
2428		case IR_STORE:
2429			ir_match_fuse_addr(ctx, insn->op2);
2430			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
2431				store_rule = IR_STORE_INT;
2432				load_op = IR_LOAD;
2433				goto store_int;
2434			} else {
2435				return IR_STORE_FP;
2436			}
2437			break;
2438		case IR_RLOAD:
2439			if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), insn->op2)) {
2440				return IR_SKIPPED | IR_RLOAD;
2441			}
2442			return IR_RLOAD;
2443		case IR_RSTORE:
2444			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2445				if ((ctx->flags & IR_OPT_CODEGEN)
2446				 && ir_in_same_block(ctx, insn->op2)
2447				 && ctx->use_lists[insn->op2].count == 1
2448				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2449					ir_insn *op_insn = &ctx->ir_base[insn->op2];
2450
2451					if (op_insn->op == IR_ADD ||
2452				        op_insn->op == IR_SUB ||
2453//				        op_insn->op == IR_MUL ||
2454				        op_insn->op == IR_OR  ||
2455				        op_insn->op == IR_AND ||
2456				        op_insn->op == IR_XOR) {
2457						if (insn->op1 == op_insn->op1
2458						 && ctx->ir_base[op_insn->op1].op == IR_RLOAD
2459						 && ctx->ir_base[op_insn->op1].op2 == insn->op3
2460						 && ctx->use_lists[op_insn->op1].count == 2) {
2461							/* l = RLOAD(r) ... v = BINOP(l, _) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2462							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2463							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2464							return IR_REG_BINOP_INT;
2465						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2466						 && insn->op1 == op_insn->op2
2467						 && ctx->ir_base[op_insn->op2].op == IR_RLOAD
2468						 && ctx->ir_base[op_insn->op2].op2 == insn->op3
2469						 && ctx->use_lists[op_insn->op2].count == 2) {
2470							/* l = RLOAD(r) ... v = BINOP(x, l) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
2471							ir_swap_ops(op_insn);
2472							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2473							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
2474							return IR_REG_BINOP_INT;
2475						}
2476					}
2477				}
2478			}
2479			ir_match_fuse_load(ctx, insn->op2, ref);
2480			return IR_RSTORE;
2481		case IR_START:
2482		case IR_BEGIN:
2483		case IR_IF_TRUE:
2484		case IR_IF_FALSE:
2485		case IR_CASE_VAL:
2486		case IR_CASE_DEFAULT:
2487		case IR_MERGE:
2488		case IR_LOOP_BEGIN:
2489		case IR_UNREACHABLE:
2490			return IR_SKIPPED | insn->op;
2491		case IR_RETURN:
2492			if (!insn->op2) {
2493				return IR_RETURN_VOID;
2494			} else if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2495				return IR_RETURN_INT;
2496			} else {
2497				return IR_RETURN_FP;
2498			}
2499		case IR_IF:
2500			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2501				op2_insn = &ctx->ir_base[insn->op2];
2502				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT) {
2503					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2504						if (IR_IS_CONST_REF(op2_insn->op2)
2505						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2506						 && ctx->ir_base[op2_insn->op2].val.i64 == 0
2507						 && op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2508							ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2509
2510							if (op1_insn->op == IR_AND && ctx->use_lists[op2_insn->op1].count == 1) {
2511								/* v = AND(_, _); c = CMP(v, 0) ... IF(c) => SKIP_TEST; SKIP ... TEST_AND_BRANCH */
2512								ir_match_fuse_load_test_int(ctx, op1_insn, ref);
2513								ctx->rules[op2_insn->op1] = IR_FUSED | IR_TEST_INT;
2514								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_NOP;
2515								return IR_TEST_AND_BRANCH_INT;
2516							} else if (insn->op2 == ref - 1 && /* previous instruction */
2517									((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2518										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2519										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2520											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE)))) {
2521								/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... JCC */
2522								if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2523									ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2524									ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
2525								} else {
2526									ir_match_fuse_load(ctx, op1_insn->op2, ref);
2527									ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2528								}
2529								ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2530								return IR_JCC_INT;
2531							}
2532						}
2533						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2534						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2535						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2536						return IR_CMP_AND_BRANCH_INT;
2537					} else {
2538						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
2539						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, 1);
2540						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2541						return IR_CMP_AND_BRANCH_FP;
2542					}
2543				} else if (op2_insn->op == IR_AND) {
2544					/* c = AND(_, _) ... IF(c) => SKIP_TEST ... TEST_AND_BRANCH */
2545					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2546					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2547					return IR_TEST_AND_BRANCH_INT;
2548				} else if (op2_insn->op == IR_OVERFLOW) {
2549					/* c = OVERFLOW(_) ... IF(c) => SKIP_OVERFLOW ... OVERFLOW_AND_BRANCH */
2550					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2551					return IR_OVERFLOW_AND_BRANCH;
2552				}
2553			}
2554			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2555				if (insn->op2 == ref - 1 /* previous instruction */
2556				 && ir_in_same_block(ctx, insn->op2)) {
2557					op2_insn = &ctx->ir_base[insn->op2];
2558					if (op2_insn->op == IR_ADD ||
2559					    op2_insn->op == IR_SUB ||
2560//					    op2_insn->op == IR_MUL ||
2561					    op2_insn->op == IR_OR  ||
2562					    op2_insn->op == IR_AND ||
2563					    op2_insn->op == IR_XOR) {
2564
2565							/* v = BINOP(_, _); IF(v) => BINOP; JCC */
2566						if (ir_op_flags[op2_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2567							ir_match_fuse_load_commutative_int(ctx, op2_insn, ref);
2568							ctx->rules[insn->op2] = IR_BINOP_INT | IR_MAY_SWAP;
2569						} else {
2570							ir_match_fuse_load(ctx, op2_insn->op2, ref);
2571							ctx->rules[insn->op2] = IR_BINOP_INT;
2572						}
2573						return IR_JCC_INT;
2574					}
2575				} else if ((ctx->flags & IR_OPT_CODEGEN)
2576				 && insn->op1 == ref - 1 /* previous instruction */
2577				 && insn->op2 == ref - 2 /* previous instruction */
2578				 && ir_in_same_block(ctx, insn->op2)
2579				 && ctx->use_lists[insn->op2].count == 2
2580				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
2581					ir_insn *store_insn = &ctx->ir_base[insn->op1];
2582
2583					if (store_insn->op == IR_STORE && store_insn->op3 == insn->op2) {
2584						ir_insn *op_insn = &ctx->ir_base[insn->op2];
2585
2586						if (op_insn->op == IR_ADD ||
2587						    op_insn->op == IR_SUB ||
2588//						    op_insn->op == IR_MUL ||
2589						    op_insn->op == IR_OR  ||
2590						    op_insn->op == IR_AND ||
2591						    op_insn->op == IR_XOR) {
2592							if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2593							 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2594								if (ir_in_same_block(ctx, op_insn->op1)
2595								 && ctx->use_lists[op_insn->op1].count == 2
2596								 && store_insn->op1 == op_insn->op1) {
2597									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2598									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2599									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2600									ir_match_fuse_addr(ctx, store_insn->op2);
2601									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2602									return IR_JCC_INT;
2603								}
2604							} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2605							 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2606							 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2607								if (ir_in_same_block(ctx, op_insn->op2)
2608								 && ctx->use_lists[op_insn->op2].count == 2
2609								 && store_insn->op1 == op_insn->op2) {
2610									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2611									ir_swap_ops(op_insn);
2612									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
2613									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2614									ir_match_fuse_addr(ctx, store_insn->op2);
2615									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
2616									return IR_JCC_INT;
2617								}
2618							}
2619						}
2620					}
2621				}
2622				ir_match_fuse_load(ctx, insn->op2, ref);
2623				return IR_IF_INT;
2624			} else {
2625				IR_ASSERT(0 && "NIY IR_IF_FP");
2626				break;
2627			}
2628		case IR_COND:
2629			if (ir_in_same_block(ctx, insn->op1) && ctx->use_lists[insn->op1].count == 1) {
2630				ir_insn *op1_insn = &ctx->ir_base[insn->op1];
2631
2632				if (op1_insn->op >= IR_EQ && op1_insn->op <= IR_UGT) {
2633					if (IR_IS_TYPE_INT(ctx->ir_base[op1_insn->op1].type)) {
2634						ir_match_fuse_load_cmp_int(ctx, op1_insn, ref);
2635						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_INT;
2636						return IR_COND_CMP_INT;
2637					} else {
2638						ir_match_fuse_load_cmp_fp_br(ctx, op1_insn, ref, 1);
2639						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_FP;
2640						return IR_COND_CMP_FP;
2641					}
2642				}
2643			}
2644			return IR_COND;
2645		case IR_GUARD:
2646		case IR_GUARD_NOT:
2647			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
2648				op2_insn = &ctx->ir_base[insn->op2];
2649				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT
2650					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
2651				 && (insn->op2 == ref - 1 ||
2652				     (insn->op2 == ctx->prev_ref[ref] - 1
2653				   && ctx->ir_base[ctx->prev_ref[ref]].op == IR_SNAPSHOT))) {
2654					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
2655						if (IR_IS_CONST_REF(op2_insn->op2)
2656						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
2657						 && ctx->ir_base[op2_insn->op2].val.i64 == 0) {
2658							if (op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
2659								ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];
2660
2661								if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
2662										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2663										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
2664											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2665									if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
2666										ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
2667										ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
2668									} else {
2669										ir_match_fuse_load(ctx, op1_insn->op2, ref);
2670										ctx->rules[op2_insn->op1] = IR_BINOP_INT;
2671									}
2672									/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... GUARD_JCC */
2673									ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2674									return IR_GUARD_JCC_INT;
2675								}
2676							} else if ((ctx->flags & IR_OPT_CODEGEN)
2677							 && op2_insn->op1 == insn->op2 - 2 /* before previous instruction */
2678							 && ir_in_same_block(ctx, op2_insn->op1)
2679							 && ctx->use_lists[op2_insn->op1].count == 2) {
2680								ir_insn *store_insn = &ctx->ir_base[insn->op2 - 1];
2681
2682								if (store_insn->op == IR_STORE && store_insn->op3 == op2_insn->op1) {
2683									ir_insn *op_insn = &ctx->ir_base[op2_insn->op1];
2684
2685									if ((op_insn->op == IR_OR || op_insn->op == IR_AND || op_insn->op == IR_XOR) ||
2686											/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
2687											((op_insn->op == IR_ADD || op_insn->op == IR_SUB) &&
2688												(op2_insn->op == IR_EQ || op2_insn->op == IR_NE))) {
2689										if (ctx->ir_base[op_insn->op1].op == IR_LOAD
2690										 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
2691											if (ir_in_same_block(ctx, op_insn->op1)
2692											 && ctx->use_lists[op_insn->op1].count == 2
2693											 && store_insn->op1 == op_insn->op1) {
2694												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; GUARD_JCC */
2695												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2696												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2697												ir_match_fuse_addr(ctx, store_insn->op2);
2698												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2699												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2700												return IR_GUARD_JCC_INT;
2701											}
2702										} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
2703										 && ctx->ir_base[op_insn->op2].op == IR_LOAD
2704										 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
2705											if (ir_in_same_block(ctx, op_insn->op2)
2706											 && ctx->use_lists[op_insn->op2].count == 2
2707											 && store_insn->op1 == op_insn->op2) {
2708												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
2709												ir_swap_ops(op_insn);
2710												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
2711												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
2712												ir_match_fuse_addr(ctx, store_insn->op2);
2713												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
2714												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
2715												return IR_GUARD_JCC_INT;
2716											}
2717										}
2718									}
2719								}
2720							}
2721						}
2722						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2723						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
2724						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
2725						return IR_GUARD_CMP_INT;
2726					} else {
2727						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
2728						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, insn->op == IR_GUARD_NOT);
2729						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
2730						return IR_GUARD_CMP_FP;
2731					}
2732				} else if (op2_insn->op == IR_AND) { // TODO: OR, XOR. etc
2733					/* c = AND(_, _) ... GUARD(c) => SKIP_TEST ... GUARD_TEST */
2734					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
2735					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
2736					return IR_GUARD_TEST_INT;
2737				} else if (op2_insn->op == IR_OVERFLOW) {
2738					/* c = OVERFLOW(_) ... GUARD(c) => SKIP_OVERFLOW ... GUARD_OVERFLOW */
2739					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
2740					return IR_GUARD_OVERFLOW;
2741				}
2742			}
2743			ir_match_fuse_load(ctx, insn->op2, ref);
2744			return insn->op;
2745		case IR_INT2FP:
2746			if (ir_type_size[ctx->ir_base[insn->op1].type] > (IR_IS_TYPE_SIGNED(ctx->ir_base[insn->op1].type) ? 2 : 4)) {
2747				ir_match_fuse_load(ctx, insn->op1, ref);
2748			}
2749			return insn->op;
2750		case IR_SEXT:
2751		case IR_ZEXT:
2752		case IR_FP2INT:
2753		case IR_FP2FP:
2754			ir_match_fuse_load(ctx, insn->op1, ref);
2755			return insn->op;
2756		case IR_TRUNC:
2757		case IR_PROTO:
2758			ir_match_fuse_load(ctx, insn->op1, ref);
2759			return insn->op | IR_MAY_REUSE;
2760		case IR_BITCAST:
2761			ir_match_fuse_load(ctx, insn->op1, ref);
2762			if (IR_IS_TYPE_INT(insn->type) && IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
2763				return insn->op | IR_MAY_REUSE;
2764			} else {
2765				return insn->op;
2766			}
2767		case IR_CTLZ:
2768		case IR_CTTZ:
2769			ir_match_fuse_load(ctx, insn->op1, ref);
2770			return IR_BIT_COUNT;
2771		case IR_CTPOP:
2772			ir_match_fuse_load(ctx, insn->op1, ref);
2773			return (ctx->mflags & IR_X86_BMI1) ? IR_BIT_COUNT : IR_CTPOP;
2774		case IR_VA_START:
2775			ctx->flags2 |= IR_HAS_VA_START;
2776			if ((ctx->ir_base[insn->op2].op == IR_ALLOCA) || (ctx->ir_base[insn->op2].op == IR_VADDR)) {
2777				ir_use_list *use_list = &ctx->use_lists[insn->op2];
2778				ir_ref *p, n = use_list->count;
2779				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2780					ir_insn *use_insn = &ctx->ir_base[*p];
2781					if (use_insn->op == IR_VA_START || use_insn->op == IR_VA_END) {
2782					} else if (use_insn->op == IR_VA_COPY) {
2783						if (use_insn->op3 == insn->op2) {
2784							ctx->flags2 |= IR_HAS_VA_COPY;
2785						}
2786					} else if (use_insn->op == IR_VA_ARG) {
2787						if (use_insn->op2 == insn->op2) {
2788							if (IR_IS_TYPE_INT(use_insn->type)) {
2789								ctx->flags2 |= IR_HAS_VA_ARG_GP;
2790							} else {
2791								IR_ASSERT(IR_IS_TYPE_FP(use_insn->type));
2792								ctx->flags2 |= IR_HAS_VA_ARG_FP;
2793							}
2794						}
2795					} else if (*p > ref) {
2796						/* diriect va_list access */
2797						ctx->flags2 |= IR_HAS_VA_ARG_GP|IR_HAS_VA_ARG_FP;
2798					}
2799				}
2800			}
2801			return IR_VA_START;
2802		case IR_VA_END:
2803			return IR_SKIPPED | IR_NOP;
2804		case IR_VADDR:
2805			if (ctx->use_lists[ref].count > 0) {
2806				ir_use_list *use_list = &ctx->use_lists[ref];
2807				ir_ref *p, n = use_list->count;
2808
2809				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
2810					if (ctx->ir_base[*p].op != IR_VA_END) {
2811						return IR_STATIC_ALLOCA;
2812					}
2813				}
2814			}
2815			return IR_SKIPPED | IR_NOP;
2816		default:
2817			break;
2818	}
2819
2820	return insn->op;
2821}
2822
2823static void ir_match_insn2(ir_ctx *ctx, ir_ref ref, uint32_t rule)
2824{
2825	if (rule == IR_LEA_IB) {
2826		ir_match_try_revert_lea_to_add(ctx, ref);
2827	}
2828}
2829
2830/* code generation */
2831static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg)
2832{
2833	int32_t offset;
2834
2835	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2836	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
2837	IR_ASSERT(offset != -1);
2838	if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2839		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2840		*reg = ctx->spill_base;
2841		return offset;
2842	}
2843	*reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2844	return IR_SPILL_POS_TO_OFFSET(offset);
2845}
2846
2847static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v)
2848{
2849	int32_t offset;
2850	ir_reg base;
2851
2852	IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]);
2853	offset = ctx->live_intervals[v]->stack_spill_pos;
2854	IR_ASSERT(offset != -1);
2855	if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
2856		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
2857		return IR_MEM_BO(ctx->spill_base, offset);
2858	}
2859	base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2860	offset = IR_SPILL_POS_TO_OFFSET(offset);
2861	return IR_MEM_BO(base, offset);
2862}
2863
2864static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref)
2865{
2866	IR_ASSERT(!IR_IS_CONST_REF(ref));
2867	return ir_vreg_spill_slot(ctx, ctx->vregs[ref]);
2868}
2869
2870static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem)
2871{
2872	ir_mem m = ir_ref_spill_slot(ctx, ref);
2873	return IR_MEM_VAL(m) == IR_MEM_VAL(mem);
2874}
2875
2876static ir_mem ir_var_spill_slot(ir_ctx *ctx, ir_ref ref)
2877{
2878	ir_insn *var_insn = &ctx->ir_base[ref];
2879	ir_reg reg;
2880
2881	IR_ASSERT(var_insn->op == IR_VAR);
2882	reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2883	return IR_MEM_BO(reg, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
2884}
2885
2886static bool ir_may_avoid_spill_load(ir_ctx *ctx, ir_ref ref, ir_ref use)
2887{
2888	ir_live_interval *ival;
2889
2890	IR_ASSERT(ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
2891	ival = ctx->live_intervals[ctx->vregs[ref]];
2892	while (ival) {
2893		ir_use_pos *use_pos = ival->use_pos;
2894		while (use_pos) {
2895			if (IR_LIVE_POS_TO_REF(use_pos->pos) == use) {
2896				return !use_pos->next || use_pos->next->op_num == 0;
2897			}
2898			use_pos = use_pos->next;
2899		}
2900		ival = ival->next;
2901	}
2902	return 0;
2903}
2904
2905static void ir_emit_mov_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
2906{
2907	ir_backend_data *data = ctx->data;
2908	dasm_State **Dst = &data->dasm_state;
2909
2910	if (ir_type_size[type] == 8) {
2911		IR_ASSERT(sizeof(void*) == 8);
2912|.if X64
2913		if (IR_IS_UNSIGNED_32BIT(val)) {
2914			|	mov Rd(reg), (uint32_t)val // zero extended load
2915		} else if (IR_IS_SIGNED_32BIT(val)) {
2916			|	mov Rq(reg), (int32_t)val // sign extended load
2917		} else if (type == IR_ADDR && IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, (intptr_t)val)) {
2918			|	lea Ra(reg), [&val]
2919		} else {
2920			|	mov64 Ra(reg), val
2921		}
2922|.endif
2923	} else {
2924		|	ASM_REG_IMM_OP mov, type, reg, (int32_t)val // sign extended load
2925	}
2926}
2927
2928static void ir_emit_load_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
2929{
2930	ir_backend_data *data = ctx->data;
2931	dasm_State **Dst = &data->dasm_state;
2932
2933	IR_ASSERT(IR_IS_TYPE_INT(type));
2934	if (val == 0) {
2935		|	ASM_REG_REG_OP xor, type, reg, reg
2936	} else {
2937		ir_emit_mov_imm_int(ctx, type, reg, val);
2938	}
2939}
2940
2941static void ir_emit_load_mem_int(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2942{
2943	ir_backend_data *data = ctx->data;
2944	dasm_State **Dst = &data->dasm_state;
2945
2946	|	ASM_REG_MEM_OP mov, type, reg, mem
2947}
2948
2949static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
2950{
2951	ir_backend_data *data = ctx->data;
2952	dasm_State **Dst = &data->dasm_state;
2953	ir_insn *insn = &ctx->ir_base[src];
2954	int label;
2955
2956	if (type == IR_FLOAT && insn->val.u32 == 0) {
2957		if (ctx->mflags & IR_X86_AVX) {
2958			|	vxorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2959		} else {
2960			|	xorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2961		}
2962	} else if (type == IR_DOUBLE && insn->val.u64 == 0) {
2963		if (ctx->mflags & IR_X86_AVX) {
2964			|	vxorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2965		} else {
2966			|	xorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
2967		}
2968	} else {
2969		label = ir_const_label(ctx, src);
2970		|	ASM_FP_REG_TXT_OP movs, type, reg, [=>label]
2971	}
2972}
2973
2974static void ir_emit_load_mem_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2975{
2976	ir_backend_data *data = ctx->data;
2977	dasm_State **Dst = &data->dasm_state;
2978
2979	|	ASM_FP_REG_MEM_OP movs, type, reg, mem
2980}
2981
2982static void ir_emit_load_mem(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
2983{
2984	if (IR_IS_TYPE_INT(type)) {
2985		ir_emit_load_mem_int(ctx, type, reg, mem);
2986	} else {
2987		ir_emit_load_mem_fp(ctx, type, reg, mem);
2988	}
2989}
2990
2991static void ir_load_local_addr(ir_ctx *ctx, ir_reg reg, ir_ref src)
2992{
2993	ir_backend_data *data = ctx->data;
2994	dasm_State **Dst = &data->dasm_state;
2995	ir_reg base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
2996	int32_t offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[src].op3);
2997
2998	IR_ASSERT(ir_rule(ctx, src) == IR_STATIC_ALLOCA);
2999	if (offset == 0) {
3000		| mov Ra(reg), Ra(base)
3001	} else {
3002		| lea Ra(reg), [Ra(base)+offset]
3003	}
3004}
3005
3006static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
3007{
3008	if (IR_IS_CONST_REF(src)) {
3009		if (IR_IS_TYPE_INT(type)) {
3010			ir_insn *insn = &ctx->ir_base[src];
3011
3012			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
3013				void *addr = ir_sym_val(ctx, insn);
3014				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
3015			} else if (insn->op == IR_STR) {
3016				ir_backend_data *data = ctx->data;
3017				dasm_State **Dst = &data->dasm_state;
3018				int label = ir_const_label(ctx, src);
3019
3020				|	lea Ra(reg), aword [=>label]
3021			} else {
3022				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
3023			}
3024		} else {
3025			ir_emit_load_imm_fp(ctx, type, reg, src);
3026		}
3027	} else if (ctx->vregs[src]) {
3028		ir_emit_load_mem(ctx, type, reg, ir_ref_spill_slot(ctx, src));
3029	} else {
3030		ir_load_local_addr(ctx, reg, src);
3031	}
3032}
3033
3034static void ir_emit_store_mem_int(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3035{
3036	ir_backend_data *data = ctx->data;
3037	dasm_State **Dst = &data->dasm_state;
3038
3039	|	ASM_MEM_REG_OP mov, type, mem, reg
3040}
3041
3042static void ir_emit_store_mem_fp(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3043{
3044	ir_backend_data *data = ctx->data;
3045	dasm_State **Dst = &data->dasm_state;
3046
3047	|	ASM_FP_MEM_REG_OP movs, type, mem, reg
3048}
3049
3050static void ir_emit_store_mem_imm(ir_ctx *ctx, ir_type type, ir_mem mem, int32_t imm)
3051{
3052	ir_backend_data *data = ctx->data;
3053	dasm_State **Dst = &data->dasm_state;
3054
3055	|	ASM_MEM_IMM_OP mov, type, mem, imm
3056}
3057
3058static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, bool is_arg)
3059{
3060	ir_backend_data *data = ctx->data;
3061	dasm_State **Dst = &data->dasm_state;
3062	ir_insn *val_insn = &ctx->ir_base[src];
3063
3064	IR_ASSERT(IR_IS_CONST_REF(src));
3065	if (val_insn->op == IR_STR) {
3066		int label = ir_const_label(ctx, src);
3067
3068		IR_ASSERT(tmp_reg != IR_REG_NONE);
3069|.if X64
3070		|	lea Ra(tmp_reg), aword [=>label]
3071||		ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
3072|.else
3073		|	ASM_TMEM_TXT_OP mov, aword, mem, =>label
3074|.endif
3075	} else {
3076		int64_t val = val_insn->val.i64;
3077
3078		if (val_insn->op == IR_FUNC || val_insn->op == IR_SYM) {
3079			val = (int64_t)(intptr_t)ir_sym_val(ctx, val_insn);
3080		}
3081
3082		if (sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(val)) {
3083			if (is_arg && ir_type_size[type] < 4) {
3084				type = IR_U32;
3085			}
3086			ir_emit_store_mem_imm(ctx, type, mem, val);
3087		} else {
3088			IR_ASSERT(tmp_reg != IR_REG_NONE);
3089			tmp_reg = IR_REG_NUM(tmp_reg);
3090			ir_emit_load_imm_int(ctx, type, tmp_reg, val);
3091			ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
3092		}
3093	}
3094}
3095
3096static void ir_emit_store_mem_fp_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, ir_reg tmp_fp_reg)
3097{
3098	ir_val *val = &ctx->ir_base[src].val;
3099
3100	if (type == IR_FLOAT) {
3101		ir_emit_store_mem_imm(ctx, IR_U32, mem, val->i32);
3102	} else if (sizeof(void*) == 8 && val->i64 == 0) {
3103		ir_emit_store_mem_imm(ctx, IR_U64, mem, 0);
3104	} else if (sizeof(void*) == 8 && tmp_reg != IR_REG_NONE) {
3105		ir_emit_load_imm_int(ctx, IR_U64, tmp_reg, val->i64);
3106		ir_emit_store_mem_int(ctx, IR_U64, mem, tmp_reg);
3107	} else {
3108		tmp_fp_reg = IR_REG_NUM(tmp_fp_reg);
3109		ir_emit_load(ctx, type, tmp_fp_reg, src);
3110		ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
3111	}
3112}
3113
3114static void ir_emit_store_mem(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
3115{
3116	if (IR_IS_TYPE_INT(type)) {
3117		ir_emit_store_mem_int(ctx, type, mem, reg);
3118	} else {
3119		ir_emit_store_mem_fp(ctx, type, mem, reg);
3120	}
3121}
3122
3123static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg)
3124{
3125	IR_ASSERT(dst >= 0);
3126	ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg);
3127}
3128
3129static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3130{
3131	ir_backend_data *data = ctx->data;
3132	dasm_State **Dst = &data->dasm_state;
3133
3134	|	ASM_REG_REG_OP mov, type, dst, src
3135}
3136
3137#define IR_HAVE_SWAP_INT
3138
3139static void ir_emit_swap(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3140{
3141	ir_backend_data *data = ctx->data;
3142	dasm_State **Dst = &data->dasm_state;
3143
3144	|	ASM_REG_REG_OP xchg, type, dst, src
3145}
3146
3147static void ir_emit_mov_ext(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3148{
3149	ir_backend_data *data = ctx->data;
3150	dasm_State **Dst = &data->dasm_state;
3151
3152	if (ir_type_size[type] > 2) {
3153		|	ASM_REG_REG_OP mov, type, dst, src
3154	} else if (ir_type_size[type] == 2) {
3155		if (IR_IS_TYPE_SIGNED(type)) {
3156			|	movsx Rd(dst), Rw(src)
3157		} else {
3158			|	movzx Rd(dst), Rw(src)
3159		}
3160	} else /* if (ir_type_size[type] == 1) */ {
3161		if (IR_IS_TYPE_SIGNED(type)) {
3162			|	movsx Rd(dst), Rb(src)
3163		} else {
3164			|	movzx Rd(dst), Rb(src)
3165		}
3166	}
3167}
3168
3169static void ir_emit_fp_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
3170{
3171	ir_backend_data *data = ctx->data;
3172	dasm_State **Dst = &data->dasm_state;
3173
3174	|	ASM_FP_REG_REG_OP movap, type, dst, src
3175}
3176
3177static ir_mem ir_fuse_addr_const(ir_ctx *ctx, ir_ref ref)
3178{
3179	ir_mem mem;
3180	ir_insn *addr_insn = &ctx->ir_base[ref];
3181
3182	IR_ASSERT(IR_IS_CONST_REF(ref));
3183	if (IR_IS_SYM_CONST(addr_insn->op)) {
3184		void *addr = ir_sym_val(ctx, addr_insn);
3185		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT((intptr_t)addr));
3186		mem = IR_MEM_O((int32_t)(intptr_t)addr);
3187	} else {
3188		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(addr_insn->val.i64));
3189		mem = IR_MEM_O(addr_insn->val.i32);
3190	}
3191	return mem;
3192}
3193
3194static ir_mem ir_fuse_addr(ir_ctx *ctx, ir_ref root, ir_ref ref)
3195{
3196	uint32_t rule = ctx->rules[ref];
3197	ir_insn *insn = &ctx->ir_base[ref];
3198	ir_insn *op1_insn, *op2_insn, *offset_insn;
3199	ir_ref base_reg_ref, index_reg_ref;
3200	ir_reg base_reg = IR_REG_NONE, index_reg;
3201	int32_t offset = 0, scale;
3202
3203	IR_ASSERT(((rule & IR_RULE_MASK) >= IR_LEA_OB &&
3204			(rule & IR_RULE_MASK) <= IR_LEA_SI_B) ||
3205		rule == IR_STATIC_ALLOCA);
3206	switch (rule & IR_RULE_MASK) {
3207		default:
3208			IR_ASSERT(0);
3209		case IR_LEA_OB:
3210			offset_insn = insn;
3211			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3212				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3213				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3214				base_reg_ref = IR_UNUSED;
3215			} else {
3216				base_reg_ref = ref * sizeof(ir_ref) + 1;
3217			}
3218			index_reg_ref = IR_UNUSED;
3219			scale = 1;
3220			break;
3221		case IR_LEA_SI:
3222			scale = ctx->ir_base[insn->op2].val.i32;
3223			index_reg_ref = ref * sizeof(ir_ref) + 1;
3224			base_reg_ref = IR_UNUSED;
3225			offset_insn = NULL;
3226			break;
3227		case IR_LEA_SIB:
3228			base_reg_ref = index_reg_ref = ref * sizeof(ir_ref) + 1;
3229			scale = ctx->ir_base[insn->op2].val.i32 - 1;
3230			offset_insn = NULL;
3231			break;
3232		case IR_LEA_IB:
3233			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3234				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3235				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3236				base_reg_ref = IR_UNUSED;
3237				index_reg_ref = ref * sizeof(ir_ref) + 2;
3238			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3239				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3240				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3241				base_reg_ref = IR_UNUSED;
3242				index_reg_ref = ref * sizeof(ir_ref) + 1;
3243			} else {
3244				base_reg_ref = ref * sizeof(ir_ref) + 1;
3245				index_reg_ref = ref * sizeof(ir_ref) + 2;
3246			}
3247			offset_insn = NULL;
3248			scale = 1;
3249			break;
3250		case IR_LEA_OB_I:
3251			op1_insn = &ctx->ir_base[insn->op1];
3252			offset_insn = op1_insn;
3253			scale = 1;
3254			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3255				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3256				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3257				base_reg_ref = IR_UNUSED;
3258				index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3259			} else if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
3260				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op1_insn->op1].op3);
3261				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3262				base_reg_ref = IR_UNUSED;
3263				index_reg_ref = ref * sizeof(ir_ref) + 2;
3264			} else {
3265				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3266				index_reg_ref = ref * sizeof(ir_ref) + 2;
3267			}
3268			break;
3269		case IR_LEA_I_OB:
3270			op2_insn = &ctx->ir_base[insn->op2];
3271			offset_insn = op2_insn;
3272			scale = 1;
3273			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3274				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3275				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3276				base_reg_ref = IR_UNUSED;
3277				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3278			} else if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
3279				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op2_insn->op1].op3);
3280				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3281				base_reg_ref = IR_UNUSED;
3282				index_reg_ref = ref * sizeof(ir_ref) + 1;
3283			} else {
3284				base_reg_ref = ref * sizeof(ir_ref) + 1;
3285				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3286			}
3287			break;
3288		case IR_LEA_SI_O:
3289			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3290			op1_insn = &ctx->ir_base[insn->op1];
3291			scale = ctx->ir_base[op1_insn->op2].val.i32;
3292			offset_insn = insn;
3293			base_reg_ref = IR_UNUSED;
3294			break;
3295		case IR_LEA_SIB_O:
3296			base_reg_ref = index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3297			op1_insn = &ctx->ir_base[insn->op1];
3298			scale = ctx->ir_base[op1_insn->op2].val.i32 - 1;
3299			offset_insn = insn;
3300			break;
3301		case IR_LEA_IB_O:
3302			base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3303			index_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
3304			offset_insn = insn;
3305			scale = 1;
3306			break;
3307		case IR_LEA_OB_SI:
3308			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3309			op1_insn = &ctx->ir_base[insn->op1];
3310			offset_insn = op1_insn;
3311			op2_insn = &ctx->ir_base[insn->op2];
3312			scale = ctx->ir_base[op2_insn->op2].val.i32;
3313			if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
3314				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op1_insn->op1].op3);
3315				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3316				base_reg_ref = IR_UNUSED;
3317			} else {
3318				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3319			}
3320			break;
3321		case IR_LEA_SI_OB:
3322			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3323			op1_insn = &ctx->ir_base[insn->op1];
3324			scale = ctx->ir_base[op1_insn->op2].val.i32;
3325			op2_insn = &ctx->ir_base[insn->op2];
3326			offset_insn = op2_insn;
3327			if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
3328				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[op2_insn->op1].op3);
3329				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3330				base_reg_ref = IR_UNUSED;
3331			} else {
3332				base_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3333			}
3334			break;
3335		case IR_LEA_B_SI:
3336			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
3337				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op1].op3);
3338				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3339				base_reg_ref = IR_UNUSED;
3340			} else {
3341				base_reg_ref = ref * sizeof(ir_ref) + 1;
3342			}
3343			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
3344			op2_insn = &ctx->ir_base[insn->op2];
3345			scale = ctx->ir_base[op2_insn->op2].val.i32;
3346			offset_insn = NULL;
3347			break;
3348		case IR_LEA_SI_B:
3349			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
3350			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
3351				offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
3352				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3353				base_reg_ref = IR_UNUSED;
3354			} else {
3355				base_reg_ref = ref * sizeof(ir_ref) + 2;
3356			}
3357			op1_insn = &ctx->ir_base[insn->op1];
3358			scale = ctx->ir_base[op1_insn->op2].val.i32;
3359			offset_insn = NULL;
3360			break;
3361		case IR_ALLOCA:
3362			offset = IR_SPILL_POS_TO_OFFSET(insn->op3);
3363			base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3364			base_reg_ref = index_reg_ref = IR_UNUSED;
3365			scale = 1;
3366			offset_insn = NULL;
3367			break;
3368	}
3369
3370	if (offset_insn) {
3371		ir_insn *addr_insn = &ctx->ir_base[offset_insn->op2];
3372
3373		if (IR_IS_SYM_CONST(addr_insn->op)) {
3374			void *addr = ir_sym_val(ctx, addr_insn);
3375			IR_ASSERT(sizeof(void*) != 8 || IR_IS_SIGNED_32BIT((intptr_t)addr));
3376			offset += (int64_t)(intptr_t)(addr);
3377		} else {
3378			if (offset_insn->op == IR_SUB) {
3379				offset = -addr_insn->val.i32;
3380			} else {
3381				offset += addr_insn->val.i32;
3382			}
3383		}
3384	}
3385
3386	if (base_reg_ref) {
3387		if (UNEXPECTED(ctx->rules[base_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3388			base_reg = ir_get_fused_reg(ctx, root, base_reg_ref);
3389		} else {
3390			base_reg = ((int8_t*)ctx->regs)[base_reg_ref];
3391		}
3392		IR_ASSERT(base_reg != IR_REG_NONE);
3393		if (IR_REG_SPILLED(base_reg)) {
3394			base_reg = IR_REG_NUM(base_reg);
3395			ir_emit_load(ctx, insn->type, base_reg, ((ir_ref*)ctx->ir_base)[base_reg_ref]);
3396		}
3397	}
3398
3399	index_reg = IR_REG_NONE;
3400	if (index_reg_ref) {
3401		if (base_reg_ref
3402			&& ((ir_ref*)ctx->ir_base)[index_reg_ref]
3403				== ((ir_ref*)ctx->ir_base)[base_reg_ref]) {
3404			index_reg = base_reg;
3405		} else {
3406			if (UNEXPECTED(ctx->rules[index_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
3407				index_reg = ir_get_fused_reg(ctx, root, index_reg_ref);
3408			} else {
3409				index_reg = ((int8_t*)ctx->regs)[index_reg_ref];
3410			}
3411			IR_ASSERT(index_reg != IR_REG_NONE);
3412			if (IR_REG_SPILLED(index_reg)) {
3413				index_reg = IR_REG_NUM(index_reg);
3414				ir_emit_load(ctx, insn->type, index_reg, ((ir_ref*)ctx->ir_base)[index_reg_ref]);
3415			}
3416		}
3417	}
3418
3419	return IR_MEM(base_reg, offset, index_reg, scale);
3420}
3421
3422static ir_mem ir_fuse_mem(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *mem_insn, ir_reg reg)
3423{
3424	if (reg != IR_REG_NONE) {
3425		if (IR_REG_SPILLED(reg)) {
3426			reg = IR_REG_NUM(reg);
3427			ir_emit_load(ctx, IR_ADDR, reg, mem_insn->op2);
3428		}
3429		return IR_MEM_B(reg);
3430	} else if (IR_IS_CONST_REF(mem_insn->op2)) {
3431		return ir_fuse_addr_const(ctx, mem_insn->op2);
3432	} else {
3433		return ir_fuse_addr(ctx, root, mem_insn->op2);
3434	}
3435}
3436
3437static ir_mem ir_fuse_load(ir_ctx *ctx, ir_ref root, ir_ref ref)
3438{
3439	ir_insn *load_insn = &ctx->ir_base[ref];
3440	ir_reg reg;
3441
3442	IR_ASSERT(load_insn->op == IR_LOAD);
3443	if (UNEXPECTED(ctx->rules[ref] & IR_FUSED_REG)) {
3444		reg = ir_get_fused_reg(ctx, root, ref * sizeof(ir_ref) + 2);
3445	} else {
3446		reg = ctx->regs[ref][2];
3447	}
3448	return ir_fuse_mem(ctx, root, ref, load_insn, reg);
3449}
3450
3451static int32_t ir_fuse_imm(ir_ctx *ctx, ir_ref ref)
3452{
3453	ir_insn *val_insn = &ctx->ir_base[ref];
3454
3455	IR_ASSERT(IR_IS_CONST_REF(ref));
3456	if (IR_IS_SYM_CONST(val_insn->op)) {
3457		void *addr = ir_sym_val(ctx, val_insn);
3458		IR_ASSERT(IR_IS_SIGNED_32BIT((intptr_t)addr));
3459		return (int32_t)(intptr_t)addr;
3460	} else {
3461		IR_ASSERT(IR_IS_SIGNED_32BIT(val_insn->val.i32));
3462		return val_insn->val.i32;
3463	}
3464}
3465
3466static void ir_emit_load_ex(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src, ir_ref root)
3467{
3468	if (IR_IS_CONST_REF(src)) {
3469		if (IR_IS_TYPE_INT(type)) {
3470			ir_insn *insn = &ctx->ir_base[src];
3471
3472			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
3473				void *addr = ir_sym_val(ctx, insn);
3474				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
3475			} else if (insn->op == IR_STR) {
3476				ir_backend_data *data = ctx->data;
3477				dasm_State **Dst = &data->dasm_state;
3478				int label = ir_const_label(ctx, src);
3479
3480				|	lea Ra(reg), aword [=>label]
3481			} else {
3482				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
3483			}
3484		} else {
3485			ir_emit_load_imm_fp(ctx, type, reg, src);
3486		}
3487	} else if (ir_rule(ctx, src) == IR_STATIC_ALLOCA) {
3488		ir_load_local_addr(ctx, reg, src);
3489	} else {
3490		ir_mem mem;
3491
3492		if (ir_rule(ctx, src) & IR_FUSED) {
3493			mem = ir_fuse_load(ctx, root, src);
3494		} else {
3495			mem = ir_ref_spill_slot(ctx, src);
3496		}
3497		ir_emit_load_mem(ctx, type, reg, mem);
3498	}
3499}
3500
3501static void ir_emit_prologue(ir_ctx *ctx)
3502{
3503	ir_backend_data *data = ctx->data;
3504	dasm_State **Dst = &data->dasm_state;
3505	int offset = ctx->stack_frame_size + ctx->call_stack_size;
3506
3507	if (ctx->flags & IR_USE_FRAME_POINTER) {
3508		|	push Ra(IR_REG_RBP)
3509		|	mov Ra(IR_REG_RBP), Ra(IR_REG_RSP)
3510	}
3511	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3512		int i;
3513		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3514
3515		for (i = IR_REG_GP_FIRST; i <= IR_REG_GP_LAST; i++) {
3516			if (IR_REGSET_IN(used_preserved_regs, i)) {
3517				offset -= sizeof(void*);
3518				|	push Ra(i)
3519			}
3520		}
3521	}
3522	if (ctx->stack_frame_size + ctx->call_stack_size) {
3523		if (ctx->fixed_stack_red_zone) {
3524			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3525		} else if (offset) {
3526			|	sub Ra(IR_REG_RSP), offset
3527		}
3528	}
3529	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
3530		ir_reg fp;
3531		int i;
3532		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP);
3533
3534		if (ctx->flags & IR_USE_FRAME_POINTER) {
3535			fp = IR_REG_FRAME_POINTER;
3536			offset -= ctx->stack_frame_size + ctx->call_stack_size;
3537		} else {
3538			fp = IR_REG_STACK_POINTER;
3539		}
3540		for (i = IR_REG_FP_FIRST; i <= IR_REG_FP_LAST; i++) {
3541			if (IR_REGSET_IN(used_preserved_regs, i)) {
3542				offset -= sizeof(void*);
3543				if (ctx->mflags & IR_X86_AVX) {
3544					|	vmovsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3545				} else {
3546					|	movsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
3547				}
3548			}
3549		}
3550	}
3551	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
3552#if defined(_WIN64)
3553		ir_reg fp;
3554		int offset;
3555
3556		if (ctx->flags & IR_USE_FRAME_POINTER) {
3557			fp = IR_REG_FRAME_POINTER;
3558			offset = sizeof(void*) * 2;
3559		} else {
3560			fp = IR_REG_STACK_POINTER;
3561			offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*);
3562		}
3563		|	mov [Ra(fp)+offset], Ra(IR_REG_INT_ARG1)
3564		|	mov [Ra(fp)+offset+8], Ra(IR_REG_INT_ARG2)
3565		|	mov [Ra(fp)+offset+16], Ra(IR_REG_INT_ARG3)
3566		|	mov [Ra(fp)+offset+24], Ra(IR_REG_INT_ARG4)
3567#elif defined(IR_TARGET_X64)
3568|.if X64
3569		const int8_t *int_reg_params = _ir_int_reg_params;
3570		const int8_t *fp_reg_params = _ir_fp_reg_params;
3571		uint32_t i;
3572		ir_reg fp;
3573		int offset;
3574
3575		if (ctx->flags & IR_USE_FRAME_POINTER) {
3576			fp = IR_REG_FRAME_POINTER;
3577
3578			offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
3579		} else {
3580			fp = IR_REG_STACK_POINTER;
3581			offset = ctx->locals_area_size + ctx->call_stack_size;
3582		}
3583
3584		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
3585			/* skip named args */
3586			offset += sizeof(void*) * ctx->gp_reg_params;
3587			for (i = ctx->gp_reg_params; i < IR_REG_INT_ARGS; i++) {
3588				|	mov qword [Ra(fp)+offset], Rq(int_reg_params[i])
3589				offset += sizeof(void*);
3590			}
3591		}
3592		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
3593			|	test al, al
3594			|	je	>1
3595			/* skip named args */
3596			offset += 16 * ctx->fp_reg_params;
3597			for (i = ctx->fp_reg_params; i < IR_REG_FP_ARGS; i++) {
3598				|	movaps [Ra(fp)+offset], xmm(fp_reg_params[i]-IR_REG_FP_FIRST)
3599				offset += 16;
3600			}
3601			|1:
3602		}
3603|.endif
3604#endif
3605	}
3606}
3607
3608static void ir_emit_epilogue(ir_ctx *ctx)
3609{
3610	ir_backend_data *data = ctx->data;
3611	dasm_State **Dst = &data->dasm_state;
3612
3613	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
3614		int i;
3615		int offset;
3616		ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
3617		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
3618
3619		if (ctx->flags & IR_USE_FRAME_POINTER) {
3620			fp = IR_REG_FRAME_POINTER;
3621			offset = 0;
3622		} else {
3623			fp = IR_REG_STACK_POINTER;
3624			offset = ctx->stack_frame_size + ctx->call_stack_size;
3625		}
3626		for (i = 0; i < IR_REG_NUM; i++) {
3627			if (IR_REGSET_IN(used_preserved_regs, i)) {
3628				if (i < IR_REG_FP_FIRST) {
3629					offset -= sizeof(void*);
3630				} else {
3631					offset -= sizeof(void*);
3632					if (ctx->mflags & IR_X86_AVX) {
3633						|	vmovsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3634					} else {
3635						|	movsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
3636					}
3637				}
3638			}
3639		}
3640	}
3641
3642	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3643		int i;
3644		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3645		int offset;
3646
3647		if (ctx->flags & IR_USE_FRAME_POINTER) {
3648			offset = 0;
3649		} else {
3650			offset = ctx->stack_frame_size + ctx->call_stack_size;
3651		}
3652		if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
3653			int i;
3654			ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
3655
3656			for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
3657				if (IR_REGSET_IN(used_preserved_regs, i)) {
3658					offset -= sizeof(void*);
3659				}
3660			}
3661		}
3662		if (ctx->flags & IR_USE_FRAME_POINTER) {
3663			|	lea Ra(IR_REG_RSP), [Ra(IR_REG_RBP)+offset]
3664		} else if (offset) {
3665			|	add Ra(IR_REG_RSP), offset
3666		}
3667		for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
3668			if (IR_REGSET_IN(used_preserved_regs, i)) {
3669				|	pop Ra(i)
3670			}
3671		}
3672		if (ctx->flags & IR_USE_FRAME_POINTER) {
3673			|	pop Ra(IR_REG_RBP)
3674		}
3675	} else if (ctx->flags & IR_USE_FRAME_POINTER) {
3676		|	mov Ra(IR_REG_RSP), Ra(IR_REG_RBP)
3677		|	pop Ra(IR_REG_RBP)
3678	} else if (ctx->stack_frame_size + ctx->call_stack_size) {
3679		if (ctx->fixed_stack_red_zone) {
3680			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
3681		} else {
3682			|	add Ra(IR_REG_RSP), (ctx->stack_frame_size + ctx->call_stack_size)
3683		}
3684	}
3685}
3686
3687static void ir_emit_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3688{
3689	ir_backend_data *data = ctx->data;
3690	dasm_State **Dst = &data->dasm_state;
3691	ir_type type = insn->type;
3692	ir_ref op1 = insn->op1;
3693	ir_ref op2 = insn->op2;
3694	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3695	ir_reg op1_reg = ctx->regs[def][1];
3696	ir_reg op2_reg = ctx->regs[def][2];
3697
3698	IR_ASSERT(def_reg != IR_REG_NONE);
3699
3700	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3701		op1_reg = IR_REG_NUM(op1_reg);
3702		ir_emit_load(ctx, type, op1_reg, op1);
3703	}
3704	if (def_reg != op1_reg) {
3705		if (op1_reg != IR_REG_NONE) {
3706			ir_emit_mov(ctx, type, def_reg, op1_reg);
3707		} else {
3708			ir_emit_load(ctx, type, def_reg, op1);
3709		}
3710		if (op1 == op2) {
3711			op2_reg = def_reg;
3712		}
3713	}
3714
3715	if (op2_reg != IR_REG_NONE) {
3716		if (IR_REG_SPILLED(op2_reg)) {
3717			op2_reg = IR_REG_NUM(op2_reg);
3718			if (op1 != op2) {
3719				ir_emit_load(ctx, type, op2_reg, op2);
3720			}
3721		}
3722		switch (insn->op) {
3723			default:
3724				IR_ASSERT(0 && "NIY binary op");
3725			case IR_ADD:
3726			case IR_ADD_OV:
3727				|	ASM_REG_REG_OP add, type, def_reg, op2_reg
3728				break;
3729			case IR_SUB:
3730			case IR_SUB_OV:
3731				|	ASM_REG_REG_OP sub, type, def_reg, op2_reg
3732				break;
3733			case IR_MUL:
3734			case IR_MUL_OV:
3735				|	ASM_REG_REG_MUL imul, type, def_reg, op2_reg
3736				break;
3737			case IR_OR:
3738				|	ASM_REG_REG_OP or, type, def_reg, op2_reg
3739				break;
3740			case IR_AND:
3741				|	ASM_REG_REG_OP and, type, def_reg, op2_reg
3742				break;
3743			case IR_XOR:
3744				|	ASM_REG_REG_OP xor, type, def_reg, op2_reg
3745				break;
3746		}
3747	} else if (IR_IS_CONST_REF(op2)) {
3748		int32_t val = ir_fuse_imm(ctx, op2);
3749
3750		switch (insn->op) {
3751			default:
3752				IR_ASSERT(0 && "NIY binary op");
3753			case IR_ADD:
3754			case IR_ADD_OV:
3755				|	ASM_REG_IMM_OP add, type, def_reg, val
3756				break;
3757			case IR_SUB:
3758			case IR_SUB_OV:
3759				|	ASM_REG_IMM_OP sub, type, def_reg, val
3760				break;
3761			case IR_MUL:
3762			case IR_MUL_OV:
3763				|	ASM_REG_IMM_MUL imul, type, def_reg, val
3764				break;
3765			case IR_OR:
3766				|	ASM_REG_IMM_OP or, type, def_reg, val
3767				break;
3768			case IR_AND:
3769				|	ASM_REG_IMM_OP and, type, def_reg, val
3770				break;
3771			case IR_XOR:
3772				|	ASM_REG_IMM_OP xor, type, def_reg, val
3773				break;
3774		}
3775	} else {
3776		ir_mem mem;
3777
3778		if (ir_rule(ctx, op2) & IR_FUSED) {
3779			mem = ir_fuse_load(ctx, def, op2);
3780		} else {
3781			mem = ir_ref_spill_slot(ctx, op2);
3782		}
3783		switch (insn->op) {
3784			default:
3785				IR_ASSERT(0 && "NIY binary op");
3786			case IR_ADD:
3787			case IR_ADD_OV:
3788				|	ASM_REG_MEM_OP add, type, def_reg, mem
3789				break;
3790			case IR_SUB:
3791			case IR_SUB_OV:
3792				|	ASM_REG_MEM_OP sub, type, def_reg, mem
3793				break;
3794			case IR_MUL:
3795			case IR_MUL_OV:
3796				|	ASM_REG_MEM_MUL imul, type, def_reg, mem
3797				break;
3798			case IR_OR:
3799				|	ASM_REG_MEM_OP or, type, def_reg, mem
3800				break;
3801			case IR_AND:
3802				|	ASM_REG_MEM_OP and, type, def_reg, mem
3803				break;
3804			case IR_XOR:
3805				|	ASM_REG_MEM_OP xor, type, def_reg, mem
3806				break;
3807		}
3808	}
3809	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3810		ir_emit_store(ctx, type, def, def_reg);
3811	}
3812}
3813
3814static void ir_emit_imul3(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3815{
3816	ir_backend_data *data = ctx->data;
3817	dasm_State **Dst = &data->dasm_state;
3818	ir_type type = insn->type;
3819	ir_ref op1 = insn->op1;
3820	ir_ref op2 = insn->op2;
3821	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3822	ir_reg op1_reg = ctx->regs[def][1];
3823	int32_t val = ir_fuse_imm(ctx, op2);
3824
3825	IR_ASSERT(def_reg != IR_REG_NONE);
3826	IR_ASSERT(!IR_IS_CONST_REF(op1));
3827
3828	if (op1_reg != IR_REG_NONE) {
3829		if (IR_REG_SPILLED(op1_reg)) {
3830			op1_reg = IR_REG_NUM(op1_reg);
3831			ir_emit_load(ctx, type, op1_reg, op1);
3832		}
3833		switch (ir_type_size[type]) {
3834			default:
3835				IR_ASSERT(0);
3836			case 2:
3837				|	imul Rw(def_reg), Rw(op1_reg), val
3838				break;
3839			case 4:
3840				|	imul Rd(def_reg), Rd(op1_reg), val
3841				break;
3842|.if X64
3843||			case 8:
3844|				imul Rq(def_reg), Rq(op1_reg), val
3845||				break;
3846|.endif
3847		}
3848	} else {
3849		ir_mem mem;
3850
3851		if (ir_rule(ctx, op1) & IR_FUSED) {
3852			mem = ir_fuse_load(ctx, def, op1);
3853		} else {
3854			mem = ir_ref_spill_slot(ctx, op1);
3855		}
3856		|	ASM_REG_MEM_TXT_MUL imul, type, def_reg, mem, val
3857	}
3858	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3859		ir_emit_store(ctx, type, def, def_reg);
3860	}
3861}
3862
3863static void ir_emit_min_max_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3864{
3865	ir_backend_data *data = ctx->data;
3866	dasm_State **Dst = &data->dasm_state;
3867	ir_type type = insn->type;
3868	ir_ref op1 = insn->op1;
3869	ir_ref op2 = insn->op2;
3870	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3871	ir_reg op1_reg = ctx->regs[def][1];
3872	ir_reg op2_reg = ctx->regs[def][2];
3873
3874	IR_ASSERT(def_reg != IR_REG_NONE && op2_reg != IR_REG_NONE);
3875
3876	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
3877		op1_reg = IR_REG_NUM(op1_reg);
3878		ir_emit_load(ctx, type, op1_reg, op1);
3879	}
3880	if (def_reg != op1_reg) {
3881		if (op1_reg != IR_REG_NONE) {
3882			ir_emit_mov(ctx, type, def_reg, op1_reg);
3883		} else {
3884			ir_emit_load(ctx, type, def_reg, op1);
3885		}
3886	}
3887
3888	if (IR_REG_SPILLED(op2_reg)) {
3889		op2_reg = IR_REG_NUM(op2_reg);
3890		if (op1 != op2) {
3891			ir_emit_load(ctx, type, op2_reg, op2);
3892		}
3893	}
3894
3895	if (op1 == op2) {
3896		return;
3897	}
3898
3899	|	ASM_REG_REG_OP cmp, type, def_reg, op2_reg
3900	if (insn->op == IR_MIN) {
3901		if (IR_IS_TYPE_SIGNED(type)) {
3902			|	ASM_REG_REG_OP2 cmovg, type, def_reg, op2_reg
3903		} else {
3904			|	ASM_REG_REG_OP2 cmova, type, def_reg, op2_reg
3905		}
3906	} else {
3907		IR_ASSERT(insn->op == IR_MAX);
3908		if (IR_IS_TYPE_SIGNED(type)) {
3909			|	ASM_REG_REG_OP2 cmovl, type, def_reg, op2_reg
3910		} else {
3911			|	ASM_REG_REG_OP2 cmovb, type, def_reg, op2_reg
3912		}
3913	}
3914
3915	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3916		ir_emit_store(ctx, type, def, def_reg);
3917	}
3918}
3919
3920static void ir_emit_overflow(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3921{
3922	ir_backend_data *data = ctx->data;
3923	dasm_State **Dst = &data->dasm_state;
3924	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
3925	ir_type type = ctx->ir_base[insn->op1].type;
3926
3927	IR_ASSERT(def_reg != IR_REG_NONE);
3928	IR_ASSERT(IR_IS_TYPE_INT(type));
3929	if (IR_IS_TYPE_SIGNED(type)) {
3930		|	seto Rb(def_reg)
3931	} else {
3932		|	setc Rb(def_reg)
3933	}
3934	if (IR_REG_SPILLED(ctx->regs[def][0])) {
3935		ir_emit_store(ctx, insn->type, def, def_reg);
3936	}
3937}
3938
3939static void ir_emit_overflow_and_branch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
3940{
3941	ir_backend_data *data = ctx->data;
3942	dasm_State **Dst = &data->dasm_state;
3943	ir_insn *overflow_insn = &ctx->ir_base[insn->op2];
3944	ir_type type = ctx->ir_base[overflow_insn->op1].type;
3945	uint32_t true_block, false_block;
3946	bool reverse = 0;
3947
3948	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
3949	if (true_block == next_block) {
3950		reverse = 1;
3951		true_block = false_block;
3952		false_block = 0;
3953	} else if (false_block == next_block) {
3954		false_block = 0;
3955	}
3956
3957	if (IR_IS_TYPE_SIGNED(type)) {
3958		if (reverse) {
3959			|	jno =>true_block
3960		} else {
3961			|	jo =>true_block
3962		}
3963	} else {
3964		if (reverse) {
3965			|	jnc =>true_block
3966		} else {
3967			|	jc =>true_block
3968		}
3969	}
3970	if (false_block) {
3971		|	jmp =>false_block
3972	}
3973}
3974
3975static void ir_emit_mem_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
3976{
3977	ir_backend_data *data = ctx->data;
3978	dasm_State **Dst = &data->dasm_state;
3979	ir_insn *op_insn = &ctx->ir_base[insn->op3];
3980	ir_type type = op_insn->type;
3981	ir_ref op2 = op_insn->op2;
3982	ir_reg op2_reg = ctx->regs[insn->op3][2];
3983	ir_mem mem;
3984
3985	if (insn->op == IR_STORE) {
3986		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
3987	} else {
3988		IR_ASSERT(insn->op == IR_VSTORE);
3989		mem = ir_var_spill_slot(ctx, insn->op2);
3990	}
3991
3992	if (op2_reg == IR_REG_NONE) {
3993		int32_t val = ir_fuse_imm(ctx, op2);
3994
3995		switch (op_insn->op) {
3996			default:
3997				IR_ASSERT(0 && "NIY binary op");
3998			case IR_ADD:
3999			case IR_ADD_OV:
4000				|	ASM_MEM_IMM_OP add, type, mem, val
4001				break;
4002			case IR_SUB:
4003			case IR_SUB_OV:
4004				|	ASM_MEM_IMM_OP sub, type, mem, val
4005				break;
4006			case IR_OR:
4007				|	ASM_MEM_IMM_OP or, type, mem, val
4008				break;
4009			case IR_AND:
4010				|	ASM_MEM_IMM_OP and, type, mem, val
4011				break;
4012			case IR_XOR:
4013				|	ASM_MEM_IMM_OP xor, type, mem, val
4014				break;
4015		}
4016	} else {
4017		if (IR_REG_SPILLED(op2_reg)) {
4018			op2_reg = IR_REG_NUM(op2_reg);
4019			ir_emit_load(ctx, type, op2_reg, op2);
4020		}
4021		switch (op_insn->op) {
4022			default:
4023				IR_ASSERT(0 && "NIY binary op");
4024			case IR_ADD:
4025			case IR_ADD_OV:
4026				|	ASM_MEM_REG_OP add, type, mem, op2_reg
4027				break;
4028			case IR_SUB:
4029			case IR_SUB_OV:
4030				|	ASM_MEM_REG_OP sub, type, mem, op2_reg
4031				break;
4032			case IR_OR:
4033				|	ASM_MEM_REG_OP or, type, mem, op2_reg
4034				break;
4035			case IR_AND:
4036				|	ASM_MEM_REG_OP and, type, mem, op2_reg
4037				break;
4038			case IR_XOR:
4039				|	ASM_MEM_REG_OP xor, type, mem, op2_reg
4040				break;
4041		}
4042	}
4043}
4044
4045static void ir_emit_reg_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4046{
4047	ir_backend_data *data = ctx->data;
4048	dasm_State **Dst = &data->dasm_state;
4049	ir_insn *op_insn = &ctx->ir_base[insn->op2];
4050	ir_type type = op_insn->type;
4051	ir_ref op2 = op_insn->op2;
4052	ir_reg op2_reg = ctx->regs[insn->op2][2];
4053	ir_reg reg;
4054
4055	IR_ASSERT(insn->op == IR_RSTORE);
4056	reg = insn->op3;
4057
4058	if (op2_reg == IR_REG_NONE) {
4059		int32_t val = ir_fuse_imm(ctx, op2);
4060
4061		switch (op_insn->op) {
4062			default:
4063				IR_ASSERT(0 && "NIY binary op");
4064			case IR_ADD:
4065				|	ASM_REG_IMM_OP add, type, reg, val
4066				break;
4067			case IR_SUB:
4068				|	ASM_REG_IMM_OP sub, type, reg, val
4069				break;
4070			case IR_OR:
4071				|	ASM_REG_IMM_OP or, type, reg, val
4072				break;
4073			case IR_AND:
4074				|	ASM_REG_IMM_OP and, type, reg, val
4075				break;
4076			case IR_XOR:
4077				|	ASM_REG_IMM_OP xor, type, reg, val
4078				break;
4079		}
4080	} else {
4081		if (IR_REG_SPILLED(op2_reg)) {
4082			op2_reg = IR_REG_NUM(op2_reg);
4083			ir_emit_load(ctx, type, op2_reg, op2);
4084		}
4085		switch (op_insn->op) {
4086			default:
4087				IR_ASSERT(0 && "NIY binary op");
4088			case IR_ADD:
4089				|	ASM_REG_REG_OP add, type, reg, op2_reg
4090				break;
4091			case IR_SUB:
4092				|	ASM_REG_REG_OP sub, type, reg, op2_reg
4093				break;
4094			case IR_OR:
4095				|	ASM_REG_REG_OP or, type, reg, op2_reg
4096				break;
4097			case IR_AND:
4098				|	ASM_REG_REG_OP and, type, reg, op2_reg
4099				break;
4100			case IR_XOR:
4101				|	ASM_REG_REG_OP xor, type, reg, op2_reg
4102				break;
4103		}
4104	}
4105}
4106
4107static void ir_emit_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4108{
4109	ir_backend_data *data = ctx->data;
4110	dasm_State **Dst = &data->dasm_state;
4111	ir_type type = insn->type;
4112	ir_ref op1 = insn->op1;
4113	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4114	ir_reg op1_reg = ctx->regs[def][1];
4115
4116	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4117	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4118	IR_ASSERT(def_reg != IR_REG_NONE);
4119
4120	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4121		op1_reg = IR_REG_NUM(op1_reg);
4122		ir_emit_load(ctx, type, op1_reg, op1);
4123	}
4124	if (def_reg != op1_reg) {
4125		if (op1_reg != IR_REG_NONE) {
4126			ir_emit_mov(ctx, type, def_reg, op1_reg);
4127		} else {
4128			ir_emit_load(ctx, type, def_reg, op1);
4129		}
4130	}
4131	if (insn->op == IR_MUL) {
4132		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4133
4134		if (shift == 1) {
4135			|	ASM_REG_REG_OP add, type, def_reg, def_reg
4136		} else {
4137			|	ASM_REG_IMM_OP shl, type, def_reg, shift
4138		}
4139	} else if (insn->op == IR_DIV) {
4140		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4141
4142		|	ASM_REG_IMM_OP shr, type, def_reg, shift
4143	} else {
4144		IR_ASSERT(insn->op == IR_MOD);
4145		uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
4146
4147|.if X64
4148||		if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
4149||			ir_reg op2_reg = ctx->regs[def][2];
4150||
4151||			op2_reg = IR_REG_NUM(op2_reg);
4152||			ir_emit_load_imm_int(ctx, type, op2_reg, mask);
4153			|	ASM_REG_REG_OP and, type, def_reg, op2_reg
4154||		} else {
4155|.endif
4156			|	ASM_REG_IMM_OP and, type, def_reg, mask
4157|.if X64
4158||		}
4159|.endif
4160	}
4161	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4162		ir_emit_store(ctx, type, def, def_reg);
4163	}
4164}
4165
4166static void ir_emit_sdiv_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4167{
4168	ir_backend_data *data = ctx->data;
4169	dasm_State **Dst = &data->dasm_state;
4170	ir_type type = insn->type;
4171	ir_ref op1 = insn->op1;
4172	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4173	ir_reg op1_reg = ctx->regs[def][1];
4174	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4175	int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
4176
4177	IR_ASSERT(shift != 0);
4178	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4179	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4180	IR_ASSERT(op1_reg != IR_REG_NONE && def_reg != IR_REG_NONE && op1_reg != def_reg);
4181
4182	if (IR_REG_SPILLED(op1_reg)) {
4183		op1_reg = IR_REG_NUM(op1_reg);
4184		ir_emit_load(ctx, type, op1_reg, op1);
4185	}
4186
4187	if (shift == 1) {
4188|.if X64
4189||		if (ir_type_size[type] == 8) {
4190			|	mov Rq(def_reg), Rq(op1_reg)
4191			|	ASM_REG_IMM_OP shr, type, def_reg, 63
4192			|	add Rq(def_reg), Rq(op1_reg)
4193||		} else {
4194|.endif
4195			|	mov Rd(def_reg), Rd(op1_reg)
4196			|	ASM_REG_IMM_OP shr, type, def_reg, (ir_type_size[type]*8-1)
4197			|	add Rd(def_reg), Rd(op1_reg)
4198|.if X64
4199||		}
4200|.endif
4201	} else {
4202|.if X64
4203||		if (ir_type_size[type] == 8) {
4204||			ir_reg op2_reg = ctx->regs[def][2];
4205||
4206||			if (op2_reg != IR_REG_NONE) {
4207||				op2_reg =  IR_REG_NUM(op2_reg);
4208||				ir_emit_load_imm_int(ctx, type, op2_reg, offset);
4209				|	lea Rq(def_reg), [Rq(op1_reg)+Rq(op2_reg)]
4210||			} else {
4211				|	lea Rq(def_reg), [Rq(op1_reg)+(int32_t)offset]
4212||			}
4213||		} else {
4214|.endif
4215			|	lea Rd(def_reg), [Rd(op1_reg)+(int32_t)offset]
4216|.if X64
4217||		}
4218|.endif
4219		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
4220		|	ASM_REG_REG_OP2 cmovns, type, def_reg, op1_reg
4221	}
4222	|	ASM_REG_IMM_OP sar, type, def_reg, shift
4223
4224	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4225		ir_emit_store(ctx, type, def, def_reg);
4226	}
4227}
4228
4229static void ir_emit_smod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4230{
4231	ir_backend_data *data = ctx->data;
4232	dasm_State **Dst = &data->dasm_state;
4233	ir_type type = insn->type;
4234	ir_ref op1 = insn->op1;
4235	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4236	ir_reg op1_reg = ctx->regs[def][1];
4237	ir_reg tmp_reg = ctx->regs[def][3];
4238	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
4239	uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;
4240
4241	IR_ASSERT(shift != 0);
4242	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
4243	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4244	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE && def_reg != tmp_reg);
4245
4246	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4247		op1_reg = IR_REG_NUM(op1_reg);
4248		ir_emit_load(ctx, type, op1_reg, op1);
4249	}
4250	if (def_reg != op1_reg) {
4251		if (op1_reg != IR_REG_NONE) {
4252			ir_emit_mov(ctx, type, def_reg, op1_reg);
4253		} else {
4254			ir_emit_load(ctx, type, def_reg, op1);
4255		}
4256	}
4257	if (tmp_reg != op1_reg) {
4258		ir_emit_mov(ctx, type, tmp_reg, def_reg);
4259	}
4260
4261
4262	if (shift == 1) {
4263		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-1)
4264	} else {
4265		|	ASM_REG_IMM_OP sar, type, tmp_reg, (ir_type_size[type]*8-1)
4266		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-shift)
4267	}
4268	|	ASM_REG_REG_OP add, type, def_reg, tmp_reg
4269
4270|.if X64
4271||	if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
4272||		ir_reg op2_reg = ctx->regs[def][2];
4273||
4274||		op2_reg = IR_REG_NUM(op2_reg);
4275||		ir_emit_load_imm_int(ctx, type, op2_reg, mask);
4276		|	ASM_REG_REG_OP and, type, def_reg, op2_reg
4277||	} else {
4278|.endif
4279		|	ASM_REG_IMM_OP and, type, def_reg, mask
4280|.if X64
4281||	}
4282|.endif
4283
4284	|	ASM_REG_REG_OP sub, type, def_reg, tmp_reg
4285
4286	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4287		ir_emit_store(ctx, type, def, def_reg);
4288	}
4289}
4290
4291static void ir_emit_mem_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4292{
4293	ir_backend_data *data = ctx->data;
4294	dasm_State **Dst = &data->dasm_state;
4295	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4296	ir_type type = op_insn->type;
4297	ir_mem mem;
4298
4299	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
4300	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
4301
4302	if (insn->op == IR_STORE) {
4303		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4304	} else {
4305		IR_ASSERT(insn->op == IR_VSTORE);
4306		mem = ir_var_spill_slot(ctx, insn->op2);
4307	}
4308
4309	if (op_insn->op == IR_MUL) {
4310		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
4311		|	ASM_MEM_IMM_OP shl, type, mem, shift
4312	} else if (op_insn->op == IR_DIV) {
4313		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
4314		|	ASM_MEM_IMM_OP shr, type, mem, shift
4315	} else {
4316		IR_ASSERT(op_insn->op == IR_MOD);
4317		uint64_t mask = ctx->ir_base[op_insn->op2].val.u64 - 1;
4318		IR_ASSERT(IR_IS_UNSIGNED_32BIT(mask));
4319		|	ASM_MEM_IMM_OP and, type, mem, mask
4320	}
4321}
4322
4323static void ir_emit_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4324{
4325	ir_backend_data *data = ctx->data;
4326	dasm_State **Dst = &data->dasm_state;
4327	ir_type type = insn->type;
4328	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4329	ir_reg op1_reg = ctx->regs[def][1];
4330	ir_reg op2_reg = ctx->regs[def][2];
4331
4332	IR_ASSERT(def_reg != IR_REG_NONE && def_reg != IR_REG_RCX);
4333	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4334		op1_reg = IR_REG_NUM(op1_reg);
4335		ir_emit_load(ctx, type, op1_reg, insn->op1);
4336	}
4337	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
4338		op2_reg = IR_REG_NUM(op2_reg);
4339		ir_emit_load(ctx, type, op2_reg, insn->op2);
4340	}
4341	if (op2_reg != IR_REG_RCX) {
4342		if (op1_reg == IR_REG_RCX) {
4343			ir_emit_mov(ctx, type, def_reg, op1_reg);
4344			op1_reg = def_reg;
4345		}
4346		if (op2_reg != IR_REG_NONE) {
4347			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
4348		} else {
4349			ir_emit_load(ctx, type, IR_REG_RCX, insn->op2);
4350		}
4351	}
4352	if (def_reg != op1_reg) {
4353		if (op1_reg != IR_REG_NONE) {
4354			ir_emit_mov(ctx, type, def_reg, op1_reg);
4355		} else {
4356			ir_emit_load(ctx, type, def_reg, insn->op1);
4357		}
4358	}
4359	switch (insn->op) {
4360		default:
4361			IR_ASSERT(0);
4362		case IR_SHL:
4363			|	ASM_REG_TXT_OP shl, insn->type, def_reg, cl
4364			break;
4365		case IR_SHR:
4366			|	ASM_REG_TXT_OP shr, insn->type, def_reg, cl
4367			break;
4368		case IR_SAR:
4369			|	ASM_REG_TXT_OP sar, insn->type, def_reg, cl
4370			break;
4371		case IR_ROL:
4372			|	ASM_REG_TXT_OP rol, insn->type, def_reg, cl
4373			break;
4374		case IR_ROR:
4375			|	ASM_REG_TXT_OP ror, insn->type, def_reg, cl
4376			break;
4377	}
4378	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4379		ir_emit_store(ctx, type, def, def_reg);
4380	}
4381}
4382
4383static void ir_emit_mem_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4384{
4385	ir_backend_data *data = ctx->data;
4386	dasm_State **Dst = &data->dasm_state;
4387	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4388	ir_type type = op_insn->type;
4389	ir_ref op2 = op_insn->op2;
4390	ir_reg op2_reg = ctx->regs[insn->op3][2];
4391	ir_mem mem;
4392
4393	if (insn->op == IR_STORE) {
4394		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4395	} else {
4396		IR_ASSERT(insn->op == IR_VSTORE);
4397		mem = ir_var_spill_slot(ctx, insn->op2);
4398	}
4399
4400	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
4401		op2_reg = IR_REG_NUM(op2_reg);
4402		ir_emit_load(ctx, type, op2_reg, op2);
4403	}
4404	if (op2_reg != IR_REG_RCX) {
4405		if (op2_reg != IR_REG_NONE) {
4406			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
4407		} else {
4408			ir_emit_load(ctx, type, IR_REG_RCX, op2);
4409		}
4410	}
4411	switch (op_insn->op) {
4412		default:
4413			IR_ASSERT(0);
4414		case IR_SHL:
4415			|	ASM_MEM_TXT_OP shl, type, mem, cl
4416			break;
4417		case IR_SHR:
4418			|	ASM_MEM_TXT_OP shr, type, mem, cl
4419			break;
4420		case IR_SAR:
4421			|	ASM_MEM_TXT_OP sar, type, mem, cl
4422			break;
4423		case IR_ROL:
4424			|	ASM_MEM_TXT_OP rol, type, mem, cl
4425			break;
4426		case IR_ROR:
4427			|	ASM_MEM_TXT_OP ror, type, mem, cl
4428			break;
4429	}
4430}
4431
4432static void ir_emit_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4433{
4434	ir_backend_data *data = ctx->data;
4435	dasm_State **Dst = &data->dasm_state;
4436	int32_t shift;
4437	ir_type type = insn->type;
4438	ir_ref op1 = insn->op1;
4439	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4440	ir_reg op1_reg = ctx->regs[def][1];
4441
4442	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
4443	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[insn->op2].val.i64));
4444	shift = ctx->ir_base[insn->op2].val.i32;
4445	IR_ASSERT(def_reg != IR_REG_NONE);
4446
4447	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4448		op1_reg = IR_REG_NUM(op1_reg);
4449		ir_emit_load(ctx, type, op1_reg, op1);
4450	}
4451	if (def_reg != op1_reg) {
4452		if (op1_reg != IR_REG_NONE) {
4453			ir_emit_mov(ctx, type, def_reg, op1_reg);
4454		} else {
4455			ir_emit_load(ctx, type, def_reg, op1);
4456		}
4457	}
4458	switch (insn->op) {
4459		default:
4460			IR_ASSERT(0);
4461		case IR_SHL:
4462			|	ASM_REG_IMM_OP shl, insn->type, def_reg, shift
4463			break;
4464		case IR_SHR:
4465			|	ASM_REG_IMM_OP shr, insn->type, def_reg, shift
4466			break;
4467		case IR_SAR:
4468			|	ASM_REG_IMM_OP sar, insn->type, def_reg, shift
4469			break;
4470		case IR_ROL:
4471			|	ASM_REG_IMM_OP rol, insn->type, def_reg, shift
4472			break;
4473		case IR_ROR:
4474			|	ASM_REG_IMM_OP ror, insn->type, def_reg, shift
4475			break;
4476	}
4477	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4478		ir_emit_store(ctx, type, def, def_reg);
4479	}
4480}
4481
4482static void ir_emit_mem_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4483{
4484	ir_backend_data *data = ctx->data;
4485	dasm_State **Dst = &data->dasm_state;
4486	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4487	ir_type type = op_insn->type;
4488	int32_t shift;
4489	ir_mem mem;
4490
4491	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
4492	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
4493	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[op_insn->op2].val.i64));
4494	shift = ctx->ir_base[op_insn->op2].val.i32;
4495	if (insn->op == IR_STORE) {
4496		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4497	} else {
4498		IR_ASSERT(insn->op == IR_VSTORE);
4499		mem = ir_var_spill_slot(ctx, insn->op2);
4500	}
4501
4502	switch (op_insn->op) {
4503		default:
4504			IR_ASSERT(0);
4505		case IR_SHL:
4506			|	ASM_MEM_IMM_OP shl, type, mem, shift
4507			break;
4508		case IR_SHR:
4509			|	ASM_MEM_IMM_OP shr, type, mem, shift
4510			break;
4511		case IR_SAR:
4512			|	ASM_MEM_IMM_OP sar, type, mem, shift
4513			break;
4514		case IR_ROL:
4515			|	ASM_MEM_IMM_OP rol, type, mem, shift
4516			break;
4517		case IR_ROR:
4518			|	ASM_MEM_IMM_OP ror, type, mem, shift
4519			break;
4520	}
4521}
4522
4523static void ir_emit_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4524{
4525	ir_backend_data *data = ctx->data;
4526	dasm_State **Dst = &data->dasm_state;
4527	ir_type type = insn->type;
4528	ir_ref op1 = insn->op1;
4529	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4530	ir_reg op1_reg = ctx->regs[def][1];
4531
4532	IR_ASSERT(def_reg != IR_REG_NONE);
4533
4534	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4535		op1_reg = IR_REG_NUM(op1_reg);
4536		ir_emit_load(ctx, type, op1_reg, op1);
4537	}
4538	if (def_reg != op1_reg) {
4539		if (op1_reg != IR_REG_NONE) {
4540			ir_emit_mov(ctx, type, def_reg, op1_reg);
4541		} else {
4542			ir_emit_load(ctx, type, def_reg, op1);
4543		}
4544	}
4545	if (rule == IR_INC) {
4546		|	ASM_REG_OP inc, insn->type, def_reg
4547	} else if (rule == IR_DEC) {
4548		|	ASM_REG_OP dec, insn->type, def_reg
4549	} else if (insn->op == IR_NOT) {
4550		|	ASM_REG_OP not, insn->type, def_reg
4551	} else if (insn->op == IR_NEG) {
4552		|	ASM_REG_OP neg, insn->type, def_reg
4553	} else {
4554		IR_ASSERT(insn->op == IR_BSWAP);
4555		switch (ir_type_size[insn->type]) {
4556			default:
4557				IR_ASSERT(0);
4558			case 4:
4559				|	bswap Rd(def_reg)
4560				break;
4561			case 8:
4562				IR_ASSERT(sizeof(void*) == 8);
4563|.if X64
4564				|	bswap Rq(def_reg)
4565|.endif
4566				break;
4567		}
4568	}
4569	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4570		ir_emit_store(ctx, type, def, def_reg);
4571	}
4572}
4573
4574static void ir_emit_bit_count(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4575{
4576	ir_backend_data *data = ctx->data;
4577	dasm_State **Dst = &data->dasm_state;
4578	ir_type type = insn->type;
4579	ir_ref op1 = insn->op1;
4580	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4581	ir_reg op1_reg = ctx->regs[def][1];
4582
4583	IR_ASSERT(def_reg != IR_REG_NONE);
4584
4585	if (op1_reg != IR_REG_NONE) {
4586		if (IR_REG_SPILLED(op1_reg)) {
4587			op1_reg = IR_REG_NUM(op1_reg);
4588			ir_emit_load(ctx, type, op1_reg, op1);
4589		}
4590		switch (ir_type_size[insn->type]) {
4591			default:
4592				IR_ASSERT(0);
4593			case 2:
4594				if (insn->op == IR_CTLZ) {
4595					if (ctx->mflags & IR_X86_BMI1) {
4596						|	lzcnt Rw(def_reg), Rw(op1_reg)
4597					} else {
4598						|	bsr Rw(def_reg), Rw(op1_reg)
4599						|	xor Rw(def_reg), 0xf
4600					}
4601				} else if (insn->op == IR_CTTZ) {
4602					if (ctx->mflags & IR_X86_BMI1) {
4603						|	tzcnt Rw(def_reg), Rw(op1_reg)
4604					} else {
4605						|	bsf Rw(def_reg), Rw(op1_reg)
4606					}
4607				} else {
4608					IR_ASSERT(insn->op == IR_CTPOP);
4609					|	popcnt Rw(def_reg), Rw(op1_reg)
4610				}
4611				break;
4612			case 1:
4613				|   movzx Rd(op1_reg), Rb(op1_reg)
4614				if (insn->op == IR_CTLZ) {
4615					if (ctx->mflags & IR_X86_BMI1) {
4616						|	lzcnt Rd(def_reg), Rd(op1_reg)
4617						|	sub Rd(def_reg), 24
4618					} else {
4619						|	bsr Rd(def_reg), Rd(op1_reg)
4620						|	xor Rw(def_reg), 0x7
4621					}
4622					break;
4623				}
4624				IR_FALLTHROUGH;
4625			case 4:
4626				if (insn->op == IR_CTLZ) {
4627					if (ctx->mflags & IR_X86_BMI1) {
4628						|	lzcnt Rd(def_reg), Rd(op1_reg)
4629					} else {
4630						|	bsr Rd(def_reg), Rd(op1_reg)
4631						|	xor Rw(def_reg), 0x1f
4632					}
4633				} else if (insn->op == IR_CTTZ) {
4634					if (ctx->mflags & IR_X86_BMI1) {
4635						|	tzcnt Rd(def_reg), Rd(op1_reg)
4636					} else {
4637						|	bsf Rd(def_reg), Rd(op1_reg)
4638					}
4639				} else {
4640					IR_ASSERT(insn->op == IR_CTPOP);
4641					|	popcnt Rd(def_reg), Rd(op1_reg)
4642				}
4643				break;
4644|.if X64
4645			case 8:
4646				if (insn->op == IR_CTLZ) {
4647					if (ctx->mflags & IR_X86_BMI1) {
4648						|	lzcnt Rq(def_reg), Rq(op1_reg)
4649					} else {
4650						|	bsr Rq(def_reg), Rq(op1_reg)
4651						|	xor Rw(def_reg), 0x3f
4652					}
4653				} else if (insn->op == IR_CTTZ) {
4654					if (ctx->mflags & IR_X86_BMI1) {
4655						|	tzcnt Rq(def_reg), Rq(op1_reg)
4656					} else {
4657						|	bsf Rq(def_reg), Rq(op1_reg)
4658					}
4659				} else {
4660					IR_ASSERT(insn->op == IR_CTPOP);
4661					|	popcnt Rq(def_reg), Rq(op1_reg)
4662				}
4663				break;
4664|.endif
4665		}
4666	} else {
4667		ir_mem mem;
4668
4669		if (ir_rule(ctx, op1) & IR_FUSED) {
4670			mem = ir_fuse_load(ctx, def, op1);
4671		} else {
4672			mem = ir_ref_spill_slot(ctx, op1);
4673		}
4674		switch (ir_type_size[insn->type]) {
4675			default:
4676				IR_ASSERT(0);
4677			case 2:
4678				if (insn->op == IR_CTLZ) {
4679					if (ctx->mflags & IR_X86_BMI1) {
4680						|	ASM_TXT_TMEM_OP lzcnt, Rw(def_reg), word, mem
4681					} else {
4682						|	ASM_TXT_TMEM_OP bsr, Rw(def_reg), word, mem
4683						|	xor Rw(def_reg), 0xf
4684					}
4685				} else if (insn->op == IR_CTTZ) {
4686					if (ctx->mflags & IR_X86_BMI1) {
4687						|	ASM_TXT_TMEM_OP tzcnt, Rw(def_reg), word, mem
4688					} else {
4689						|	ASM_TXT_TMEM_OP bsf, Rw(def_reg), word, mem
4690					}
4691				} else {
4692					|	ASM_TXT_TMEM_OP popcnt, Rw(def_reg), word, mem
4693				}
4694				break;
4695			case 4:
4696				if (insn->op == IR_CTLZ) {
4697					if (ctx->mflags & IR_X86_BMI1) {
4698						|	ASM_TXT_TMEM_OP lzcnt, Rd(def_reg), dword, mem
4699					} else {
4700						|	ASM_TXT_TMEM_OP bsr, Rd(def_reg), dword, mem
4701						|	xor Rw(def_reg), 0x1f
4702					}
4703				} else if (insn->op == IR_CTTZ) {
4704					if (ctx->mflags & IR_X86_BMI1) {
4705						|	ASM_TXT_TMEM_OP tzcnt, Rd(def_reg), dword, mem
4706					} else {
4707						|	ASM_TXT_TMEM_OP bsf, Rd(def_reg), dword, mem
4708					}
4709				} else {
4710					|	ASM_TXT_TMEM_OP popcnt, Rd(def_reg), dword, mem
4711				}
4712				break;
4713|.if X64
4714			case 8:
4715				if (insn->op == IR_CTLZ) {
4716					if (ctx->mflags & IR_X86_BMI1) {
4717						|	ASM_TXT_TMEM_OP lzcnt, Rq(def_reg), qword, mem
4718					} else {
4719						|	ASM_TXT_TMEM_OP bsr, Rq(def_reg), qword, mem
4720						|	xor Rw(def_reg), 0x3f
4721					}
4722				} else if (insn->op == IR_CTTZ) {
4723					if (ctx->mflags & IR_X86_BMI1) {
4724						|	ASM_TXT_TMEM_OP tzcnt, Rq(def_reg), qword, mem
4725					} else {
4726						|	ASM_TXT_TMEM_OP bsf, Rq(def_reg), qword, mem
4727					}
4728				} else {
4729					|	ASM_TXT_TMEM_OP popcnt, Rq(def_reg), qword, mem
4730				}
4731				break;
4732|.endif
4733		}
4734	}
4735
4736	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4737		ir_emit_store(ctx, type, def, def_reg);
4738	}
4739}
4740
4741static void ir_emit_ctpop(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4742{
4743	ir_backend_data *data = ctx->data;
4744	dasm_State **Dst = &data->dasm_state;
4745	ir_type type = insn->type;
4746	ir_ref op1 = insn->op1;
4747	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4748	ir_reg op1_reg = ctx->regs[def][1];
4749	ir_reg tmp_reg = ctx->regs[def][2];
4750|.if X64
4751||	ir_reg const_reg = ctx->regs[def][3];
4752|.endif
4753
4754	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
4755	if (op1_reg == IR_REG_NONE) {
4756		ir_emit_load(ctx, type, def_reg, op1);
4757		if (ir_type_size[insn->type] == 1) {
4758			|	movzx Rd(def_reg), Rb(def_reg)
4759		} else if (ir_type_size[insn->type] == 2) {
4760			|	movzx Rd(def_reg), Rw(def_reg)
4761		}
4762	} else {
4763		if (IR_REG_SPILLED(op1_reg)) {
4764			op1_reg = IR_REG_NUM(op1_reg);
4765			ir_emit_load(ctx, type, op1_reg, op1);
4766		}
4767		switch (ir_type_size[insn->type]) {
4768			default:
4769				IR_ASSERT(0);
4770			case 1:
4771				|	movzx Rd(def_reg), Rb(op1_reg)
4772				break;
4773			case 2:
4774				|	movzx Rd(def_reg), Rw(op1_reg)
4775				break;
4776			case 4:
4777				|	mov Rd(def_reg), Rd(op1_reg)
4778				break;
4779|.if X64
4780||			case 8:
4781				|	mov Rq(def_reg), Rq(op1_reg)
4782||				break;
4783|.endif
4784		}
4785	}
4786	switch (ir_type_size[insn->type]) {
4787		default:
4788			IR_ASSERT(0);
4789		case 1:
4790			|	mov Rd(tmp_reg), Rd(def_reg)
4791			|	shr Rd(def_reg), 1
4792			|	and Rd(def_reg), 0x55
4793			|	sub Rd(tmp_reg), Rd(def_reg)
4794			|	mov Rd(def_reg), Rd(tmp_reg)
4795			|	and Rd(def_reg), 0x33
4796			|	shr Rd(tmp_reg), 2
4797			|	and Rd(tmp_reg), 0x33
4798			|	add Rd(tmp_reg), Rd(def_reg)
4799			|	mov Rd(def_reg), Rd(tmp_reg)
4800			|	shr Rd(def_reg), 4
4801			|	add Rd(def_reg), Rd(tmp_reg)
4802			|	and Rd(def_reg), 0x0f
4803			break;
4804		case 2:
4805			|	mov Rd(tmp_reg), Rd(def_reg)
4806			|	shr Rd(def_reg), 1
4807			|	and Rd(def_reg), 0x5555
4808			|	sub Rd(tmp_reg), Rd(def_reg)
4809			|	mov Rd(def_reg), Rd(tmp_reg)
4810			|	and Rd(def_reg), 0x3333
4811			|	shr Rd(tmp_reg), 2
4812			|	and Rd(tmp_reg), 0x3333
4813			|	add Rd(tmp_reg), Rd(def_reg)
4814			|	mov Rd(def_reg), Rd(tmp_reg)
4815			|	shr Rd(def_reg), 4
4816			|	add Rd(def_reg), Rd(tmp_reg)
4817			|	and Rd(def_reg), 0x0f0f
4818			|	mov	Rd(tmp_reg), Rd(def_reg)
4819			|	shr Rd(tmp_reg), 8
4820			|	and Rd(def_reg), 0x0f
4821			|	add Rd(def_reg), Rd(tmp_reg)
4822			break;
4823		case 4:
4824			|	mov Rd(tmp_reg), Rd(def_reg)
4825			|	shr Rd(def_reg), 1
4826			|	and Rd(def_reg), 0x55555555
4827			|	sub Rd(tmp_reg), Rd(def_reg)
4828			|	mov Rd(def_reg), Rd(tmp_reg)
4829			|	and Rd(def_reg), 0x33333333
4830			|	shr Rd(tmp_reg), 2
4831			|	and Rd(tmp_reg), 0x33333333
4832			|	add Rd(tmp_reg), Rd(def_reg)
4833			|	mov Rd(def_reg), Rd(tmp_reg)
4834			|	shr Rd(def_reg), 4
4835			|	add Rd(def_reg), Rd(tmp_reg)
4836			|	and Rd(def_reg), 0x0f0f0f0f
4837			|	imul Rd(def_reg), 0x01010101
4838			|	shr Rd(def_reg), 24
4839			break;
4840|.if X64
4841||		case 8:
4842||			IR_ASSERT(const_reg != IR_REG_NONE);
4843			|	mov Rq(tmp_reg), Rq(def_reg)
4844			|	shr Rq(def_reg), 1
4845			|	mov64 Rq(const_reg), 0x5555555555555555
4846			|	and Rq(def_reg), Rq(const_reg)
4847			|	sub Rq(tmp_reg), Rq(def_reg)
4848			|	mov Rq(def_reg), Rq(tmp_reg)
4849			|	mov64 Rq(const_reg), 0x3333333333333333
4850			|	and Rq(def_reg), Rq(const_reg)
4851			|	shr Rq(tmp_reg), 2
4852			|	and Rq(tmp_reg), Rq(const_reg)
4853			|	add Rq(tmp_reg), Rq(def_reg)
4854			|	mov Rq(def_reg), Rq(tmp_reg)
4855			|	shr Rq(def_reg), 4
4856			|	add Rq(def_reg), Rq(tmp_reg)
4857			|	mov64 Rq(const_reg), 0x0f0f0f0f0f0f0f0f
4858			|	and Rq(def_reg), Rq(const_reg)
4859			|	mov64 Rq(const_reg), 0x0101010101010101
4860			|	imul Rq(def_reg), Rq(const_reg)
4861			|	shr Rq(def_reg), 56
4862||			break;
4863|.endif
4864	}
4865
4866	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4867		ir_emit_store(ctx, type, def, def_reg);
4868	}
4869}
4870
4871static void ir_emit_mem_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
4872{
4873	ir_backend_data *data = ctx->data;
4874	dasm_State **Dst = &data->dasm_state;
4875	ir_insn *op_insn = &ctx->ir_base[insn->op3];
4876	ir_type type = op_insn->type;
4877	ir_mem mem;
4878
4879	if (insn->op == IR_STORE) {
4880		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
4881	} else {
4882		IR_ASSERT(insn->op == IR_VSTORE);
4883		mem = ir_var_spill_slot(ctx, insn->op2);
4884	}
4885
4886	if (rule == IR_MEM_INC) {
4887		|	ASM_MEM_OP inc, type, mem
4888	} else if (rule == IR_MEM_DEC) {
4889		|	ASM_MEM_OP dec, type, mem
4890	} else if (op_insn->op == IR_NOT) {
4891		|	ASM_MEM_OP not, type, mem
4892	} else {
4893		IR_ASSERT(op_insn->op == IR_NEG);
4894		|	ASM_MEM_OP neg, type, mem
4895	}
4896}
4897
4898static void ir_emit_abs_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4899{
4900	ir_backend_data *data = ctx->data;
4901	dasm_State **Dst = &data->dasm_state;
4902	ir_type type = insn->type;
4903	ir_ref op1 = insn->op1;
4904	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4905	ir_reg op1_reg = ctx->regs[def][1];
4906
4907	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
4908
4909	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4910		op1_reg = IR_REG_NUM(op1_reg);
4911		ir_emit_load(ctx, type, op1_reg, op1);
4912	}
4913
4914	IR_ASSERT(def_reg != op1_reg);
4915
4916	ir_emit_mov(ctx, insn->type, def_reg, op1_reg);
4917	|	ASM_REG_OP neg, insn->type, def_reg
4918	|	ASM_REG_REG_OP2, cmovs, type, def_reg, op1_reg
4919	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4920		ir_emit_store(ctx, type, def, def_reg);
4921	}
4922}
4923
4924static void ir_emit_bool_not_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4925{
4926	ir_backend_data *data = ctx->data;
4927	dasm_State **Dst = &data->dasm_state;
4928	ir_type type = ctx->ir_base[insn->op1].type;
4929	ir_ref op1 = insn->op1;
4930	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4931	ir_reg op1_reg = ctx->regs[def][1];
4932
4933	IR_ASSERT(def_reg != IR_REG_NONE);
4934
4935	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4936		op1_reg = IR_REG_NUM(op1_reg);
4937		ir_emit_load(ctx, type, op1_reg, op1);
4938	}
4939
4940	if (op1_reg != IR_REG_NONE) {
4941		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
4942	} else {
4943		ir_mem mem = ir_ref_spill_slot(ctx, op1);
4944
4945		|	ASM_MEM_IMM_OP cmp, type, mem, 0
4946	}
4947	|	sete Rb(def_reg)
4948
4949	if (IR_REG_SPILLED(ctx->regs[def][0])) {
4950		ir_emit_store(ctx, type, def, def_reg);
4951	}
4952}
4953
4954static void ir_emit_mul_div_mod(ir_ctx *ctx, ir_ref def, ir_insn *insn)
4955{
4956	ir_backend_data *data = ctx->data;
4957	dasm_State **Dst = &data->dasm_state;
4958	ir_type type = insn->type;
4959	ir_ref op1 = insn->op1;
4960	ir_ref op2 = insn->op2;
4961	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
4962	ir_reg op1_reg = ctx->regs[def][1];
4963	ir_reg op2_reg = ctx->regs[def][2];
4964	ir_mem mem;
4965
4966	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
4967		op1_reg = IR_REG_NUM(op1_reg);
4968		ir_emit_load(ctx, type, op1_reg, op1);
4969	}
4970	if (op1_reg != IR_REG_RAX) {
4971		if (op1_reg != IR_REG_NONE) {
4972			ir_emit_mov(ctx, type, IR_REG_RAX, op1_reg);
4973		} else {
4974			ir_emit_load(ctx, type, IR_REG_RAX, op1);
4975		}
4976	}
4977	if (op2_reg == IR_REG_NONE && op1 == op2) {
4978		op2_reg = IR_REG_RAX;
4979	} else if (op2_reg != IR_REG_NONE) {
4980		if (IR_REG_SPILLED(op2_reg)) {
4981			op2_reg = IR_REG_NUM(op2_reg);
4982			ir_emit_load(ctx, type, op2_reg, op2);
4983		}
4984	} else if (IR_IS_CONST_REF(op2)
4985	 && (insn->op == IR_MUL || insn->op == IR_MUL_OV)) {
4986		op2_reg = IR_REG_RDX;
4987		ir_emit_load(ctx, type, op2_reg, op2);
4988	}
4989	if (insn->op == IR_MUL || insn->op == IR_MUL_OV) {
4990		if (IR_IS_TYPE_SIGNED(insn->type)) {
4991			if (op2_reg != IR_REG_NONE) {
4992				|	ASM_REG_OP imul, type, op2_reg
4993			} else {
4994				if (ir_rule(ctx, op2) & IR_FUSED) {
4995					mem = ir_fuse_load(ctx, def, op2);
4996				} else {
4997					mem = ir_ref_spill_slot(ctx, op2);
4998				}
4999				|	ASM_MEM_OP imul, type, mem
5000			}
5001		} else {
5002			if (op2_reg != IR_REG_NONE) {
5003				|	ASM_REG_OP mul, type, op2_reg
5004			} else {
5005				if (ir_rule(ctx, op2) & IR_FUSED) {
5006					mem = ir_fuse_load(ctx, def, op2);
5007				} else {
5008					mem = ir_ref_spill_slot(ctx, op2);
5009				}
5010				|	ASM_MEM_OP mul, type, mem
5011			}
5012		}
5013	} else {
5014		if (IR_IS_TYPE_SIGNED(type)) {
5015			if (ir_type_size[type] == 8) {
5016				|	cqo
5017			} else if (ir_type_size[type] == 4) {
5018				|	cdq
5019			} else if (ir_type_size[type] == 2) {
5020				|	cwd
5021			} else {
5022				|	movsx ax, al
5023			}
5024			if (op2_reg != IR_REG_NONE) {
5025				|	ASM_REG_OP idiv, type, op2_reg
5026			} else {
5027				if (ir_rule(ctx, op2) & IR_FUSED) {
5028					mem = ir_fuse_load(ctx, def, op2);
5029				} else {
5030					mem = ir_ref_spill_slot(ctx, op2);
5031				}
5032				|	ASM_MEM_OP idiv, type, mem
5033			}
5034		} else {
5035			if (ir_type_size[type] == 1) {
5036				|	movzx ax, al
5037			} else {
5038				|	ASM_REG_REG_OP xor, type, IR_REG_RDX, IR_REG_RDX
5039			}
5040			if (op2_reg != IR_REG_NONE) {
5041				|	ASM_REG_OP div, type, op2_reg
5042			} else {
5043				if (ir_rule(ctx, op2) & IR_FUSED) {
5044					mem = ir_fuse_load(ctx, def, op2);
5045				} else {
5046					mem = ir_ref_spill_slot(ctx, op2);
5047				}
5048				|	ASM_MEM_OP div, type, mem
5049			}
5050		}
5051	}
5052
5053	if (insn->op == IR_MUL || insn->op == IR_MUL_OV || insn->op == IR_DIV) {
5054		if (def_reg != IR_REG_NONE) {
5055			if (def_reg != IR_REG_RAX) {
5056				ir_emit_mov(ctx, type, def_reg, IR_REG_RAX);
5057			}
5058			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5059				ir_emit_store(ctx, type, def, def_reg);
5060			}
5061		} else {
5062			ir_emit_store(ctx, type, def, IR_REG_RAX);
5063		}
5064	} else {
5065		IR_ASSERT(insn->op == IR_MOD);
5066		if (ir_type_size[type] == 1) {
5067			if (def_reg != IR_REG_NONE) {
5068				|	mov al, ah
5069				if (def_reg != IR_REG_RAX) {
5070					|	mov Rb(def_reg), al
5071				}
5072				if (IR_REG_SPILLED(ctx->regs[def][0])) {
5073					ir_emit_store(ctx, type, def, def_reg);
5074				}
5075			} else {
5076				ir_reg fp;
5077				int32_t offset = ir_ref_spill_slot_offset(ctx, def, &fp);
5078
5079//?????
5080				|	mov byte [Ra(fp)+offset], ah
5081			}
5082		} else {
5083			if (def_reg != IR_REG_NONE) {
5084				if (def_reg != IR_REG_RDX) {
5085					ir_emit_mov(ctx, type, def_reg, IR_REG_RDX);
5086				}
5087				if (IR_REG_SPILLED(ctx->regs[def][0])) {
5088					ir_emit_store(ctx, type, def, def_reg);
5089				}
5090			} else {
5091				ir_emit_store(ctx, type, def, IR_REG_RDX);
5092			}
5093		}
5094	}
5095}
5096
5097static void ir_rodata(ir_ctx *ctx)
5098{
5099	ir_backend_data *data = ctx->data;
5100	dasm_State **Dst = &data->dasm_state;
5101
5102	|.rodata
5103	if (!data->rodata_label) {
5104		int label = data->rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
5105		|=>label:
5106	}
5107}
5108
5109static void ir_emit_op_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5110{
5111	ir_backend_data *data = ctx->data;
5112	dasm_State **Dst = &data->dasm_state;
5113	ir_type type = insn->type;
5114	ir_ref op1 = insn->op1;
5115	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5116	ir_reg op1_reg = ctx->regs[def][1];
5117
5118	IR_ASSERT(def_reg != IR_REG_NONE);
5119
5120	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5121		op1_reg = IR_REG_NUM(op1_reg);
5122		ir_emit_load(ctx, type, op1_reg, op1);
5123	}
5124	if (def_reg != op1_reg) {
5125		if (op1_reg != IR_REG_NONE) {
5126			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
5127		} else {
5128			ir_emit_load(ctx, type, def_reg, op1);
5129		}
5130	}
5131	if (insn->op == IR_NEG) {
5132		if (insn->type == IR_DOUBLE) {
5133			if (!data->double_neg_const) {
5134				data->double_neg_const = 1;
5135				ir_rodata(ctx);
5136				|.align 16
5137				|->double_neg_const:
5138				|.dword 0, 0x80000000, 0, 0
5139				|.code
5140			}
5141			if (ctx->mflags & IR_X86_AVX) {
5142				|	vxorpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
5143			} else {
5144				|	xorpd xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
5145			}
5146		} else {
5147			IR_ASSERT(insn->type == IR_FLOAT);
5148			if (!data->float_neg_const) {
5149				data->float_neg_const = 1;
5150				ir_rodata(ctx);
5151				|.align 16
5152				|->float_neg_const:
5153				|.dword 0x80000000, 0, 0, 0
5154				|.code
5155			}
5156			if (ctx->mflags & IR_X86_AVX) {
5157				|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
5158			} else {
5159				|	xorps xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
5160			}
5161		}
5162	} else {
5163		IR_ASSERT(insn->op == IR_ABS);
5164		if (insn->type == IR_DOUBLE) {
5165			if (!data->double_abs_const) {
5166				data->double_abs_const = 1;
5167				ir_rodata(ctx);
5168				|.align 16
5169				|->double_abs_const:
5170				|.dword 0xffffffff, 0x7fffffff, 0, 0
5171				|.code
5172			}
5173			if (ctx->mflags & IR_X86_AVX) {
5174				|	vandpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
5175			} else {
5176				|	andpd xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
5177			}
5178		} else {
5179			IR_ASSERT(insn->type == IR_FLOAT);
5180			if (!data->float_abs_const) {
5181				data->float_abs_const = 1;
5182				ir_rodata(ctx);
5183				|.align 16
5184				|->float_abs_const:
5185				|.dword 0x7fffffff, 0, 0, 0
5186				|.code
5187			}
5188			if (ctx->mflags & IR_X86_AVX) {
5189				|	vandps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
5190			} else {
5191				|	andps xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
5192			}
5193		}
5194	}
5195	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5196		ir_emit_store(ctx, insn->type, def, def_reg);
5197	}
5198}
5199
5200static void ir_emit_binop_sse2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5201{
5202	ir_backend_data *data = ctx->data;
5203	dasm_State **Dst = &data->dasm_state;
5204	ir_type type = insn->type;
5205	ir_ref op1 = insn->op1;
5206	ir_ref op2 = insn->op2;
5207	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5208	ir_reg op1_reg = ctx->regs[def][1];
5209	ir_reg op2_reg = ctx->regs[def][2];
5210
5211	IR_ASSERT(def_reg != IR_REG_NONE);
5212
5213	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5214		op1_reg = IR_REG_NUM(op1_reg);
5215		ir_emit_load(ctx, type, op1_reg, op1);
5216	}
5217	if (def_reg != op1_reg) {
5218		if (op1_reg != IR_REG_NONE) {
5219			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
5220		} else {
5221			ir_emit_load(ctx, type, def_reg, op1);
5222		}
5223		if (op1 == op2) {
5224			op2_reg = def_reg;
5225		}
5226	}
5227	if (op2_reg != IR_REG_NONE) {
5228		if (IR_REG_SPILLED(op2_reg)) {
5229			op2_reg = IR_REG_NUM(op2_reg);
5230			if (op1 != op2) {
5231				ir_emit_load(ctx, type, op2_reg, op2);
5232			}
5233		}
5234		switch (insn->op) {
5235			default:
5236				IR_ASSERT(0 && "NIY binary op");
5237			case IR_ADD:
5238				|	ASM_SSE2_REG_REG_OP adds, type, def_reg, op2_reg
5239				break;
5240			case IR_SUB:
5241				|	ASM_SSE2_REG_REG_OP subs, type, def_reg, op2_reg
5242				break;
5243			case IR_MUL:
5244				|	ASM_SSE2_REG_REG_OP muls, type, def_reg, op2_reg
5245				break;
5246			case IR_DIV:
5247				|	ASM_SSE2_REG_REG_OP divs, type, def_reg, op2_reg
5248				break;
5249			case IR_MIN:
5250				|	ASM_SSE2_REG_REG_OP mins, type, def_reg, op2_reg
5251				break;
5252			case IR_MAX:
5253				|	ASM_SSE2_REG_REG_OP maxs, type, def_reg, op2_reg
5254				break;
5255		}
5256	} else if (IR_IS_CONST_REF(op2)) {
5257		int label = ir_const_label(ctx, op2);
5258
5259		switch (insn->op) {
5260			default:
5261				IR_ASSERT(0 && "NIY binary op");
5262			case IR_ADD:
5263				|	ASM_SSE2_REG_TXT_OP adds, type, def_reg, [=>label]
5264				break;
5265			case IR_SUB:
5266				|	ASM_SSE2_REG_TXT_OP subs, type, def_reg, [=>label]
5267				break;
5268			case IR_MUL:
5269				|	ASM_SSE2_REG_TXT_OP muls, type, def_reg, [=>label]
5270				break;
5271			case IR_DIV:
5272				|	ASM_SSE2_REG_TXT_OP divs, type, def_reg, [=>label]
5273				break;
5274			case IR_MIN:
5275				|	ASM_SSE2_REG_TXT_OP mins, type, def_reg, [=>label]
5276				break;
5277			case IR_MAX:
5278				|	ASM_SSE2_REG_TXT_OP maxs, type, def_reg, [=>label]
5279				break;
5280		}
5281	} else {
5282		ir_mem mem;
5283
5284		if (ir_rule(ctx, op2) & IR_FUSED) {
5285			mem = ir_fuse_load(ctx, def, op2);
5286		} else {
5287			mem = ir_ref_spill_slot(ctx, op2);
5288		}
5289		switch (insn->op) {
5290			default:
5291				IR_ASSERT(0 && "NIY binary op");
5292			case IR_ADD:
5293				|	ASM_SSE2_REG_MEM_OP adds, type, def_reg, mem
5294				break;
5295			case IR_SUB:
5296				|	ASM_SSE2_REG_MEM_OP subs, type, def_reg, mem
5297				break;
5298			case IR_MUL:
5299				|	ASM_SSE2_REG_MEM_OP muls, type, def_reg, mem
5300				break;
5301			case IR_DIV:
5302				|	ASM_SSE2_REG_MEM_OP divs, type, def_reg, mem
5303				break;
5304			case IR_MIN:
5305				|	ASM_SSE2_REG_MEM_OP mins, type, def_reg, mem
5306				break;
5307			case IR_MAX:
5308				|	ASM_SSE2_REG_MEM_OP maxs, type, def_reg, mem
5309				break;
5310		}
5311	}
5312	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5313		ir_emit_store(ctx, insn->type, def, def_reg);
5314	}
5315}
5316
5317static void ir_emit_binop_avx(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5318{
5319	ir_backend_data *data = ctx->data;
5320	dasm_State **Dst = &data->dasm_state;
5321	ir_type type = insn->type;
5322	ir_ref op1 = insn->op1;
5323	ir_ref op2 = insn->op2;
5324	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5325	ir_reg op1_reg = ctx->regs[def][1];
5326	ir_reg op2_reg = ctx->regs[def][2];
5327
5328	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);
5329
5330	if (IR_REG_SPILLED(op1_reg)) {
5331		op1_reg = IR_REG_NUM(op1_reg);
5332		ir_emit_load(ctx, type, op1_reg, op1);
5333	}
5334	if (op2_reg != IR_REG_NONE) {
5335		if (IR_REG_SPILLED(op2_reg)) {
5336			op2_reg = IR_REG_NUM(op2_reg);
5337			if (op1 != op2) {
5338				ir_emit_load(ctx, type, op2_reg, op2);
5339			}
5340		}
5341		switch (insn->op) {
5342			default:
5343				IR_ASSERT(0 && "NIY binary op");
5344			case IR_ADD:
5345				|	ASM_AVX_REG_REG_REG_OP vadds, type, def_reg, op1_reg, op2_reg
5346				break;
5347			case IR_SUB:
5348				|	ASM_AVX_REG_REG_REG_OP vsubs, type, def_reg, op1_reg, op2_reg
5349				break;
5350			case IR_MUL:
5351				|	ASM_AVX_REG_REG_REG_OP vmuls, type, def_reg, op1_reg, op2_reg
5352				break;
5353			case IR_DIV:
5354				|	ASM_AVX_REG_REG_REG_OP vdivs, type, def_reg, op1_reg, op2_reg
5355				break;
5356			case IR_MIN:
5357				|	ASM_AVX_REG_REG_REG_OP vmins, type, def_reg, op1_reg, op2_reg
5358				break;
5359			case IR_MAX:
5360				|	ASM_AVX_REG_REG_REG_OP vmaxs, type, def_reg, op1_reg, op2_reg
5361				break;
5362		}
5363	} else if (IR_IS_CONST_REF(op2)) {
5364		int label = ir_const_label(ctx, op2);
5365
5366		switch (insn->op) {
5367			default:
5368				IR_ASSERT(0 && "NIY binary op");
5369			case IR_ADD:
5370				|	ASM_AVX_REG_REG_TXT_OP vadds, type, def_reg, op1_reg, [=>label]
5371				break;
5372			case IR_SUB:
5373				|	ASM_AVX_REG_REG_TXT_OP vsubs, type, def_reg, op1_reg, [=>label]
5374				break;
5375			case IR_MUL:
5376				|	ASM_AVX_REG_REG_TXT_OP vmuls, type, def_reg, op1_reg, [=>label]
5377				break;
5378			case IR_DIV:
5379				|	ASM_AVX_REG_REG_TXT_OP vdivs, type, def_reg, op1_reg, [=>label]
5380				break;
5381			case IR_MIN:
5382				|	ASM_AVX_REG_REG_TXT_OP vmins, type, def_reg, op1_reg, [=>label]
5383				break;
5384			case IR_MAX:
5385				|	ASM_AVX_REG_REG_TXT_OP vmaxs, type, def_reg, op1_reg, [=>label]
5386				break;
5387		}
5388	} else {
5389		ir_mem mem;
5390
5391		if (ir_rule(ctx, op2) & IR_FUSED) {
5392			mem = ir_fuse_load(ctx, def, op2);
5393		} else {
5394			mem = ir_ref_spill_slot(ctx, op2);
5395		}
5396		switch (insn->op) {
5397			default:
5398				IR_ASSERT(0 && "NIY binary op");
5399			case IR_ADD:
5400				|	ASM_AVX_REG_REG_MEM_OP vadds, type, def_reg, op1_reg, mem
5401				break;
5402			case IR_SUB:
5403				|	ASM_AVX_REG_REG_MEM_OP vsubs, type, def_reg, op1_reg, mem
5404				break;
5405			case IR_MUL:
5406				|	ASM_AVX_REG_REG_MEM_OP vmuls, type, def_reg, op1_reg, mem
5407				break;
5408			case IR_DIV:
5409				|	ASM_AVX_REG_REG_MEM_OP vdivs, type, def_reg, op1_reg, mem
5410				break;
5411			case IR_MIN:
5412				|	ASM_AVX_REG_REG_MEM_OP vmins, type, def_reg, op1_reg, mem
5413				break;
5414			case IR_MAX:
5415				|	ASM_AVX_REG_REG_MEM_OP vmaxs, type, def_reg, op1_reg, mem
5416				break;
5417		}
5418	}
5419	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5420		ir_emit_store(ctx, insn->type, def, def_reg);
5421	}
5422}
5423
5424static void ir_emit_cmp_int_common(ir_ctx *ctx, ir_type type, ir_ref root, ir_insn *insn, ir_reg op1_reg, ir_ref op1, ir_reg op2_reg, ir_ref op2)
5425{
5426	ir_backend_data *data = ctx->data;
5427	dasm_State **Dst = &data->dasm_state;
5428
5429	if (op1_reg != IR_REG_NONE) {
5430		if (op2_reg != IR_REG_NONE) {
5431			|	ASM_REG_REG_OP cmp, type, op1_reg, op2_reg
5432		} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5433			|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
5434		} else if (IR_IS_CONST_REF(op2)) {
5435			int32_t val = ir_fuse_imm(ctx, op2);
5436			|	ASM_REG_IMM_OP cmp, type, op1_reg, val
5437		} else {
5438			ir_mem mem;
5439
5440			if (ir_rule(ctx, op2) & IR_FUSED) {
5441				mem = ir_fuse_load(ctx, root, op2);
5442			} else {
5443				mem = ir_ref_spill_slot(ctx, op2);
5444			}
5445			|	ASM_REG_MEM_OP cmp, type, op1_reg, mem
5446		}
5447	} else if (IR_IS_CONST_REF(op1)) {
5448		IR_ASSERT(0);
5449	} else {
5450		ir_mem mem;
5451
5452		if (ir_rule(ctx, op1) & IR_FUSED) {
5453			mem = ir_fuse_load(ctx, root, op1);
5454		} else {
5455			mem = ir_ref_spill_slot(ctx, op1);
5456		}
5457		if (op2_reg != IR_REG_NONE) {
5458			|	ASM_MEM_REG_OP cmp, type, mem, op2_reg
5459		} else {
5460			int32_t val = ir_fuse_imm(ctx, op2);
5461			|	ASM_MEM_IMM_OP cmp, type, mem, val
5462		}
5463	}
5464}
5465
5466static void ir_emit_cmp_int_common2(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *cmp_insn)
5467{
5468	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5469	ir_ref op1 = cmp_insn->op1;
5470	ir_ref op2 = cmp_insn->op2;
5471	ir_reg op1_reg = ctx->regs[ref][1];
5472	ir_reg op2_reg = ctx->regs[ref][2];
5473
5474	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5475		op1_reg = IR_REG_NUM(op1_reg);
5476		ir_emit_load(ctx, type, op1_reg, op1);
5477	}
5478	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
5479		op2_reg = IR_REG_NUM(op2_reg);
5480		if (op1 != op2) {
5481			ir_emit_load(ctx, type, op2_reg, op2);
5482		}
5483	}
5484
5485	ir_emit_cmp_int_common(ctx, type, root, cmp_insn, op1_reg, op1, op2_reg, op2);
5486}
5487
5488static void _ir_emit_setcc_int(ir_ctx *ctx, uint8_t op, ir_reg def_reg)
5489{
5490	ir_backend_data *data = ctx->data;
5491	dasm_State **Dst = &data->dasm_state;
5492
5493	switch (op) {
5494		default:
5495			IR_ASSERT(0 && "NIY binary op");
5496		case IR_EQ:
5497			|	sete Rb(def_reg)
5498			break;
5499		case IR_NE:
5500			|	setne Rb(def_reg)
5501			break;
5502		case IR_LT:
5503			|	setl Rb(def_reg)
5504			break;
5505		case IR_GE:
5506			|	setge Rb(def_reg)
5507			break;
5508		case IR_LE:
5509			|	setle Rb(def_reg)
5510			break;
5511		case IR_GT:
5512			|	setg Rb(def_reg)
5513			break;
5514		case IR_ULT:
5515			|	setb Rb(def_reg)
5516			break;
5517		case IR_UGE:
5518			|	setae Rb(def_reg)
5519			break;
5520		case IR_ULE:
5521			|	setbe Rb(def_reg)
5522			break;
5523		case IR_UGT:
5524			|	seta Rb(def_reg)
5525			break;
5526	}
5527}
5528
5529static void _ir_emit_setcc_int_mem(ir_ctx *ctx, uint8_t op, ir_mem mem)
5530{
5531	ir_backend_data *data = ctx->data;
5532	dasm_State **Dst = &data->dasm_state;
5533
5534
5535	switch (op) {
5536		default:
5537			IR_ASSERT(0 && "NIY binary op");
5538		case IR_EQ:
5539			|	ASM_TMEM_OP sete, byte, mem
5540			break;
5541		case IR_NE:
5542			|	ASM_TMEM_OP setne, byte, mem
5543			break;
5544		case IR_LT:
5545			|	ASM_TMEM_OP setl, byte, mem
5546			break;
5547		case IR_GE:
5548			|	ASM_TMEM_OP setge, byte, mem
5549			break;
5550		case IR_LE:
5551			|	ASM_TMEM_OP setle, byte, mem
5552			break;
5553		case IR_GT:
5554			|	ASM_TMEM_OP setg, byte, mem
5555			break;
5556		case IR_ULT:
5557			|	ASM_TMEM_OP setb, byte, mem
5558			break;
5559		case IR_UGE:
5560			|	ASM_TMEM_OP setae, byte, mem
5561			break;
5562		case IR_ULE:
5563			|	ASM_TMEM_OP setbe, byte, mem
5564			break;
5565		case IR_UGT:
5566			|	ASM_TMEM_OP seta, byte, mem
5567			break;
5568	}
5569}
5570
5571static void ir_emit_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5572{
5573	ir_backend_data *data = ctx->data;
5574	dasm_State **Dst = &data->dasm_state;
5575	ir_type type = ctx->ir_base[insn->op1].type;
5576	ir_op op = insn->op;
5577	ir_ref op1 = insn->op1;
5578	ir_ref op2 = insn->op2;
5579	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5580	ir_reg op1_reg = ctx->regs[def][1];
5581	ir_reg op2_reg = ctx->regs[def][2];
5582
5583	IR_ASSERT(def_reg != IR_REG_NONE);
5584	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5585		op1_reg = IR_REG_NUM(op1_reg);
5586		ir_emit_load(ctx, type, op1_reg, op1);
5587	}
5588	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
5589		op2_reg = IR_REG_NUM(op2_reg);
5590		if (op1 != op2) {
5591			ir_emit_load(ctx, type, op2_reg, op2);
5592		}
5593	}
5594	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
5595		if (op == IR_ULT) {
5596			/* always false */
5597			|	xor Ra(def_reg), Ra(def_reg)
5598			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5599				ir_emit_store(ctx, insn->type, def, def_reg);
5600			}
5601			return;
5602		} else if (op == IR_UGE) {
5603			/* always true */
5604			|	ASM_REG_IMM_OP mov, insn->type, def_reg, 1
5605			if (IR_REG_SPILLED(ctx->regs[def][0])) {
5606				ir_emit_store(ctx, insn->type, def, def_reg);
5607			}
5608			return;
5609		} else if (op == IR_ULE) {
5610			op = IR_EQ;
5611		} else if (op == IR_UGT) {
5612			op = IR_NE;
5613		}
5614	}
5615	ir_emit_cmp_int_common(ctx, type, def, insn, op1_reg, op1, op2_reg, op2);
5616	_ir_emit_setcc_int(ctx, op, def_reg);
5617	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5618		ir_emit_store(ctx, insn->type, def, def_reg);
5619	}
5620}
5621
5622static void ir_emit_test_int_common(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_op op)
5623{
5624	ir_backend_data *data = ctx->data;
5625	dasm_State **Dst = &data->dasm_state;
5626	ir_insn *binop_insn = &ctx->ir_base[ref];
5627	ir_type type = binop_insn->type;
5628	ir_ref op1 = binop_insn->op1;
5629	ir_ref op2 = binop_insn->op2;
5630	ir_reg op1_reg = ctx->regs[ref][1];
5631	ir_reg op2_reg = ctx->regs[ref][2];
5632
5633	IR_ASSERT(binop_insn->op == IR_AND);
5634	if (op1_reg != IR_REG_NONE) {
5635		if (IR_REG_SPILLED(op1_reg)) {
5636			op1_reg = IR_REG_NUM(op1_reg);
5637			ir_emit_load(ctx, type, op1_reg, op1);
5638		}
5639		if (op2_reg != IR_REG_NONE) {
5640			if (IR_REG_SPILLED(op2_reg)) {
5641				op2_reg = IR_REG_NUM(op2_reg);
5642				if (op1 != op2) {
5643					ir_emit_load(ctx, type, op2_reg, op2);
5644				}
5645			}
5646			|	ASM_REG_REG_OP test, type, op1_reg, op2_reg
5647		} else if (IR_IS_CONST_REF(op2)) {
5648			int32_t val = ir_fuse_imm(ctx, op2);
5649
5650			if ((op == IR_EQ || op == IR_NE) && val == 0xff && (sizeof(void*) == 8 || op1_reg <= IR_REG_R3)) {
5651				|	test Rb(op1_reg), Rb(op1_reg)
5652			} else if ((op == IR_EQ || op == IR_NE) && val == 0xff00 && op1_reg <= IR_REG_R3) {
5653				if (op1_reg == IR_REG_RAX) {
5654					|	test ah, ah
5655				} else if (op1_reg == IR_REG_RBX) {
5656					|	test bh, bh
5657				} else if (op1_reg == IR_REG_RCX) {
5658					|	test ch, ch
5659				} else if (op1_reg == IR_REG_RDX) {
5660					|	test dh, dh
5661				} else {
5662					IR_ASSERT(0);
5663				}
5664			} else if ((op == IR_EQ || op == IR_NE) && val == 0xffff) {
5665				|	test Rw(op1_reg), Rw(op1_reg)
5666			} else if ((op == IR_EQ || op == IR_NE) && val == -1) {
5667				|	test Rd(op1_reg), Rd(op1_reg)
5668			} else {
5669				|	ASM_REG_IMM_OP test, type, op1_reg, val
5670			}
5671		} else {
5672			ir_mem mem;
5673
5674			if (ir_rule(ctx, op2) & IR_FUSED) {
5675				mem = ir_fuse_load(ctx, root, op2);
5676			} else {
5677				mem = ir_ref_spill_slot(ctx, op2);
5678			}
5679			|	ASM_REG_MEM_OP test, type, op1_reg, mem
5680		}
5681	} else if (IR_IS_CONST_REF(op1)) {
5682		IR_ASSERT(0);
5683	} else {
5684		ir_mem mem;
5685
5686		if (ir_rule(ctx, op1) & IR_FUSED) {
5687			mem = ir_fuse_load(ctx, root, op1);
5688		} else {
5689			mem = ir_ref_spill_slot(ctx, op1);
5690		}
5691		if (op2_reg != IR_REG_NONE) {
5692			if (IR_REG_SPILLED(op2_reg)) {
5693				op2_reg = IR_REG_NUM(op2_reg);
5694				if (op1 != op2) {
5695					ir_emit_load(ctx, type, op2_reg, op2);
5696				}
5697			}
5698			|	ASM_MEM_REG_OP test, type, mem, op2_reg
5699		} else {
5700			IR_ASSERT(!IR_IS_CONST_REF(op1));
5701			int32_t val = ir_fuse_imm(ctx, op2);
5702			|	ASM_MEM_IMM_OP test, type, mem, val
5703		}
5704	}
5705}
5706
5707static void ir_emit_testcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5708{
5709	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5710
5711	IR_ASSERT(def_reg != IR_REG_NONE);
5712	ir_emit_test_int_common(ctx, def, insn->op1, insn->op);
5713	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5714	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5715		ir_emit_store(ctx, insn->type, def, def_reg);
5716	}
5717}
5718
5719static void ir_emit_setcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5720{
5721	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5722
5723	IR_ASSERT(def_reg != IR_REG_NONE);
5724	_ir_emit_setcc_int(ctx, insn->op, def_reg);
5725	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5726		ir_emit_store(ctx, insn->type, def, def_reg);
5727	}
5728}
5729
5730static ir_op ir_emit_cmp_fp_common(ir_ctx *ctx, ir_ref root, ir_ref cmp_ref, ir_insn *cmp_insn)
5731{
5732	ir_backend_data *data = ctx->data;
5733	dasm_State **Dst = &data->dasm_state;
5734	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5735	ir_op op = cmp_insn->op;
5736	ir_ref op1, op2;
5737	ir_reg op1_reg, op2_reg;
5738
5739	op1 = cmp_insn->op1;
5740	op2 = cmp_insn->op2;
5741	op1_reg = ctx->regs[cmp_ref][1];
5742	op2_reg = ctx->regs[cmp_ref][2];
5743
5744	if (op1_reg == IR_REG_NONE && op2_reg != IR_REG_NONE && (op == IR_EQ || op == IR_NE)) {
5745		ir_reg tmp_reg;
5746
5747		SWAP_REFS(op1, op2);
5748		tmp_reg = op1_reg;
5749		op1_reg = op2_reg;
5750		op2_reg = tmp_reg;
5751	}
5752
5753
5754	IR_ASSERT(op1_reg != IR_REG_NONE);
5755	if (IR_REG_SPILLED(op1_reg)) {
5756		op1_reg = IR_REG_NUM(op1_reg);
5757		ir_emit_load(ctx, type, op1_reg, op1);
5758	}
5759	if (op2_reg != IR_REG_NONE) {
5760		if (IR_REG_SPILLED(op2_reg)) {
5761			op2_reg = IR_REG_NUM(op2_reg);
5762			if (op1 != op2) {
5763				ir_emit_load(ctx, type, op2_reg, op2);
5764			}
5765		}
5766		|	ASM_FP_REG_REG_OP ucomis, type, op1_reg, op2_reg
5767	} else if (IR_IS_CONST_REF(op2)) {
5768		int label = ir_const_label(ctx, op2);
5769
5770		|	ASM_FP_REG_TXT_OP ucomis, type, op1_reg, [=>label]
5771	} else {
5772		ir_mem mem;
5773
5774		if (ir_rule(ctx, op2) & IR_FUSED) {
5775			mem = ir_fuse_load(ctx, root, op2);
5776		} else {
5777			mem = ir_ref_spill_slot(ctx, op2);
5778		}
5779		|	ASM_FP_REG_MEM_OP ucomis, type, op1_reg, mem
5780	}
5781	return op;
5782}
5783
5784static void ir_emit_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
5785{
5786	ir_backend_data *data = ctx->data;
5787	dasm_State **Dst = &data->dasm_state;
5788	ir_op op = ir_emit_cmp_fp_common(ctx, def, def, insn);
5789	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
5790	ir_reg tmp_reg = ctx->regs[def][3];
5791
5792	IR_ASSERT(def_reg != IR_REG_NONE);
5793	switch (op) {
5794		default:
5795			IR_ASSERT(0 && "NIY binary op");
5796		case IR_EQ:
5797			|	setnp Rb(def_reg)
5798			|	mov Rd(tmp_reg), 0
5799			|	cmovne Rd(def_reg), Rd(tmp_reg)
5800			break;
5801		case IR_NE:
5802			|	setp Rb(def_reg)
5803			|	mov Rd(tmp_reg), 1
5804			|	cmovne Rd(def_reg), Rd(tmp_reg)
5805			break;
5806		case IR_LT:
5807			|	setnp Rb(def_reg)
5808			|	mov Rd(tmp_reg), 0
5809			|	cmovae Rd(def_reg), Rd(tmp_reg)
5810			break;
5811		case IR_GE:
5812			|	setae Rb(def_reg)
5813			break;
5814		case IR_LE:
5815			|	setnp Rb(def_reg)
5816			|	mov Rd(tmp_reg), 0
5817			|	cmova Rd(def_reg), Rd(tmp_reg)
5818			break;
5819		case IR_GT:
5820			|	seta Rb(def_reg)
5821			break;
5822		case IR_ULT:
5823			|	setb Rb(def_reg)
5824			break;
5825		case IR_UGE:
5826			|	setp Rb(def_reg)
5827			|	mov Rd(tmp_reg), 1
5828			|	cmovae Rd(def_reg), Rd(tmp_reg)
5829			break;
5830		case IR_ULE:
5831			|	setbe Rb(def_reg)
5832			break;
5833		case IR_UGT:
5834			|	setp Rb(def_reg)
5835			|	mov Rd(tmp_reg), 1
5836			|	cmova Rd(def_reg), Rd(tmp_reg)
5837			break;
5838	}
5839	if (IR_REG_SPILLED(ctx->regs[def][0])) {
5840		ir_emit_store(ctx, insn->type, def, def_reg);
5841	}
5842}
5843
5844static void ir_emit_jmp_true(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
5845{
5846	uint32_t true_block, false_block;
5847	ir_backend_data *data = ctx->data;
5848	dasm_State **Dst = &data->dasm_state;
5849
5850	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5851	if (true_block != next_block) {
5852		|	jmp =>true_block
5853	}
5854}
5855
5856static void ir_emit_jmp_false(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
5857{
5858	uint32_t true_block, false_block;
5859	ir_backend_data *data = ctx->data;
5860	dasm_State **Dst = &data->dasm_state;
5861
5862	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5863	if (false_block != next_block) {
5864		|	jmp =>false_block
5865	}
5866}
5867
5868static void ir_emit_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block, uint8_t op, bool int_cmp)
5869{
5870	uint32_t true_block, false_block;
5871	ir_backend_data *data = ctx->data;
5872	dasm_State **Dst = &data->dasm_state;
5873
5874	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
5875	if (true_block == next_block) {
5876		/* swap to avoid unconditional JMP */
5877		if (int_cmp || op == IR_EQ || op == IR_NE) {
5878			op ^= 1; // reverse
5879		} else {
5880			op ^= 5; // reverse
5881		}
5882		true_block = false_block;
5883		false_block = 0;
5884	} else if (false_block == next_block) {
5885		false_block = 0;
5886	}
5887
5888	if (int_cmp) {
5889		switch (op) {
5890			default:
5891				IR_ASSERT(0 && "NIY binary op");
5892			case IR_EQ:
5893				|	je =>true_block
5894				break;
5895			case IR_NE:
5896				|	jne =>true_block
5897				break;
5898			case IR_LT:
5899				|	jl =>true_block
5900				break;
5901			case IR_GE:
5902				|	jge =>true_block
5903				break;
5904			case IR_LE:
5905				|	jle =>true_block
5906				break;
5907			case IR_GT:
5908				|	jg =>true_block
5909				break;
5910			case IR_ULT:
5911				|	jb =>true_block
5912				break;
5913			case IR_UGE:
5914				|	jae =>true_block
5915				break;
5916			case IR_ULE:
5917				|	jbe =>true_block
5918				break;
5919			case IR_UGT:
5920				|	ja =>true_block
5921				break;
5922		}
5923	} else {
5924		switch (op) {
5925			default:
5926				IR_ASSERT(0 && "NIY binary op");
5927			case IR_EQ:
5928				if (!false_block) {
5929					|	jp >1
5930					|	je =>true_block
5931					|1:
5932				} else {
5933					|	jp =>false_block
5934					|	je =>true_block
5935				}
5936				break;
5937			case IR_NE:
5938				|	jne =>true_block
5939				|	jp =>true_block
5940				break;
5941			case IR_LT:
5942				if (!false_block) {
5943					|	jp >1
5944					|	jb =>true_block
5945					|1:
5946				} else {
5947					|	jp =>false_block
5948					|	jb =>true_block
5949				}
5950				break;
5951			case IR_GE:
5952				|	jae =>true_block
5953				break;
5954			case IR_LE:
5955				if (!false_block) {
5956					|	jp >1
5957					|	jbe =>true_block
5958					|1:
5959				} else {
5960					|	jp =>false_block
5961					|	jbe =>true_block
5962				}
5963				break;
5964			case IR_GT:
5965				|	ja =>true_block
5966				break;
5967			case IR_ULT:
5968				|	jb =>true_block
5969				break;
5970			case IR_UGE:
5971				|	jp =>true_block
5972				|	jae =>true_block
5973				break;
5974			case IR_ULE:
5975				|	jbe =>true_block
5976				break;
5977			case IR_UGT:
5978				|	jp =>true_block
5979				|	ja =>true_block
5980				break;
5981		}
5982	}
5983	if (false_block) {
5984		|	jmp =>false_block
5985	}
5986}
5987
5988static void ir_emit_cmp_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
5989{
5990	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
5991	ir_op op = cmp_insn->op;
5992	ir_type type = ctx->ir_base[cmp_insn->op1].type;
5993	ir_ref op1 = cmp_insn->op1;
5994	ir_ref op2 = cmp_insn->op2;
5995	ir_reg op1_reg = ctx->regs[insn->op2][1];
5996	ir_reg op2_reg = ctx->regs[insn->op2][2];
5997
5998	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
5999		op1_reg = IR_REG_NUM(op1_reg);
6000		ir_emit_load(ctx, type, op1_reg, op1);
6001	}
6002	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6003		op2_reg = IR_REG_NUM(op2_reg);
6004		if (op1 != op2) {
6005			ir_emit_load(ctx, type, op2_reg, op2);
6006		}
6007	}
6008	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
6009		if (op == IR_ULT) {
6010			/* always false */
6011			ir_emit_jmp_false(ctx, b, def, next_block);
6012			return;
6013		} else if (op == IR_UGE) {
6014			/* always true */
6015			ir_emit_jmp_true(ctx, b, def, next_block);
6016			return;
6017		} else if (op == IR_ULE) {
6018			op = IR_EQ;
6019		} else if (op == IR_UGT) {
6020			op = IR_NE;
6021		}
6022	}
6023
6024	bool same_comparison = 0;
6025	ir_insn *prev_insn = &ctx->ir_base[insn->op1];
6026	if (prev_insn->op == IR_IF_TRUE || prev_insn->op == IR_IF_FALSE) {
6027		if (ir_rule(ctx, prev_insn->op1) == IR_CMP_AND_BRANCH_INT) {
6028			prev_insn = &ctx->ir_base[prev_insn->op1];
6029			prev_insn = &ctx->ir_base[prev_insn->op2];
6030			if (prev_insn->op1 == cmp_insn->op1 && prev_insn->op2 == cmp_insn->op2) {
6031				same_comparison = true;
6032			}
6033		}
6034	}
6035	if (!same_comparison) {
6036		ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
6037	}
6038	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1);
6039}
6040
6041static void ir_emit_test_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6042{
6043	ir_ref op2 = insn->op2;
6044	ir_op op = ctx->ir_base[op2].op;
6045
6046	if (op >= IR_EQ && op <= IR_UGT) {
6047		op2 = ctx->ir_base[op2].op1;
6048	} else {
6049		IR_ASSERT(op == IR_AND);
6050		op = IR_NE;
6051	}
6052
6053	ir_emit_test_int_common(ctx, def, op2, op);
6054	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1);
6055}
6056
6057static void ir_emit_cmp_and_branch_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6058{
6059	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
6060	ir_emit_jcc(ctx, b, def, insn, next_block, op, 0);
6061}
6062
6063static void ir_emit_if_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
6064{
6065	ir_type type = ctx->ir_base[insn->op2].type;
6066	ir_reg op2_reg = ctx->regs[def][2];
6067	ir_backend_data *data = ctx->data;
6068	dasm_State **Dst = &data->dasm_state;
6069
6070	if (op2_reg != IR_REG_NONE) {
6071		if (IR_REG_SPILLED(op2_reg)) {
6072			op2_reg = IR_REG_NUM(op2_reg);
6073			ir_emit_load(ctx, type, op2_reg, insn->op2);
6074		}
6075		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
6076	} else if (IR_IS_CONST_REF(insn->op2)) {
6077		uint32_t true_block, false_block;
6078
6079		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
6080		if (ir_const_is_true(&ctx->ir_base[insn->op2])) {
6081			if (true_block != next_block) {
6082				|	jmp =>true_block
6083			}
6084		} else {
6085			if (false_block != next_block) {
6086				|	jmp =>false_block
6087			}
6088		}
6089		return;
6090	} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
6091		uint32_t true_block, false_block;
6092
6093		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
6094		if (true_block != next_block) {
6095			|	jmp =>true_block
6096		}
6097		return;
6098	} else {
6099		ir_mem mem;
6100
6101		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
6102			mem = ir_fuse_load(ctx, def, insn->op2);
6103		} else {
6104			mem = ir_ref_spill_slot(ctx, insn->op2);
6105		}
6106		|	ASM_MEM_IMM_OP cmp, type, mem, 0
6107	}
6108	ir_emit_jcc(ctx, b, def, insn, next_block, IR_NE, 1);
6109}
6110
6111static void ir_emit_cond(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6112{
6113	ir_backend_data *data = ctx->data;
6114	dasm_State **Dst = &data->dasm_state;
6115	ir_type type = insn->type;
6116	ir_ref op1 = insn->op1;
6117	ir_ref op2 = insn->op2;
6118	ir_ref op3 = insn->op3;
6119	ir_type op1_type = ctx->ir_base[op1].type;
6120	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6121	ir_reg op1_reg = ctx->regs[def][1];
6122	ir_reg op2_reg = ctx->regs[def][2];
6123	ir_reg op3_reg = ctx->regs[def][3];
6124
6125	IR_ASSERT(def_reg != IR_REG_NONE);
6126
6127	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6128		op2_reg = IR_REG_NUM(op2_reg);
6129		ir_emit_load(ctx, type, op2_reg, op2);
6130		if (op1 == op2) {
6131			op1_reg = op2_reg;
6132		}
6133		if (op3 == op2) {
6134			op3_reg = op2_reg;
6135		}
6136	}
6137	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6138		op3_reg = IR_REG_NUM(op3_reg);
6139		ir_emit_load(ctx, type, op3_reg, op3);
6140		if (op1 == op2) {
6141			op1_reg = op3_reg;
6142		}
6143	}
6144	if (op1_reg != IR_REG_NONE && op1 != op2 && op1 != op3 && IR_REG_SPILLED(op1_reg)) {
6145		op1_reg = IR_REG_NUM(op1_reg);
6146		ir_emit_load(ctx, op1_type, op1_reg, op1);
6147	}
6148
6149	if (IR_IS_TYPE_INT(op1_type)) {
6150		if (op1_reg != IR_REG_NONE) {
6151			|	ASM_REG_REG_OP test, op1_type, op1_reg, op1_reg
6152		} else {
6153			ir_mem mem = ir_ref_spill_slot(ctx, op1);
6154
6155			|	ASM_MEM_IMM_OP cmp, op1_type, mem, 0
6156		}
6157		if (IR_IS_TYPE_INT(type)) {
6158			IR_ASSERT(op2_reg != IR_REG_NONE || op3_reg != IR_REG_NONE);
6159			if (op3_reg != IR_REG_NONE) {
6160				if (op3_reg == def_reg) {
6161					IR_ASSERT(op2_reg != IR_REG_NONE);
6162					|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
6163				} else {
6164					if (op2_reg != IR_REG_NONE) {
6165						if (def_reg != op2_reg) {
6166							if (IR_IS_TYPE_INT(type)) {
6167								ir_emit_mov(ctx, type, def_reg, op2_reg);
6168							} else {
6169								ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6170							}
6171						}
6172					} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
6173						/* prevent "xor" and flags clobbering */
6174						ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
6175					} else {
6176						ir_emit_load_ex(ctx, type, def_reg, op2, def);
6177					}
6178					|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
6179				}
6180			} else {
6181				IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
6182				if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
6183					/* prevent "xor" and flags clobbering */
6184					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
6185				} else {
6186					ir_emit_load_ex(ctx, type, def_reg, op3, def);
6187				}
6188				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
6189			}
6190
6191			if (IR_REG_SPILLED(ctx->regs[def][0])) {
6192				ir_emit_store(ctx, type, def, def_reg);
6193			}
6194			return;
6195		}
6196		|	je >2
6197	} else {
6198		if (!data->double_zero_const) {
6199			data->double_zero_const = 1;
6200			ir_rodata(ctx);
6201			|.align 16
6202			|->double_zero_const:
6203			|.dword 0, 0
6204			|.code
6205		}
6206		|	ASM_FP_REG_TXT_OP ucomis, op1_type, op1_reg, [->double_zero_const]
6207		|	jp >1
6208		|	je >2
6209		|1:
6210	}
6211
6212	if (op2_reg != IR_REG_NONE) {
6213		if (def_reg != op2_reg) {
6214			if (IR_IS_TYPE_INT(type)) {
6215				ir_emit_mov(ctx, type, def_reg, op2_reg);
6216			} else {
6217				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6218			}
6219		}
6220	} else {
6221		ir_emit_load_ex(ctx, type, def_reg, op2, def);
6222	}
6223	|	jmp >3
6224	|2:
6225	if (op3_reg != IR_REG_NONE) {
6226		if (def_reg != op3_reg) {
6227			if (IR_IS_TYPE_INT(type)) {
6228				ir_emit_mov(ctx, type, def_reg, op3_reg);
6229			} else {
6230				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6231			}
6232		}
6233	} else {
6234		ir_emit_load_ex(ctx, type, def_reg, op3, def);
6235	}
6236	|3:
6237
6238	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6239		ir_emit_store(ctx, type, def, def_reg);
6240	}
6241}
6242
6243static void ir_emit_cond_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6244{
6245	ir_backend_data *data = ctx->data;
6246	dasm_State **Dst = &data->dasm_state;
6247	ir_type type = insn->type;
6248	ir_ref op2 = insn->op2;
6249	ir_ref op3 = insn->op3;
6250	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6251	ir_reg op2_reg = ctx->regs[def][2];
6252	ir_reg op3_reg = ctx->regs[def][3];
6253	ir_op op;
6254
6255	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6256		op2_reg = IR_REG_NUM(op2_reg);
6257		ir_emit_load(ctx, type, op2_reg, op2);
6258		if (op3 == op2) {
6259			op3_reg = op2_reg;
6260		}
6261	}
6262	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6263		op3_reg = IR_REG_NUM(op3_reg);
6264		ir_emit_load(ctx, type, op3_reg, op3);
6265	}
6266
6267	ir_emit_cmp_int_common2(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);
6268	op = ctx->ir_base[insn->op1].op;
6269
6270	if (IR_IS_TYPE_INT(type)) {
6271		if (op3_reg != IR_REG_NONE) {
6272			if (op3_reg == def_reg) {
6273				IR_ASSERT(op2_reg != IR_REG_NONE);
6274				op3_reg = op2_reg;
6275				op ^= 1; // reverse
6276			} else {
6277				if (op2_reg != IR_REG_NONE) {
6278					if (def_reg != op2_reg) {
6279//						if (IR_IS_TYPE_INT(type)) {
6280							ir_emit_mov(ctx, type, def_reg, op2_reg);
6281//						} else {
6282//							ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6283//						}
6284					}
6285				} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
6286					/* prevent "xor" and flags clobbering */
6287					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
6288				} else {
6289					ir_emit_load_ex(ctx, type, def_reg, op2, def);
6290				}
6291			}
6292		} else {
6293			IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
6294			if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
6295				/* prevent "xor" and flags clobbering */
6296				ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
6297			} else {
6298				ir_emit_load_ex(ctx, type, def_reg, op3, def);
6299			}
6300			op3_reg = op2_reg;
6301			op ^= 1; // reverse
6302		}
6303
6304		switch (op) {
6305			default:
6306				IR_ASSERT(0 && "NIY binary op");
6307			case IR_EQ:
6308				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op3_reg
6309				break;
6310			case IR_NE:
6311				|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
6312				break;
6313			case IR_LT:
6314				|	ASM_REG_REG_OP2 cmovge, type, def_reg, op3_reg
6315				break;
6316			case IR_GE:
6317				|	ASM_REG_REG_OP2 cmovl, type, def_reg, op3_reg
6318				break;
6319			case IR_LE:
6320				|	ASM_REG_REG_OP2 cmovg, type, def_reg, op3_reg
6321				break;
6322			case IR_GT:
6323				|	ASM_REG_REG_OP2 cmovle, type, def_reg, op3_reg
6324				break;
6325			case IR_ULT:
6326				|	ASM_REG_REG_OP2 cmovae, type, def_reg, op3_reg
6327				break;
6328			case IR_UGE:
6329				|	ASM_REG_REG_OP2 cmovb, type, def_reg, op3_reg
6330				break;
6331			case IR_ULE:
6332				|	ASM_REG_REG_OP2 cmova, type, def_reg, op3_reg
6333				break;
6334			case IR_UGT:
6335				|	ASM_REG_REG_OP2 cmovbe, type, def_reg, op3_reg
6336				break;
6337		}
6338	} else {
6339		switch (op) {
6340			default:
6341				IR_ASSERT(0 && "NIY binary op");
6342			case IR_EQ:
6343				|	jne >2
6344				break;
6345			case IR_NE:
6346				|	je >2
6347				break;
6348			case IR_LT:
6349				|	jge >2
6350				break;
6351			case IR_GE:
6352				|	jl >2
6353				break;
6354			case IR_LE:
6355				|	jg >2
6356				break;
6357			case IR_GT:
6358				|	jle >2
6359				break;
6360			case IR_ULT:
6361				|	jae >2
6362				break;
6363			case IR_UGE:
6364				|	jb >2
6365				break;
6366			case IR_ULE:
6367				|	ja >2
6368				break;
6369			case IR_UGT:
6370				|	jbe >2
6371				break;
6372		}
6373		|1:
6374
6375		if (op2_reg != IR_REG_NONE) {
6376			if (def_reg != op2_reg) {
6377				if (IR_IS_TYPE_INT(type)) {
6378					ir_emit_mov(ctx, type, def_reg, op2_reg);
6379				} else {
6380					ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6381				}
6382			}
6383		} else {
6384			ir_emit_load_ex(ctx, type, def_reg, op2, def);
6385		}
6386		|	jmp >3
6387		|2:
6388		if (op3_reg != IR_REG_NONE) {
6389			if (def_reg != op3_reg) {
6390				if (IR_IS_TYPE_INT(type)) {
6391					ir_emit_mov(ctx, type, def_reg, op3_reg);
6392				} else {
6393					ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6394				}
6395			}
6396		} else {
6397			ir_emit_load_ex(ctx, type, def_reg, op3, def);
6398		}
6399		|3:
6400	}
6401
6402	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6403		ir_emit_store(ctx, type, def, def_reg);
6404	}
6405}
6406
6407static void ir_emit_cond_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6408{
6409	ir_backend_data *data = ctx->data;
6410	dasm_State **Dst = &data->dasm_state;
6411	ir_type type = insn->type;
6412	ir_ref op2 = insn->op2;
6413	ir_ref op3 = insn->op3;
6414	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6415	ir_reg op2_reg = ctx->regs[def][2];
6416	ir_reg op3_reg = ctx->regs[def][3];
6417	ir_op op;
6418
6419	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
6420		op2_reg = IR_REG_NUM(op2_reg);
6421		ir_emit_load(ctx, type, op2_reg, op2);
6422		if (op3 == op2) {
6423			op3_reg = op2_reg;
6424		}
6425	}
6426	if (op3_reg != IR_REG_NONE && op3 != op2 && IR_REG_SPILLED(op3_reg)) {
6427		op3_reg = IR_REG_NUM(op3_reg);
6428		ir_emit_load(ctx, type, op3_reg, op3);
6429	}
6430
6431	op = ir_emit_cmp_fp_common(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);
6432
6433	switch (op) {
6434		default:
6435			IR_ASSERT(0 && "NIY binary op");
6436		case IR_EQ:
6437			|	jne >2
6438			|	jp >2
6439			break;
6440		case IR_NE:
6441			|	jp >1
6442			|	je >2
6443			break;
6444		case IR_LT:
6445			|	jp >2
6446			|	jae >2
6447			break;
6448		case IR_GE:
6449			|	jb >2
6450			break;
6451		case IR_LE:
6452			|	jp >2
6453			|	ja >2
6454			break;
6455		case IR_GT:
6456			|	jbe >2
6457			break;
6458		case IR_ULT:
6459			|	jae >2
6460			break;
6461		case IR_UGE:
6462			|	jp >1
6463			|	jb >2
6464			break;
6465		case IR_ULE:
6466			|	ja >2
6467			break;
6468		case IR_UGT:
6469			|	jp >1
6470			|	jbe >2
6471			break;
6472	}
6473	|1:
6474
6475	if (op2_reg != IR_REG_NONE) {
6476		if (def_reg != op2_reg) {
6477			if (IR_IS_TYPE_INT(type)) {
6478				ir_emit_mov(ctx, type, def_reg, op2_reg);
6479			} else {
6480				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
6481			}
6482		}
6483	} else {
6484		ir_emit_load_ex(ctx, type, def_reg, op2, def);
6485	}
6486	|	jmp >3
6487	|2:
6488	if (op3_reg != IR_REG_NONE) {
6489		if (def_reg != op3_reg) {
6490			if (IR_IS_TYPE_INT(type)) {
6491				ir_emit_mov(ctx, type, def_reg, op3_reg);
6492			} else {
6493				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
6494			}
6495		}
6496	} else {
6497		ir_emit_load_ex(ctx, type, def_reg, op3, def);
6498	}
6499	|3:
6500
6501	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6502		ir_emit_store(ctx, type, def, def_reg);
6503	}
6504}
6505
6506static void ir_emit_return_void(ir_ctx *ctx)
6507{
6508	ir_backend_data *data = ctx->data;
6509	dasm_State **Dst = &data->dasm_state;
6510
6511	ir_emit_epilogue(ctx);
6512
6513#ifdef IR_TARGET_X86
6514	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC) && ctx->param_stack_size) {
6515		|	ret ctx->param_stack_size
6516		return;
6517	}
6518#endif
6519
6520	|	ret
6521}
6522
6523static void ir_emit_return_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6524{
6525	ir_reg op2_reg = ctx->regs[ref][2];
6526
6527	if (op2_reg != IR_REG_INT_RET1) {
6528		ir_type type = ctx->ir_base[insn->op2].type;
6529
6530		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
6531			ir_emit_mov(ctx, type, IR_REG_INT_RET1, op2_reg);
6532		} else {
6533			ir_emit_load(ctx, type, IR_REG_INT_RET1, insn->op2);
6534		}
6535	}
6536	ir_emit_return_void(ctx);
6537}
6538
6539static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
6540{
6541	ir_reg op2_reg = ctx->regs[ref][2];
6542	ir_type type = ctx->ir_base[insn->op2].type;
6543
6544#ifdef IR_REG_FP_RET1
6545	if (op2_reg != IR_REG_FP_RET1) {
6546		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
6547			ir_emit_fp_mov(ctx, type, IR_REG_FP_RET1, op2_reg);
6548		} else {
6549			ir_emit_load(ctx, type, IR_REG_FP_RET1, insn->op2);
6550		}
6551	}
6552#else
6553	ir_backend_data *data = ctx->data;
6554	dasm_State **Dst = &data->dasm_state;
6555
6556	if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
6557		ir_reg fp;
6558		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp);
6559
6560		if (type == IR_DOUBLE) {
6561			|	fld qword [Ra(fp)+offset]
6562		} else {
6563			IR_ASSERT(type == IR_FLOAT);
6564			|	fld dword [Ra(fp)+offset]
6565		}
6566	} else {
6567		int32_t offset = ctx->ret_slot;
6568		ir_reg fp;
6569
6570		IR_ASSERT(offset != -1);
6571		offset = IR_SPILL_POS_TO_OFFSET(offset);
6572		fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
6573		ir_emit_store_mem_fp(ctx, type, IR_MEM_BO(fp, offset), op2_reg);
6574		if (type == IR_DOUBLE) {
6575			|	fld qword [Ra(fp)+offset]
6576		} else {
6577			IR_ASSERT(type == IR_FLOAT);
6578			|	fld dword [Ra(fp)+offset]
6579		}
6580	}
6581#endif
6582	ir_emit_return_void(ctx);
6583}
6584
6585static void ir_emit_sext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6586{
6587	ir_type dst_type = insn->type;
6588	ir_type src_type = ctx->ir_base[insn->op1].type;
6589	ir_backend_data *data = ctx->data;
6590	dasm_State **Dst = &data->dasm_state;
6591	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6592	ir_reg op1_reg = ctx->regs[def][1];
6593
6594	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6595	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6596	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
6597	IR_ASSERT(def_reg != IR_REG_NONE);
6598
6599	if (op1_reg != IR_REG_NONE) {
6600		if (IR_REG_SPILLED(op1_reg)) {
6601			op1_reg = IR_REG_NUM(op1_reg);
6602			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6603		}
6604		if (ir_type_size[src_type] == 1) {
6605			if (ir_type_size[dst_type] == 2) {
6606				|	movsx Rw(def_reg), Rb(op1_reg)
6607			} else if (ir_type_size[dst_type] == 4) {
6608				|	movsx Rd(def_reg), Rb(op1_reg)
6609			} else {
6610				IR_ASSERT(ir_type_size[dst_type] == 8);
6611				IR_ASSERT(sizeof(void*) == 8);
6612|.if X64
6613				|	movsx Rq(def_reg), Rb(op1_reg)
6614|.endif
6615			}
6616		} else if (ir_type_size[src_type] == 2) {
6617			if (ir_type_size[dst_type] == 4) {
6618				|	movsx Rd(def_reg), Rw(op1_reg)
6619			} else {
6620				IR_ASSERT(ir_type_size[dst_type] == 8);
6621				IR_ASSERT(sizeof(void*) == 8);
6622|.if X64
6623				|	movsx Rq(def_reg), Rw(op1_reg)
6624|.endif
6625			}
6626		} else {
6627			IR_ASSERT(ir_type_size[src_type] == 4);
6628			IR_ASSERT(ir_type_size[dst_type] == 8);
6629			IR_ASSERT(sizeof(void*) == 8);
6630|.if X64
6631			|	movsxd Rq(def_reg), Rd(op1_reg)
6632|.endif
6633		}
6634	} else if (IR_IS_CONST_REF(insn->op1)) {
6635		IR_ASSERT(0);
6636	} else {
6637		ir_mem mem;
6638
6639		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6640			mem = ir_fuse_load(ctx, def, insn->op1);
6641		} else {
6642			mem = ir_ref_spill_slot(ctx, insn->op1);
6643		}
6644
6645		if (ir_type_size[src_type] == 1) {
6646			if (ir_type_size[dst_type] == 2) {
6647				|	ASM_TXT_TMEM_OP movsx, Rw(def_reg), byte, mem
6648			} else if (ir_type_size[dst_type] == 4) {
6649				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), byte, mem
6650			} else {
6651				IR_ASSERT(ir_type_size[dst_type] == 8);
6652				IR_ASSERT(sizeof(void*) == 8);
6653|.if X64
6654				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), byte, mem
6655|.endif
6656			}
6657		} else if (ir_type_size[src_type] == 2) {
6658			if (ir_type_size[dst_type] == 4) {
6659				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), word, mem
6660			} else {
6661				IR_ASSERT(ir_type_size[dst_type] == 8);
6662				IR_ASSERT(sizeof(void*) == 8);
6663|.if X64
6664				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), word, mem
6665|.endif
6666			}
6667		} else {
6668			IR_ASSERT(ir_type_size[src_type] == 4);
6669			IR_ASSERT(ir_type_size[dst_type] == 8);
6670			IR_ASSERT(sizeof(void*) == 8);
6671|.if X64
6672			|	ASM_TXT_TMEM_OP movsxd, Rq(def_reg), dword, mem
6673|.endif
6674		}
6675	}
6676	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6677		ir_emit_store(ctx, dst_type, def, def_reg);
6678	}
6679}
6680
6681static void ir_emit_zext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6682{
6683	ir_type dst_type = insn->type;
6684	ir_type src_type = ctx->ir_base[insn->op1].type;
6685	ir_backend_data *data = ctx->data;
6686	dasm_State **Dst = &data->dasm_state;
6687	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6688	ir_reg op1_reg = ctx->regs[def][1];
6689
6690	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6691	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6692	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
6693	IR_ASSERT(def_reg != IR_REG_NONE);
6694
6695	if (op1_reg != IR_REG_NONE) {
6696		if (IR_REG_SPILLED(op1_reg)) {
6697			op1_reg = IR_REG_NUM(op1_reg);
6698			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6699		}
6700		if (ir_type_size[src_type] == 1) {
6701			if (ir_type_size[dst_type] == 2) {
6702				|	movzx Rw(def_reg), Rb(op1_reg)
6703			} else if (ir_type_size[dst_type] == 4) {
6704				|	movzx Rd(def_reg), Rb(op1_reg)
6705			} else {
6706				IR_ASSERT(ir_type_size[dst_type] == 8);
6707				IR_ASSERT(sizeof(void*) == 8);
6708|.if X64
6709				|	movzx Rq(def_reg), Rb(op1_reg)
6710|.endif
6711			}
6712		} else if (ir_type_size[src_type] == 2) {
6713			if (ir_type_size[dst_type] == 4) {
6714				|	movzx Rd(def_reg), Rw(op1_reg)
6715			} else {
6716				IR_ASSERT(ir_type_size[dst_type] == 8);
6717				IR_ASSERT(sizeof(void*) == 8);
6718|.if X64
6719				|	movzx Rq(def_reg), Rw(op1_reg)
6720|.endif
6721			}
6722		} else {
6723			IR_ASSERT(ir_type_size[src_type] == 4);
6724			IR_ASSERT(ir_type_size[dst_type] == 8);
6725			IR_ASSERT(sizeof(void*) == 8);
6726|.if X64
6727			/* Avoid zero extension to the same register. This may be not always safe ??? */
6728			if (op1_reg != def_reg) {
6729				|	mov Rd(def_reg), Rd(op1_reg)
6730			}
6731|.endif
6732		}
6733	} else if (IR_IS_CONST_REF(insn->op1)) {
6734		IR_ASSERT(0);
6735	} else {
6736		ir_mem mem;
6737
6738		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6739			mem = ir_fuse_load(ctx, def, insn->op1);
6740		} else {
6741			mem = ir_ref_spill_slot(ctx, insn->op1);
6742		}
6743
6744		if (ir_type_size[src_type] == 1) {
6745			if (ir_type_size[dst_type] == 2) {
6746				|	ASM_TXT_TMEM_OP movzx, Rw(def_reg), byte, mem
6747			} else if (ir_type_size[dst_type] == 4) {
6748				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), byte, mem
6749			} else {
6750				IR_ASSERT(ir_type_size[dst_type] == 8);
6751				IR_ASSERT(sizeof(void*) == 8);
6752|.if X64
6753				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), byte, mem
6754|.endif
6755			}
6756		} else if (ir_type_size[src_type] == 2) {
6757			if (ir_type_size[dst_type] == 4) {
6758				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), word, mem
6759			} else {
6760				IR_ASSERT(ir_type_size[dst_type] == 8);
6761				IR_ASSERT(sizeof(void*) == 8);
6762|.if X64
6763				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), word, mem
6764|.endif
6765			}
6766		} else {
6767			IR_ASSERT(ir_type_size[src_type] == 4);
6768			IR_ASSERT(ir_type_size[dst_type] == 8);
6769|.if X64
6770			|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6771|.endif
6772		}
6773	}
6774	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6775		ir_emit_store(ctx, dst_type, def, def_reg);
6776	}
6777}
6778
6779static void ir_emit_trunc(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6780{
6781	ir_type dst_type = insn->type;
6782	ir_type src_type = ctx->ir_base[insn->op1].type;
6783	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6784	ir_reg op1_reg = ctx->regs[def][1];
6785
6786	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6787	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6788	IR_ASSERT(ir_type_size[dst_type] < ir_type_size[src_type]);
6789	IR_ASSERT(def_reg != IR_REG_NONE);
6790	if (op1_reg != IR_REG_NONE) {
6791		if (IR_REG_SPILLED(op1_reg)) {
6792			op1_reg = IR_REG_NUM(op1_reg);
6793			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6794		}
6795		if (op1_reg != def_reg) {
6796			ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6797		}
6798	} else {
6799		ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6800	}
6801	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6802		ir_emit_store(ctx, dst_type, def, def_reg);
6803	}
6804}
6805
6806static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6807{
6808	ir_type dst_type = insn->type;
6809	ir_type src_type = ctx->ir_base[insn->op1].type;
6810	ir_backend_data *data = ctx->data;
6811	dasm_State **Dst = &data->dasm_state;
6812	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6813	ir_reg op1_reg = ctx->regs[def][1];
6814
6815	IR_ASSERT(ir_type_size[dst_type] == ir_type_size[src_type]);
6816	IR_ASSERT(def_reg != IR_REG_NONE);
6817	if (IR_IS_TYPE_INT(src_type) && IR_IS_TYPE_INT(dst_type)) {
6818		if (op1_reg != IR_REG_NONE) {
6819			if (IR_REG_SPILLED(op1_reg)) {
6820				op1_reg = IR_REG_NUM(op1_reg);
6821				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6822			}
6823			if (op1_reg != def_reg) {
6824				ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
6825			}
6826		} else {
6827			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6828		}
6829	} else if (IR_IS_TYPE_FP(src_type) && IR_IS_TYPE_FP(dst_type)) {
6830		if (op1_reg != IR_REG_NONE) {
6831			if (IR_REG_SPILLED(op1_reg)) {
6832				op1_reg = IR_REG_NUM(op1_reg);
6833				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6834			}
6835			if (op1_reg != def_reg) {
6836				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
6837			}
6838		} else {
6839			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
6840		}
6841	} else if (IR_IS_TYPE_FP(src_type)) {
6842		IR_ASSERT(IR_IS_TYPE_INT(dst_type));
6843		if (op1_reg != IR_REG_NONE) {
6844			if (IR_REG_SPILLED(op1_reg)) {
6845				op1_reg = IR_REG_NUM(op1_reg);
6846				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6847			}
6848			if (src_type == IR_DOUBLE) {
6849				IR_ASSERT(sizeof(void*) == 8);
6850|.if X64
6851				if (ctx->mflags & IR_X86_AVX) {
6852					|	vmovd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6853				} else {
6854					|	movd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6855				}
6856|.endif
6857			} else {
6858				IR_ASSERT(src_type == IR_FLOAT);
6859				if (ctx->mflags & IR_X86_AVX) {
6860					|	vmovd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6861				} else {
6862					|	movd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
6863				}
6864			}
6865		} else if (IR_IS_CONST_REF(insn->op1)) {
6866			ir_insn *_insn = &ctx->ir_base[insn->op1];
6867			IR_ASSERT(!IR_IS_SYM_CONST(_insn->op));
6868			if (src_type == IR_DOUBLE) {
6869				IR_ASSERT(sizeof(void*) == 8);
6870|.if X64
6871				|	mov64 Rq(def_reg), _insn->val.i64
6872|.endif
6873			} else {
6874				IR_ASSERT(src_type == IR_FLOAT);
6875				|	mov Rd(def_reg), _insn->val.i32
6876			}
6877		} else {
6878			ir_mem mem;
6879
6880			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6881				mem = ir_fuse_load(ctx, def, insn->op1);
6882			} else {
6883				mem = ir_ref_spill_slot(ctx, insn->op1);
6884			}
6885
6886			if (src_type == IR_DOUBLE) {
6887				IR_ASSERT(sizeof(void*) == 8);
6888|.if X64
6889				|	ASM_TXT_TMEM_OP mov, Rq(def_reg), qword, mem
6890|.endif
6891			} else {
6892				IR_ASSERT(src_type == IR_FLOAT);
6893				|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
6894			}
6895		}
6896	} else if (IR_IS_TYPE_FP(dst_type)) {
6897		IR_ASSERT(IR_IS_TYPE_INT(src_type));
6898		if (op1_reg != IR_REG_NONE) {
6899			if (IR_REG_SPILLED(op1_reg)) {
6900				op1_reg = IR_REG_NUM(op1_reg);
6901				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6902			}
6903			if (dst_type == IR_DOUBLE) {
6904				IR_ASSERT(sizeof(void*) == 8);
6905|.if X64
6906				if (ctx->mflags & IR_X86_AVX) {
6907					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6908				} else {
6909					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
6910				}
6911|.endif
6912			} else {
6913				IR_ASSERT(dst_type == IR_FLOAT);
6914				if (ctx->mflags & IR_X86_AVX) {
6915					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6916				} else {
6917					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
6918				}
6919			}
6920		} else if (IR_IS_CONST_REF(insn->op1)) {
6921			int label = ir_const_label(ctx, insn->op1);
6922
6923			|	ASM_FP_REG_TXT_OP movs, dst_type, def_reg, [=>label]
6924		} else {
6925			ir_mem mem;
6926
6927			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
6928				mem = ir_fuse_load(ctx, def, insn->op1);
6929			} else {
6930				mem = ir_ref_spill_slot(ctx, insn->op1);
6931			}
6932
6933			|	ASM_FP_REG_MEM_OP movs, dst_type, def_reg, mem
6934		}
6935	}
6936	if (IR_REG_SPILLED(ctx->regs[def][0])) {
6937		ir_emit_store(ctx, dst_type, def, def_reg);
6938	}
6939}
6940
6941static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
6942{
6943	ir_type dst_type = insn->type;
6944	ir_type src_type = ctx->ir_base[insn->op1].type;
6945	ir_backend_data *data = ctx->data;
6946	dasm_State **Dst = &data->dasm_state;
6947	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
6948	ir_reg op1_reg = ctx->regs[def][1];
6949
6950	IR_ASSERT(IR_IS_TYPE_INT(src_type));
6951	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
6952	IR_ASSERT(def_reg != IR_REG_NONE);
6953	if (op1_reg != IR_REG_NONE) {
6954		bool src64 = 0;
6955
6956		if (IR_REG_SPILLED(op1_reg)) {
6957			op1_reg = IR_REG_NUM(op1_reg);
6958			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
6959		}
6960		if (IR_IS_TYPE_SIGNED(src_type)) {
6961			if (ir_type_size[src_type] < 4) {
6962|.if X64
6963||				if (ir_type_size[src_type] == 1) {
6964					| movsx Rq(op1_reg), Rb(op1_reg)
6965||				} else {
6966					| movsx Rq(op1_reg), Rw(op1_reg)
6967||				}
6968||				src64 = 1;
6969|.else
6970||				if (ir_type_size[src_type] == 1) {
6971					| movsx Rd(op1_reg), Rb(op1_reg)
6972||				} else {
6973					| movsx Rd(op1_reg), Rw(op1_reg)
6974||				}
6975|.endif
6976			} else if (ir_type_size[src_type] > 4) {
6977				src64 = 1;
6978			}
6979		} else {
6980			if (ir_type_size[src_type] < 8) {
6981|.if X64
6982||				if (ir_type_size[src_type] == 1) {
6983					| movzx Rq(op1_reg), Rb(op1_reg)
6984||				} else if (ir_type_size[src_type] == 2) {
6985					| movzx Rq(op1_reg), Rw(op1_reg)
6986||				}
6987||				src64 = 1;
6988|.else
6989||				if (ir_type_size[src_type] == 1) {
6990					| movzx Rd(op1_reg), Rb(op1_reg)
6991||				} else if (ir_type_size[src_type] == 2) {
6992					| movzx Rd(op1_reg), Rw(op1_reg)
6993||				}
6994|.endif
6995			} else {
6996				// TODO: uint64_t -> double
6997				src64 = 1;
6998			}
6999		}
7000		if (!src64) {
7001			if (dst_type == IR_DOUBLE) {
7002				if (ctx->mflags & IR_X86_AVX) {
7003					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7004					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7005				} else {
7006					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7007					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7008				}
7009			} else {
7010				IR_ASSERT(dst_type == IR_FLOAT);
7011				if (ctx->mflags & IR_X86_AVX) {
7012					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7013					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7014				} else {
7015					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7016					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
7017				}
7018			}
7019		} else {
7020			IR_ASSERT(sizeof(void*) == 8);
7021|.if X64
7022			if (dst_type == IR_DOUBLE) {
7023				if (ctx->mflags & IR_X86_AVX) {
7024					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7025					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7026				} else {
7027					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7028					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7029				}
7030			} else {
7031				IR_ASSERT(dst_type == IR_FLOAT);
7032				if (ctx->mflags & IR_X86_AVX) {
7033					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7034					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7035				} else {
7036					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7037					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
7038				}
7039			}
7040|.endif
7041		}
7042	} else {
7043		ir_mem mem;
7044		bool src64 = ir_type_size[src_type] == 8;
7045
7046		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7047			mem = ir_fuse_load(ctx, def, insn->op1);
7048		} else {
7049			mem = ir_ref_spill_slot(ctx, insn->op1);
7050		}
7051
7052		if (!src64) {
7053			if (dst_type == IR_DOUBLE) {
7054				if (ctx->mflags & IR_X86_AVX) {
7055					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7056					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7057				} else {
7058					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7059					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7060				}
7061			} else {
7062				IR_ASSERT(dst_type == IR_FLOAT);
7063				if (ctx->mflags & IR_X86_AVX) {
7064					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7065					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7066				} else {
7067					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7068					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7069				}
7070			}
7071		} else {
7072			IR_ASSERT(sizeof(void*) == 8);
7073|.if X64
7074			if (dst_type == IR_DOUBLE) {
7075				if (ctx->mflags & IR_X86_AVX) {
7076					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7077					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7078				} else {
7079					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7080					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7081				}
7082			} else {
7083				IR_ASSERT(dst_type == IR_FLOAT);
7084				if (ctx->mflags & IR_X86_AVX) {
7085					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7086					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7087				} else {
7088					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
7089					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7090				}
7091			}
7092|.endif
7093		}
7094	}
7095	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7096		ir_emit_store(ctx, dst_type, def, def_reg);
7097	}
7098}
7099
7100static void ir_emit_fp2int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7101{
7102	ir_type dst_type = insn->type;
7103	ir_type src_type = ctx->ir_base[insn->op1].type;
7104	ir_backend_data *data = ctx->data;
7105	dasm_State **Dst = &data->dasm_state;
7106	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7107	ir_reg op1_reg = ctx->regs[def][1];
7108	bool dst64 = 0;
7109
7110	IR_ASSERT(IR_IS_TYPE_FP(src_type));
7111	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
7112	IR_ASSERT(def_reg != IR_REG_NONE);
7113	if (IR_IS_TYPE_SIGNED(dst_type) ? ir_type_size[dst_type] == 8 : ir_type_size[dst_type] >= 4) {
7114		// TODO: we might need to perform truncation from 32/64 bit integer
7115		dst64 = 1;
7116	}
7117	if (op1_reg != IR_REG_NONE) {
7118		if (IR_REG_SPILLED(op1_reg)) {
7119			op1_reg = IR_REG_NUM(op1_reg);
7120			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
7121		}
7122		if (!dst64) {
7123			if (src_type == IR_DOUBLE) {
7124				if (ctx->mflags & IR_X86_AVX) {
7125					|	vcvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7126				} else {
7127					|	cvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7128				}
7129			} else {
7130				IR_ASSERT(src_type == IR_FLOAT);
7131				if (ctx->mflags & IR_X86_AVX) {
7132					|	vcvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7133				} else {
7134					|	cvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7135				}
7136			}
7137		} else {
7138			IR_ASSERT(sizeof(void*) == 8);
7139|.if X64
7140			if (src_type == IR_DOUBLE) {
7141				if (ctx->mflags & IR_X86_AVX) {
7142					|	vcvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7143				} else {
7144					|	cvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7145				}
7146			} else {
7147				IR_ASSERT(src_type == IR_FLOAT);
7148				if (ctx->mflags & IR_X86_AVX) {
7149					|	vcvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7150				} else {
7151					|	cvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
7152				}
7153			}
7154|.endif
7155		}
7156	} else if (IR_IS_CONST_REF(insn->op1)) {
7157		int label = ir_const_label(ctx, insn->op1);
7158
7159		if (!dst64) {
7160			if (src_type == IR_DOUBLE) {
7161				if (ctx->mflags & IR_X86_AVX) {
7162					|	vcvttsd2si Rd(def_reg), qword [=>label]
7163				} else {
7164					|	cvttsd2si Rd(def_reg), qword [=>label]
7165				}
7166			} else {
7167				IR_ASSERT(src_type == IR_FLOAT);
7168				if (ctx->mflags & IR_X86_AVX) {
7169					|	vcvttss2si Rd(def_reg), dword [=>label]
7170				} else {
7171					|	cvttss2si Rd(def_reg), dword [=>label]
7172				}
7173			}
7174		} else {
7175			IR_ASSERT(sizeof(void*) == 8);
7176|.if X64
7177			if (src_type == IR_DOUBLE) {
7178				if (ctx->mflags & IR_X86_AVX) {
7179					|	vcvttsd2si Rq(def_reg), qword [=>label]
7180				} else {
7181					|	cvttsd2si Rq(def_reg), qword [=>label]
7182				}
7183			} else {
7184				IR_ASSERT(src_type == IR_FLOAT);
7185				if (ctx->mflags & IR_X86_AVX) {
7186					|	vcvttss2si Rq(def_reg), dword [=>label]
7187				} else {
7188					|	cvttss2si Rq(def_reg), dword [=>label]
7189				}
7190			}
7191|.endif
7192		}
7193	} else {
7194		ir_mem mem;
7195
7196		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7197			mem = ir_fuse_load(ctx, def, insn->op1);
7198		} else {
7199			mem = ir_ref_spill_slot(ctx, insn->op1);
7200		}
7201
7202		if (!dst64) {
7203			if (src_type == IR_DOUBLE) {
7204				if (ctx->mflags & IR_X86_AVX) {
7205					|	ASM_TXT_TMEM_OP vcvttsd2si, Rd(def_reg), qword, mem
7206				} else {
7207					|	ASM_TXT_TMEM_OP cvttsd2si, Rd(def_reg), qword, mem
7208				}
7209			} else {
7210				IR_ASSERT(src_type == IR_FLOAT);
7211				if (ctx->mflags & IR_X86_AVX) {
7212					|	ASM_TXT_TMEM_OP vcvttss2si, Rd(def_reg), dword, mem
7213				} else {
7214					|	ASM_TXT_TMEM_OP cvttss2si, Rd(def_reg), dword, mem
7215				}
7216			}
7217		} else {
7218			IR_ASSERT(sizeof(void*) == 8);
7219|.if X64
7220			if (src_type == IR_DOUBLE) {
7221				if (ctx->mflags & IR_X86_AVX) {
7222					|	ASM_TXT_TMEM_OP vcvttsd2si, Rq(def_reg), qword, mem
7223				} else {
7224					|	ASM_TXT_TMEM_OP cvttsd2si, Rq(def_reg), qword, mem
7225				}
7226			} else {
7227				IR_ASSERT(src_type == IR_FLOAT);
7228				if (ctx->mflags & IR_X86_AVX) {
7229					|	ASM_TXT_TMEM_OP vcvttss2si, Rq(def_reg), dword, mem
7230				} else {
7231					|	ASM_TXT_TMEM_OP cvttss2si, Rq(def_reg), dword, mem
7232				}
7233			}
7234|.endif
7235		}
7236	}
7237	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7238		ir_emit_store(ctx, dst_type, def, def_reg);
7239	}
7240}
7241
7242static void ir_emit_fp2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7243{
7244	ir_type dst_type = insn->type;
7245	ir_type src_type = ctx->ir_base[insn->op1].type;
7246	ir_backend_data *data = ctx->data;
7247	dasm_State **Dst = &data->dasm_state;
7248	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7249	ir_reg op1_reg = ctx->regs[def][1];
7250
7251	IR_ASSERT(IR_IS_TYPE_FP(src_type));
7252	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
7253	IR_ASSERT(def_reg != IR_REG_NONE);
7254	if (op1_reg != IR_REG_NONE) {
7255		if (IR_REG_SPILLED(op1_reg)) {
7256			op1_reg = IR_REG_NUM(op1_reg);
7257			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
7258		}
7259		if (src_type == dst_type) {
7260			if (op1_reg != def_reg) {
7261				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
7262			}
7263		} else if (src_type == IR_DOUBLE) {
7264			if (ctx->mflags & IR_X86_AVX) {
7265				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7266			} else {
7267				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7268			}
7269		} else {
7270			IR_ASSERT(src_type == IR_FLOAT);
7271			if (ctx->mflags & IR_X86_AVX) {
7272				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7273			} else {
7274				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
7275			}
7276		}
7277	} else if (IR_IS_CONST_REF(insn->op1)) {
7278		int label = ir_const_label(ctx, insn->op1);
7279
7280		if (src_type == IR_DOUBLE) {
7281			if (ctx->mflags & IR_X86_AVX) {
7282				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
7283			} else {
7284				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
7285			}
7286		} else {
7287			IR_ASSERT(src_type == IR_FLOAT);
7288			if (ctx->mflags & IR_X86_AVX) {
7289				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
7290			} else {
7291				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
7292			}
7293		}
7294	} else {
7295		ir_mem mem;
7296
7297		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
7298			mem = ir_fuse_load(ctx, def, insn->op1);
7299		} else {
7300			mem = ir_ref_spill_slot(ctx, insn->op1);
7301		}
7302
7303		if (src_type == IR_DOUBLE) {
7304			if (ctx->mflags & IR_X86_AVX) {
7305				|	ASM_TXT_TXT_TMEM_OP vcvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7306			} else {
7307				|	ASM_TXT_TMEM_OP cvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
7308			}
7309		} else {
7310			IR_ASSERT(src_type == IR_FLOAT);
7311			if (ctx->mflags & IR_X86_AVX) {
7312				|	ASM_TXT_TXT_TMEM_OP vcvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7313			} else {
7314				|	ASM_TXT_TMEM_OP cvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
7315			}
7316		}
7317	}
7318	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7319		ir_emit_store(ctx, dst_type, def, def_reg);
7320	}
7321}
7322
7323static void ir_emit_copy_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7324{
7325	ir_ref type = insn->type;
7326	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7327	ir_reg op1_reg = ctx->regs[def][1];
7328
7329	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
7330	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7331		op1_reg = IR_REG_NUM(op1_reg);
7332		ir_emit_load(ctx, type, op1_reg, insn->op1);
7333	}
7334	if (def_reg == op1_reg) {
7335		/* same reg */
7336	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
7337		ir_emit_mov(ctx, type, def_reg, op1_reg);
7338	} else if (def_reg != IR_REG_NONE) {
7339		ir_emit_load(ctx, type, def_reg, insn->op1);
7340	} else if (op1_reg != IR_REG_NONE) {
7341		ir_emit_store(ctx, type, def, op1_reg);
7342	} else {
7343		IR_ASSERT(0);
7344	}
7345	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
7346		ir_emit_store(ctx, type, def, def_reg);
7347	}
7348}
7349
7350static void ir_emit_copy_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7351{
7352	ir_type type = insn->type;
7353	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7354	ir_reg op1_reg = ctx->regs[def][1];
7355
7356	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
7357	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7358		op1_reg = IR_REG_NUM(op1_reg);
7359		ir_emit_load(ctx, type, op1_reg, insn->op1);
7360	}
7361	if (def_reg == op1_reg) {
7362		/* same reg */
7363	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
7364		ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
7365	} else if (def_reg != IR_REG_NONE) {
7366		ir_emit_load(ctx, type, def_reg, insn->op1);
7367	} else if (op1_reg != IR_REG_NONE) {
7368		ir_emit_store(ctx, type, def, op1_reg);
7369	} else {
7370		IR_ASSERT(0);
7371	}
7372	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
7373		ir_emit_store(ctx, type, def, def_reg);
7374	}
7375}
7376
7377static void ir_emit_vaddr(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7378{
7379	ir_backend_data *data = ctx->data;
7380	dasm_State **Dst = &data->dasm_state;
7381	ir_ref type = insn->type;
7382	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7383	ir_mem mem;
7384	int32_t offset;
7385	ir_reg fp;
7386
7387	IR_ASSERT(def_reg != IR_REG_NONE);
7388	mem = ir_var_spill_slot(ctx, insn->op1);
7389	fp = IR_MEM_BASE(mem);
7390	offset = IR_MEM_OFFSET(mem);
7391	|	lea Ra(def_reg), aword [Ra(fp)+offset]
7392	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7393		ir_emit_store(ctx, type, def, def_reg);
7394	}
7395}
7396
7397static void ir_emit_vload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7398{
7399	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7400	ir_ref type = insn->type;
7401	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7402	ir_reg fp;
7403	ir_mem mem;
7404
7405	IR_ASSERT(var_insn->op == IR_VAR);
7406	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7407	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7408	if (def_reg == IR_REG_NONE && ir_is_same_mem_var(ctx, def, var_insn->op3)) {
7409		return; // fake load
7410	}
7411	IR_ASSERT(def_reg != IR_REG_NONE);
7412
7413	ir_emit_load_mem(ctx, type, def_reg, mem);
7414	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7415		ir_emit_store(ctx, type, def, def_reg);
7416	}
7417}
7418
7419static void ir_emit_vstore_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7420{
7421	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7422	ir_insn *val_insn = &ctx->ir_base[insn->op3];
7423	ir_ref type = val_insn->type;
7424	ir_reg op3_reg = ctx->regs[ref][3];
7425	ir_reg fp;
7426	ir_mem mem;
7427
7428	IR_ASSERT(var_insn->op == IR_VAR);
7429	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7430	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7431	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
7432	 && !IR_IS_CONST_REF(insn->op3)
7433	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7434	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
7435		return; // fake store
7436	}
7437	if (IR_IS_CONST_REF(insn->op3)) {
7438		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
7439	} else {
7440		IR_ASSERT(op3_reg != IR_REG_NONE);
7441		if (IR_REG_SPILLED(op3_reg)) {
7442			op3_reg = IR_REG_NUM(op3_reg);
7443			ir_emit_load(ctx, type, op3_reg, insn->op3);
7444		}
7445		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
7446	}
7447}
7448
7449static void ir_emit_vstore_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7450{
7451	ir_insn *var_insn = &ctx->ir_base[insn->op2];
7452	ir_ref type = ctx->ir_base[insn->op3].type;
7453	ir_reg op3_reg = ctx->regs[ref][3];
7454	ir_reg fp;
7455	ir_mem mem;
7456
7457	IR_ASSERT(var_insn->op == IR_VAR);
7458	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7459	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
7460	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
7461	 && !IR_IS_CONST_REF(insn->op3)
7462	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7463	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
7464		return; // fake store
7465	}
7466	if (IR_IS_CONST_REF(insn->op3)) {
7467		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
7468	} else {
7469		IR_ASSERT(op3_reg != IR_REG_NONE);
7470		if (IR_REG_SPILLED(op3_reg)) {
7471			op3_reg = IR_REG_NUM(op3_reg);
7472			ir_emit_load(ctx, type, op3_reg, insn->op3);
7473		}
7474		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
7475	}
7476}
7477
7478static void ir_emit_load_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7479{
7480	ir_ref type = insn->type;
7481	ir_reg op2_reg = ctx->regs[def][2];
7482	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7483	ir_mem mem;
7484
7485	if (ctx->use_lists[def].count == 1) {
7486		/* dead load */
7487		return;
7488	}
7489	IR_ASSERT(def_reg != IR_REG_NONE);
7490	if (op2_reg != IR_REG_NONE) {
7491		if (IR_REG_SPILLED(op2_reg)) {
7492			op2_reg = IR_REG_NUM(op2_reg);
7493			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7494			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7495		}
7496		mem = IR_MEM_B(op2_reg);
7497	} else if (IR_IS_CONST_REF(insn->op2)) {
7498		mem = ir_fuse_addr_const(ctx, insn->op2);
7499	} else {
7500		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7501		mem = ir_fuse_addr(ctx, def, insn->op2);
7502		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
7503			if (!ir_may_avoid_spill_load(ctx, def, def)) {
7504				ir_emit_load_mem_int(ctx, type, def_reg, mem);
7505			}
7506			/* avoid load to the same location (valid only when register is not reused) */
7507			return;
7508		}
7509	}
7510
7511	ir_emit_load_mem_int(ctx, type, def_reg, mem);
7512	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7513		ir_emit_store(ctx, type, def, def_reg);
7514	}
7515}
7516
7517static void ir_emit_load_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7518{
7519	ir_ref type = insn->type;
7520	ir_reg op2_reg = ctx->regs[def][2];
7521	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7522	ir_mem mem;
7523
7524	if (ctx->use_lists[def].count == 1) {
7525		/* dead load */
7526		return;
7527	}
7528	IR_ASSERT(def_reg != IR_REG_NONE);
7529	if (op2_reg != IR_REG_NONE) {
7530		if (IR_REG_SPILLED(op2_reg)) {
7531			op2_reg = IR_REG_NUM(op2_reg);
7532			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7533			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7534		}
7535		mem = IR_MEM_B(op2_reg);
7536	} else if (IR_IS_CONST_REF(insn->op2)) {
7537		mem = ir_fuse_addr_const(ctx, insn->op2);
7538	} else {
7539		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7540		mem = ir_fuse_addr(ctx, def, insn->op2);
7541		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
7542			if (!ir_may_avoid_spill_load(ctx, def, def)) {
7543				ir_emit_load_mem_fp(ctx, type, def_reg, mem);
7544			}
7545			/* avoid load to the same location (valid only when register is not reused) */
7546			return;
7547		}
7548	}
7549
7550	ir_emit_load_mem_fp(ctx, type, def_reg, mem);
7551	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7552		ir_emit_store(ctx, type, def, def_reg);
7553	}
7554}
7555
7556static void ir_emit_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7557{
7558	ir_insn *val_insn = &ctx->ir_base[insn->op3];
7559	ir_ref type = val_insn->type;
7560	ir_reg op2_reg = ctx->regs[ref][2];
7561	ir_reg op3_reg = ctx->regs[ref][3];
7562	ir_mem mem;
7563
7564	if (op2_reg != IR_REG_NONE) {
7565		if (IR_REG_SPILLED(op2_reg)) {
7566			op2_reg = IR_REG_NUM(op2_reg);
7567			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7568			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7569		}
7570		mem = IR_MEM_B(op2_reg);
7571	} else if (IR_IS_CONST_REF(insn->op2)) {
7572		mem = ir_fuse_addr_const(ctx, insn->op2);
7573	} else {
7574		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7575		mem = ir_fuse_addr(ctx, ref, insn->op2);
7576		if (!IR_IS_CONST_REF(insn->op3)
7577		 && IR_REG_SPILLED(op3_reg)
7578		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7579		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
7580			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
7581				op3_reg = IR_REG_NUM(op3_reg);
7582				ir_emit_load(ctx, type, op3_reg, insn->op3);
7583			}
7584			/* avoid store to the same location */
7585			return;
7586		}
7587	}
7588
7589	if (IR_IS_CONST_REF(insn->op3)) {
7590		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
7591	} else {
7592		IR_ASSERT(op3_reg != IR_REG_NONE);
7593		if (IR_REG_SPILLED(op3_reg)) {
7594			op3_reg = IR_REG_NUM(op3_reg);
7595			ir_emit_load(ctx, type, op3_reg, insn->op3);
7596		}
7597		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
7598	}
7599}
7600
7601static void ir_emit_cmp_and_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7602{
7603	ir_reg addr_reg = ctx->regs[ref][2];
7604	ir_mem mem;
7605	ir_insn *cmp_insn = &ctx->ir_base[insn->op3];
7606	ir_op op = cmp_insn->op;
7607	ir_type type = ctx->ir_base[cmp_insn->op1].type;
7608	ir_ref op1 = cmp_insn->op1;
7609	ir_ref op2 = cmp_insn->op2;
7610	ir_reg op1_reg = ctx->regs[insn->op3][1];
7611	ir_reg op2_reg = ctx->regs[insn->op3][2];
7612
7613	if (addr_reg != IR_REG_NONE) {
7614		if (IR_REG_SPILLED(addr_reg)) {
7615			addr_reg = IR_REG_NUM(addr_reg);
7616			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7617			ir_emit_load(ctx, IR_ADDR, addr_reg, insn->op2);
7618		}
7619		mem = IR_MEM_B(addr_reg);
7620	} else if (IR_IS_CONST_REF(insn->op2)) {
7621		mem = ir_fuse_addr_const(ctx, insn->op2);
7622	} else {
7623		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7624		mem = ir_fuse_addr(ctx, ref, insn->op2);
7625	}
7626
7627	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
7628		op1_reg = IR_REG_NUM(op1_reg);
7629		ir_emit_load(ctx, type, op1_reg, op1);
7630	}
7631	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7632		op2_reg = IR_REG_NUM(op2_reg);
7633		if (op1 != op2) {
7634			ir_emit_load(ctx, type, op2_reg, op2);
7635		}
7636	}
7637
7638	ir_emit_cmp_int_common(ctx, type, ref, cmp_insn, op1_reg, op1, op2_reg, op2);
7639	_ir_emit_setcc_int_mem(ctx, op, mem);
7640}
7641
7642static void ir_emit_store_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7643{
7644	ir_ref type = ctx->ir_base[insn->op3].type;
7645	ir_reg op2_reg = ctx->regs[ref][2];
7646	ir_reg op3_reg = ctx->regs[ref][3];
7647	ir_mem mem;
7648
7649	IR_ASSERT(op3_reg != IR_REG_NONE);
7650	if (op2_reg != IR_REG_NONE) {
7651		if (IR_REG_SPILLED(op2_reg)) {
7652			op2_reg = IR_REG_NUM(op2_reg);
7653			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
7654			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7655		}
7656		mem = IR_MEM_B(op2_reg);
7657	} else if (IR_IS_CONST_REF(insn->op2)) {
7658		mem = ir_fuse_addr_const(ctx, insn->op2);
7659	} else {
7660		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
7661		mem = ir_fuse_addr(ctx, ref, insn->op2);
7662		if (!IR_IS_CONST_REF(insn->op3)
7663		 && IR_REG_SPILLED(op3_reg)
7664		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
7665		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
7666			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
7667				op3_reg = IR_REG_NUM(op3_reg);
7668				ir_emit_load(ctx, type, op3_reg, insn->op3);
7669			}
7670			/* avoid store to the same location */
7671			return;
7672		}
7673	}
7674
7675	if (IR_IS_CONST_REF(insn->op3)) {
7676		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
7677	} else {
7678		IR_ASSERT(op3_reg != IR_REG_NONE);
7679		if (IR_REG_SPILLED(op3_reg)) {
7680			op3_reg = IR_REG_NUM(op3_reg);
7681			ir_emit_load(ctx, type, op3_reg, insn->op3);
7682		}
7683		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
7684	}
7685}
7686
7687static void ir_emit_rload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7688{
7689	ir_reg src_reg = insn->op2;
7690	ir_type type = insn->type;
7691
7692	if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), src_reg)) {
7693		if (ctx->vregs[def]
7694		 && ctx->live_intervals[ctx->vregs[def]]
7695		 && ctx->live_intervals[ctx->vregs[def]]->stack_spill_pos != -1) {
7696			ir_emit_store(ctx, type, def, src_reg);
7697		}
7698	} else {
7699		ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7700
7701		if (def_reg == IR_REG_NONE) {
7702			/* op3 is used as a flag that the value is already stored in memory.
7703			 * If op3 is set we don't have to store the value once again (in case of spilling)
7704			 */
7705			if (!insn->op3 || !ir_is_same_spill_slot(ctx, def, IR_MEM_BO(ctx->spill_base, insn->op3))) {
7706				ir_emit_store(ctx, type, def, src_reg);
7707			}
7708		} else {
7709			if (src_reg != def_reg) {
7710				if (IR_IS_TYPE_INT(type)) {
7711					ir_emit_mov(ctx, type, def_reg, src_reg);
7712				} else {
7713					IR_ASSERT(IR_IS_TYPE_FP(type));
7714					ir_emit_fp_mov(ctx, type, def_reg, src_reg);
7715				}
7716			}
7717			if (IR_REG_SPILLED(ctx->regs[def][0])
7718			 && (!insn->op3 || !ir_is_same_spill_slot(ctx, def,  IR_MEM_BO(ctx->spill_base, insn->op3)))) {
7719				ir_emit_store(ctx, type, def, def_reg);
7720			}
7721		}
7722	}
7723}
7724
7725static void ir_emit_rstore(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
7726{
7727	ir_ref type = ctx->ir_base[insn->op2].type;
7728	ir_reg op2_reg = ctx->regs[ref][2];
7729	ir_reg dst_reg = insn->op3;
7730
7731	if (op2_reg != IR_REG_NONE) {
7732		if (IR_REG_SPILLED(op2_reg)) {
7733			op2_reg = IR_REG_NUM(op2_reg);
7734			ir_emit_load(ctx, type, op2_reg, insn->op2);
7735		}
7736		if (op2_reg != dst_reg) {
7737			if (IR_IS_TYPE_INT(type)) {
7738				ir_emit_mov(ctx, type, dst_reg, op2_reg);
7739			} else {
7740				IR_ASSERT(IR_IS_TYPE_FP(type));
7741				ir_emit_fp_mov(ctx, type, dst_reg, op2_reg);
7742			}
7743		}
7744	} else {
7745		ir_emit_load_ex(ctx, type, dst_reg, insn->op2, ref);
7746	}
7747}
7748
7749static void ir_emit_alloca(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7750{
7751	ir_backend_data *data = ctx->data;
7752	dasm_State **Dst = &data->dasm_state;
7753	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7754
7755	if (ctx->use_lists[def].count == 1) {
7756		/* dead alloca */
7757		return;
7758	}
7759	if (IR_IS_CONST_REF(insn->op2)) {
7760		ir_insn *val = &ctx->ir_base[insn->op2];
7761		int32_t size = val->val.i32;
7762
7763		IR_ASSERT(IR_IS_TYPE_INT(val->type));
7764		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7765		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 >= 0);
7766		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
7767
7768		/* Stack must be 16 byte aligned */
7769		size = IR_ALIGNED_SIZE(size, 16);
7770		|	ASM_REG_IMM_OP sub, IR_ADDR, IR_REG_RSP, size
7771		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
7772			ctx->call_stack_size += size;
7773		}
7774	} else {
7775		int32_t alignment = 16;
7776		ir_reg op2_reg = ctx->regs[def][2];
7777		ir_type type = ctx->ir_base[insn->op2].type;
7778
7779		IR_ASSERT(ctx->flags & IR_FUNCTION);
7780		IR_ASSERT(ctx->flags & IR_USE_FRAME_POINTER);
7781		IR_ASSERT(def_reg != IR_REG_NONE);
7782		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7783			op2_reg = IR_REG_NUM(op2_reg);
7784			ir_emit_load(ctx, type, op2_reg, insn->op2);
7785		}
7786		if (def_reg != op2_reg) {
7787			if (op2_reg != IR_REG_NONE) {
7788				ir_emit_mov(ctx, type, def_reg, op2_reg);
7789			} else {
7790				ir_emit_load(ctx, type, def_reg, insn->op2);
7791			}
7792		}
7793
7794		|	ASM_REG_IMM_OP add, IR_ADDR, def_reg, (alignment-1)
7795		|	ASM_REG_IMM_OP and, IR_ADDR, def_reg, ~(alignment-1)
7796		|	ASM_REG_REG_OP sub, IR_ADDR, IR_REG_RSP, def_reg
7797	}
7798	if (def_reg != IR_REG_NONE) {
7799		|	mov Ra(def_reg), Ra(IR_REG_RSP)
7800		if (IR_REG_SPILLED(ctx->regs[def][0])) {
7801			ir_emit_store(ctx, insn->type, def, def_reg);
7802		}
7803	} else {
7804		ir_emit_store(ctx, IR_ADDR, def, IR_REG_STACK_POINTER);
7805	}
7806}
7807
7808static void ir_emit_afree(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7809{
7810	ir_backend_data *data = ctx->data;
7811	dasm_State **Dst = &data->dasm_state;
7812
7813	if (IR_IS_CONST_REF(insn->op2)) {
7814		ir_insn *val = &ctx->ir_base[insn->op2];
7815		int32_t size = val->val.i32;
7816
7817		IR_ASSERT(IR_IS_TYPE_INT(val->type));
7818		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
7819		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 > 0);
7820		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));
7821
7822		/* Stack must be 16 byte aligned */
7823		size = IR_ALIGNED_SIZE(size, 16);
7824		|	ASM_REG_IMM_OP add, IR_ADDR, IR_REG_RSP, size
7825		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
7826			ctx->call_stack_size -= size;
7827		}
7828	} else {
7829//		int32_t alignment = 16;
7830		ir_reg op2_reg = ctx->regs[def][2];
7831		ir_type type = ctx->ir_base[insn->op2].type;
7832
7833		IR_ASSERT(ctx->flags & IR_FUNCTION);
7834		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
7835			op2_reg = IR_REG_NUM(op2_reg);
7836			ir_emit_load(ctx, type, op2_reg, insn->op2);
7837		}
7838
7839		// TODO: alignment ???
7840
7841		|	ASM_REG_REG_OP add, IR_ADDR, IR_REG_RSP, op2_reg
7842	}
7843}
7844
7845static void ir_emit_block_begin(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7846{
7847	ir_backend_data *data = ctx->data;
7848	dasm_State **Dst = &data->dasm_state;
7849	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7850
7851	|	mov Ra(def_reg), Ra(IR_REG_RSP)
7852
7853	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7854		ir_emit_store(ctx, IR_ADDR, def, def_reg);
7855	}
7856}
7857
7858static void ir_emit_block_end(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7859{
7860	ir_backend_data *data = ctx->data;
7861	dasm_State **Dst = &data->dasm_state;
7862	ir_reg op2_reg = ctx->regs[def][2];
7863
7864	IR_ASSERT(op2_reg != IR_REG_NONE);
7865	if (IR_REG_SPILLED(op2_reg)) {
7866		op2_reg = IR_REG_NUM(op2_reg);
7867		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7868	}
7869
7870	|	mov Ra(IR_REG_RSP), Ra(op2_reg)
7871}
7872
7873static void ir_emit_frame_addr(ir_ctx *ctx, ir_ref def)
7874{
7875	ir_backend_data *data = ctx->data;
7876	dasm_State **Dst = &data->dasm_state;
7877	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
7878
7879	if (ctx->flags & IR_USE_FRAME_POINTER) {
7880		|	mov Ra(def_reg), Ra(IR_REG_RBP)
7881	} else {
7882		|	lea Ra(def_reg), [Ra(IR_REG_RSP)+(ctx->stack_frame_size + ctx->call_stack_size)]
7883	}
7884	if (IR_REG_SPILLED(ctx->regs[def][0])) {
7885		ir_emit_store(ctx, IR_ADDR, def, def_reg);
7886	}
7887}
7888
7889static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7890{
7891#if defined(_WIN64) || defined(IR_TARGET_X86)
7892	ir_backend_data *data = ctx->data;
7893	dasm_State **Dst = &data->dasm_state;
7894	ir_reg fp;
7895	int arg_area_offset;
7896	ir_reg op2_reg = ctx->regs[def][2];
7897	ir_reg tmp_reg = ctx->regs[def][3];
7898	int32_t offset;
7899
7900	IR_ASSERT(tmp_reg != IR_REG_NONE);
7901	if (op2_reg != IR_REG_NONE) {
7902		if (IR_REG_SPILLED(op2_reg)) {
7903			op2_reg = IR_REG_NUM(op2_reg);
7904			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7905		}
7906		offset = 0;
7907	} else {
7908		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
7909		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7910		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
7911	}
7912
7913	if (ctx->flags & IR_USE_FRAME_POINTER) {
7914		fp = IR_REG_FRAME_POINTER;
7915		arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
7916	} else {
7917		fp = IR_REG_STACK_POINTER;
7918		arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
7919	}
7920	|	lea Ra(tmp_reg), aword [Ra(fp)+arg_area_offset]
7921	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
7922#elif defined(IR_TARGET_X64)
7923|.if X64
7924	ir_backend_data *data = ctx->data;
7925	dasm_State **Dst = &data->dasm_state;
7926	ir_reg fp;
7927	int reg_save_area_offset;
7928	int overflow_arg_area_offset;
7929	ir_reg op2_reg = ctx->regs[def][2];
7930	ir_reg tmp_reg = ctx->regs[def][3];
7931	bool have_reg_save_area = 0;
7932	int32_t offset;
7933
7934	IR_ASSERT(tmp_reg != IR_REG_NONE);
7935	if (op2_reg != IR_REG_NONE) {
7936		if (IR_REG_SPILLED(op2_reg)) {
7937			op2_reg = IR_REG_NUM(op2_reg);
7938			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
7939		}
7940		offset = 0;
7941	} else {
7942		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
7943		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
7944		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
7945	}
7946
7947	if (ctx->flags & IR_USE_FRAME_POINTER) {
7948		fp = IR_REG_FRAME_POINTER;
7949		reg_save_area_offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
7950		overflow_arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
7951	} else {
7952		fp = IR_REG_STACK_POINTER;
7953		reg_save_area_offset = ctx->locals_area_size + ctx->call_stack_size;
7954		overflow_arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
7955	}
7956
7957	if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
7958		|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
7959		have_reg_save_area = 1;
7960		/* Set va_list.gp_offset */
7961		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * ctx->gp_reg_params
7962	} else {
7963		reg_save_area_offset -= sizeof(void*) * IR_REG_INT_ARGS;
7964		/* Set va_list.gp_offset */
7965		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * IR_REG_INT_ARGS
7966	}
7967	if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
7968		if (!have_reg_save_area) {
7969			|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
7970			have_reg_save_area = 1;
7971		}
7972		/* Set va_list.fp_offset */
7973		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * ctx->fp_reg_params
7974	} else {
7975		/* Set va_list.fp_offset */
7976		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
7977	}
7978	if (have_reg_save_area) {
7979		/* Set va_list.reg_save_area */
7980		|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
7981	}
7982	|	lea Ra(tmp_reg), aword [Ra(fp)+overflow_arg_area_offset]
7983	/* Set va_list.overflow_arg_area */
7984	|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
7985|.endif
7986#else
7987	IR_ASSERT(0 && "NIY va_start");
7988#endif
7989}
7990
7991static void ir_emit_va_copy(ir_ctx *ctx, ir_ref def, ir_insn *insn)
7992{
7993#if defined(_WIN64) || defined(IR_TARGET_X86)
7994	ir_backend_data *data = ctx->data;
7995	dasm_State **Dst = &data->dasm_state;
7996	ir_reg tmp_reg = ctx->regs[def][1];
7997	ir_reg op2_reg = ctx->regs[def][2];
7998	ir_reg op3_reg = ctx->regs[def][3];
7999	int32_t op2_offset, op3_offset;
8000
8001	IR_ASSERT(tmp_reg != IR_REG_NONE);
8002	if (op2_reg != IR_REG_NONE) {
8003		if (IR_REG_SPILLED(op2_reg)) {
8004			op2_reg = IR_REG_NUM(op2_reg);
8005			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8006		}
8007		op2_offset = 0;
8008	} else {
8009		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8010		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8011		op2_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8012	}
8013	if (op3_reg != IR_REG_NONE) {
8014		if (IR_REG_SPILLED(op3_reg)) {
8015			op3_reg = IR_REG_NUM(op3_reg);
8016			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
8017		}
8018		op3_offset = 0;
8019	} else {
8020		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
8021		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8022		op3_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op3].op3);
8023	}
8024	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+op3_offset]
8025	|	mov aword [Ra(op2_reg)+op2_offset], Ra(tmp_reg)
8026#elif defined(IR_TARGET_X64)
8027|.if X64
8028	ir_backend_data *data = ctx->data;
8029	dasm_State **Dst = &data->dasm_state;
8030	ir_reg tmp_reg = ctx->regs[def][1];
8031	ir_reg op2_reg = ctx->regs[def][2];
8032	ir_reg op3_reg = ctx->regs[def][3];
8033	int32_t op2_offset, op3_offset;
8034
8035	IR_ASSERT(tmp_reg != IR_REG_NONE);
8036	if (op2_reg != IR_REG_NONE) {
8037		if (IR_REG_SPILLED(op2_reg)) {
8038			op2_reg = IR_REG_NUM(op2_reg);
8039			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8040		}
8041		op2_offset = 0;
8042	} else {
8043		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8044		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8045		op2_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8046	}
8047	if (op3_reg != IR_REG_NONE) {
8048		if (IR_REG_SPILLED(op3_reg)) {
8049			op3_reg = IR_REG_NUM(op3_reg);
8050			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
8051		}
8052		op3_offset = 0;
8053	} else {
8054		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
8055		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8056		op3_offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op3].op3);
8057	}
8058	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, gp_offset))]
8059	|	mov dword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
8060	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, fp_offset))]
8061	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, fp_offset))], Ra(tmp_reg)
8062	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, overflow_arg_area))]
8063	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8064	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, reg_save_area))]
8065	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
8066|.endif
8067#else
8068	IR_ASSERT(0 && "NIY va_copy");
8069#endif
8070}
8071
8072static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8073{
8074#if defined(_WIN64) || defined(IR_TARGET_X86)
8075	ir_backend_data *data = ctx->data;
8076	dasm_State **Dst = &data->dasm_state;
8077	ir_type type = insn->type;
8078	ir_reg def_reg = ctx->regs[def][0];
8079	ir_reg op2_reg = ctx->regs[def][2];
8080	ir_reg tmp_reg = ctx->regs[def][3];
8081	int32_t offset;
8082
8083	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
8084	if (op2_reg != IR_REG_NONE) {
8085		if (IR_REG_SPILLED(op2_reg)) {
8086			op2_reg = IR_REG_NUM(op2_reg);
8087			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8088		}
8089		offset = 0;
8090	} else {
8091		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8092		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8093		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8094	}
8095	|	mov Ra(tmp_reg), aword [Ra(op2_reg)+offset]
8096	ir_emit_load_mem(ctx, type, def_reg, IR_MEM_B(tmp_reg));
8097	|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
8098	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
8099	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8100		ir_emit_store(ctx, type, def, def_reg);
8101	}
8102#elif defined(IR_TARGET_X64)
8103|.if X64
8104	ir_backend_data *data = ctx->data;
8105	dasm_State **Dst = &data->dasm_state;
8106	ir_type type = insn->type;
8107	ir_reg def_reg = ctx->regs[def][0];
8108	ir_reg op2_reg = ctx->regs[def][2];
8109	ir_reg tmp_reg = ctx->regs[def][3];
8110	int32_t offset;
8111
8112	IR_ASSERT(def_reg != IR_REG_NONE&& tmp_reg != IR_REG_NONE);
8113	if (op2_reg != IR_REG_NONE) {
8114		if (IR_REG_SPILLED(op2_reg)) {
8115			op2_reg = IR_REG_NUM(op2_reg);
8116			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8117		}
8118		offset = 0;
8119	} else {
8120		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
8121		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8122		offset = IR_SPILL_POS_TO_OFFSET(ctx->ir_base[insn->op2].op3);
8123	}
8124	if (IR_IS_TYPE_INT(type)) {
8125		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))]
8126		|	cmp Rd(tmp_reg), sizeof(void*)*IR_REG_INT_ARGS
8127		|	jge >1
8128		|	add Rd(tmp_reg), sizeof(void*)
8129		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
8130		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
8131		|	jmp >2
8132		|1:
8133		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
8134		|	add Ra(tmp_reg), sizeof(void*)
8135		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8136		|2:
8137		|	mov Ra(def_reg), aword [Ra(tmp_reg)-sizeof(void*)]
8138	} else {
8139		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))]
8140		|	cmp Rd(tmp_reg), sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
8141		|	jge >1
8142		|	add Rd(tmp_reg), 16
8143		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], Rd(tmp_reg)
8144		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
8145		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, -16));
8146		|	jmp >2
8147		|1:
8148		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
8149		ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, 0));
8150		|	add Ra(tmp_reg), 8
8151		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
8152		|2:
8153	}
8154	if (IR_REG_SPILLED(ctx->regs[def][0])) {
8155		ir_emit_store(ctx, type, def, def_reg);
8156	}
8157|.endif
8158#else
8159	IR_ASSERT(0 && "NIY va_arg");
8160#endif
8161}
8162
8163static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
8164{
8165	ir_backend_data *data = ctx->data;
8166	dasm_State **Dst = &data->dasm_state;
8167	ir_type type;
8168	ir_block *bb;
8169	ir_insn *use_insn, *val;
8170	uint32_t n, *p, use_block;
8171	int i;
8172	int label, default_label = 0;
8173	int count = 0;
8174	ir_val min, max;
8175	ir_reg op2_reg = ctx->regs[def][2];
8176	ir_reg tmp_reg = ctx->regs[def][3];
8177
8178	type = ctx->ir_base[insn->op2].type;
8179	IR_ASSERT(tmp_reg != IR_REG_NONE);
8180	if (IR_IS_TYPE_SIGNED(type)) {
8181		min.u64 = 0x7fffffffffffffff;
8182		max.u64 = 0x8000000000000000;
8183	} else {
8184		min.u64 = 0xffffffffffffffff;
8185		max.u64 = 0x0;
8186	}
8187
8188	bb = &ctx->cfg_blocks[b];
8189	p = &ctx->cfg_edges[bb->successors];
8190	for (n = bb->successors_count; n != 0; p++, n--) {
8191		use_block = *p;
8192		use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8193		if (use_insn->op == IR_CASE_VAL) {
8194			val = &ctx->ir_base[use_insn->op2];
8195			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8196			if (IR_IS_TYPE_SIGNED(type)) {
8197				IR_ASSERT(IR_IS_TYPE_SIGNED(val->type));
8198				min.i64 = IR_MIN(min.i64, val->val.i64);
8199				max.i64 = IR_MAX(max.i64, val->val.i64);
8200			} else {
8201				IR_ASSERT(!IR_IS_TYPE_SIGNED(val->type));
8202				min.u64 = (int64_t)IR_MIN(min.u64, val->val.u64);
8203				max.u64 = (int64_t)IR_MAX(max.u64, val->val.u64);
8204			}
8205			count++;
8206		} else {
8207			IR_ASSERT(use_insn->op == IR_CASE_DEFAULT);
8208			default_label = ir_skip_empty_target_blocks(ctx, use_block);
8209		}
8210	}
8211
8212	IR_ASSERT(op2_reg != IR_REG_NONE);
8213	if (IR_REG_SPILLED(op2_reg)) {
8214		op2_reg = IR_REG_NUM(op2_reg);
8215		ir_emit_load(ctx, type, op2_reg, insn->op2);
8216	}
8217
8218	/* Generate a table jmp or a seqence of calls */
8219	if (count > 2 && (max.i64-min.i64) < count * 8) {
8220		int *labels = ir_mem_malloc(sizeof(int) * (size_t)(max.i64 - min.i64 + 1));
8221
8222		for (i = 0; i <= (max.i64 - min.i64); i++) {
8223			labels[i] = default_label;
8224		}
8225		p = &ctx->cfg_edges[bb->successors];
8226		for (n = bb->successors_count; n != 0; p++, n--) {
8227			use_block = *p;
8228			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8229			if (use_insn->op == IR_CASE_VAL) {
8230				val = &ctx->ir_base[use_insn->op2];
8231				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8232				label = ir_skip_empty_target_blocks(ctx, use_block);
8233				labels[val->val.i64 - min.i64] = label;
8234			}
8235		}
8236
8237		switch (ir_type_size[type]) {
8238			default:
8239				IR_ASSERT(0 && "Unsupported type size");
8240			case 1:
8241				if (IR_IS_TYPE_SIGNED(type)) {
8242					|	movsx Ra(op2_reg), Rb(op2_reg)
8243				} else {
8244					|	movzx Ra(op2_reg), Rb(op2_reg)
8245				}
8246				break;
8247			case 2:
8248				if (IR_IS_TYPE_SIGNED(type)) {
8249					|	movsx Ra(op2_reg), Rw(op2_reg)
8250				} else {
8251					|	movzx Ra(op2_reg), Rw(op2_reg)
8252				}
8253				break;
8254			case 4:
8255|.if X64
8256				if (IR_IS_TYPE_SIGNED(type)) {
8257					|	movsxd Ra(op2_reg), Rd(op2_reg)
8258				} else {
8259					|	mov Rd(op2_reg), Rd(op2_reg)
8260				}
8261				break;
8262||			case 8:
8263|.endif
8264				break;
8265		}
8266
8267		if (min.i64 != 0) {
8268			int64_t offset = -min.i64;
8269
8270			if (IR_IS_SIGNED_32BIT(offset)) {
8271				|	lea Ra(tmp_reg), [Ra(op2_reg)+(int32_t)offset]
8272			} else {
8273				IR_ASSERT(sizeof(void*) == 8);
8274|.if X64
8275				|	mov64 Rq(tmp_reg), offset
8276				|	add Ra(tmp_reg), Ra(op2_reg)
8277|.endif
8278			}
8279			if (default_label) {
8280				offset = max.i64 - min.i64;
8281
8282				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8283				|	cmp Ra(tmp_reg), (int32_t)offset
8284				|	ja =>default_label
8285			}
8286|.if X64
8287			if (ctx->code_buffer
8288			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
8289			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
8290				|	jmp aword [Ra(tmp_reg)*8+>1]
8291			} else {
8292				int64_t offset = -min.i64;
8293
8294				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8295				offset *= 8;
8296				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8297				|	lea Ra(tmp_reg), aword [>1]
8298				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8+offset]
8299			}
8300|.else
8301			|	jmp aword [Ra(tmp_reg)*4+>1]
8302|.endif
8303		} else {
8304			if (default_label) {
8305				int64_t offset = max.i64;
8306
8307				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
8308				|	cmp Ra(op2_reg), (int32_t)offset
8309				|	ja =>default_label
8310			}
8311|.if X64
8312			if (ctx->code_buffer
8313			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
8314			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
8315				|	jmp aword [Ra(op2_reg)*8+>1]
8316			} else {
8317				|	lea Ra(tmp_reg), aword [>1]
8318				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8]
8319			}
8320|.else
8321			|	jmp aword [Ra(op2_reg)*4+>1]
8322|.endif
8323		}
8324
8325		|.jmp_table
8326		if (!data->jmp_table_label) {
8327			data->jmp_table_label = ctx->cfg_blocks_count + ctx->consts_count + 3;
8328			|=>data->jmp_table_label:
8329		}
8330		|.align aword
8331		|1:
8332		for (i = 0; i <= (max.i64 - min.i64); i++) {
8333			int b = labels[i];
8334			if (b) {
8335				ir_block *bb = &ctx->cfg_blocks[b];
8336				ir_insn *insn = &ctx->ir_base[bb->end];
8337
8338				if (insn->op == IR_IJMP && IR_IS_CONST_REF(insn->op2)) {
8339					ir_ref prev = ctx->prev_ref[bb->end];
8340					if (prev != bb->start && ctx->ir_base[prev].op == IR_SNAPSHOT) {
8341						prev = ctx->prev_ref[prev];
8342					}
8343					if (prev == bb->start) {
8344						void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8345
8346						|	.aword &addr
8347						if (ctx->ir_base[bb->start].op != IR_CASE_DEFAULT) {
8348							bb->flags |= IR_BB_EMPTY;
8349						}
8350						continue;
8351					}
8352				}
8353				|	.aword =>b
8354			} else {
8355				|	.aword 0
8356			}
8357		}
8358		|.code
8359		ir_mem_free(labels);
8360	} else {
8361		p = &ctx->cfg_edges[bb->successors];
8362		for (n = bb->successors_count; n != 0; p++, n--) {
8363			use_block = *p;
8364			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
8365			if (use_insn->op == IR_CASE_VAL) {
8366				val = &ctx->ir_base[use_insn->op2];
8367				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
8368				label = ir_skip_empty_target_blocks(ctx, use_block);
8369				if (IR_IS_32BIT(type, val->val)) {
8370					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
8371				} else {
8372					IR_ASSERT(sizeof(void*) == 8);
8373|.if X64
8374					|	mov64 Ra(tmp_reg), val->val.i64
8375					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
8376|.endif
8377				}
8378				|	je =>label
8379			}
8380		}
8381		if (default_label) {
8382			|	jmp =>default_label
8383		}
8384	}
8385}
8386
8387static int32_t ir_call_used_stack(ir_ctx *ctx, ir_insn *insn)
8388{
8389	int j, n;
8390	ir_type type;
8391	int int_param = 0;
8392	int fp_param = 0;
8393	int int_reg_params_count = IR_REG_INT_ARGS;
8394	int fp_reg_params_count = IR_REG_FP_ARGS;
8395	int32_t used_stack = 0;
8396
8397#ifdef IR_HAVE_FASTCALL
8398	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
8399		int_reg_params_count = IR_REG_INT_FCARGS;
8400		fp_reg_params_count = IR_REG_FP_FCARGS;
8401	}
8402#endif
8403
8404	n = insn->inputs_count;
8405	for (j = 3; j <= n; j++) {
8406		type = ctx->ir_base[ir_insn_op(insn, j)].type;
8407		if (IR_IS_TYPE_INT(type)) {
8408			if (int_param >= int_reg_params_count) {
8409				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
8410			}
8411			int_param++;
8412#ifdef _WIN64
8413			/* WIN64 calling convention use common couter for int and fp registers */
8414			fp_param++;
8415#endif
8416		} else {
8417			IR_ASSERT(IR_IS_TYPE_FP(type));
8418			if (fp_param >= fp_reg_params_count) {
8419				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
8420			}
8421			fp_param++;
8422#ifdef _WIN64
8423			/* WIN64 calling convention use common couter for int and fp registers */
8424			int_param++;
8425#endif
8426		}
8427	}
8428
8429	/* Reserved "home space" or "shadow store" for register arguments (used in Windows64 ABI) */
8430	used_stack += IR_SHADOW_ARGS;
8431
8432	return used_stack;
8433}
8434
8435static int32_t ir_emit_arguments(ir_ctx *ctx, ir_ref def, ir_insn *insn, ir_reg tmp_reg)
8436{
8437	ir_backend_data *data = ctx->data;
8438	dasm_State **Dst = &data->dasm_state;
8439	int j, n;
8440	ir_ref arg;
8441	ir_insn *arg_insn;
8442	uint8_t type;
8443	ir_reg src_reg, dst_reg;
8444	int int_param = 0;
8445	int fp_param = 0;
8446	int count = 0;
8447	int int_reg_params_count = IR_REG_INT_ARGS;
8448	int fp_reg_params_count = IR_REG_FP_ARGS;
8449	const int8_t *int_reg_params = _ir_int_reg_params;
8450	const int8_t *fp_reg_params = _ir_fp_reg_params;
8451	int32_t used_stack, stack_offset = IR_SHADOW_ARGS;
8452	ir_copy *copies;
8453	bool do_pass3 = 0;
8454	/* For temporaries we may use any scratch registers except for registers used for parameters */
8455	ir_reg tmp_fp_reg = IR_REG_FP_LAST; /* Temporary register for FP loads and swap */
8456
8457	n = insn->inputs_count;
8458	if (n < 3) {
8459		return 0;
8460	}
8461
8462	if (tmp_reg == IR_REG_NONE) {
8463		tmp_reg = IR_REG_RAX;
8464	}
8465
8466#ifdef IR_HAVE_FASTCALL
8467	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
8468		int_reg_params_count = IR_REG_INT_FCARGS;
8469		fp_reg_params_count = IR_REG_FP_FCARGS;
8470		int_reg_params = _ir_int_fc_reg_params;
8471		fp_reg_params = _ir_fp_fc_reg_params;
8472	}
8473#endif
8474
8475	if (insn->op == IR_CALL
8476	 && (ctx->flags & IR_PREALLOCATED_STACK)
8477#ifdef IR_HAVE_FASTCALL
8478	 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
8479#endif
8480	) {
8481		// TODO: support for preallocated stack
8482		used_stack = 0;
8483	} else {
8484		used_stack = ir_call_used_stack(ctx, insn);
8485		if (IR_SHADOW_ARGS
8486		 && insn->op == IR_TAILCALL
8487		 && used_stack == IR_SHADOW_ARGS) {
8488			used_stack = 0;
8489		}
8490		if (ctx->fixed_call_stack_size
8491		 && used_stack <= ctx->fixed_call_stack_size
8492#ifdef IR_HAVE_FASTCALL
8493		 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
8494#endif
8495		) {
8496			used_stack = 0;
8497		} else {
8498			/* Stack must be 16 byte aligned */
8499			int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
8500			ctx->call_stack_size += aligned_stack;
8501			if (aligned_stack) {
8502				|	sub Ra(IR_REG_RSP), aligned_stack
8503			}
8504		}
8505	}
8506
8507	/* 1. move all register arguments that should be passed through stack
8508	 *    and collect arguments that should be passed through registers */
8509	copies = ir_mem_malloc((n - 2) * sizeof(ir_copy));
8510	for (j = 3; j <= n; j++) {
8511		arg = ir_insn_op(insn, j);
8512		src_reg = ir_get_alocated_reg(ctx, def, j);
8513		arg_insn = &ctx->ir_base[arg];
8514		type = arg_insn->type;
8515		if (IR_IS_TYPE_INT(type)) {
8516			if (int_param < int_reg_params_count) {
8517				dst_reg = int_reg_params[int_param];
8518			} else {
8519				dst_reg = IR_REG_NONE; /* pass argument through stack */
8520			}
8521			int_param++;
8522#ifdef _WIN64
8523			/* WIN64 calling convention use common couter for int and fp registers */
8524			fp_param++;
8525#endif
8526		} else {
8527			IR_ASSERT(IR_IS_TYPE_FP(type));
8528			if (fp_param < fp_reg_params_count) {
8529				dst_reg = fp_reg_params[fp_param];
8530			} else {
8531				dst_reg = IR_REG_NONE; /* pass argument through stack */
8532			}
8533			fp_param++;
8534#ifdef _WIN64
8535			/* WIN64 calling convention use common couter for int and fp registers */
8536			int_param++;
8537#endif
8538		}
8539		if (dst_reg != IR_REG_NONE) {
8540			if (src_reg == IR_REG_NONE) {
8541				/* delay CONST->REG and MEM->REG moves to third pass */
8542				do_pass3 = 1;
8543			} else {
8544				if (IR_REG_SPILLED(src_reg)) {
8545					src_reg = IR_REG_NUM(src_reg);
8546					ir_emit_load(ctx, type, src_reg, arg);
8547				}
8548				if (src_reg != dst_reg) {
8549					/* delay REG->REG moves to second pass */
8550					copies[count].type = type;
8551					copies[count].from = src_reg;
8552					copies[count].to = dst_reg;
8553					count++;
8554				}
8555			}
8556		} else {
8557			/* Pass register arguments to stack (REG->MEM moves) */
8558			if (!IR_IS_CONST_REF(arg) && src_reg != IR_REG_NONE && !IR_REG_SPILLED(src_reg)) {
8559				ir_emit_store_mem(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg);
8560			} else {
8561				do_pass3 = 1;
8562			}
8563			stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
8564		}
8565	}
8566
8567	/* 2. move all arguments that should be passed from one register to another (REG->REG movs) */
8568	if (count) {
8569		ir_parallel_copy(ctx, copies, count, tmp_reg, tmp_fp_reg);
8570	}
8571	ir_mem_free(copies);
8572
8573	/* 3. move the remaining memory and immediate values */
8574	if (do_pass3) {
8575		stack_offset = IR_SHADOW_ARGS;
8576		int_param = 0;
8577		fp_param = 0;
8578		for (j = 3; j <= n; j++) {
8579			arg = ir_insn_op(insn, j);
8580			src_reg = ir_get_alocated_reg(ctx, def, j);
8581			arg_insn = &ctx->ir_base[arg];
8582			type = arg_insn->type;
8583			if (IR_IS_TYPE_INT(type)) {
8584				if (int_param < int_reg_params_count) {
8585					dst_reg = int_reg_params[int_param];
8586				} else {
8587					dst_reg = IR_REG_NONE; /* argument already passed through stack */
8588				}
8589				int_param++;
8590#ifdef _WIN64
8591				/* WIN64 calling convention use common couter for int and fp registers */
8592				fp_param++;
8593#endif
8594			} else {
8595				IR_ASSERT(IR_IS_TYPE_FP(type));
8596				if (fp_param < fp_reg_params_count) {
8597					dst_reg = fp_reg_params[fp_param];
8598				} else {
8599					dst_reg = IR_REG_NONE; /* argument already passed through stack */
8600				}
8601				fp_param++;
8602#ifdef _WIN64
8603				/* WIN64 calling convention use common couter for int and fp registers */
8604				int_param++;
8605#endif
8606			}
8607			if (dst_reg != IR_REG_NONE) {
8608				if (src_reg == IR_REG_NONE) {
8609					if (IR_IS_TYPE_INT(type)) {
8610						if (IR_IS_CONST_REF(arg)) {
8611							if (type == IR_I8 || type == IR_I16) {
8612								type = IR_I32;
8613							} else if (type == IR_U8 || type == IR_U16) {
8614								type = IR_U32;
8615							}
8616							ir_emit_load(ctx, type, dst_reg, arg);
8617						} else if (ctx->vregs[arg]) {
8618							ir_mem mem = ir_ref_spill_slot(ctx, arg);
8619
8620							if (ir_type_size[type] > 2) {
8621								ir_emit_load_mem_int(ctx, type, dst_reg, mem);
8622							} else if (ir_type_size[type] == 2) {
8623								if (type == IR_I16) {
8624									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), word, mem
8625								} else {
8626									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), word, mem
8627								}
8628							} else {
8629								IR_ASSERT(ir_type_size[type] == 1);
8630								if (type == IR_I8) {
8631									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), byte, mem
8632								} else {
8633									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), byte, mem
8634								}
8635							}
8636						} else {
8637							ir_load_local_addr(ctx, dst_reg, arg);
8638						}
8639					} else {
8640						ir_emit_load(ctx, type, dst_reg, arg);
8641					}
8642				}
8643			} else {
8644				ir_mem mem = IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset);
8645
8646				if (IR_IS_TYPE_INT(type)) {
8647					if (IR_IS_CONST_REF(arg)) {
8648						ir_emit_store_mem_int_const(ctx, type, mem, arg, tmp_reg, 1);
8649					} else if (src_reg == IR_REG_NONE) {
8650						IR_ASSERT(tmp_reg != IR_REG_NONE);
8651						ir_emit_load(ctx, type, tmp_reg, arg);
8652						ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
8653					} else if (IR_REG_SPILLED(src_reg)) {
8654						src_reg = IR_REG_NUM(src_reg);
8655						ir_emit_load(ctx, type, src_reg, arg);
8656						ir_emit_store_mem_int(ctx, type, mem, src_reg);
8657					}
8658				} else {
8659					if (IR_IS_CONST_REF(arg)) {
8660						ir_emit_store_mem_fp_const(ctx, type, mem, arg, tmp_reg, tmp_fp_reg);
8661					} else if (src_reg == IR_REG_NONE) {
8662						IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
8663						ir_emit_load(ctx, type, tmp_fp_reg, arg);
8664						ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
8665					} else if (IR_REG_SPILLED(src_reg)) {
8666						src_reg = IR_REG_NUM(src_reg);
8667						ir_emit_load(ctx, type, src_reg, arg);
8668						ir_emit_store_mem_fp(ctx, type, mem, src_reg);
8669					}
8670				}
8671				stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
8672			}
8673		}
8674	}
8675
8676#ifdef _WIN64
8677	/* WIN64 calling convention requires duplcation of parameters passed in FP register into GP ones */
8678	if (ir_is_vararg(ctx, insn)) {
8679		n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
8680		for (j = 3; j <= n; j++) {
8681			arg = ir_insn_op(insn, j);
8682			arg_insn = &ctx->ir_base[arg];
8683			type = arg_insn->type;
8684			if (IR_IS_TYPE_FP(type)) {
8685				src_reg = fp_reg_params[j-3];
8686				dst_reg = int_reg_params[j-3];
8687|.if X64
8688				if (ctx->mflags & IR_X86_AVX) {
8689					|	vmovd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
8690				} else {
8691					|	movd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
8692				}
8693|.endif
8694			}
8695		}
8696	}
8697#endif
8698#ifdef IR_REG_VARARG_FP_REGS
8699	/* set hidden argument to specify the number of vector registers used */
8700	if (ir_is_vararg(ctx, insn)) {
8701		fp_param = IR_MIN(fp_param, fp_reg_params_count);
8702		|	mov Rd(IR_REG_VARARG_FP_REGS), fp_param
8703	}
8704#endif
8705
8706	return used_stack;
8707}
8708
8709static void ir_emit_call_ex(ir_ctx *ctx, ir_ref def, ir_insn *insn, int32_t used_stack)
8710{
8711	ir_backend_data *data = ctx->data;
8712	dasm_State **Dst = &data->dasm_state;
8713	ir_reg def_reg;
8714
8715	if (IR_IS_CONST_REF(insn->op2)) {
8716		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8717
8718		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8719			|	call aword &addr
8720		} else {
8721|.if X64
8722||			ir_reg tmp_reg = IR_REG_RAX;
8723
8724#ifdef IR_REG_VARARG_FP_REGS
8725||			if (ir_is_vararg(ctx, insn)) {
8726||				tmp_reg = IR_REG_R11;
8727||			}
8728#endif
8729||			if (IR_IS_SIGNED_32BIT(addr)) {
8730				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8731||			} else {
8732				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8733||			}
8734			|	call Rq(tmp_reg)
8735|.endif
8736		}
8737    } else {
8738		ir_reg op2_reg = ctx->regs[def][2];
8739
8740		if (op2_reg != IR_REG_NONE) {
8741			if (IR_REG_SPILLED(op2_reg)) {
8742				op2_reg = IR_REG_NUM(op2_reg);
8743				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8744			}
8745			|	call Ra(op2_reg)
8746		} else {
8747			ir_mem mem;
8748
8749			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8750				mem = ir_fuse_load(ctx, def, insn->op2);
8751			} else {
8752				mem = ir_ref_spill_slot(ctx, insn->op2);
8753			}
8754
8755			|	ASM_TMEM_OP call, aword, mem
8756		}
8757    }
8758
8759	if (used_stack) {
8760		int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
8761
8762		ctx->call_stack_size -= aligned_stack;
8763		if (ir_is_fastcall(ctx, insn)) {
8764			aligned_stack -= used_stack;
8765			if (aligned_stack) {
8766				|	add Ra(IR_REG_RSP), aligned_stack
8767			}
8768		} else {
8769			|	add Ra(IR_REG_RSP), aligned_stack
8770		}
8771	}
8772
8773	if (insn->type != IR_VOID) {
8774		if (IR_IS_TYPE_INT(insn->type)) {
8775			def_reg = IR_REG_NUM(ctx->regs[def][0]);
8776			if (def_reg != IR_REG_NONE) {
8777				if (def_reg != IR_REG_INT_RET1) {
8778					ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
8779				}
8780				if (IR_REG_SPILLED(ctx->regs[def][0])) {
8781					ir_emit_store(ctx, insn->type, def, def_reg);
8782				}
8783			} else if (ctx->use_lists[def].count > 1) {
8784				ir_emit_store(ctx, insn->type, def, IR_REG_INT_RET1);
8785			}
8786		} else {
8787			IR_ASSERT(IR_IS_TYPE_FP(insn->type));
8788			def_reg = IR_REG_NUM(ctx->regs[def][0]);
8789#ifdef IR_REG_FP_RET1
8790			if (def_reg != IR_REG_NONE) {
8791				if (def_reg != IR_REG_FP_RET1) {
8792					ir_emit_fp_mov(ctx, insn->type, def_reg, IR_REG_FP_RET1);
8793				}
8794				if (IR_REG_SPILLED(ctx->regs[def][0])) {
8795					ir_emit_store(ctx, insn->type, def, def_reg);
8796				}
8797			} else if (ctx->use_lists[def].count > 1) {
8798				ir_emit_store(ctx, insn->type, def, IR_REG_FP_RET1);
8799			}
8800#else
8801			if (ctx->use_lists[def].count > 1) {
8802				int32_t offset;
8803				ir_reg fp;
8804
8805				if (def_reg == IR_REG_NONE) {
8806					offset = ir_ref_spill_slot_offset(ctx, def, &fp);
8807					if (insn->type == IR_DOUBLE) {
8808						|	fstp qword [Ra(fp)+offset]
8809					} else {
8810						IR_ASSERT(insn->type == IR_FLOAT);
8811						|	fstp dword [Ra(fp)+offset]
8812					}
8813				} else {
8814					offset = ctx->ret_slot;
8815					IR_ASSERT(offset != -1);
8816					offset = IR_SPILL_POS_TO_OFFSET(offset);
8817					fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
8818					if (insn->type == IR_DOUBLE) {
8819						|	fstp qword [Ra(fp)+offset]
8820					} else {
8821						IR_ASSERT(insn->type == IR_FLOAT);
8822						|	fstp dword [Ra(fp)+offset]
8823					}
8824					ir_emit_load_mem_fp(ctx, insn->type, def_reg, IR_MEM_BO(fp, offset));
8825					if (IR_REG_SPILLED(ctx->regs[def][0])) {
8826						ir_emit_store(ctx, insn->type, def, def_reg);
8827					}
8828				}
8829			}
8830#endif
8831		}
8832	}
8833}
8834
8835static void ir_emit_call(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8836{
8837	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
8838	ir_emit_call_ex(ctx, def, insn, used_stack);
8839}
8840
8841static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8842{
8843	ir_backend_data *data = ctx->data;
8844	dasm_State **Dst = &data->dasm_state;
8845	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
8846
8847	if (used_stack != 0) {
8848		ir_emit_call_ex(ctx, def, insn, used_stack);
8849		ir_emit_return_void(ctx);
8850		return;
8851	}
8852
8853	ir_emit_epilogue(ctx);
8854
8855	if (IR_IS_CONST_REF(insn->op2)) {
8856		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8857
8858		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8859			|	jmp aword &addr
8860		} else {
8861|.if X64
8862||			ir_reg tmp_reg = IR_REG_RAX;
8863
8864#ifdef IR_REG_VARARG_FP_REGS
8865||			if (ir_is_vararg(ctx, insn)) {
8866||				tmp_reg = IR_REG_R11;
8867||			}
8868#endif
8869||			if (IR_IS_SIGNED_32BIT(addr)) {
8870				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8871||			} else {
8872				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8873||			}
8874			|	jmp Rq(tmp_reg)
8875|.endif
8876		}
8877    } else {
8878		ir_reg op2_reg = ctx->regs[def][2];
8879
8880		if (op2_reg != IR_REG_NONE) {
8881			if (IR_REG_SPILLED(op2_reg)) {
8882				op2_reg = IR_REG_NUM(op2_reg);
8883				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8884			}
8885			|	jmp Ra(op2_reg)
8886		} else {
8887			ir_mem mem;
8888
8889			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8890				mem = ir_fuse_load(ctx, def, insn->op2);
8891			} else {
8892				mem = ir_ref_spill_slot(ctx, insn->op2);
8893			}
8894			|	ASM_TMEM_OP jmp, aword, mem
8895		}
8896    }
8897}
8898
8899static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
8900{
8901	ir_backend_data *data = ctx->data;
8902	dasm_State **Dst = &data->dasm_state;
8903	ir_reg op2_reg = ctx->regs[def][2];
8904
8905	if (IR_IS_CONST_REF(insn->op2)) {
8906		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);
8907
8908		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
8909			|	jmp aword &addr
8910		} else {
8911|.if X64
8912			if (IR_IS_SIGNED_32BIT(addr)) {
8913				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
8914			} else {
8915				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
8916			}
8917			|	jmp rax
8918|.endif
8919		}
8920	} else if (ir_rule(ctx, insn->op2) & IR_FUSED) {
8921	    ir_mem mem = ir_fuse_load(ctx, def, insn->op2);
8922		|	ASM_TMEM_OP jmp, aword, mem
8923	} else if (op2_reg != IR_REG_NONE) {
8924		if (IR_REG_SPILLED(op2_reg)) {
8925			op2_reg = IR_REG_NUM(op2_reg);
8926			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
8927		}
8928		|	jmp Ra(op2_reg)
8929	} else {
8930		ir_mem mem = ir_ref_spill_slot(ctx, insn->op2);
8931
8932		|	ASM_TMEM_OP jmp, aword, mem
8933	}
8934}
8935
8936static bool ir_emit_guard_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block, uint8_t op, void *addr, bool int_cmp)
8937{
8938	ir_backend_data *data = ctx->data;
8939	dasm_State **Dst = &data->dasm_state;
8940	ir_insn *next_insn = &ctx->ir_base[def + 1];
8941
8942	if (next_insn->op == IR_END || next_insn->op == IR_LOOP_END) {
8943		ir_block *bb = &ctx->cfg_blocks[b];
8944		uint32_t target;
8945
8946		if (!(bb->flags & IR_BB_DESSA_MOVES)) {
8947			target = ctx->cfg_edges[bb->successors];
8948			if (UNEXPECTED(bb->successors_count == 2)) {
8949				if (ctx->cfg_blocks[target].flags & IR_BB_ENTRY) {
8950					target = ctx->cfg_edges[bb->successors + 1];
8951				} else {
8952					IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
8953				}
8954			} else {
8955				IR_ASSERT(bb->successors_count == 1);
8956			}
8957			target = ir_skip_empty_target_blocks(ctx, target);
8958			if (target != next_block) {
8959				if (int_cmp) {
8960					switch (op) {
8961						default:
8962							IR_ASSERT(0 && "NIY binary op");
8963						case IR_EQ:
8964							|	jne =>target
8965							break;
8966						case IR_NE:
8967							|	je =>target
8968							break;
8969						case IR_LT:
8970							|	jge =>target
8971							break;
8972						case IR_GE:
8973							|	jl =>target
8974							break;
8975						case IR_LE:
8976							|	jg =>target
8977							break;
8978						case IR_GT:
8979							|	jle =>target
8980							break;
8981						case IR_ULT:
8982							|	jae =>target
8983							break;
8984						case IR_UGE:
8985							|	jb =>target
8986							break;
8987						case IR_ULE:
8988							|	ja =>target
8989							break;
8990						case IR_UGT:
8991							|	jbe =>target
8992							break;
8993					}
8994				} else {
8995					switch (op) {
8996						default:
8997							IR_ASSERT(0 && "NIY binary op");
8998						case IR_EQ:
8999							|	jne =>target
9000							|	jp =>target
9001							break;
9002						case IR_NE:
9003							|	jp &addr
9004							|	je =>target
9005							break;
9006						case IR_LT:
9007							|	jae =>target
9008							break;
9009						case IR_GE:
9010							|	jp &addr
9011							|	jb =>target
9012							break;
9013						case IR_LE:
9014							|	ja =>target
9015							break;
9016						case IR_GT:
9017							|	jp &addr
9018							|	jbe =>target
9019							break;
9020					}
9021				}
9022				|	jmp &addr
9023				return 1;
9024			}
9025		}
9026	} else if (next_insn->op == IR_IJMP && IR_IS_CONST_REF(next_insn->op2)) {
9027		void *target_addr = ir_jmp_addr(ctx, next_insn, &ctx->ir_base[next_insn->op2]);
9028
9029		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, target_addr)) {
9030			if (int_cmp) {
9031				switch (op) {
9032					default:
9033						IR_ASSERT(0 && "NIY binary op");
9034					case IR_EQ:
9035						|	jne &target_addr
9036						break;
9037					case IR_NE:
9038						|	je &target_addr
9039						break;
9040					case IR_LT:
9041						|	jge &target_addr
9042						break;
9043					case IR_GE:
9044						|	jl &target_addr
9045						break;
9046					case IR_LE:
9047						|	jg &target_addr
9048						break;
9049					case IR_GT:
9050						|	jle &target_addr
9051						break;
9052					case IR_ULT:
9053						|	jae &target_addr
9054						break;
9055					case IR_UGE:
9056						|	jb &target_addr
9057						break;
9058					case IR_ULE:
9059						|	ja &target_addr
9060						break;
9061					case IR_UGT:
9062						|	jbe &target_addr
9063						break;
9064				}
9065			} else {
9066				switch (op) {
9067					default:
9068						IR_ASSERT(0 && "NIY binary op");
9069					case IR_EQ:
9070						|	jne &target_addr
9071						|	jp &target_addr
9072						break;
9073					case IR_NE:
9074						|	jp &addr
9075						|	je &target_addr
9076						break;
9077					case IR_LT:
9078						|	jae &target_addr
9079						break;
9080					case IR_GE:
9081						|	jp &addr
9082						|	jb &target_addr
9083						break;
9084					case IR_LE:
9085						|	ja &target_addr
9086						break;
9087					case IR_GT:
9088						|	jp &addr
9089						|	jbe &target_addr
9090						break;
9091				}
9092			}
9093			|	jmp &addr
9094			return 1;
9095		}
9096	}
9097
9098	if (int_cmp) {
9099		switch (op) {
9100			default:
9101				IR_ASSERT(0 && "NIY binary op");
9102			case IR_EQ:
9103				|	je &addr
9104				break;
9105			case IR_NE:
9106				|	jne &addr
9107				break;
9108			case IR_LT:
9109				|	jl &addr
9110				break;
9111			case IR_GE:
9112				|	jge &addr
9113				break;
9114			case IR_LE:
9115				|	jle &addr
9116				break;
9117			case IR_GT:
9118				|	jg &addr
9119				break;
9120			case IR_ULT:
9121				|	jb &addr
9122				break;
9123			case IR_UGE:
9124				|	jae &addr
9125				break;
9126			case IR_ULE:
9127				|	jbe &addr
9128				break;
9129			case IR_UGT:
9130				|	ja &addr
9131				break;
9132		}
9133	} else {
9134		switch (op) {
9135			default:
9136				IR_ASSERT(0 && "NIY binary op");
9137			case IR_EQ:
9138				|	jp >1
9139				|	je &addr
9140				|1:
9141				break;
9142			case IR_NE:
9143				|	jne &addr
9144				|	jp &addr
9145				break;
9146			case IR_LT:
9147				|	jp >1
9148				|	jb &addr
9149				|1:
9150				break;
9151			case IR_GE:
9152				|	jae &addr
9153				break;
9154			case IR_LE:
9155				|	jp >1
9156				|	jbe &addr
9157				|1:
9158				break;
9159			case IR_GT:
9160				|	ja &addr
9161				break;
9162//			case IR_ULT: fprintf(stderr, "\tjb .LL%d\n", true_block); break;
9163//			case IR_UGE: fprintf(stderr, "\tjae .LL%d\n", true_block); break;
9164//			case IR_ULE: fprintf(stderr, "\tjbe .LL%d\n", true_block); break;
9165//			case IR_UGT: fprintf(stderr, "\tja .LL%d\n", true_block); break;
9166		}
9167	}
9168	return 0;
9169}
9170
9171static bool ir_emit_guard(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9172{
9173	ir_backend_data *data = ctx->data;
9174	dasm_State **Dst = &data->dasm_state;
9175	ir_reg op2_reg = ctx->regs[def][2];
9176	ir_type type = ctx->ir_base[insn->op2].type;
9177	void *addr;
9178
9179	IR_ASSERT(IR_IS_TYPE_INT(type));
9180	if (IR_IS_CONST_REF(insn->op2)) {
9181		bool is_true = ir_ref_is_true(ctx, insn->op2);
9182
9183		if ((insn->op == IR_GUARD && !is_true) || (insn->op == IR_GUARD_NOT && is_true)) {
9184			addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9185			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9186				|	jmp aword &addr
9187			} else {
9188|.if X64
9189				if (IR_IS_SIGNED_32BIT(addr)) {
9190					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9191				} else {
9192					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9193				}
9194				|	jmp aword [rax]
9195|.endif
9196			}
9197		}
9198		return 0;
9199	}
9200
9201	if (op2_reg != IR_REG_NONE) {
9202		if (IR_REG_SPILLED(op2_reg)) {
9203			op2_reg = IR_REG_NUM(op2_reg);
9204			ir_emit_load(ctx, type, op2_reg, insn->op2);
9205		}
9206		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
9207	} else {
9208		ir_mem mem;
9209
9210		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
9211			mem = ir_fuse_load(ctx, def, insn->op2);
9212		} else {
9213			mem = ir_ref_spill_slot(ctx, insn->op2);
9214		}
9215		|	ASM_MEM_IMM_OP cmp, type, mem, 0
9216	}
9217
9218	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9219	if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9220		ir_op op;
9221
9222		if (insn->op == IR_GUARD) {
9223			op = IR_EQ;
9224		} else {
9225			op = IR_NE;
9226		}
9227		return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9228	} else {
9229|.if X64
9230		if (insn->op == IR_GUARD) {
9231			|	je >1
9232		} else {
9233			|	jne >1
9234		}
9235		|.cold_code
9236		|1:
9237		if (IR_IS_SIGNED_32BIT(addr)) {
9238			|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9239		} else {
9240			|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9241		}
9242		|	jmp aword [rax]
9243		|.code
9244|.endif
9245		return 0;
9246	}
9247}
9248
9249static bool ir_emit_guard_cmp_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9250{
9251	ir_backend_data *data = ctx->data;
9252	dasm_State **Dst = &data->dasm_state;
9253	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
9254	ir_op op = cmp_insn->op;
9255	ir_type type = ctx->ir_base[cmp_insn->op1].type;
9256	ir_ref op1 = cmp_insn->op1;
9257	ir_ref op2 = cmp_insn->op2;
9258	ir_reg op1_reg = ctx->regs[insn->op2][1];
9259	ir_reg op2_reg = ctx->regs[insn->op2][2];
9260	void *addr;
9261
9262	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
9263		op1_reg = IR_REG_NUM(op1_reg);
9264		ir_emit_load(ctx, type, op1_reg, op1);
9265	}
9266	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
9267		op2_reg = IR_REG_NUM(op2_reg);
9268		if (op1 != op2) {
9269			ir_emit_load(ctx, type, op2_reg, op2);
9270		}
9271	}
9272
9273	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9274	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
9275		if (op == IR_ULT) {
9276			/* always false */
9277			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9278				|	jmp aword &addr
9279			} else {
9280|.if X64
9281				if (IR_IS_SIGNED_32BIT(addr)) {
9282					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9283				} else {
9284					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9285				}
9286				|	jmp aword [rax]
9287|.endif
9288			}
9289			return 0;
9290		} else if (op == IR_UGE) {
9291			/* always true */
9292			return 0;
9293		} else if (op == IR_ULE) {
9294			op = IR_EQ;
9295		} else if (op == IR_UGT) {
9296			op = IR_NE;
9297		}
9298	}
9299	ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
9300
9301	if (insn->op == IR_GUARD) {
9302		op ^= 1; // reverse
9303	}
9304
9305	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9306}
9307
9308static bool ir_emit_guard_cmp_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9309{
9310	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
9311	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9312
9313	if (insn->op == IR_GUARD) {
9314		op ^= 1; // reverse
9315	}
9316	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 0);
9317}
9318
9319static bool ir_emit_guard_test_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9320{
9321	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9322	ir_op op = (insn->op == IR_GUARD) ? IR_EQ : IR_NE;
9323
9324	ir_emit_test_int_common(ctx, def, insn->op2, op);
9325	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9326}
9327
9328static bool ir_emit_guard_jcc_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
9329{
9330	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9331	ir_op op = ctx->ir_base[insn->op2].op;
9332
9333	if (insn->op == IR_GUARD) {
9334		op ^= 1; // reverse
9335	}
9336	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1);
9337}
9338
9339static bool ir_emit_guard_overflow(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
9340{
9341	ir_backend_data *data = ctx->data;
9342	dasm_State **Dst = &data->dasm_state;
9343	ir_type type;
9344	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
9345
9346	type = ctx->ir_base[ctx->ir_base[insn->op2].op1].type;
9347
9348	IR_ASSERT(IR_IS_TYPE_INT(type));
9349	if (IR_IS_TYPE_SIGNED(type)) {
9350		if (insn->op == IR_GUARD) {
9351			|	jno &addr
9352		} else {
9353			|	jo &addr
9354		}
9355	} else {
9356		if (insn->op == IR_GUARD) {
9357			|	jnc &addr
9358		} else {
9359			|	jc &addr
9360		}
9361	}
9362	return 0;
9363}
9364
9365static void ir_emit_lea(ir_ctx *ctx, ir_ref def, ir_type type)
9366{
9367	ir_backend_data *data = ctx->data;
9368	dasm_State **Dst = &data->dasm_state;
9369	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9370	ir_mem mem = ir_fuse_addr(ctx, def, def);
9371
9372	IR_ASSERT(def_reg != IR_REG_NONE);
9373	if (ir_type_size[type] == 4) {
9374		if (IR_MEM_BASE(mem) == def_reg
9375		 && IR_MEM_OFFSET(mem) == 0
9376		 && IR_MEM_SCALE(mem) == 1
9377		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
9378			ir_reg reg = IR_MEM_INDEX(mem);
9379			|	add Rd(def_reg), Rd(reg)
9380		} else if (IR_MEM_INDEX(mem) == def_reg
9381		 && IR_MEM_OFFSET(mem) == 0
9382		 && IR_MEM_SCALE(mem) == 1
9383		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
9384			ir_reg reg = IR_MEM_BASE(mem);
9385			|	add Rd(def_reg), Rd(reg)
9386		} else {
9387			|	ASM_TXT_TMEM_OP lea, Rd(def_reg), dword, mem
9388		}
9389	} else {
9390		if (IR_MEM_BASE(mem) == def_reg
9391		 && IR_MEM_OFFSET(mem) == 0
9392		 && IR_MEM_SCALE(mem) == 1
9393		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
9394			ir_reg reg = IR_MEM_INDEX(mem);
9395			|	add Ra(def_reg), Ra(reg)
9396		} else if (IR_MEM_INDEX(mem) == def_reg
9397		 && IR_MEM_OFFSET(mem) == 0
9398		 && IR_MEM_SCALE(mem) == 1
9399		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
9400			ir_reg reg = IR_MEM_BASE(mem);
9401			|	add Ra(def_reg), Ra(reg)
9402		} else {
9403			|	ASM_TXT_TMEM_OP lea, Ra(def_reg), aword, mem
9404		}
9405	}
9406	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9407		ir_emit_store(ctx, type, def, def_reg);
9408	}
9409}
9410
9411static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9412{
9413	ir_backend_data *data = ctx->data;
9414	dasm_State **Dst = &data->dasm_state;
9415	ir_reg reg = IR_REG_NUM(ctx->regs[def][0]);
9416
9417	if (ctx->use_lists[def].count == 1) {
9418		/* dead load */
9419		return;
9420	}
9421
9422|.if X64WIN
9423|	gs
9424|	mov Ra(reg), aword [0x58]
9425|	mov Ra(reg), aword [Ra(reg)+insn->op2]
9426|	mov Ra(reg), aword [Ra(reg)+insn->op3]
9427|.elif WIN
9428|	fs
9429|	mov Ra(reg), aword [0x2c]
9430|	mov Ra(reg), aword [Ra(reg)+insn->op2]
9431|	mov Ra(reg), aword [Ra(reg)+insn->op3]
9432|.elif X64APPLE
9433|	gs
9434||	if (insn->op3 == IR_NULL) {
9435|		mov Ra(reg), aword [insn->op2]
9436||	} else {
9437|		mov Ra(reg), aword [insn->op2]
9438|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9439||	}
9440|.elif X64
9441|	fs
9442||	if (insn->op3 == IR_NULL) {
9443|		mov Ra(reg), aword [insn->op2]
9444||	} else {
9445|		mov Ra(reg), [0x8]
9446|		mov Ra(reg), aword [Ra(reg)+insn->op2]
9447|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9448||	}
9449|.else
9450|	gs
9451||	if (insn->op3 == IR_NULL) {
9452|		mov Ra(reg), aword [insn->op2]
9453||	} else {
9454|		mov Ra(reg), [0x4]
9455|		mov Ra(reg), aword [Ra(reg)+insn->op2]
9456|		mov Ra(reg), aword [Ra(reg)+insn->op3]
9457||	}
9458|	.endif
9459	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9460		ir_emit_store(ctx, IR_ADDR, def, reg);
9461	}
9462}
9463
9464static void ir_emit_sse_sqrt(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9465{
9466	ir_backend_data *data = ctx->data;
9467	dasm_State **Dst = &data->dasm_state;
9468	ir_reg op3_reg = ctx->regs[def][3];
9469	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9470
9471	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
9472	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);
9473
9474	if (IR_REG_SPILLED(op3_reg)) {
9475		op3_reg = IR_REG_NUM(op3_reg);
9476		ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
9477	}
9478
9479	|	ASM_FP_REG_REG_OP sqrts, insn->type, def_reg, op3_reg
9480
9481	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9482		ir_emit_store(ctx, insn->type, def, def_reg);
9483	}
9484}
9485
9486static void ir_emit_sse_round(ir_ctx *ctx, ir_ref def, ir_insn *insn, int round_op)
9487{
9488	ir_backend_data *data = ctx->data;
9489	dasm_State **Dst = &data->dasm_state;
9490	ir_reg op3_reg = ctx->regs[def][3];
9491	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9492
9493	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
9494	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);
9495
9496	if (IR_REG_SPILLED(op3_reg)) {
9497		op3_reg = IR_REG_NUM(op3_reg);
9498		ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
9499	}
9500
9501	if (ctx->mflags & IR_X86_AVX) {
9502		|	ASM_SSE2_REG_REG_REG_TXT_OP vrounds, insn->type, def_reg, def_reg, op3_reg, round_op
9503	} else {
9504		|	ASM_SSE2_REG_REG_TXT_OP rounds, insn->type, def_reg, op3_reg, round_op
9505	}
9506
9507	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9508		ir_emit_store(ctx, insn->type, def, def_reg);
9509	}
9510}
9511
9512static void ir_emit_exitcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
9513{
9514	ir_backend_data *data = ctx->data;
9515	dasm_State **Dst = &data->dasm_state;
9516	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
9517
9518	IR_ASSERT(def_reg != IR_REG_NONE);
9519
9520	|.if X64
9521	|	sub rsp, 16*8+16*8+8 /* CPU regs + SSE regs */
9522	|	mov aword [rsp+0*8], rax
9523	|	mov aword [rsp+1*8], rcx
9524	|	mov aword [rsp+2*8], rdx
9525	|	mov aword [rsp+3*8], rbx
9526	|	mov aword [rsp+5*8], rbp
9527	|	mov aword [rsp+6*8], rsi
9528	|	mov aword [rsp+7*8], rdi
9529	|	mov aword [rsp+8*8], r8
9530	|	mov aword [rsp+9*8], r9
9531	|	mov aword [rsp+10*8], r10
9532	|	mov aword [rsp+11*8], r11
9533	|	mov aword [rsp+12*8], r12
9534	|	mov aword [rsp+13*8], r13
9535	|	mov aword [rsp+14*8], r14
9536	|	mov aword [rsp+15*8], r15
9537	|	movsd qword [rsp+16*8+0*8], xmm0
9538	|	movsd qword [rsp+16*8+1*8], xmm1
9539	|	movsd qword [rsp+16*8+2*8], xmm2
9540	|	movsd qword [rsp+16*8+3*8], xmm3
9541	|	movsd qword [rsp+16*8+4*8], xmm4
9542	|	movsd qword [rsp+16*8+5*8], xmm5
9543	|	movsd qword [rsp+16*8+6*8], xmm6
9544	|	movsd qword [rsp+16*8+7*8], xmm7
9545	|	movsd qword [rsp+16*8+8*8], xmm8
9546	|	movsd qword [rsp+16*8+9*8], xmm9
9547	|	movsd qword [rsp+16*8+10*8], xmm10
9548	|	movsd qword [rsp+16*8+11*8], xmm11
9549	|	movsd qword [rsp+16*8+12*8], xmm12
9550	|	movsd qword [rsp+16*8+13*8], xmm13
9551	|	movsd qword [rsp+16*8+14*8], xmm14
9552	|	movsd qword [rsp+16*8+15*8], xmm15
9553	|
9554	|	mov Ra(IR_REG_INT_ARG2), rsp
9555	|	lea Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+16]
9556	|	mov aword [rsp+4*8], Ra(IR_REG_INT_ARG1)
9557	|	mov Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+8]
9558	|.if X64WIN
9559	|	sub rsp, 32 /* shadow space */
9560	|.endif
9561	|.else
9562	|	sub esp, 8*4+8*8+12 /* CPU regs + SSE regs */
9563	|	mov aword [esp+0*4], eax
9564	|	mov aword [esp+1*4], ecx
9565	|	mov aword [esp+2*4], edx
9566	|	mov aword [esp+3*4], ebx
9567	|	mov aword [esp+5*4], ebp
9568	|	mov aword [esp+6*4], esi
9569	|	mov aword [esp+7*4], edi
9570	|	movsd qword [esp+8*4+0*8], xmm0
9571	|	movsd qword [esp+8*4+1*8], xmm1
9572	|	movsd qword [esp+8*4+2*8], xmm2
9573	|	movsd qword [esp+8*4+3*8], xmm3
9574	|	movsd qword [esp+8*4+4*8], xmm4
9575	|	movsd qword [esp+8*4+5*8], xmm5
9576	|	movsd qword [esp+8*4+6*8], xmm6
9577	|	movsd qword [esp+8*4+7*8], xmm7
9578	|
9579	|	mov Ra(IR_REG_INT_FCARG2), esp
9580	|	lea Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+16]
9581	|	mov aword [esp+4*4], Ra(IR_REG_INT_FCARG1)
9582	|	mov Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+12]
9583	|.endif
9584
9585	if (IR_IS_CONST_REF(insn->op2)) {
9586		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);
9587
9588		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
9589			|	call aword &addr
9590		} else {
9591|.if X64
9592			if (IR_IS_SIGNED_32BIT(addr)) {
9593				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
9594			} else {
9595				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
9596			}
9597			|	call rax
9598|.endif
9599		}
9600	} else {
9601		IR_ASSERT(0);
9602	}
9603
9604	//  restore SP
9605	|.if X64WIN
9606	|	add rsp, 32+16*8+16*8+16 /* shadow space + CPU regs + SSE regs */
9607	|.elif X64
9608	|	add rsp, 16*8+16*8+16 /* CPU regs + SSE regs */
9609	|.else
9610	|	add esp, 8*4+8*8+16 /* CPU regs + SSE regs */
9611	|.endif
9612
9613	if (def_reg != IR_REG_INT_RET1) {
9614		ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
9615	}
9616	if (IR_REG_SPILLED(ctx->regs[def][0])) {
9617		ir_emit_store(ctx, insn->type, def, def_reg);
9618	}
9619}
9620
9621static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_reg to_reg, ir_ref to, int32_t offset)
9622{
9623	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
9624
9625	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);
9626
9627	if (IR_IS_TYPE_INT(type)) {
9628		if (from_reg != IR_REG_NONE) {
9629			if (to_reg != IR_REG_NONE) {
9630				ir_emit_mov(ctx, type, to_reg, from_reg);
9631			} else {
9632				ir_emit_store(ctx, type, to, from_reg);
9633			}
9634		} else {
9635			ir_emit_load_mem_int(ctx, type, to_reg, IR_MEM_BO(fp, offset));
9636		}
9637	} else {
9638		if (from_reg != IR_REG_NONE) {
9639			if (to_reg != IR_REG_NONE) {
9640				ir_emit_fp_mov(ctx, type, to_reg, from_reg);
9641			} else {
9642				ir_emit_store(ctx, type, to, from_reg);
9643			}
9644		} else {
9645			ir_emit_load_mem_fp(ctx, type, to_reg, IR_MEM_BO(fp, offset));
9646		}
9647	}
9648}
9649
9650static void ir_emit_load_params(ir_ctx *ctx)
9651{
9652	ir_use_list *use_list = &ctx->use_lists[1];
9653	ir_insn *insn;
9654	ir_ref i, n, *p, use;
9655	int int_param_num = 0;
9656	int fp_param_num = 0;
9657	ir_reg src_reg;
9658	ir_reg dst_reg;
9659	// TODO: Calling convention specific
9660	int int_reg_params_count = IR_REG_INT_ARGS;
9661	int fp_reg_params_count = IR_REG_FP_ARGS;
9662	const int8_t *int_reg_params = _ir_int_reg_params;
9663	const int8_t *fp_reg_params = _ir_fp_reg_params;
9664	int32_t stack_offset = 0;
9665
9666#ifdef IR_TARGET_X86
9667	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
9668		int_reg_params_count = IR_REG_INT_FCARGS;
9669		fp_reg_params_count = IR_REG_FP_FCARGS;
9670		int_reg_params = _ir_int_fc_reg_params;
9671		fp_reg_params = _ir_fp_fc_reg_params;
9672	}
9673#endif
9674
9675	if (ctx->flags & IR_USE_FRAME_POINTER) {
9676		stack_offset = sizeof(void*) * 2; /* skip old frame pointer and return address */
9677	} else {
9678		stack_offset = sizeof(void*) + ctx->stack_frame_size + ctx->call_stack_size; /* skip return address */
9679	}
9680	n = use_list->count;
9681	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9682		use = *p;
9683		insn = &ctx->ir_base[use];
9684		if (insn->op == IR_PARAM) {
9685			if (IR_IS_TYPE_INT(insn->type)) {
9686				if (int_param_num < int_reg_params_count) {
9687					src_reg = int_reg_params[int_param_num];
9688				} else {
9689					src_reg = IR_REG_NONE;
9690				}
9691				int_param_num++;
9692#ifdef _WIN64
9693				/* WIN64 calling convention use common couter for int and fp registers */
9694				fp_param_num++;
9695#endif
9696			} else {
9697				if (fp_param_num < fp_reg_params_count) {
9698					src_reg = fp_reg_params[fp_param_num];
9699				} else {
9700					src_reg = IR_REG_NONE;
9701				}
9702				fp_param_num++;
9703#ifdef _WIN64
9704				/* WIN64 calling convention use common couter for int and fp registers */
9705				int_param_num++;
9706#endif
9707			}
9708			if (ctx->vregs[use]) {
9709				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
9710				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
9711					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
9712						((ctx->flags & IR_USE_FRAME_POINTER) ?
9713							-(ctx->stack_frame_size - ctx->stack_frame_alignment) :
9714							ctx->call_stack_size));
9715				if (src_reg != dst_reg) {
9716					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
9717				}
9718				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
9719					ir_emit_store(ctx, insn->type, use, dst_reg);
9720				}
9721			}
9722			if (src_reg == IR_REG_NONE) {
9723				if (sizeof(void*) == 8) {
9724					stack_offset += sizeof(void*);
9725				} else {
9726					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
9727				}
9728			}
9729		}
9730	}
9731}
9732
9733static ir_reg ir_get_free_reg(ir_type type, ir_regset available)
9734{
9735	if (IR_IS_TYPE_INT(type)) {
9736		available = IR_REGSET_INTERSECTION(available, IR_REGSET_GP);
9737	} else {
9738		IR_ASSERT(IR_IS_TYPE_FP(type));
9739		available = IR_REGSET_INTERSECTION(available, IR_REGSET_FP);
9740	}
9741	IR_ASSERT(!IR_REGSET_IS_EMPTY(available));
9742	return IR_REGSET_FIRST(available);
9743}
9744
9745static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
9746{
9747	ir_backend_data *data = ctx->data;
9748	ir_ref ref = ctx->cfg_blocks[data->dessa_from_block].end;
9749
9750	if (to == 0) {
9751		if (IR_IS_TYPE_INT(type)) {
9752			if (ctx->regs[ref][0] == IR_REG_NONE) {
9753				ctx->regs[ref][0] = IR_REG_RAX;
9754			}
9755		} else {
9756			IR_ASSERT(IR_IS_TYPE_FP(type));
9757			if (ctx->regs[ref][1] == IR_REG_NONE) {
9758				ctx->regs[ref][1] = IR_REG_XMM0;
9759			}
9760		}
9761	} else if (from != 0) {
9762		if (IR_IS_TYPE_INT(type)) {
9763			if (ctx->regs[ref][0] == IR_REG_NONE) {
9764				ctx->regs[ref][0] = IR_REG_RAX;
9765			}
9766		} else {
9767			IR_ASSERT(IR_IS_TYPE_FP(type));
9768			if (ctx->regs[ref][1] == IR_REG_NONE) {
9769				ctx->regs[ref][1] = IR_REG_XMM0;
9770			}
9771		}
9772	}
9773	return 1;
9774}
9775
9776static void ir_fix_param_spills(ir_ctx *ctx)
9777{
9778	ir_use_list *use_list = &ctx->use_lists[1];
9779	ir_insn *insn;
9780	ir_ref i, n, *p, use;
9781	int int_param_num = 0;
9782	int fp_param_num = 0;
9783	ir_reg src_reg;
9784	// TODO: Calling convention specific
9785	int int_reg_params_count = IR_REG_INT_ARGS;
9786	int fp_reg_params_count = IR_REG_FP_ARGS;
9787	const int8_t *int_reg_params = _ir_int_reg_params;
9788	const int8_t *fp_reg_params = _ir_fp_reg_params;
9789	int32_t stack_start = 0;
9790	int32_t stack_offset = 0;
9791
9792#ifdef IR_TARGET_X86
9793	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
9794		int_reg_params_count = IR_REG_INT_FCARGS;
9795		fp_reg_params_count = IR_REG_FP_FCARGS;
9796		int_reg_params = _ir_int_fc_reg_params;
9797		fp_reg_params = _ir_fp_fc_reg_params;
9798	}
9799#endif
9800
9801	if (ctx->flags & IR_USE_FRAME_POINTER) {
9802		/* skip old frame pointer and return address */
9803		stack_start = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
9804	} else {
9805		 /* skip return address */
9806		stack_start = sizeof(void*) + ctx->stack_frame_size;
9807	}
9808	n = use_list->count;
9809	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9810		use = *p;
9811		insn = &ctx->ir_base[use];
9812		if (insn->op == IR_PARAM) {
9813			if (IR_IS_TYPE_INT(insn->type)) {
9814				if (int_param_num < int_reg_params_count) {
9815					src_reg = int_reg_params[int_param_num];
9816				} else {
9817					src_reg = IR_REG_NONE;
9818				}
9819				int_param_num++;
9820#ifdef _WIN64
9821				/* WIN64 calling convention use common couter for int and fp registers */
9822				fp_param_num++;
9823#endif
9824			} else {
9825				if (fp_param_num < fp_reg_params_count) {
9826					src_reg = fp_reg_params[fp_param_num];
9827				} else {
9828					src_reg = IR_REG_NONE;
9829				}
9830				fp_param_num++;
9831#ifdef _WIN64
9832				/* WIN64 calling convention use common couter for int and fp registers */
9833				int_param_num++;
9834#endif
9835			}
9836			if (src_reg == IR_REG_NONE) {
9837				if (ctx->vregs[use]) {
9838					ir_live_interval *ival = ctx->live_intervals[ctx->vregs[use]];
9839					if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
9840					 && ival->stack_spill_pos == -1
9841					 && (ival->next || ival->reg == IR_REG_NONE)) {
9842						ival->stack_spill_pos = stack_start + stack_offset;
9843					}
9844				}
9845				if (sizeof(void*) == 8) {
9846					stack_offset += sizeof(void*);
9847				} else {
9848					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
9849				}
9850			}
9851		}
9852	}
9853
9854#ifdef _WIN64
9855	/* WIN64 uses shsow area for registers */
9856	stack_offset += IR_MIN(int_param_num, int_reg_params_count) * sizeof(void*);
9857#endif
9858	ctx->gp_reg_params = IR_MIN(int_param_num, int_reg_params_count);
9859	ctx->fp_reg_params = IR_MIN(fp_param_num, fp_reg_params_count);
9860	ctx->param_stack_size = stack_offset;
9861}
9862
9863static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
9864{
9865	uint32_t b;
9866	ir_block *bb;
9867	ir_insn *insn;
9868	ir_ref i, n, j, *p;
9869	uint32_t *rule, insn_flags;
9870	ir_backend_data *data = ctx->data;
9871	ir_regset available = 0;
9872	ir_target_constraints constraints;
9873	uint32_t def_flags;
9874	ir_reg reg;
9875
9876#ifndef IR_REG_FP_RET1
9877	if (ctx->flags2 & IR_HAS_FP_RET_SLOT) {
9878		ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
9879	} else if (ctx->ret_type == IR_FLOAT || ctx->ret_type == IR_DOUBLE) {
9880		ctx->ret_slot = ir_allocate_spill_slot(ctx, ctx->ret_type, &data->ra_data);
9881	} else {
9882		ctx->ret_slot = -1;
9883	}
9884#endif
9885
9886	ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
9887	memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);
9888
9889	/* vregs + tmp + fixed + SRATCH + ALL */
9890	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));
9891
9892    if (!ctx->arena) {
9893		ctx->arena = ir_arena_create(16 * 1024);
9894	}
9895
9896	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
9897		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
9898		for (i = bb->start, insn = ctx->ir_base + i, rule = ctx->rules + i; i <= bb->end;) {
9899			switch (ctx->rules ? *rule : insn->op) {
9900				case IR_START:
9901				case IR_BEGIN:
9902				case IR_END:
9903				case IR_IF_TRUE:
9904				case IR_IF_FALSE:
9905				case IR_CASE_VAL:
9906				case IR_CASE_DEFAULT:
9907				case IR_MERGE:
9908				case IR_LOOP_BEGIN:
9909				case IR_LOOP_END:
9910					break;
9911#ifndef IR_REG_FP_RET1
9912				case IR_CALL:
9913					if (ctx->ret_slot == -1 && (insn->type == IR_FLOAT || insn->type == IR_DOUBLE)) {
9914						ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
9915					}
9916#endif
9917					IR_FALLTHROUGH;
9918				default:
9919					def_flags = ir_get_target_constraints(ctx, i, &constraints);
9920					if (ctx->rules
9921					 && *rule != IR_CMP_AND_BRANCH_INT
9922					 && *rule != IR_CMP_AND_BRANCH_FP
9923					 && *rule != IR_TEST_AND_BRANCH_INT
9924					 && *rule != IR_GUARD_CMP_INT
9925					 && *rule != IR_GUARD_CMP_FP) {
9926						available = IR_REGSET_SCRATCH;
9927					}
9928					if (ctx->vregs[i]) {
9929						reg = constraints.def_reg;
9930						if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
9931							IR_REGSET_EXCL(available, reg);
9932							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
9933						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
9934							if (insn->op == IR_VLOAD
9935							 && ctx->live_intervals[ctx->vregs[i]]
9936							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1) {
9937								/* pass */
9938							} else if (insn->op != IR_PARAM) {
9939								reg = ir_get_free_reg(insn->type, available);
9940								IR_REGSET_EXCL(available, reg);
9941								ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
9942							}
9943						}
9944						if (!ctx->live_intervals[ctx->vregs[i]]) {
9945							ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9946							memset(ival, 0, sizeof(ir_live_interval));
9947							ctx->live_intervals[ctx->vregs[i]] = ival;
9948							ival->type = insn->type;
9949							ival->reg = IR_REG_NONE;
9950							ival->vreg = ctx->vregs[i];
9951							ival->stack_spill_pos = -1;
9952							if (insn->op == IR_PARAM && reg == IR_REG_NONE) {
9953								ival->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
9954							} else {
9955								ival->stack_spill_pos = ir_allocate_spill_slot(ctx, ival->type, &data->ra_data);
9956							}
9957						} else if (insn->op == IR_PARAM) {
9958							IR_ASSERT(0 && "unexpected PARAM");
9959							return;
9960						}
9961					} else if (insn->op == IR_VAR) {
9962						ir_use_list *use_list = &ctx->use_lists[i];
9963						ir_ref n = use_list->count;
9964
9965						if (n > 0) {
9966							int32_t stack_spill_pos = insn->op3 = ir_allocate_spill_slot(ctx, insn->type, &data->ra_data);
9967							ir_ref i, *p, use;
9968							ir_insn *use_insn;
9969
9970							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
9971								use = *p;
9972								use_insn = &ctx->ir_base[use];
9973								if (use_insn->op == IR_VLOAD) {
9974									if (ctx->vregs[use]
9975									 && !ctx->live_intervals[ctx->vregs[use]]) {
9976										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9977										memset(ival, 0, sizeof(ir_live_interval));
9978										ctx->live_intervals[ctx->vregs[use]] = ival;
9979										ival->type = insn->type;
9980										ival->reg = IR_REG_NONE;
9981										ival->vreg = ctx->vregs[use];
9982										ival->stack_spill_pos = stack_spill_pos;
9983									}
9984								} else if (use_insn->op == IR_VSTORE) {
9985									if (!IR_IS_CONST_REF(use_insn->op3)
9986									 && ctx->vregs[use_insn->op3]
9987									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
9988										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
9989										memset(ival, 0, sizeof(ir_live_interval));
9990										ctx->live_intervals[ctx->vregs[use_insn->op3]] = ival;
9991										ival->type = insn->type;
9992										ival->reg = IR_REG_NONE;
9993										ival->vreg = ctx->vregs[use_insn->op3];
9994										ival->stack_spill_pos = stack_spill_pos;
9995									}
9996								}
9997							}
9998						}
9999					}
10000
10001					insn_flags = ir_op_flags[insn->op];
10002					n = constraints.tmps_count;
10003					if (n) {
10004						do {
10005							n--;
10006							if (constraints.tmp_regs[n].type) {
10007								ir_reg reg = ir_get_free_reg(constraints.tmp_regs[n].type, available);
10008								ir_ref *ops = insn->ops;
10009								IR_REGSET_EXCL(available, reg);
10010								if (constraints.tmp_regs[n].num > 0
10011								 && IR_IS_CONST_REF(ops[constraints.tmp_regs[n].num])) {
10012									/* rematerialization */
10013									reg |= IR_REG_SPILL_LOAD;
10014								}
10015								ctx->regs[i][constraints.tmp_regs[n].num] = reg;
10016							} else if (constraints.tmp_regs[n].reg == IR_REG_SCRATCH) {
10017								available = IR_REGSET_DIFFERENCE(available, IR_REGSET_SCRATCH);
10018							} else {
10019								IR_REGSET_EXCL(available, constraints.tmp_regs[n].reg);
10020							}
10021						} while (n);
10022					}
10023					n = insn->inputs_count;
10024					for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
10025						ir_ref input = *p;
10026						if (IR_OPND_KIND(insn_flags, j) == IR_OPND_DATA && input > 0 && ctx->vregs[input]) {
10027							if ((def_flags & IR_DEF_REUSES_OP1_REG) && j == 1) {
10028								ir_reg reg = IR_REG_NUM(ctx->regs[i][0]);
10029								ctx->regs[i][1] = reg | IR_REG_SPILL_LOAD;
10030							} else {
10031								uint8_t use_flags = IR_USE_FLAGS(def_flags, j);
10032								ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;
10033
10034								if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
10035									IR_REGSET_EXCL(available, reg);
10036									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
10037								} else if (j > 1 && input == insn->op1 && ctx->regs[i][1] != IR_REG_NONE) {
10038									ctx->regs[i][j] = ctx->regs[i][1];
10039								} else if (use_flags & IR_USE_MUST_BE_IN_REG) {
10040									reg = ir_get_free_reg(ctx->ir_base[input].type, available);
10041									IR_REGSET_EXCL(available, reg);
10042									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
10043								}
10044							}
10045						}
10046					}
10047					break;
10048			}
10049			n = ir_insn_len(insn);
10050			i += n;
10051			insn += n;
10052			rule += n;
10053		}
10054		if (bb->flags & IR_BB_DESSA_MOVES) {
10055			data->dessa_from_block = b;
10056			ir_gen_dessa_moves(ctx, b, ir_fix_dessa_tmps);
10057		}
10058	}
10059
10060	ctx->used_preserved_regs = ctx->fixed_save_regset;
10061	ctx->flags |= IR_NO_STACK_COMBINE;
10062	ir_fix_stack_frame(ctx);
10063}
10064
10065static void ir_preallocate_call_stack(ir_ctx *ctx)
10066{
10067	int call_stack_size, peak_call_stack_size = 0;
10068	ir_ref i, n;
10069	ir_insn *insn;
10070
10071	for (i = 1, insn = ctx->ir_base + 1; i < ctx->insns_count;) {
10072		if (insn->op == IR_CALL) {
10073			call_stack_size = ir_call_used_stack(ctx, insn);
10074			if (call_stack_size > peak_call_stack_size
10075#ifdef IR_HAVE_FASTCALL
10076			 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
10077#endif
10078			) {
10079				peak_call_stack_size = call_stack_size;
10080			}
10081		}
10082		n = ir_insn_len(insn);
10083		i += n;
10084		insn += n;
10085	}
10086	if (peak_call_stack_size) {
10087		ctx->call_stack_size = peak_call_stack_size;
10088		ctx->flags |= IR_PREALLOCATED_STACK;
10089	}
10090}
10091
10092void ir_fix_stack_frame(ir_ctx *ctx)
10093{
10094	uint32_t additional_size = 0;
10095
10096	ctx->locals_area_size = ctx->stack_frame_size;
10097
10098#if defined(IR_TARGET_X64) && !defined(_WIN64)
10099	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
10100		ctx->flags2 |= IR_16B_FRAME_ALIGNMENT;
10101		ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, 16);
10102		ctx->locals_area_size = ctx->stack_frame_size;
10103		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
10104			additional_size += sizeof(void*) * IR_REG_INT_ARGS;
10105		}
10106		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
10107			additional_size += 16 * IR_REG_FP_ARGS;
10108		}
10109	}
10110#endif
10111
10112	if (ctx->used_preserved_regs) {
10113		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
10114		ir_reg reg;
10115		(void) reg;
10116
10117		IR_REGSET_FOREACH(used_preserved_regs, reg) {
10118			additional_size += sizeof(void*);
10119		} IR_REGSET_FOREACH_END();
10120	}
10121
10122	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
10123	ctx->stack_frame_size += additional_size;
10124	ctx->stack_frame_alignment = 0;
10125	ctx->call_stack_size = 0;
10126
10127	if (ctx->flags2 & IR_16B_FRAME_ALIGNMENT) {
10128		/* Stack must be 16 byte aligned */
10129		if (!(ctx->flags & IR_FUNCTION)) {
10130			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
10131				ctx->stack_frame_size += sizeof(void*);
10132				ctx->stack_frame_alignment += sizeof(void*);
10133			}
10134		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
10135			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
10136				ctx->stack_frame_size += sizeof(void*);
10137				ctx->stack_frame_alignment += sizeof(void*);
10138			}
10139		} else {
10140			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
10141				ir_preallocate_call_stack(ctx);
10142			}
10143			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*), 16) !=
10144					ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*)) {
10145				ctx->stack_frame_size += sizeof(void*);
10146				ctx->stack_frame_alignment += sizeof(void*);
10147			}
10148		}
10149	}
10150
10151	ir_fix_param_spills(ctx);
10152}
10153
10154static void* dasm_labels[ir_lb_MAX];
10155
10156static uint32_t _ir_next_block(ir_ctx *ctx, uint32_t _b)
10157{
10158	uint32_t b = ctx->cfg_schedule[++_b];
10159
10160	/* Check for empty ENTRY block */
10161	while (b && ((ctx->cfg_blocks[b].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
10162		b = ctx->cfg_schedule[++_b];
10163	}
10164	return b;
10165}
10166
10167void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
10168{
10169	uint32_t _b, b, n, target;
10170	ir_block *bb;
10171	ir_ref i;
10172	ir_insn *insn;
10173	uint32_t *rule;
10174	ir_backend_data data;
10175	dasm_State **Dst;
10176	int ret;
10177	void *entry;
10178	size_t size;
10179
10180	data.ra_data.unused_slot_4 = 0;
10181	data.ra_data.unused_slot_2 = 0;
10182	data.ra_data.unused_slot_1 = 0;
10183	data.ra_data.handled = NULL;
10184	data.rodata_label = 0;
10185	data.jmp_table_label = 0;
10186	data.double_neg_const = 0;
10187	data.float_neg_const = 0;
10188	data.double_abs_const = 0;
10189	data.float_abs_const = 0;
10190	data.double_zero_const = 0;
10191	ctx->data = &data;
10192
10193	if (!ctx->live_intervals) {
10194		ctx->stack_frame_size = 0;
10195		ctx->stack_frame_alignment = 0;
10196		ctx->call_stack_size = 0;
10197		ctx->used_preserved_regs = 0;
10198		ir_allocate_unique_spill_slots(ctx);
10199	}
10200
10201	if (ctx->fixed_stack_frame_size != -1) {
10202		if (ctx->fixed_stack_red_zone) {
10203			IR_ASSERT(ctx->fixed_stack_red_zone == ctx->fixed_stack_frame_size + ctx->fixed_call_stack_size);
10204		}
10205		if (ctx->stack_frame_size > ctx->fixed_stack_frame_size) {
10206			// TODO: report error to caller
10207#ifdef IR_DEBUG_MESSAGES
10208			fprintf(stderr, "IR Compilation Aborted: ctx->stack_frame_size > ctx->fixed_stack_frame_size at %s:%d\n",
10209				__FILE__, __LINE__);
10210#endif
10211			ctx->data = NULL;
10212			ctx->status = IR_ERROR_FIXED_STACK_FRAME_OVERFLOW;
10213			return NULL;
10214		}
10215		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
10216		ctx->call_stack_size = ctx->fixed_call_stack_size;
10217		ctx->stack_frame_alignment = 0;
10218	}
10219
10220	Dst = &data.dasm_state;
10221	data.dasm_state = NULL;
10222	dasm_init(&data.dasm_state, DASM_MAXSECTION);
10223	dasm_setupglobal(&data.dasm_state, dasm_labels, ir_lb_MAX);
10224	dasm_setup(&data.dasm_state, dasm_actions);
10225	/* labels for each block + for each constant + rodata label + jmp_table label + for each entry */
10226	dasm_growpc(&data.dasm_state, ctx->cfg_blocks_count + 1 + ctx->consts_count + 1 + 1 + 1 + ctx->entries_count);
10227	data.emit_constants = ir_bitset_malloc(ctx->consts_count);
10228
10229	if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_START_BR_TARGET)) {
10230		|.if X64
10231		|	endbr64
10232		|.else
10233		|	endbr32
10234		|.endif
10235	}
10236
10237	if (!(ctx->flags & IR_SKIP_PROLOGUE)) {
10238		ir_emit_prologue(ctx);
10239	}
10240	if (ctx->flags & IR_FUNCTION) {
10241		ir_emit_load_params(ctx);
10242	}
10243
10244	if (UNEXPECTED(!ctx->cfg_schedule)) {
10245		uint32_t *list = ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
10246		for (b = 0; b <= ctx->cfg_blocks_count; b++) {
10247			list[b] = b;
10248		}
10249		list[ctx->cfg_blocks_count + 1] = 0;
10250	}
10251
10252	for (_b = 1; _b <= ctx->cfg_blocks_count; _b++) {
10253		b = ctx->cfg_schedule[_b];
10254		bb = &ctx->cfg_blocks[b];
10255		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
10256		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
10257			continue;
10258		}
10259		if (bb->flags & IR_BB_ALIGN_LOOP) {
10260			|	.align IR_LOOP_ALIGNMENT
10261		}
10262		|=>b:
10263
10264		i = bb->start;
10265		insn = ctx->ir_base + i;
10266		if (bb->flags & IR_BB_ENTRY) {
10267			uint32_t label = ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3;
10268
10269			|=>label:
10270			if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_ENTRY_BR_TARGET)) {
10271				|.if X64
10272				|	endbr64
10273				|.else
10274				|	endbr32
10275				|.endif
10276			}
10277			ir_emit_prologue(ctx);
10278			ctx->entries[insn->op3] = i;
10279		}
10280
10281		/* skip first instruction */
10282		n = ir_insn_len(insn);
10283		i += n;
10284		insn += n;
10285		rule = ctx->rules + i;
10286
10287		while (i <= bb->end) {
10288			if (!((*rule) & (IR_FUSED|IR_SKIPPED)))
10289			switch ((*rule) & IR_RULE_MASK) {
10290				case IR_VAR:
10291				case IR_PARAM:
10292				case IR_PI:
10293				case IR_PHI:
10294				case IR_SNAPSHOT:
10295				case IR_VA_END:
10296					break;
10297				case IR_LEA_OB:
10298				case IR_LEA_SI:
10299				case IR_LEA_SIB:
10300				case IR_LEA_IB:
10301				case IR_LEA_OB_I:
10302				case IR_LEA_I_OB:
10303				case IR_LEA_SI_O:
10304				case IR_LEA_SIB_O:
10305				case IR_LEA_IB_O:
10306				case IR_LEA_OB_SI:
10307				case IR_LEA_SI_OB:
10308				case IR_LEA_B_SI:
10309				case IR_LEA_SI_B:
10310					ir_emit_lea(ctx, i, insn->type);
10311					break;
10312				case IR_MUL_PWR2:
10313				case IR_DIV_PWR2:
10314				case IR_MOD_PWR2:
10315					ir_emit_mul_div_mod_pwr2(ctx, i, insn);
10316					break;
10317				case IR_SDIV_PWR2:
10318					ir_emit_sdiv_pwr2(ctx, i, insn);
10319					break;
10320				case IR_SMOD_PWR2:
10321					ir_emit_smod_pwr2(ctx, i, insn);
10322					break;
10323				case IR_SHIFT:
10324					ir_emit_shift(ctx, i, insn);
10325					break;
10326				case IR_SHIFT_CONST:
10327					ir_emit_shift_const(ctx, i, insn);
10328					break;
10329				case IR_BIT_COUNT:
10330					ir_emit_bit_count(ctx, i, insn);
10331					break;
10332				case IR_CTPOP:
10333					ir_emit_ctpop(ctx, i, insn);
10334					break;
10335				case IR_INC:
10336				case IR_DEC:
10337				case IR_OP_INT:
10338					ir_emit_op_int(ctx, i, insn, *rule);
10339					break;
10340				case IR_ABS_INT:
10341					ir_emit_abs_int(ctx, i, insn);
10342					break;
10343				case IR_BOOL_NOT_INT:
10344					ir_emit_bool_not_int(ctx, i, insn);
10345					break;
10346				case IR_OP_FP:
10347					ir_emit_op_fp(ctx, i, insn);
10348					break;
10349				case IR_IMUL3:
10350					ir_emit_imul3(ctx, i, insn);
10351					break;
10352				case IR_BINOP_INT:
10353					ir_emit_binop_int(ctx, i, insn);
10354					break;
10355				case IR_BINOP_SSE2:
10356					ir_emit_binop_sse2(ctx, i, insn);
10357					break;
10358				case IR_BINOP_AVX:
10359					ir_emit_binop_avx(ctx, i, insn);
10360					break;
10361				case IR_MUL_INT:
10362				case IR_DIV_INT:
10363				case IR_MOD_INT:
10364					ir_emit_mul_div_mod(ctx, i, insn);
10365					break;
10366				case IR_CMP_INT:
10367					ir_emit_cmp_int(ctx, i, insn);
10368					break;
10369				case IR_TESTCC_INT:
10370					ir_emit_testcc_int(ctx, i, insn);
10371					break;
10372				case IR_SETCC_INT:
10373					ir_emit_setcc_int(ctx, i, insn);
10374					break;
10375				case IR_CMP_FP:
10376					ir_emit_cmp_fp(ctx, i, insn);
10377					break;
10378				case IR_SEXT:
10379					ir_emit_sext(ctx, i, insn);
10380					break;
10381				case IR_ZEXT:
10382					ir_emit_zext(ctx, i, insn);
10383					break;
10384				case IR_TRUNC:
10385					ir_emit_trunc(ctx, i, insn);
10386					break;
10387				case IR_BITCAST:
10388				case IR_PROTO:
10389					ir_emit_bitcast(ctx, i, insn);
10390					break;
10391				case IR_INT2FP:
10392					ir_emit_int2fp(ctx, i, insn);
10393					break;
10394				case IR_FP2INT:
10395					ir_emit_fp2int(ctx, i, insn);
10396					break;
10397				case IR_FP2FP:
10398					ir_emit_fp2fp(ctx, i, insn);
10399					break;
10400				case IR_COPY_INT:
10401					ir_emit_copy_int(ctx, i, insn);
10402					break;
10403				case IR_COPY_FP:
10404					ir_emit_copy_fp(ctx, i, insn);
10405					break;
10406				case IR_CMP_AND_STORE_INT:
10407					ir_emit_cmp_and_store_int(ctx, i, insn);
10408					break;
10409				case IR_CMP_AND_BRANCH_INT:
10410					ir_emit_cmp_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10411					break;
10412				case IR_CMP_AND_BRANCH_FP:
10413					ir_emit_cmp_and_branch_fp(ctx, b, i, insn, _ir_next_block(ctx, _b));
10414					break;
10415				case IR_TEST_AND_BRANCH_INT:
10416					ir_emit_test_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10417					break;
10418				case IR_JCC_INT:
10419					{
10420						ir_op op = ctx->ir_base[insn->op2].op;
10421
10422						if (op == IR_ADD ||
10423						    op == IR_SUB ||
10424//						    op == IR_MUL ||
10425						    op == IR_OR  ||
10426						    op == IR_AND ||
10427						    op == IR_XOR) {
10428							op = IR_NE;
10429						} else {
10430							IR_ASSERT(op >= IR_EQ && op <= IR_UGT);
10431						}
10432						ir_emit_jcc(ctx, b, i, insn, _ir_next_block(ctx, _b), op, 1);
10433					}
10434					break;
10435				case IR_GUARD_CMP_INT:
10436					if (ir_emit_guard_cmp_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10437						goto next_block;
10438					}
10439					break;
10440				case IR_GUARD_CMP_FP:
10441					if (ir_emit_guard_cmp_fp(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10442						goto next_block;
10443					}
10444					break;
10445				case IR_GUARD_TEST_INT:
10446					if (ir_emit_guard_test_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10447						goto next_block;
10448					}
10449					break;
10450				case IR_GUARD_JCC_INT:
10451					if (ir_emit_guard_jcc_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10452						goto next_block;
10453					}
10454					break;
10455				case IR_IF_INT:
10456					ir_emit_if_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
10457					break;
10458				case IR_COND:
10459					ir_emit_cond(ctx, i, insn);
10460					break;
10461				case IR_COND_CMP_INT:
10462					ir_emit_cond_cmp_int(ctx, i, insn);
10463					break;
10464				case IR_COND_CMP_FP:
10465					ir_emit_cond_cmp_fp(ctx, i, insn);
10466					break;
10467				case IR_SWITCH:
10468					ir_emit_switch(ctx, b, i, insn);
10469					break;
10470				case IR_MIN_MAX_INT:
10471					ir_emit_min_max_int(ctx, i, insn);
10472					break;
10473				case IR_OVERFLOW:
10474					ir_emit_overflow(ctx, i, insn);
10475					break;
10476				case IR_OVERFLOW_AND_BRANCH:
10477					ir_emit_overflow_and_branch(ctx, b, i, insn, _ir_next_block(ctx, _b));
10478					break;
10479				case IR_END:
10480				case IR_LOOP_END:
10481					if (bb->flags & IR_BB_OSR_ENTRY_LOADS) {
10482						ir_emit_osr_entry_loads(ctx, b, bb);
10483					}
10484					if (bb->flags & IR_BB_DESSA_MOVES) {
10485						ir_emit_dessa_moves(ctx, b, bb);
10486					}
10487					do {
10488						ir_ref succ = ctx->cfg_edges[bb->successors];
10489
10490						if (UNEXPECTED(bb->successors_count == 2)) {
10491							if (ctx->cfg_blocks[succ].flags & IR_BB_ENTRY) {
10492								succ = ctx->cfg_edges[bb->successors + 1];
10493							} else {
10494								IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
10495							}
10496						} else {
10497							IR_ASSERT(bb->successors_count == 1);
10498						}
10499						target = ir_skip_empty_target_blocks(ctx, succ);
10500						if (target != _ir_next_block(ctx, _b)) {
10501							|	jmp =>target
10502						}
10503					} while (0);
10504					break;
10505				case IR_RETURN_VOID:
10506					ir_emit_return_void(ctx);
10507					break;
10508				case IR_RETURN_INT:
10509					ir_emit_return_int(ctx, i, insn);
10510					break;
10511				case IR_RETURN_FP:
10512					ir_emit_return_fp(ctx, i, insn);
10513					break;
10514				case IR_CALL:
10515					ir_emit_call(ctx, i, insn);
10516					break;
10517				case IR_TAILCALL:
10518					ir_emit_tailcall(ctx, i, insn);
10519					break;
10520				case IR_IJMP:
10521					ir_emit_ijmp(ctx, i, insn);
10522					break;
10523				case IR_MEM_OP_INT:
10524				case IR_MEM_INC:
10525				case IR_MEM_DEC:
10526					ir_emit_mem_op_int(ctx, i, insn, *rule);
10527					break;
10528				case IR_MEM_BINOP_INT:
10529					ir_emit_mem_binop_int(ctx, i, insn);
10530					break;
10531				case IR_MEM_MUL_PWR2:
10532				case IR_MEM_DIV_PWR2:
10533				case IR_MEM_MOD_PWR2:
10534					ir_emit_mem_mul_div_mod_pwr2(ctx, i, insn);
10535					break;
10536				case IR_MEM_SHIFT:
10537					ir_emit_mem_shift(ctx, i, insn);
10538					break;
10539				case IR_MEM_SHIFT_CONST:
10540					ir_emit_mem_shift_const(ctx, i, insn);
10541					break;
10542				case IR_REG_BINOP_INT:
10543					ir_emit_reg_binop_int(ctx, i, insn);
10544					break;
10545				case IR_VADDR:
10546					ir_emit_vaddr(ctx, i, insn);
10547					break;
10548				case IR_VLOAD:
10549					ir_emit_vload(ctx, i, insn);
10550					break;
10551				case IR_VSTORE_INT:
10552					ir_emit_vstore_int(ctx, i, insn);
10553					break;
10554				case IR_VSTORE_FP:
10555					ir_emit_vstore_fp(ctx, i, insn);
10556					break;
10557				case IR_RLOAD:
10558					ir_emit_rload(ctx, i, insn);
10559					break;
10560				case IR_RSTORE:
10561					ir_emit_rstore(ctx, i, insn);
10562					break;
10563				case IR_LOAD_INT:
10564					ir_emit_load_int(ctx, i, insn);
10565					break;
10566				case IR_LOAD_FP:
10567					ir_emit_load_fp(ctx, i, insn);
10568					break;
10569				case IR_STORE_INT:
10570					ir_emit_store_int(ctx, i, insn);
10571					break;
10572				case IR_STORE_FP:
10573					ir_emit_store_fp(ctx, i, insn);
10574					break;
10575				case IR_ALLOCA:
10576					ir_emit_alloca(ctx, i, insn);
10577					break;
10578				case IR_VA_START:
10579					ir_emit_va_start(ctx, i, insn);
10580					break;
10581				case IR_VA_COPY:
10582					ir_emit_va_copy(ctx, i, insn);
10583					break;
10584				case IR_VA_ARG:
10585					ir_emit_va_arg(ctx, i, insn);
10586					break;
10587				case IR_AFREE:
10588					ir_emit_afree(ctx, i, insn);
10589					break;
10590				case IR_BLOCK_BEGIN:
10591					ir_emit_block_begin(ctx, i, insn);
10592					break;
10593				case IR_BLOCK_END:
10594					ir_emit_block_end(ctx, i, insn);
10595					break;
10596				case IR_FRAME_ADDR:
10597					ir_emit_frame_addr(ctx, i);
10598					break;
10599				case IR_EXITCALL:
10600					ir_emit_exitcall(ctx, i, insn);
10601					break;
10602				case IR_GUARD:
10603				case IR_GUARD_NOT:
10604					if (ir_emit_guard(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
10605						goto next_block;
10606					}
10607					break;
10608				case IR_GUARD_OVERFLOW:
10609					if (ir_emit_guard_overflow(ctx, b, i, insn)) {
10610						goto next_block;
10611					}
10612					break;
10613				case IR_SSE_SQRT:
10614					ir_emit_sse_sqrt(ctx, i, insn);
10615					break;
10616				case IR_SSE_RINT:
10617					ir_emit_sse_round(ctx, i, insn, 4);
10618					break;
10619				case IR_SSE_FLOOR:
10620					ir_emit_sse_round(ctx, i, insn, 9);
10621					break;
10622				case IR_SSE_CEIL:
10623					ir_emit_sse_round(ctx, i, insn, 10);
10624					break;
10625				case IR_SSE_TRUNC:
10626					ir_emit_sse_round(ctx, i, insn, 11);
10627					break;
10628				case IR_SSE_NEARBYINT:
10629					ir_emit_sse_round(ctx, i, insn, 12);
10630					break;
10631				case IR_TLS:
10632					ir_emit_tls(ctx, i, insn);
10633					break;
10634				case IR_TRAP:
10635					|	int3
10636					break;
10637				default:
10638					IR_ASSERT(0 && "NIY rule/instruction");
10639					ir_mem_free(data.emit_constants);
10640					dasm_free(&data.dasm_state);
10641					ctx->data = NULL;
10642					ctx->status = IR_ERROR_UNSUPPORTED_CODE_RULE;
10643					return NULL;
10644			}
10645			n = ir_insn_len(insn);
10646			i += n;
10647			insn += n;
10648			rule += n;
10649		}
10650next_block:;
10651	}
10652
10653	if (data.rodata_label) {
10654		|.rodata
10655	}
10656	IR_BITSET_FOREACH(data.emit_constants, ir_bitset_len(ctx->consts_count), i) {
10657		insn = &ctx->ir_base[-i];
10658		if (IR_IS_TYPE_FP(insn->type)) {
10659			int label = ctx->cfg_blocks_count + i;
10660
10661			if (!data.rodata_label) {
10662				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
10663
10664				|.rodata
10665				|=>data.rodata_label:
10666			}
10667			if (insn->type == IR_DOUBLE) {
10668				|.align 8
10669				|=>label:
10670				|.dword insn->val.u32, insn->val.u32_hi
10671			} else {
10672				IR_ASSERT(insn->type == IR_FLOAT);
10673				|.align 4
10674				|=>label:
10675				|.dword insn->val.u32
10676			}
10677		} else if (insn->op == IR_STR) {
10678			int label = ctx->cfg_blocks_count + i;
10679			const char *str = ir_get_str(ctx, insn->val.str);
10680			int i = 0;
10681
10682			if (!data.rodata_label) {
10683				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
10684
10685				|.rodata
10686				|=>data.rodata_label:
10687			}
10688			|.align 8
10689			|=>label:
10690			while (str[i]) {
10691				char c = str[i];
10692
10693				|.byte c
10694				i++;
10695			}
10696			|.byte 0
10697
10698		} else {
10699			IR_ASSERT(0);
10700		}
10701	} IR_BITSET_FOREACH_END();
10702	if (data.rodata_label) {
10703		|.code
10704	}
10705	ir_mem_free(data.emit_constants);
10706
10707	if (ctx->status) {
10708		dasm_free(&data.dasm_state);
10709		ctx->data = NULL;
10710		return NULL;
10711	}
10712
10713	ret = dasm_link(&data.dasm_state, size_ptr);
10714	if (ret != DASM_S_OK) {
10715		IR_ASSERT(0);
10716		dasm_free(&data.dasm_state);
10717		ctx->data = NULL;
10718		ctx->status = IR_ERROR_LINK;
10719		return NULL;
10720	}
10721	size = *size_ptr;
10722
10723	if (ctx->code_buffer) {
10724		entry = ctx->code_buffer->pos;
10725		entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
10726		if (size > (size_t)((char*)ctx->code_buffer->end - (char*)entry)) {
10727			ctx->data = NULL;
10728			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
10729			return NULL;
10730		}
10731		ctx->code_buffer->pos = (char*)entry + size;
10732	} else {
10733		entry = ir_mem_mmap(size);
10734		if (!entry) {
10735			dasm_free(&data.dasm_state);
10736			ctx->data = NULL;
10737			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
10738			return NULL;
10739		}
10740		ir_mem_unprotect(entry, size);
10741	}
10742
10743	ret = dasm_encode(&data.dasm_state, entry);
10744	if (ret != DASM_S_OK) {
10745		IR_ASSERT(0);
10746		dasm_free(&data.dasm_state);
10747		if (ctx->code_buffer) {
10748			if (ctx->code_buffer->pos == (char*)entry + size) {
10749				/* rollback */
10750				ctx->code_buffer->pos = (char*)entry - size;
10751			}
10752		} else {
10753			ir_mem_unmap(entry, size);
10754		}
10755		ctx->data = NULL;
10756		ctx->status = IR_ERROR_ENCODE;
10757		return NULL;
10758	}
10759
10760	if (data.jmp_table_label) {
10761		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.jmp_table_label);
10762		ctx->jmp_table_offset = offset;
10763	} else {
10764		ctx->jmp_table_offset = 0;
10765	}
10766	if (data.rodata_label) {
10767		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.rodata_label);
10768		ctx->rodata_offset = offset;
10769	} else {
10770		ctx->rodata_offset = 0;
10771	}
10772
10773	if (ctx->entries_count) {
10774		/* For all entries */
10775		i = ctx->entries_count;
10776		do {
10777			ir_insn *insn = &ctx->ir_base[ctx->entries[--i]];
10778			uint32_t offset = dasm_getpclabel(&data.dasm_state, ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3);
10779			insn->op3 = offset;
10780		} while (i != 0);
10781	}
10782
10783	dasm_free(&data.dasm_state);
10784
10785	ir_mem_flush(entry, size);
10786
10787#if defined(__GNUC__)
10788	if ((ctx->flags & IR_GEN_CACHE_DEMOTE) && (ctx->mflags & IR_X86_CLDEMOTE)) {
10789		uintptr_t start = (uintptr_t)entry;
10790		uintptr_t p = (uintptr_t)start & ~0x3F;
10791
10792		do {
10793			/* _cldemote(p); */
10794			asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
10795			p += 64;
10796		} while (p < start + size);
10797	}
10798#endif
10799
10800	if (!ctx->code_buffer) {
10801		ir_mem_protect(entry, size);
10802	}
10803
10804	ctx->data = NULL;
10805	return entry;
10806}
10807
10808const void *ir_emit_exitgroup(uint32_t first_exit_point, uint32_t exit_points_per_group, const void *exit_addr, ir_code_buffer *code_buffer, size_t *size_ptr)
10809{
10810	void *entry;
10811	size_t size;
10812	uint32_t i;
10813	dasm_State **Dst, *dasm_state;
10814	int ret;
10815
10816	IR_ASSERT(code_buffer);
10817	IR_ASSERT(sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(code_buffer, exit_addr));
10818
10819	Dst = &dasm_state;
10820	dasm_state = NULL;
10821	dasm_init(&dasm_state, DASM_MAXSECTION);
10822	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
10823	dasm_setup(&dasm_state, dasm_actions);
10824
10825	for (i = 0; i < exit_points_per_group - 1; i++) {
10826		|	push byte i
10827		|	.byte 0xeb, (4*(exit_points_per_group-i)-6) // jmp >1
10828	}
10829	|	push byte i
10830	|// 1:
10831	|	add aword [r4], first_exit_point
10832	|	jmp aword &exit_addr
10833
10834	ret = dasm_link(&dasm_state, &size);
10835	if (ret != DASM_S_OK) {
10836		IR_ASSERT(0);
10837		dasm_free(&dasm_state);
10838		return NULL;
10839	}
10840
10841	entry = code_buffer->pos;
10842	entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
10843	if (size > (size_t)((char*)code_buffer->end - (char*)entry)) {
10844		return NULL;
10845	}
10846	code_buffer->pos = (char*)entry + size;
10847
10848	ret = dasm_encode(&dasm_state, entry);
10849	if (ret != DASM_S_OK) {
10850		IR_ASSERT(0);
10851		dasm_free(&dasm_state);
10852		if (code_buffer->pos == (char*)entry + size) {
10853			/* rollback */
10854			code_buffer->pos = (char*)entry - size;
10855		}
10856		return NULL;
10857	}
10858
10859	dasm_free(&dasm_state);
10860
10861	ir_mem_flush(entry, size);
10862
10863	*size_ptr = size;
10864	return entry;
10865}
10866
10867bool ir_needs_thunk(ir_code_buffer *code_buffer, void *addr)
10868{
10869	return sizeof(void*) == 8 && !IR_MAY_USE_32BIT_ADDR(code_buffer, addr);
10870}
10871
10872void *ir_emit_thunk(ir_code_buffer *code_buffer, void *addr, size_t *size_ptr)
10873{
10874	void *entry;
10875	size_t size;
10876	dasm_State **Dst, *dasm_state;
10877	int ret;
10878
10879	Dst = &dasm_state;
10880	dasm_state = NULL;
10881	dasm_init(&dasm_state, DASM_MAXSECTION);
10882	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
10883	dasm_setup(&dasm_state, dasm_actions);
10884
10885	|.code
10886	|.if X64
10887	|	jmp aword [>1]
10888	|1:
10889	|	.aword &addr
10890	|.else
10891	|	jmp &addr
10892	|.endif
10893
10894	ret = dasm_link(&dasm_state, &size);
10895	if (ret != DASM_S_OK) {
10896		IR_ASSERT(0);
10897		dasm_free(&dasm_state);
10898		return NULL;
10899	}
10900
10901	if (size > (size_t)((char*)code_buffer->end - (char*)code_buffer->pos)) {
10902		dasm_free(&dasm_state);
10903		return NULL;
10904	}
10905
10906	entry = code_buffer->pos;
10907	ret = dasm_encode(&dasm_state, entry);
10908	if (ret != DASM_S_OK) {
10909		dasm_free(&dasm_state);
10910		return NULL;
10911	}
10912
10913	*size_ptr = size;
10914	code_buffer->pos = (char*)code_buffer->pos + size;
10915
10916	dasm_free(&dasm_state);
10917	ir_mem_flush(entry, size);
10918
10919	return entry;
10920}
10921
10922void ir_fix_thunk(void *thunk_entry, void *addr)
10923{
10924	unsigned char *code = thunk_entry;
10925
10926	if (sizeof(void*) == 8 && !IR_IS_SIGNED_32BIT(((unsigned char*)addr - (code + 5)))) {
10927		int32_t *offset_ptr;
10928		void **addr_ptr;
10929
10930		IR_ASSERT(code[0] == 0xff && code[1] == 0x25);
10931		offset_ptr = (int32_t*)(code + 2);
10932		addr_ptr = (void**)(code + 6 + *offset_ptr);
10933		*addr_ptr = addr;
10934	} else {
10935		int32_t *addr_ptr;
10936
10937		code[0] = 0xe9;
10938		addr_ptr = (int32_t*)(code + 1);
10939		*addr_ptr = (int32_t)(intptr_t)(void*)((unsigned char*)addr - (code + 5));
10940	}
10941}
10942