1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
sljit_get_platform_name(void)27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28 {
29 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
30 	return "x86" SLJIT_CPUINFO " ABI:fastcall";
31 #else
32 	return "x86" SLJIT_CPUINFO;
33 #endif
34 }
35 
36 /*
37    32b register indexes:
38      0 - EAX
39      1 - ECX
40      2 - EDX
41      3 - EBX
42      4 - ESP
43      5 - EBP
44      6 - ESI
45      7 - EDI
46 */
47 
48 /*
49    64b register indexes:
50      0 - RAX
51      1 - RCX
52      2 - RDX
53      3 - RBX
54      4 - RSP
55      5 - RBP
56      6 - RSI
57      7 - RDI
58      8 - R8   - From now on REX prefix is required
59      9 - R9
60     10 - R10
61     11 - R11
62     12 - R12
63     13 - R13
64     14 - R14
65     15 - R15
66 */
67 
68 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
69 
70 /* Last register + 1. */
71 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
72 
73 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
74 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
75 };
76 
77 #define CHECK_EXTRA_REGS(p, w, do) \
78 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
79 		if (p <= compiler->scratches) \
80 			w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \
81 		else \
82 			w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \
83 		p = SLJIT_MEM1(SLJIT_SP); \
84 		do; \
85 	}
86 
87 #else /* SLJIT_CONFIG_X86_32 */
88 
89 /* Last register + 1. */
90 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
91 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
92 
93 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
94    Note: avoid to use r12 and r13 for memory addessing
95    therefore r12 is better to be a higher saved register. */
96 #ifndef _WIN64
97 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
98 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
99 	0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
100 };
101 /* low-map. reg_map & 0x7. */
102 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
103 	0, 0, 6, 7, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
104 };
105 #else
106 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
107 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
108 	0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
109 };
110 /* low-map. reg_map & 0x7. */
111 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
112 	0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
113 };
114 #endif
115 
116 /* Args: xmm0-xmm3 */
117 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
118 	4, 0, 1, 2, 3, 5, 6
119 };
120 /* low-map. freg_map & 0x7. */
121 static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
122 	4, 0, 1, 2, 3, 5, 6
123 };
124 
125 #define REX_W		0x48
126 #define REX_R		0x44
127 #define REX_X		0x42
128 #define REX_B		0x41
129 #define REX		0x40
130 
131 #ifndef _WIN64
132 #define HALFWORD_MAX 0x7fffffffl
133 #define HALFWORD_MIN -0x80000000l
134 #else
135 #define HALFWORD_MAX 0x7fffffffll
136 #define HALFWORD_MIN -0x80000000ll
137 #endif
138 
139 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
140 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
141 
142 #define CHECK_EXTRA_REGS(p, w, do)
143 
144 #endif /* SLJIT_CONFIG_X86_32 */
145 
146 #define TMP_FREG	(0)
147 
148 /* Size flags for emit_x86_instruction: */
149 #define EX86_BIN_INS		0x0010
150 #define EX86_SHIFT_INS		0x0020
151 #define EX86_REX		0x0040
152 #define EX86_NO_REXW		0x0080
153 #define EX86_BYTE_ARG		0x0100
154 #define EX86_HALF_ARG		0x0200
155 #define EX86_PREF_66		0x0400
156 #define EX86_PREF_F2		0x0800
157 #define EX86_PREF_F3		0x1000
158 #define EX86_SSE2_OP1		0x2000
159 #define EX86_SSE2_OP2		0x4000
160 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
161 
162 /* --------------------------------------------------------------------- */
163 /*  Instrucion forms                                                     */
164 /* --------------------------------------------------------------------- */
165 
166 #define ADD		(/* BINARY */ 0 << 3)
167 #define ADD_EAX_i32	0x05
168 #define ADD_r_rm	0x03
169 #define ADD_rm_r	0x01
170 #define ADDSD_x_xm	0x58
171 #define ADC		(/* BINARY */ 2 << 3)
172 #define ADC_EAX_i32	0x15
173 #define ADC_r_rm	0x13
174 #define ADC_rm_r	0x11
175 #define AND		(/* BINARY */ 4 << 3)
176 #define AND_EAX_i32	0x25
177 #define AND_r_rm	0x23
178 #define AND_rm_r	0x21
179 #define ANDPD_x_xm	0x54
180 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
181 #define CALL_i32	0xe8
182 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
183 #define CDQ		0x99
184 #define CMOVE_r_rm	(/* GROUP_0F */ 0x44)
185 #define CMP		(/* BINARY */ 7 << 3)
186 #define CMP_EAX_i32	0x3d
187 #define CMP_r_rm	0x3b
188 #define CMP_rm_r	0x39
189 #define CVTPD2PS_x_xm	0x5a
190 #define CVTSI2SD_x_rm	0x2a
191 #define CVTTSD2SI_r_xm	0x2c
192 #define DIV		(/* GROUP_F7 */ 6 << 3)
193 #define DIVSD_x_xm	0x5e
194 #define FSTPS		0xd9
195 #define FSTPD		0xdd
196 #define INT3		0xcc
197 #define IDIV		(/* GROUP_F7 */ 7 << 3)
198 #define IMUL		(/* GROUP_F7 */ 5 << 3)
199 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
200 #define IMUL_r_rm_i8	0x6b
201 #define IMUL_r_rm_i32	0x69
202 #define JE_i8		0x74
203 #define JNE_i8		0x75
204 #define JMP_i8		0xeb
205 #define JMP_i32		0xe9
206 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
207 #define LEA_r_m		0x8d
208 #define MOV_r_rm	0x8b
209 #define MOV_r_i32	0xb8
210 #define MOV_rm_r	0x89
211 #define MOV_rm_i32	0xc7
212 #define MOV_rm8_i8	0xc6
213 #define MOV_rm8_r8	0x88
214 #define MOVSD_x_xm	0x10
215 #define MOVSD_xm_x	0x11
216 #define MOVSXD_r_rm	0x63
217 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
218 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
219 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
220 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
221 #define MUL		(/* GROUP_F7 */ 4 << 3)
222 #define MULSD_x_xm	0x59
223 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
224 #define NOP		0x90
225 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
226 #define OR		(/* BINARY */ 1 << 3)
227 #define OR_r_rm		0x0b
228 #define OR_EAX_i32	0x0d
229 #define OR_rm_r		0x09
230 #define OR_rm8_r8	0x08
231 #define POP_r		0x58
232 #define POP_rm		0x8f
233 #define POPF		0x9d
234 #define PREFETCH	0x18
235 #define PUSH_i32	0x68
236 #define PUSH_r		0x50
237 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
238 #define PUSHF		0x9c
239 #define RET_near	0xc3
240 #define RET_i16		0xc2
241 #define SBB		(/* BINARY */ 3 << 3)
242 #define SBB_EAX_i32	0x1d
243 #define SBB_r_rm	0x1b
244 #define SBB_rm_r	0x19
245 #define SAR		(/* SHIFT */ 7 << 3)
246 #define SHL		(/* SHIFT */ 4 << 3)
247 #define SHR		(/* SHIFT */ 5 << 3)
248 #define SUB		(/* BINARY */ 5 << 3)
249 #define SUB_EAX_i32	0x2d
250 #define SUB_r_rm	0x2b
251 #define SUB_rm_r	0x29
252 #define SUBSD_x_xm	0x5c
253 #define TEST_EAX_i32	0xa9
254 #define TEST_rm_r	0x85
255 #define UCOMISD_x_xm	0x2e
256 #define UNPCKLPD_x_xm	0x14
257 #define XCHG_EAX_r	0x90
258 #define XCHG_r_rm	0x87
259 #define XOR		(/* BINARY */ 6 << 3)
260 #define XOR_EAX_i32	0x35
261 #define XOR_r_rm	0x33
262 #define XOR_rm_r	0x31
263 #define XORPD_x_xm	0x57
264 
265 #define GROUP_0F	0x0f
266 #define GROUP_F7	0xf7
267 #define GROUP_FF	0xff
268 #define GROUP_BINARY_81	0x81
269 #define GROUP_BINARY_83	0x83
270 #define GROUP_SHIFT_1	0xd1
271 #define GROUP_SHIFT_N	0xc1
272 #define GROUP_SHIFT_CL	0xd3
273 
274 #define MOD_REG		0xc0
275 #define MOD_DISP8	0x40
276 
277 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
278 
279 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
280 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
281 #define RET()				(*inst++ = (RET_near))
282 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
283 /* r32, r/m32 */
284 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
285 
286 /* Multithreading does not affect these static variables, since they store
287    built-in CPU features. Therefore they can be overwritten by different threads
288    if they detect the CPU features in the same time. */
289 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
290 static sljit_s32 cpu_has_sse2 = -1;
291 #endif
292 static sljit_s32 cpu_has_cmov = -1;
293 
294 #ifdef _WIN32_WCE
295 #include <cmnintrin.h>
296 #elif defined(_MSC_VER) && _MSC_VER >= 1400
297 #include <intrin.h>
298 #endif
299 
300 /******************************************************/
301 /*    Unaligned-store functions                       */
302 /******************************************************/
303 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)304 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
305 {
306 	SLJIT_MEMCPY(addr, &value, sizeof(value));
307 }
308 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)309 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
310 {
311 	SLJIT_MEMCPY(addr, &value, sizeof(value));
312 }
313 
sljit_unaligned_store_sw(void * addr,sljit_sw value)314 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
315 {
316 	SLJIT_MEMCPY(addr, &value, sizeof(value));
317 }
318 
319 /******************************************************/
320 /*    Utility functions                               */
321 /******************************************************/
322 
get_cpu_features(void)323 static void get_cpu_features(void)
324 {
325 	sljit_u32 features;
326 
327 #if defined(_MSC_VER) && _MSC_VER >= 1400
328 
329 	int CPUInfo[4];
330 	__cpuid(CPUInfo, 1);
331 	features = (sljit_u32)CPUInfo[3];
332 
333 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
334 
335 	/* AT&T syntax. */
336 	__asm__ (
337 		"movl $0x1, %%eax\n"
338 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
339 		/* On x86-32, there is no red zone, so this
340 		   should work (no need for a local variable). */
341 		"push %%ebx\n"
342 #endif
343 		"cpuid\n"
344 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
345 		"pop %%ebx\n"
346 #endif
347 		"movl %%edx, %0\n"
348 		: "=g" (features)
349 		:
350 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
351 		: "%eax", "%ecx", "%edx"
352 #else
353 		: "%rax", "%rbx", "%rcx", "%rdx"
354 #endif
355 	);
356 
357 #else /* _MSC_VER && _MSC_VER >= 1400 */
358 
359 	/* Intel syntax. */
360 	__asm {
361 		mov eax, 1
362 		cpuid
363 		mov features, edx
364 	}
365 
366 #endif /* _MSC_VER && _MSC_VER >= 1400 */
367 
368 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
369 	cpu_has_sse2 = (features >> 26) & 0x1;
370 #endif
371 	cpu_has_cmov = (features >> 15) & 0x1;
372 }
373 
get_jump_code(sljit_s32 type)374 static sljit_u8 get_jump_code(sljit_s32 type)
375 {
376 	switch (type) {
377 	case SLJIT_EQUAL:
378 	case SLJIT_EQUAL_F64:
379 		return 0x84 /* je */;
380 
381 	case SLJIT_NOT_EQUAL:
382 	case SLJIT_NOT_EQUAL_F64:
383 		return 0x85 /* jne */;
384 
385 	case SLJIT_LESS:
386 	case SLJIT_LESS_F64:
387 		return 0x82 /* jc */;
388 
389 	case SLJIT_GREATER_EQUAL:
390 	case SLJIT_GREATER_EQUAL_F64:
391 		return 0x83 /* jae */;
392 
393 	case SLJIT_GREATER:
394 	case SLJIT_GREATER_F64:
395 		return 0x87 /* jnbe */;
396 
397 	case SLJIT_LESS_EQUAL:
398 	case SLJIT_LESS_EQUAL_F64:
399 		return 0x86 /* jbe */;
400 
401 	case SLJIT_SIG_LESS:
402 		return 0x8c /* jl */;
403 
404 	case SLJIT_SIG_GREATER_EQUAL:
405 		return 0x8d /* jnl */;
406 
407 	case SLJIT_SIG_GREATER:
408 		return 0x8f /* jnle */;
409 
410 	case SLJIT_SIG_LESS_EQUAL:
411 		return 0x8e /* jle */;
412 
413 	case SLJIT_OVERFLOW:
414 		return 0x80 /* jo */;
415 
416 	case SLJIT_NOT_OVERFLOW:
417 		return 0x81 /* jno */;
418 
419 	case SLJIT_UNORDERED_F64:
420 		return 0x8a /* jp */;
421 
422 	case SLJIT_ORDERED_F64:
423 		return 0x8b /* jpo */;
424 	}
425 	return 0;
426 }
427 
428 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
429 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_sw executable_offset);
430 #else
431 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr);
432 static sljit_u8* generate_put_label_code(struct sljit_put_label *put_label, sljit_u8 *code_ptr, sljit_uw max_label);
433 #endif
434 
generate_near_jump_code(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_sw executable_offset)435 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset)
436 {
437 	sljit_s32 type = jump->flags >> TYPE_SHIFT;
438 	sljit_s32 short_jump;
439 	sljit_uw label_addr;
440 
441 	if (jump->flags & JUMP_LABEL)
442 		label_addr = (sljit_uw)(code + jump->u.label->size);
443 	else
444 		label_addr = jump->u.target - executable_offset;
445 
446 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
447 
448 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
449 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
450 		return generate_far_jump_code(jump, code_ptr);
451 #endif
452 
453 	if (type == SLJIT_JUMP) {
454 		if (short_jump)
455 			*code_ptr++ = JMP_i8;
456 		else
457 			*code_ptr++ = JMP_i32;
458 		jump->addr++;
459 	}
460 	else if (type >= SLJIT_FAST_CALL) {
461 		short_jump = 0;
462 		*code_ptr++ = CALL_i32;
463 		jump->addr++;
464 	}
465 	else if (short_jump) {
466 		*code_ptr++ = get_jump_code(type) - 0x10;
467 		jump->addr++;
468 	}
469 	else {
470 		*code_ptr++ = GROUP_0F;
471 		*code_ptr++ = get_jump_code(type);
472 		jump->addr += 2;
473 	}
474 
475 	if (short_jump) {
476 		jump->flags |= PATCH_MB;
477 		code_ptr += sizeof(sljit_s8);
478 	} else {
479 		jump->flags |= PATCH_MW;
480 		code_ptr += sizeof(sljit_s32);
481 	}
482 
483 	return code_ptr;
484 }
485 
sljit_generate_code(struct sljit_compiler * compiler)486 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
487 {
488 	struct sljit_memory_fragment *buf;
489 	sljit_u8 *code;
490 	sljit_u8 *code_ptr;
491 	sljit_u8 *buf_ptr;
492 	sljit_u8 *buf_end;
493 	sljit_u8 len;
494 	sljit_sw executable_offset;
495 	sljit_sw jump_addr;
496 
497 	struct sljit_label *label;
498 	struct sljit_jump *jump;
499 	struct sljit_const *const_;
500 	struct sljit_put_label *put_label;
501 
502 	CHECK_ERROR_PTR();
503 	CHECK_PTR(check_sljit_generate_code(compiler));
504 	reverse_buf(compiler);
505 
506 	/* Second code generation pass. */
507 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size, compiler->exec_allocator_data);
508 	PTR_FAIL_WITH_EXEC_IF(code);
509 	buf = compiler->buf;
510 
511 	code_ptr = code;
512 	label = compiler->labels;
513 	jump = compiler->jumps;
514 	const_ = compiler->consts;
515 	put_label = compiler->put_labels;
516 	executable_offset = SLJIT_EXEC_OFFSET(code);
517 
518 	do {
519 		buf_ptr = buf->memory;
520 		buf_end = buf_ptr + buf->used_size;
521 		do {
522 			len = *buf_ptr++;
523 			if (len > 0) {
524 				/* The code is already generated. */
525 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
526 				code_ptr += len;
527 				buf_ptr += len;
528 			}
529 			else {
530 				switch (*buf_ptr) {
531 				case 0:
532 					label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
533 					label->size = code_ptr - code;
534 					label = label->next;
535 					break;
536 				case 1:
537 					jump->addr = (sljit_uw)code_ptr;
538 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
539 						code_ptr = generate_near_jump_code(jump, code_ptr, code, executable_offset);
540 					else {
541 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
542 						code_ptr = generate_far_jump_code(jump, code_ptr, executable_offset);
543 #else
544 						code_ptr = generate_far_jump_code(jump, code_ptr);
545 #endif
546 					}
547 					jump = jump->next;
548 					break;
549 				case 2:
550 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
551 					const_ = const_->next;
552 					break;
553 				default:
554 					SLJIT_ASSERT(*buf_ptr == 3);
555 					SLJIT_ASSERT(put_label->label);
556 					put_label->addr = (sljit_uw)code_ptr;
557 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
558 					code_ptr = generate_put_label_code(put_label, code_ptr, (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code, executable_offset) + put_label->label->size);
559 #endif
560 					put_label = put_label->next;
561 					break;
562 				}
563 				buf_ptr++;
564 			}
565 		} while (buf_ptr < buf_end);
566 		SLJIT_ASSERT(buf_ptr == buf_end);
567 		buf = buf->next;
568 	} while (buf);
569 
570 	SLJIT_ASSERT(!label);
571 	SLJIT_ASSERT(!jump);
572 	SLJIT_ASSERT(!const_);
573 	SLJIT_ASSERT(!put_label);
574 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
575 
576 	jump = compiler->jumps;
577 	while (jump) {
578 		jump_addr = jump->addr + executable_offset;
579 
580 		if (jump->flags & PATCH_MB) {
581 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
582 			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
583 		} else if (jump->flags & PATCH_MW) {
584 			if (jump->flags & JUMP_LABEL) {
585 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
586 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
587 #else
588 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
589 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
590 #endif
591 			}
592 			else {
593 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
594 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
595 #else
596 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
597 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
598 #endif
599 			}
600 		}
601 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
602 		else if (jump->flags & PATCH_MD)
603 			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
604 #endif
605 
606 		jump = jump->next;
607 	}
608 
609 	put_label = compiler->put_labels;
610 	while (put_label) {
611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
612 		sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
613 #else
614 		if (put_label->flags & PATCH_MD) {
615 			SLJIT_ASSERT(put_label->label->addr > HALFWORD_MAX);
616 			sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
617 		}
618 		else {
619 			SLJIT_ASSERT(put_label->label->addr <= HALFWORD_MAX);
620 			sljit_unaligned_store_s32((void*)(put_label->addr - sizeof(sljit_s32)), (sljit_s32)put_label->label->addr);
621 		}
622 #endif
623 
624 		put_label = put_label->next;
625 	}
626 
627 	compiler->error = SLJIT_ERR_COMPILED;
628 	compiler->executable_offset = executable_offset;
629 	compiler->executable_size = code_ptr - code;
630 
631 	code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
632 
633 	SLJIT_UPDATE_WX_FLAGS(code, (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset), 1);
634 	return (void*)code;
635 }
636 
sljit_has_cpu_feature(sljit_s32 feature_type)637 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
638 {
639 	switch (feature_type) {
640 	case SLJIT_HAS_FPU:
641 #ifdef SLJIT_IS_FPU_AVAILABLE
642 		return SLJIT_IS_FPU_AVAILABLE;
643 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
644 		if (cpu_has_sse2 == -1)
645 			get_cpu_features();
646 		return cpu_has_sse2;
647 #else /* SLJIT_DETECT_SSE2 */
648 		return 1;
649 #endif /* SLJIT_DETECT_SSE2 */
650 
651 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
652 	case SLJIT_HAS_VIRTUAL_REGISTERS:
653 		return 1;
654 #endif
655 
656 	case SLJIT_HAS_CLZ:
657 	case SLJIT_HAS_CMOV:
658 		if (cpu_has_cmov == -1)
659 			get_cpu_features();
660 		return cpu_has_cmov;
661 
662 	case SLJIT_HAS_PREFETCH:
663 		return 1;
664 
665 	case SLJIT_HAS_SSE2:
666 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
667 		if (cpu_has_sse2 == -1)
668 			get_cpu_features();
669 		return cpu_has_sse2;
670 #else
671 		return 1;
672 #endif
673 
674 	default:
675 		return 0;
676 	}
677 }
678 
679 /* --------------------------------------------------------------------- */
680 /*  Operators                                                            */
681 /* --------------------------------------------------------------------- */
682 
683 #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
684 
685 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
686 	sljit_u32 op_types,
687 	sljit_s32 dst, sljit_sw dstw,
688 	sljit_s32 src1, sljit_sw src1w,
689 	sljit_s32 src2, sljit_sw src2w);
690 
691 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
692 	sljit_u32 op_types,
693 	sljit_s32 dst, sljit_sw dstw,
694 	sljit_s32 src1, sljit_sw src1w,
695 	sljit_s32 src2, sljit_sw src2w);
696 
697 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
698 	sljit_s32 dst, sljit_sw dstw,
699 	sljit_s32 src, sljit_sw srcw);
700 
701 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
702 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
703 
704 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
705 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
706 
707 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
708 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
709 
710 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
711 	sljit_s32 src1, sljit_sw src1w,
712 	sljit_s32 src2, sljit_sw src2w);
713 
emit_endbranch(struct sljit_compiler * compiler)714 static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
715 {
716 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET)
717 	/* Emit endbr32/endbr64 when CET is enabled.  */
718 	sljit_u8 *inst;
719 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
720 	FAIL_IF(!inst);
721 	INC_SIZE(4);
722 	*inst++ = 0xf3;
723 	*inst++ = 0x0f;
724 	*inst++ = 0x1e;
725 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
726 	*inst = 0xfb;
727 #else
728 	*inst = 0xfa;
729 #endif
730 #else /* !SLJIT_CONFIG_X86_CET */
731 	SLJIT_UNUSED_ARG(compiler);
732 #endif /* SLJIT_CONFIG_X86_CET */
733 	return SLJIT_SUCCESS;
734 }
735 
736 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
737 
emit_rdssp(struct sljit_compiler * compiler,sljit_s32 reg)738 static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_s32 reg)
739 {
740 	sljit_u8 *inst;
741 	sljit_s32 size;
742 
743 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
744 	size = 5;
745 #else
746 	size = 4;
747 #endif
748 
749 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
750 	FAIL_IF(!inst);
751 	INC_SIZE(size);
752 	*inst++ = 0xf3;
753 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
754 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
755 #endif
756 	*inst++ = 0x0f;
757 	*inst++ = 0x1e;
758 	*inst = (0x3 << 6) | (0x1 << 3) | (reg_map[reg] & 0x7);
759 	return SLJIT_SUCCESS;
760 }
761 
emit_incssp(struct sljit_compiler * compiler,sljit_s32 reg)762 static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit_s32 reg)
763 {
764 	sljit_u8 *inst;
765 	sljit_s32 size;
766 
767 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
768 	size = 5;
769 #else
770 	size = 4;
771 #endif
772 
773 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
774 	FAIL_IF(!inst);
775 	INC_SIZE(size);
776 	*inst++ = 0xf3;
777 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
778 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
779 #endif
780 	*inst++ = 0x0f;
781 	*inst++ = 0xae;
782 	*inst = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
783 	return SLJIT_SUCCESS;
784 }
785 
786 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
787 
cpu_has_shadow_stack(void)788 static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void)
789 {
790 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
791 	return _get_ssp() != 0;
792 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
793 	return 0;
794 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
795 }
796 
adjust_shadow_stack(struct sljit_compiler * compiler,sljit_s32 src,sljit_sw srcw,sljit_s32 base,sljit_sw disp)797 static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler,
798 	sljit_s32 src, sljit_sw srcw, sljit_s32 base, sljit_sw disp)
799 {
800 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
801 	sljit_u8 *inst, *jz_after_cmp_inst;
802 	sljit_uw size_jz_after_cmp_inst;
803 
804 	sljit_uw size_before_rdssp_inst = compiler->size;
805 
806 	/* Generate "RDSSP TMP_REG1". */
807 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
808 
809 	/* Load return address on shadow stack into TMP_REG1. */
810 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
811 	SLJIT_ASSERT(reg_map[TMP_REG1] == 5);
812 
813 	/* Hand code unsupported "mov 0x0(%ebp),%ebp". */
814 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
815 	FAIL_IF(!inst);
816 	INC_SIZE(3);
817 	*inst++ = 0x8b;
818 	*inst++ = 0x6d;
819 	*inst = 0;
820 #else /* !SLJIT_CONFIG_X86_32 */
821 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
822 #endif /* SLJIT_CONFIG_X86_32 */
823 
824 	if (src == SLJIT_UNUSED) {
825 		/* Return address is on stack.  */
826 		src = SLJIT_MEM1(base);
827 		srcw = disp;
828 	}
829 
830 	/* Compare return address against TMP_REG1. */
831 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
832 
833 	/* Generate JZ to skip shadow stack ajdustment when shadow
834 	   stack matches normal stack. */
835 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
836 	FAIL_IF(!inst);
837 	INC_SIZE(2);
838 	*inst++ = get_jump_code(SLJIT_EQUAL) - 0x10;
839 	size_jz_after_cmp_inst = compiler->size;
840 	jz_after_cmp_inst = inst;
841 
842 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
843 	/* REX_W is not necessary. */
844 	compiler->mode32 = 1;
845 #endif
846 	/* Load 1 into TMP_REG1. */
847 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
848 
849 	/* Generate "INCSSP TMP_REG1". */
850 	FAIL_IF(emit_incssp(compiler, TMP_REG1));
851 
852 	/* Jump back to "RDSSP TMP_REG1" to check shadow stack again. */
853 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
854 	FAIL_IF(!inst);
855 	INC_SIZE(2);
856 	*inst++ = JMP_i8;
857 	*inst = size_before_rdssp_inst - compiler->size;
858 
859 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
860 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
861 	SLJIT_UNUSED_ARG(compiler);
862 	SLJIT_UNUSED_ARG(src);
863 	SLJIT_UNUSED_ARG(srcw);
864 	SLJIT_UNUSED_ARG(base);
865 	SLJIT_UNUSED_ARG(disp);
866 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
867 	return SLJIT_SUCCESS;
868 }
869 
870 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
871 #include "sljitNativeX86_32.c"
872 #else
873 #include "sljitNativeX86_64.c"
874 #endif
875 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)876 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
877 	sljit_s32 dst, sljit_sw dstw,
878 	sljit_s32 src, sljit_sw srcw)
879 {
880 	sljit_u8* inst;
881 
882 	SLJIT_ASSERT(dst != SLJIT_UNUSED);
883 
884 	if (FAST_IS_REG(src)) {
885 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
886 		FAIL_IF(!inst);
887 		*inst = MOV_rm_r;
888 		return SLJIT_SUCCESS;
889 	}
890 	if (src & SLJIT_IMM) {
891 		if (FAST_IS_REG(dst)) {
892 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
893 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
894 #else
895 			if (!compiler->mode32) {
896 				if (NOT_HALFWORD(srcw))
897 					return emit_load_imm64(compiler, dst, srcw);
898 			}
899 			else
900 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
901 #endif
902 		}
903 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
904 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
905 			/* Immediate to memory move. Only SLJIT_MOV operation copies
906 			   an immediate directly into memory so TMP_REG1 can be used. */
907 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
908 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
909 			FAIL_IF(!inst);
910 			*inst = MOV_rm_r;
911 			return SLJIT_SUCCESS;
912 		}
913 #endif
914 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
915 		FAIL_IF(!inst);
916 		*inst = MOV_rm_i32;
917 		return SLJIT_SUCCESS;
918 	}
919 	if (FAST_IS_REG(dst)) {
920 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
921 		FAIL_IF(!inst);
922 		*inst = MOV_r_rm;
923 		return SLJIT_SUCCESS;
924 	}
925 
926 	/* Memory to memory move. Only SLJIT_MOV operation copies
927 	   data from memory to memory so TMP_REG1 can be used. */
928 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
929 	FAIL_IF(!inst);
930 	*inst = MOV_r_rm;
931 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
932 	FAIL_IF(!inst);
933 	*inst = MOV_rm_r;
934 	return SLJIT_SUCCESS;
935 }
936 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)937 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
938 {
939 	sljit_u8 *inst;
940 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
941 	sljit_s32 size;
942 #endif
943 
944 	CHECK_ERROR();
945 	CHECK(check_sljit_emit_op0(compiler, op));
946 
947 	switch (GET_OPCODE(op)) {
948 	case SLJIT_BREAKPOINT:
949 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
950 		FAIL_IF(!inst);
951 		INC_SIZE(1);
952 		*inst = INT3;
953 		break;
954 	case SLJIT_NOP:
955 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
956 		FAIL_IF(!inst);
957 		INC_SIZE(1);
958 		*inst = NOP;
959 		break;
960 	case SLJIT_LMUL_UW:
961 	case SLJIT_LMUL_SW:
962 	case SLJIT_DIVMOD_UW:
963 	case SLJIT_DIVMOD_SW:
964 	case SLJIT_DIV_UW:
965 	case SLJIT_DIV_SW:
966 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
967 #ifdef _WIN64
968 		SLJIT_ASSERT(
969 			reg_map[SLJIT_R0] == 0
970 			&& reg_map[SLJIT_R1] == 2
971 			&& reg_map[TMP_REG1] > 7);
972 #else
973 		SLJIT_ASSERT(
974 			reg_map[SLJIT_R0] == 0
975 			&& reg_map[SLJIT_R1] < 7
976 			&& reg_map[TMP_REG1] == 2);
977 #endif
978 		compiler->mode32 = op & SLJIT_I32_OP;
979 #endif
980 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
981 
982 		op = GET_OPCODE(op);
983 		if ((op | 0x2) == SLJIT_DIV_UW) {
984 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
985 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
986 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
987 #else
988 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
989 #endif
990 			FAIL_IF(!inst);
991 			*inst = XOR_r_rm;
992 		}
993 
994 		if ((op | 0x2) == SLJIT_DIV_SW) {
995 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
996 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
997 #endif
998 
999 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1000 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1001 			FAIL_IF(!inst);
1002 			INC_SIZE(1);
1003 			*inst = CDQ;
1004 #else
1005 			if (compiler->mode32) {
1006 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1007 				FAIL_IF(!inst);
1008 				INC_SIZE(1);
1009 				*inst = CDQ;
1010 			} else {
1011 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1012 				FAIL_IF(!inst);
1013 				INC_SIZE(2);
1014 				*inst++ = REX_W;
1015 				*inst = CDQ;
1016 			}
1017 #endif
1018 		}
1019 
1020 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1021 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1022 		FAIL_IF(!inst);
1023 		INC_SIZE(2);
1024 		*inst++ = GROUP_F7;
1025 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
1026 #else
1027 #ifdef _WIN64
1028 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
1029 #else
1030 		size = (!compiler->mode32) ? 3 : 2;
1031 #endif
1032 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1033 		FAIL_IF(!inst);
1034 		INC_SIZE(size);
1035 #ifdef _WIN64
1036 		if (!compiler->mode32)
1037 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
1038 		else if (op >= SLJIT_DIVMOD_UW)
1039 			*inst++ = REX_B;
1040 		*inst++ = GROUP_F7;
1041 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
1042 #else
1043 		if (!compiler->mode32)
1044 			*inst++ = REX_W;
1045 		*inst++ = GROUP_F7;
1046 		*inst = MOD_REG | reg_map[SLJIT_R1];
1047 #endif
1048 #endif
1049 		switch (op) {
1050 		case SLJIT_LMUL_UW:
1051 			*inst |= MUL;
1052 			break;
1053 		case SLJIT_LMUL_SW:
1054 			*inst |= IMUL;
1055 			break;
1056 		case SLJIT_DIVMOD_UW:
1057 		case SLJIT_DIV_UW:
1058 			*inst |= DIV;
1059 			break;
1060 		case SLJIT_DIVMOD_SW:
1061 		case SLJIT_DIV_SW:
1062 			*inst |= IDIV;
1063 			break;
1064 		}
1065 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
1066 		if (op <= SLJIT_DIVMOD_SW)
1067 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1068 #else
1069 		if (op >= SLJIT_DIV_UW)
1070 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1071 #endif
1072 		break;
1073 	case SLJIT_ENDBR:
1074 		return emit_endbranch(compiler);
1075 	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
1076 		return skip_frames_before_return(compiler);
1077 	}
1078 
1079 	return SLJIT_SUCCESS;
1080 }
1081 
1082 #define ENCODE_PREFIX(prefix) \
1083 	do { \
1084 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
1085 		FAIL_IF(!inst); \
1086 		INC_SIZE(1); \
1087 		*inst = (prefix); \
1088 	} while (0)
1089 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1090 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
1091 	sljit_s32 dst, sljit_sw dstw,
1092 	sljit_s32 src, sljit_sw srcw)
1093 {
1094 	sljit_u8* inst;
1095 	sljit_s32 dst_r;
1096 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1097 	sljit_s32 work_r;
1098 #endif
1099 
1100 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1101 	compiler->mode32 = 0;
1102 #endif
1103 
1104 	if (src & SLJIT_IMM) {
1105 		if (FAST_IS_REG(dst)) {
1106 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1107 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1108 #else
1109 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1110 			FAIL_IF(!inst);
1111 			*inst = MOV_rm_i32;
1112 			return SLJIT_SUCCESS;
1113 #endif
1114 		}
1115 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
1116 		FAIL_IF(!inst);
1117 		*inst = MOV_rm8_i8;
1118 		return SLJIT_SUCCESS;
1119 	}
1120 
1121 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1122 
1123 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
1124 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1125 		if (reg_map[src] >= 4) {
1126 			SLJIT_ASSERT(dst_r == TMP_REG1);
1127 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1128 		} else
1129 			dst_r = src;
1130 #else
1131 		dst_r = src;
1132 #endif
1133 	}
1134 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1135 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
1136 		/* src, dst are registers. */
1137 		SLJIT_ASSERT(SLOW_IS_REG(dst));
1138 		if (reg_map[dst] < 4) {
1139 			if (dst != src)
1140 				EMIT_MOV(compiler, dst, 0, src, 0);
1141 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
1142 			FAIL_IF(!inst);
1143 			*inst++ = GROUP_0F;
1144 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
1145 		}
1146 		else {
1147 			if (dst != src)
1148 				EMIT_MOV(compiler, dst, 0, src, 0);
1149 			if (sign) {
1150 				/* shl reg, 24 */
1151 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
1152 				FAIL_IF(!inst);
1153 				*inst |= SHL;
1154 				/* sar reg, 24 */
1155 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
1156 				FAIL_IF(!inst);
1157 				*inst |= SAR;
1158 			}
1159 			else {
1160 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
1161 				FAIL_IF(!inst);
1162 				*(inst + 1) |= AND;
1163 			}
1164 		}
1165 		return SLJIT_SUCCESS;
1166 	}
1167 #endif
1168 	else {
1169 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
1170 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1171 		FAIL_IF(!inst);
1172 		*inst++ = GROUP_0F;
1173 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
1174 	}
1175 
1176 	if (dst & SLJIT_MEM) {
1177 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1178 		if (dst_r == TMP_REG1) {
1179 			/* Find a non-used register, whose reg_map[src] < 4. */
1180 			if ((dst & REG_MASK) == SLJIT_R0) {
1181 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
1182 					work_r = SLJIT_R2;
1183 				else
1184 					work_r = SLJIT_R1;
1185 			}
1186 			else {
1187 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1188 					work_r = SLJIT_R0;
1189 				else if ((dst & REG_MASK) == SLJIT_R1)
1190 					work_r = SLJIT_R2;
1191 				else
1192 					work_r = SLJIT_R1;
1193 			}
1194 
1195 			if (work_r == SLJIT_R0) {
1196 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1197 			}
1198 			else {
1199 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1200 				FAIL_IF(!inst);
1201 				*inst = XCHG_r_rm;
1202 			}
1203 
1204 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
1205 			FAIL_IF(!inst);
1206 			*inst = MOV_rm8_r8;
1207 
1208 			if (work_r == SLJIT_R0) {
1209 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1210 			}
1211 			else {
1212 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1213 				FAIL_IF(!inst);
1214 				*inst = XCHG_r_rm;
1215 			}
1216 		}
1217 		else {
1218 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1219 			FAIL_IF(!inst);
1220 			*inst = MOV_rm8_r8;
1221 		}
1222 #else
1223 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1224 		FAIL_IF(!inst);
1225 		*inst = MOV_rm8_r8;
1226 #endif
1227 	}
1228 
1229 	return SLJIT_SUCCESS;
1230 }
1231 
emit_prefetch(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 src,sljit_sw srcw)1232 static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1233 	sljit_s32 src, sljit_sw srcw)
1234 {
1235 	sljit_u8* inst;
1236 
1237 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1238 	compiler->mode32 = 1;
1239 #endif
1240 
1241 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1242 	FAIL_IF(!inst);
1243 	*inst++ = GROUP_0F;
1244 	*inst++ = PREFETCH;
1245 
1246 	if (op == SLJIT_PREFETCH_L1)
1247 		*inst |= (1 << 3);
1248 	else if (op == SLJIT_PREFETCH_L2)
1249 		*inst |= (2 << 3);
1250 	else if (op == SLJIT_PREFETCH_L3)
1251 		*inst |= (3 << 3);
1252 
1253 	return SLJIT_SUCCESS;
1254 }
1255 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1256 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1257 	sljit_s32 dst, sljit_sw dstw,
1258 	sljit_s32 src, sljit_sw srcw)
1259 {
1260 	sljit_u8* inst;
1261 	sljit_s32 dst_r;
1262 
1263 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1264 	compiler->mode32 = 0;
1265 #endif
1266 
1267 	if (src & SLJIT_IMM) {
1268 		if (FAST_IS_REG(dst)) {
1269 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1270 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1271 #else
1272 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1273 			FAIL_IF(!inst);
1274 			*inst = MOV_rm_i32;
1275 			return SLJIT_SUCCESS;
1276 #endif
1277 		}
1278 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1279 		FAIL_IF(!inst);
1280 		*inst = MOV_rm_i32;
1281 		return SLJIT_SUCCESS;
1282 	}
1283 
1284 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1285 
1286 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1287 		dst_r = src;
1288 	else {
1289 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1290 		FAIL_IF(!inst);
1291 		*inst++ = GROUP_0F;
1292 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1293 	}
1294 
1295 	if (dst & SLJIT_MEM) {
1296 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1297 		FAIL_IF(!inst);
1298 		*inst = MOV_rm_r;
1299 	}
1300 
1301 	return SLJIT_SUCCESS;
1302 }
1303 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1304 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1305 	sljit_s32 dst, sljit_sw dstw,
1306 	sljit_s32 src, sljit_sw srcw)
1307 {
1308 	sljit_u8* inst;
1309 
1310 	if (dst == src && dstw == srcw) {
1311 		/* Same input and output */
1312 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1313 		FAIL_IF(!inst);
1314 		*inst++ = GROUP_F7;
1315 		*inst |= opcode;
1316 		return SLJIT_SUCCESS;
1317 	}
1318 
1319 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
1320 		dst = TMP_REG1;
1321 
1322 	if (FAST_IS_REG(dst)) {
1323 		EMIT_MOV(compiler, dst, 0, src, srcw);
1324 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1325 		FAIL_IF(!inst);
1326 		*inst++ = GROUP_F7;
1327 		*inst |= opcode;
1328 		return SLJIT_SUCCESS;
1329 	}
1330 
1331 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1332 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1333 	FAIL_IF(!inst);
1334 	*inst++ = GROUP_F7;
1335 	*inst |= opcode;
1336 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1337 	return SLJIT_SUCCESS;
1338 }
1339 
emit_not_with_flags(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1340 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1341 	sljit_s32 dst, sljit_sw dstw,
1342 	sljit_s32 src, sljit_sw srcw)
1343 {
1344 	sljit_u8* inst;
1345 
1346 	if (dst == SLJIT_UNUSED)
1347 		dst = TMP_REG1;
1348 
1349 	if (FAST_IS_REG(dst)) {
1350 		EMIT_MOV(compiler, dst, 0, src, srcw);
1351 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1352 		FAIL_IF(!inst);
1353 		*inst++ = GROUP_F7;
1354 		*inst |= NOT_rm;
1355 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1356 		FAIL_IF(!inst);
1357 		*inst = OR_r_rm;
1358 		return SLJIT_SUCCESS;
1359 	}
1360 
1361 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1362 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1363 	FAIL_IF(!inst);
1364 	*inst++ = GROUP_F7;
1365 	*inst |= NOT_rm;
1366 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1367 	FAIL_IF(!inst);
1368 	*inst = OR_r_rm;
1369 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1370 	return SLJIT_SUCCESS;
1371 }
1372 
1373 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1374 static const sljit_sw emit_clz_arg = 32 + 31;
1375 #endif
1376 
emit_clz(struct sljit_compiler * compiler,sljit_s32 op_flags,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1377 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1378 	sljit_s32 dst, sljit_sw dstw,
1379 	sljit_s32 src, sljit_sw srcw)
1380 {
1381 	sljit_u8* inst;
1382 	sljit_s32 dst_r;
1383 
1384 	SLJIT_UNUSED_ARG(op_flags);
1385 
1386 	if (cpu_has_cmov == -1)
1387 		get_cpu_features();
1388 
1389 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1390 
1391 	inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1392 	FAIL_IF(!inst);
1393 	*inst++ = GROUP_0F;
1394 	*inst = BSR_r_rm;
1395 
1396 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1397 	if (cpu_has_cmov) {
1398 		if (dst_r != TMP_REG1) {
1399 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
1400 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1401 		}
1402 		else
1403 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);
1404 
1405 		FAIL_IF(!inst);
1406 		*inst++ = GROUP_0F;
1407 		*inst = CMOVE_r_rm;
1408 	}
1409 	else
1410 		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));
1411 
1412 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1413 #else
1414 	if (cpu_has_cmov) {
1415 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31));
1416 
1417 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1418 		FAIL_IF(!inst);
1419 		*inst++ = GROUP_0F;
1420 		*inst = CMOVE_r_rm;
1421 	}
1422 	else
1423 		FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31)));
1424 
1425 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1426 #endif
1427 
1428 	FAIL_IF(!inst);
1429 	*(inst + 1) |= XOR;
1430 
1431 	if (dst & SLJIT_MEM)
1432 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1433 	return SLJIT_SUCCESS;
1434 }
1435 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1436 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1437 	sljit_s32 dst, sljit_sw dstw,
1438 	sljit_s32 src, sljit_sw srcw)
1439 {
1440 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1441 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1442 	sljit_s32 dst_is_ereg = 0;
1443 #endif
1444 
1445 	CHECK_ERROR();
1446 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1447 	ADJUST_LOCAL_OFFSET(dst, dstw);
1448 	ADJUST_LOCAL_OFFSET(src, srcw);
1449 
1450 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1451 	CHECK_EXTRA_REGS(src, srcw, (void)0);
1452 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1453 	compiler->mode32 = op_flags & SLJIT_I32_OP;
1454 #endif
1455 
1456 	op = GET_OPCODE(op);
1457 
1458 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1459 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1460 		compiler->mode32 = 0;
1461 #endif
1462 
1463 		if (FAST_IS_REG(src) && src == dst) {
1464 			if (!TYPE_CAST_NEEDED(op))
1465 				return SLJIT_SUCCESS;
1466 		}
1467 
1468 		if (op_flags & SLJIT_I32_OP) {
1469 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1470 			if (src & SLJIT_MEM) {
1471 				if (op == SLJIT_MOV_S32)
1472 					op = SLJIT_MOV_U32;
1473 			}
1474 			else if (src & SLJIT_IMM) {
1475 				if (op == SLJIT_MOV_U32)
1476 					op = SLJIT_MOV_S32;
1477 			}
1478 #endif
1479 		}
1480 
1481 		if (src & SLJIT_IMM) {
1482 			switch (op) {
1483 			case SLJIT_MOV_U8:
1484 				srcw = (sljit_u8)srcw;
1485 				break;
1486 			case SLJIT_MOV_S8:
1487 				srcw = (sljit_s8)srcw;
1488 				break;
1489 			case SLJIT_MOV_U16:
1490 				srcw = (sljit_u16)srcw;
1491 				break;
1492 			case SLJIT_MOV_S16:
1493 				srcw = (sljit_s16)srcw;
1494 				break;
1495 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1496 			case SLJIT_MOV_U32:
1497 				srcw = (sljit_u32)srcw;
1498 				break;
1499 			case SLJIT_MOV_S32:
1500 				srcw = (sljit_s32)srcw;
1501 				break;
1502 #endif
1503 			}
1504 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1505 			if (SLJIT_UNLIKELY(dst_is_ereg))
1506 				return emit_mov(compiler, dst, dstw, src, srcw);
1507 #endif
1508 		}
1509 
1510 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1511 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1512 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1513 			dst = TMP_REG1;
1514 		}
1515 #endif
1516 
1517 		switch (op) {
1518 		case SLJIT_MOV:
1519 		case SLJIT_MOV_P:
1520 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1521 		case SLJIT_MOV_U32:
1522 		case SLJIT_MOV_S32:
1523 #endif
1524 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1525 			break;
1526 		case SLJIT_MOV_U8:
1527 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1528 			break;
1529 		case SLJIT_MOV_S8:
1530 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1531 			break;
1532 		case SLJIT_MOV_U16:
1533 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1534 			break;
1535 		case SLJIT_MOV_S16:
1536 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1537 			break;
1538 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1539 		case SLJIT_MOV_U32:
1540 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1541 			break;
1542 		case SLJIT_MOV_S32:
1543 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1544 			break;
1545 #endif
1546 		}
1547 
1548 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1549 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1550 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1551 #endif
1552 		return SLJIT_SUCCESS;
1553 	}
1554 
1555 	switch (op) {
1556 	case SLJIT_NOT:
1557 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
1558 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1559 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1560 
1561 	case SLJIT_NEG:
1562 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1563 
1564 	case SLJIT_CLZ:
1565 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1566 	}
1567 
1568 	return SLJIT_SUCCESS;
1569 }
1570 
1571 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1572 
1573 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1574 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1575 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1576 		FAIL_IF(!inst); \
1577 		*(inst + 1) |= (op_imm); \
1578 	} \
1579 	else { \
1580 		FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \
1581 		inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
1582 		FAIL_IF(!inst); \
1583 		*inst = (op_mr); \
1584 	}
1585 
1586 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1587 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1588 
1589 #else
1590 
1591 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1592 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1593 	FAIL_IF(!inst); \
1594 	*(inst + 1) |= (op_imm);
1595 
1596 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1597 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1598 
1599 #endif
1600 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u32 op_types,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1601 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1602 	sljit_u32 op_types,
1603 	sljit_s32 dst, sljit_sw dstw,
1604 	sljit_s32 src1, sljit_sw src1w,
1605 	sljit_s32 src2, sljit_sw src2w)
1606 {
1607 	sljit_u8* inst;
1608 	sljit_u8 op_eax_imm = (op_types >> 24);
1609 	sljit_u8 op_rm = (op_types >> 16) & 0xff;
1610 	sljit_u8 op_mr = (op_types >> 8) & 0xff;
1611 	sljit_u8 op_imm = op_types & 0xff;
1612 
1613 	if (dst == SLJIT_UNUSED) {
1614 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1615 		if (src2 & SLJIT_IMM) {
1616 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1617 		}
1618 		else {
1619 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1620 			FAIL_IF(!inst);
1621 			*inst = op_rm;
1622 		}
1623 		return SLJIT_SUCCESS;
1624 	}
1625 
1626 	if (dst == src1 && dstw == src1w) {
1627 		if (src2 & SLJIT_IMM) {
1628 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1629 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1630 #else
1631 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1632 #endif
1633 				BINARY_EAX_IMM(op_eax_imm, src2w);
1634 			}
1635 			else {
1636 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1637 			}
1638 		}
1639 		else if (FAST_IS_REG(dst)) {
1640 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1641 			FAIL_IF(!inst);
1642 			*inst = op_rm;
1643 		}
1644 		else if (FAST_IS_REG(src2)) {
1645 			/* Special exception for sljit_emit_op_flags. */
1646 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1647 			FAIL_IF(!inst);
1648 			*inst = op_mr;
1649 		}
1650 		else {
1651 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1652 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1653 			FAIL_IF(!inst);
1654 			*inst = op_mr;
1655 		}
1656 		return SLJIT_SUCCESS;
1657 	}
1658 
1659 	/* Only for cumulative operations. */
1660 	if (dst == src2 && dstw == src2w) {
1661 		if (src1 & SLJIT_IMM) {
1662 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1663 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1664 #else
1665 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1666 #endif
1667 				BINARY_EAX_IMM(op_eax_imm, src1w);
1668 			}
1669 			else {
1670 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1671 			}
1672 		}
1673 		else if (FAST_IS_REG(dst)) {
1674 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1675 			FAIL_IF(!inst);
1676 			*inst = op_rm;
1677 		}
1678 		else if (FAST_IS_REG(src1)) {
1679 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1680 			FAIL_IF(!inst);
1681 			*inst = op_mr;
1682 		}
1683 		else {
1684 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1685 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1686 			FAIL_IF(!inst);
1687 			*inst = op_mr;
1688 		}
1689 		return SLJIT_SUCCESS;
1690 	}
1691 
1692 	/* General version. */
1693 	if (FAST_IS_REG(dst)) {
1694 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1695 		if (src2 & SLJIT_IMM) {
1696 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1697 		}
1698 		else {
1699 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1700 			FAIL_IF(!inst);
1701 			*inst = op_rm;
1702 		}
1703 	}
1704 	else {
1705 		/* This version requires less memory writing. */
1706 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1707 		if (src2 & SLJIT_IMM) {
1708 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1709 		}
1710 		else {
1711 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1712 			FAIL_IF(!inst);
1713 			*inst = op_rm;
1714 		}
1715 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1716 	}
1717 
1718 	return SLJIT_SUCCESS;
1719 }
1720 
1721 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1722 	sljit_u32 op_types,
1723 	sljit_s32 dst, sljit_sw dstw,
1724 	sljit_s32 src1, sljit_sw src1w,
1725 	sljit_s32 src2, sljit_sw src2w)
1726 {
1727 	sljit_u8* inst;
1728 	sljit_u8 op_eax_imm = (op_types >> 24);
1729 	sljit_u8 op_rm = (op_types >> 16) & 0xff;
1730 	sljit_u8 op_mr = (op_types >> 8) & 0xff;
1731 	sljit_u8 op_imm = op_types & 0xff;
1732 
1733 	if (dst == SLJIT_UNUSED) {
1734 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1735 		if (src2 & SLJIT_IMM) {
1736 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1737 		}
1738 		else {
1739 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1740 			FAIL_IF(!inst);
1741 			*inst = op_rm;
1742 		}
1743 		return SLJIT_SUCCESS;
1744 	}
1745 
1746 	if (dst == src1 && dstw == src1w) {
1747 		if (src2 & SLJIT_IMM) {
1748 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1749 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1750 #else
1751 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1752 #endif
1753 				BINARY_EAX_IMM(op_eax_imm, src2w);
1754 			}
1755 			else {
1756 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1757 			}
1758 		}
1759 		else if (FAST_IS_REG(dst)) {
1760 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1761 			FAIL_IF(!inst);
1762 			*inst = op_rm;
1763 		}
1764 		else if (FAST_IS_REG(src2)) {
1765 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1766 			FAIL_IF(!inst);
1767 			*inst = op_mr;
1768 		}
1769 		else {
1770 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1771 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1772 			FAIL_IF(!inst);
1773 			*inst = op_mr;
1774 		}
1775 		return SLJIT_SUCCESS;
1776 	}
1777 
1778 	/* General version. */
1779 	if (FAST_IS_REG(dst) && dst != src2) {
1780 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1781 		if (src2 & SLJIT_IMM) {
1782 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1783 		}
1784 		else {
1785 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1786 			FAIL_IF(!inst);
1787 			*inst = op_rm;
1788 		}
1789 	}
1790 	else {
1791 		/* This version requires less memory writing. */
1792 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1793 		if (src2 & SLJIT_IMM) {
1794 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1795 		}
1796 		else {
1797 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1798 			FAIL_IF(!inst);
1799 			*inst = op_rm;
1800 		}
1801 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1802 	}
1803 
1804 	return SLJIT_SUCCESS;
1805 }
1806 
1807 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1808 	sljit_s32 dst, sljit_sw dstw,
1809 	sljit_s32 src1, sljit_sw src1w,
1810 	sljit_s32 src2, sljit_sw src2w)
1811 {
1812 	sljit_u8* inst;
1813 	sljit_s32 dst_r;
1814 
1815 	dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
1816 
1817 	/* Register destination. */
1818 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1819 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1820 		FAIL_IF(!inst);
1821 		*inst++ = GROUP_0F;
1822 		*inst = IMUL_r_rm;
1823 	}
1824 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1825 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1826 		FAIL_IF(!inst);
1827 		*inst++ = GROUP_0F;
1828 		*inst = IMUL_r_rm;
1829 	}
1830 	else if (src1 & SLJIT_IMM) {
1831 		if (src2 & SLJIT_IMM) {
1832 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1833 			src2 = dst_r;
1834 			src2w = 0;
1835 		}
1836 
1837 		if (src1w <= 127 && src1w >= -128) {
1838 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1839 			FAIL_IF(!inst);
1840 			*inst = IMUL_r_rm_i8;
1841 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1842 			FAIL_IF(!inst);
1843 			INC_SIZE(1);
1844 			*inst = (sljit_s8)src1w;
1845 		}
1846 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1847 		else {
1848 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1849 			FAIL_IF(!inst);
1850 			*inst = IMUL_r_rm_i32;
1851 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1852 			FAIL_IF(!inst);
1853 			INC_SIZE(4);
1854 			sljit_unaligned_store_sw(inst, src1w);
1855 		}
1856 #else
1857 		else if (IS_HALFWORD(src1w)) {
1858 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1859 			FAIL_IF(!inst);
1860 			*inst = IMUL_r_rm_i32;
1861 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1862 			FAIL_IF(!inst);
1863 			INC_SIZE(4);
1864 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1865 		}
1866 		else {
1867 			if (dst_r != src2)
1868 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1869 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1870 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1871 			FAIL_IF(!inst);
1872 			*inst++ = GROUP_0F;
1873 			*inst = IMUL_r_rm;
1874 		}
1875 #endif
1876 	}
1877 	else if (src2 & SLJIT_IMM) {
1878 		/* Note: src1 is NOT immediate. */
1879 
1880 		if (src2w <= 127 && src2w >= -128) {
1881 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1882 			FAIL_IF(!inst);
1883 			*inst = IMUL_r_rm_i8;
1884 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1885 			FAIL_IF(!inst);
1886 			INC_SIZE(1);
1887 			*inst = (sljit_s8)src2w;
1888 		}
1889 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1890 		else {
1891 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1892 			FAIL_IF(!inst);
1893 			*inst = IMUL_r_rm_i32;
1894 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1895 			FAIL_IF(!inst);
1896 			INC_SIZE(4);
1897 			sljit_unaligned_store_sw(inst, src2w);
1898 		}
1899 #else
1900 		else if (IS_HALFWORD(src2w)) {
1901 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1902 			FAIL_IF(!inst);
1903 			*inst = IMUL_r_rm_i32;
1904 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1905 			FAIL_IF(!inst);
1906 			INC_SIZE(4);
1907 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1908 		}
1909 		else {
1910 			if (dst_r != src1)
1911 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1912 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1913 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1914 			FAIL_IF(!inst);
1915 			*inst++ = GROUP_0F;
1916 			*inst = IMUL_r_rm;
1917 		}
1918 #endif
1919 	}
1920 	else {
1921 		/* Neither argument is immediate. */
1922 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1923 			dst_r = TMP_REG1;
1924 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1925 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1926 		FAIL_IF(!inst);
1927 		*inst++ = GROUP_0F;
1928 		*inst = IMUL_r_rm;
1929 	}
1930 
1931 	if (dst & SLJIT_MEM)
1932 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1933 
1934 	return SLJIT_SUCCESS;
1935 }
1936 
1937 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
1938 	sljit_s32 dst, sljit_sw dstw,
1939 	sljit_s32 src1, sljit_sw src1w,
1940 	sljit_s32 src2, sljit_sw src2w)
1941 {
1942 	sljit_u8* inst;
1943 	sljit_s32 dst_r, done = 0;
1944 
1945 	/* These cases better be left to handled by normal way. */
1946 	if (dst == src1 && dstw == src1w)
1947 		return SLJIT_ERR_UNSUPPORTED;
1948 	if (dst == src2 && dstw == src2w)
1949 		return SLJIT_ERR_UNSUPPORTED;
1950 
1951 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1952 
1953 	if (FAST_IS_REG(src1)) {
1954 		if (FAST_IS_REG(src2)) {
1955 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1956 			FAIL_IF(!inst);
1957 			*inst = LEA_r_m;
1958 			done = 1;
1959 		}
1960 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1961 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1962 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1963 #else
1964 		if (src2 & SLJIT_IMM) {
1965 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1966 #endif
1967 			FAIL_IF(!inst);
1968 			*inst = LEA_r_m;
1969 			done = 1;
1970 		}
1971 	}
1972 	else if (FAST_IS_REG(src2)) {
1973 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1974 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1975 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1976 #else
1977 		if (src1 & SLJIT_IMM) {
1978 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1979 #endif
1980 			FAIL_IF(!inst);
1981 			*inst = LEA_r_m;
1982 			done = 1;
1983 		}
1984 	}
1985 
1986 	if (done) {
1987 		if (dst_r == TMP_REG1)
1988 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1989 		return SLJIT_SUCCESS;
1990 	}
1991 	return SLJIT_ERR_UNSUPPORTED;
1992 }
1993 
1994 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1995 	sljit_s32 src1, sljit_sw src1w,
1996 	sljit_s32 src2, sljit_sw src2w)
1997 {
1998 	sljit_u8* inst;
1999 
2000 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2001 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2002 #else
2003 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
2004 #endif
2005 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
2006 		return SLJIT_SUCCESS;
2007 	}
2008 
2009 	if (FAST_IS_REG(src1)) {
2010 		if (src2 & SLJIT_IMM) {
2011 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
2012 		}
2013 		else {
2014 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2015 			FAIL_IF(!inst);
2016 			*inst = CMP_r_rm;
2017 		}
2018 		return SLJIT_SUCCESS;
2019 	}
2020 
2021 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
2022 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2023 		FAIL_IF(!inst);
2024 		*inst = CMP_rm_r;
2025 		return SLJIT_SUCCESS;
2026 	}
2027 
2028 	if (src2 & SLJIT_IMM) {
2029 		if (src1 & SLJIT_IMM) {
2030 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2031 			src1 = TMP_REG1;
2032 			src1w = 0;
2033 		}
2034 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
2035 	}
2036 	else {
2037 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2038 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2039 		FAIL_IF(!inst);
2040 		*inst = CMP_r_rm;
2041 	}
2042 	return SLJIT_SUCCESS;
2043 }
2044 
2045 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
2046 	sljit_s32 src1, sljit_sw src1w,
2047 	sljit_s32 src2, sljit_sw src2w)
2048 {
2049 	sljit_u8* inst;
2050 
2051 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2052 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2053 #else
2054 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
2055 #endif
2056 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
2057 		return SLJIT_SUCCESS;
2058 	}
2059 
2060 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2061 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2062 #else
2063 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
2064 #endif
2065 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
2066 		return SLJIT_SUCCESS;
2067 	}
2068 
2069 	if (!(src1 & SLJIT_IMM)) {
2070 		if (src2 & SLJIT_IMM) {
2071 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2072 			if (IS_HALFWORD(src2w) || compiler->mode32) {
2073 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2074 				FAIL_IF(!inst);
2075 				*inst = GROUP_F7;
2076 			}
2077 			else {
2078 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src2w));
2079 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src1, src1w);
2080 				FAIL_IF(!inst);
2081 				*inst = TEST_rm_r;
2082 			}
2083 #else
2084 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2085 			FAIL_IF(!inst);
2086 			*inst = GROUP_F7;
2087 #endif
2088 			return SLJIT_SUCCESS;
2089 		}
2090 		else if (FAST_IS_REG(src1)) {
2091 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2092 			FAIL_IF(!inst);
2093 			*inst = TEST_rm_r;
2094 			return SLJIT_SUCCESS;
2095 		}
2096 	}
2097 
2098 	if (!(src2 & SLJIT_IMM)) {
2099 		if (src1 & SLJIT_IMM) {
2100 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2101 			if (IS_HALFWORD(src1w) || compiler->mode32) {
2102 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
2103 				FAIL_IF(!inst);
2104 				*inst = GROUP_F7;
2105 			}
2106 			else {
2107 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
2108 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2109 				FAIL_IF(!inst);
2110 				*inst = TEST_rm_r;
2111 			}
2112 #else
2113 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
2114 			FAIL_IF(!inst);
2115 			*inst = GROUP_F7;
2116 #endif
2117 			return SLJIT_SUCCESS;
2118 		}
2119 		else if (FAST_IS_REG(src2)) {
2120 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2121 			FAIL_IF(!inst);
2122 			*inst = TEST_rm_r;
2123 			return SLJIT_SUCCESS;
2124 		}
2125 	}
2126 
2127 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2128 	if (src2 & SLJIT_IMM) {
2129 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2130 		if (IS_HALFWORD(src2w) || compiler->mode32) {
2131 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2132 			FAIL_IF(!inst);
2133 			*inst = GROUP_F7;
2134 		}
2135 		else {
2136 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2137 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2138 			FAIL_IF(!inst);
2139 			*inst = TEST_rm_r;
2140 		}
2141 #else
2142 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2143 		FAIL_IF(!inst);
2144 		*inst = GROUP_F7;
2145 #endif
2146 	}
2147 	else {
2148 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2149 		FAIL_IF(!inst);
2150 		*inst = TEST_rm_r;
2151 	}
2152 	return SLJIT_SUCCESS;
2153 }
2154 
2155 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2156 	sljit_u8 mode,
2157 	sljit_s32 dst, sljit_sw dstw,
2158 	sljit_s32 src1, sljit_sw src1w,
2159 	sljit_s32 src2, sljit_sw src2w)
2160 {
2161 	sljit_u8* inst;
2162 
2163 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2164 		if (dst == src1 && dstw == src1w) {
2165 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2166 			FAIL_IF(!inst);
2167 			*inst |= mode;
2168 			return SLJIT_SUCCESS;
2169 		}
2170 		if (dst == SLJIT_UNUSED) {
2171 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2172 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2173 			FAIL_IF(!inst);
2174 			*inst |= mode;
2175 			return SLJIT_SUCCESS;
2176 		}
2177 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2178 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2179 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2180 			FAIL_IF(!inst);
2181 			*inst |= mode;
2182 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2183 			return SLJIT_SUCCESS;
2184 		}
2185 		if (FAST_IS_REG(dst)) {
2186 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2187 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2188 			FAIL_IF(!inst);
2189 			*inst |= mode;
2190 			return SLJIT_SUCCESS;
2191 		}
2192 
2193 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2194 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2195 		FAIL_IF(!inst);
2196 		*inst |= mode;
2197 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2198 		return SLJIT_SUCCESS;
2199 	}
2200 
2201 	if (dst == SLJIT_PREF_SHIFT_REG) {
2202 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2203 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2204 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2205 		FAIL_IF(!inst);
2206 		*inst |= mode;
2207 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2208 	}
2209 	else if (SLOW_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2210 		if (src1 != dst)
2211 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2212 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2213 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2214 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2215 		FAIL_IF(!inst);
2216 		*inst |= mode;
2217 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2218 	}
2219 	else {
2220 		/* This case is complex since ecx itself may be used for
2221 		   addressing, and this case must be supported as well. */
2222 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2223 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2224 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2225 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2226 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2227 		FAIL_IF(!inst);
2228 		*inst |= mode;
2229 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2230 #else
2231 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2232 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2233 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2234 		FAIL_IF(!inst);
2235 		*inst |= mode;
2236 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2237 #endif
2238 		if (dst != SLJIT_UNUSED)
2239 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2240 	}
2241 
2242 	return SLJIT_SUCCESS;
2243 }
2244 
2245 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2246 	sljit_u8 mode, sljit_s32 set_flags,
2247 	sljit_s32 dst, sljit_sw dstw,
2248 	sljit_s32 src1, sljit_sw src1w,
2249 	sljit_s32 src2, sljit_sw src2w)
2250 {
2251 	/* The CPU does not set flags if the shift count is 0. */
2252 	if (src2 & SLJIT_IMM) {
2253 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2254 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2255 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2256 #else
2257 		if ((src2w & 0x1f) != 0)
2258 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2259 #endif
2260 		if (!set_flags)
2261 			return emit_mov(compiler, dst, dstw, src1, src1w);
2262 		/* OR dst, src, 0 */
2263 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2264 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2265 	}
2266 
2267 	if (!set_flags)
2268 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2269 
2270 	if (!FAST_IS_REG(dst))
2271 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2272 
2273 	FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2274 
2275 	if (FAST_IS_REG(dst))
2276 		return emit_cmp_binary(compiler, (dst == SLJIT_UNUSED) ? TMP_REG1 : dst, dstw, SLJIT_IMM, 0);
2277 	return SLJIT_SUCCESS;
2278 }
2279 
2280 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2281 	sljit_s32 dst, sljit_sw dstw,
2282 	sljit_s32 src1, sljit_sw src1w,
2283 	sljit_s32 src2, sljit_sw src2w)
2284 {
2285 	CHECK_ERROR();
2286 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2287 	ADJUST_LOCAL_OFFSET(dst, dstw);
2288 	ADJUST_LOCAL_OFFSET(src1, src1w);
2289 	ADJUST_LOCAL_OFFSET(src2, src2w);
2290 
2291 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2292 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2293 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2294 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2295 	compiler->mode32 = op & SLJIT_I32_OP;
2296 #endif
2297 
2298 	if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
2299 		return SLJIT_SUCCESS;
2300 
2301 	switch (GET_OPCODE(op)) {
2302 	case SLJIT_ADD:
2303 		if (!HAS_FLAGS(op)) {
2304 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2305 				return compiler->error;
2306 		}
2307 		return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2308 			dst, dstw, src1, src1w, src2, src2w);
2309 	case SLJIT_ADDC:
2310 		return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2311 			dst, dstw, src1, src1w, src2, src2w);
2312 	case SLJIT_SUB:
2313 		if (!HAS_FLAGS(op)) {
2314 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2315 				return compiler->error;
2316 			if (SLOW_IS_REG(dst) && src2 == dst) {
2317 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
2318 				return emit_unary(compiler, NEG_rm, dst, 0, dst, 0);
2319 			}
2320 		}
2321 
2322 		if (dst == SLJIT_UNUSED)
2323 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2324 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2325 			dst, dstw, src1, src1w, src2, src2w);
2326 	case SLJIT_SUBC:
2327 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2328 			dst, dstw, src1, src1w, src2, src2w);
2329 	case SLJIT_MUL:
2330 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2331 	case SLJIT_AND:
2332 		if (dst == SLJIT_UNUSED)
2333 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2334 		return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2335 			dst, dstw, src1, src1w, src2, src2w);
2336 	case SLJIT_OR:
2337 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2338 			dst, dstw, src1, src1w, src2, src2w);
2339 	case SLJIT_XOR:
2340 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2341 			dst, dstw, src1, src1w, src2, src2w);
2342 	case SLJIT_SHL:
2343 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2344 			dst, dstw, src1, src1w, src2, src2w);
2345 	case SLJIT_LSHR:
2346 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2347 			dst, dstw, src1, src1w, src2, src2w);
2348 	case SLJIT_ASHR:
2349 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2350 			dst, dstw, src1, src1w, src2, src2w);
2351 	}
2352 
2353 	return SLJIT_SUCCESS;
2354 }
2355 
2356 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
2357 	sljit_s32 src, sljit_sw srcw)
2358 {
2359 	CHECK_ERROR();
2360 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
2361 	ADJUST_LOCAL_OFFSET(src, srcw);
2362 
2363 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2364 
2365 	switch (op) {
2366 	case SLJIT_FAST_RETURN:
2367 		return emit_fast_return(compiler, src, srcw);
2368 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
2369 		/* Don't adjust shadow stack if it isn't enabled.  */
2370 		if (!cpu_has_shadow_stack ())
2371 			return SLJIT_SUCCESS;
2372 		return adjust_shadow_stack(compiler, src, srcw, SLJIT_UNUSED, 0);
2373 	case SLJIT_PREFETCH_L1:
2374 	case SLJIT_PREFETCH_L2:
2375 	case SLJIT_PREFETCH_L3:
2376 	case SLJIT_PREFETCH_ONCE:
2377 		return emit_prefetch(compiler, op, src, srcw);
2378 	}
2379 
2380 	return SLJIT_SUCCESS;
2381 }
2382 
2383 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2384 {
2385 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2386 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2387 	if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2388 		return -1;
2389 #endif
2390 	return reg_map[reg];
2391 }
2392 
2393 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2394 {
2395 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2396 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2397 	return reg;
2398 #else
2399 	return freg_map[reg];
2400 #endif
2401 }
2402 
2403 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2404 	void *instruction, sljit_s32 size)
2405 {
2406 	sljit_u8 *inst;
2407 
2408 	CHECK_ERROR();
2409 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2410 
2411 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2412 	FAIL_IF(!inst);
2413 	INC_SIZE(size);
2414 	SLJIT_MEMCPY(inst, instruction, size);
2415 	return SLJIT_SUCCESS;
2416 }
2417 
2418 /* --------------------------------------------------------------------- */
2419 /*  Floating point operators                                             */
2420 /* --------------------------------------------------------------------- */
2421 
2422 /* Alignment(3) + 4 * 16 bytes. */
2423 static sljit_s32 sse2_data[3 + (4 * 4)];
2424 static sljit_s32 *sse2_buffer;
2425 
2426 static void init_compiler(void)
2427 {
2428 	/* Align to 16 bytes. */
2429 	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2430 
2431 	/* Single precision constants (each constant is 16 byte long). */
2432 	sse2_buffer[0] = 0x80000000;
2433 	sse2_buffer[4] = 0x7fffffff;
2434 	/* Double precision constants (each constant is 16 byte long). */
2435 	sse2_buffer[8] = 0;
2436 	sse2_buffer[9] = 0x80000000;
2437 	sse2_buffer[12] = 0xffffffff;
2438 	sse2_buffer[13] = 0x7fffffff;
2439 }
2440 
2441 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2442 	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2443 {
2444 	sljit_u8 *inst;
2445 
2446 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2447 	FAIL_IF(!inst);
2448 	*inst++ = GROUP_0F;
2449 	*inst = opcode;
2450 	return SLJIT_SUCCESS;
2451 }
2452 
2453 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2454 	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2455 {
2456 	sljit_u8 *inst;
2457 
2458 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2459 	FAIL_IF(!inst);
2460 	*inst++ = GROUP_0F;
2461 	*inst = opcode;
2462 	return SLJIT_SUCCESS;
2463 }
2464 
2465 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2466 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2467 {
2468 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2469 }
2470 
2471 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2472 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2473 {
2474 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2475 }
2476 
2477 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2478 	sljit_s32 dst, sljit_sw dstw,
2479 	sljit_s32 src, sljit_sw srcw)
2480 {
2481 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2482 	sljit_u8 *inst;
2483 
2484 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2485 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2486 		compiler->mode32 = 0;
2487 #endif
2488 
2489 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2490 	FAIL_IF(!inst);
2491 	*inst++ = GROUP_0F;
2492 	*inst = CVTTSD2SI_r_xm;
2493 
2494 	if (dst & SLJIT_MEM)
2495 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2496 	return SLJIT_SUCCESS;
2497 }
2498 
2499 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2500 	sljit_s32 dst, sljit_sw dstw,
2501 	sljit_s32 src, sljit_sw srcw)
2502 {
2503 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2504 	sljit_u8 *inst;
2505 
2506 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2507 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2508 		compiler->mode32 = 0;
2509 #endif
2510 
2511 	if (src & SLJIT_IMM) {
2512 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2513 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2514 			srcw = (sljit_s32)srcw;
2515 #endif
2516 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2517 		src = TMP_REG1;
2518 		srcw = 0;
2519 	}
2520 
2521 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2522 	FAIL_IF(!inst);
2523 	*inst++ = GROUP_0F;
2524 	*inst = CVTSI2SD_x_rm;
2525 
2526 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2527 	compiler->mode32 = 1;
2528 #endif
2529 	if (dst_r == TMP_FREG)
2530 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2531 	return SLJIT_SUCCESS;
2532 }
2533 
2534 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2535 	sljit_s32 src1, sljit_sw src1w,
2536 	sljit_s32 src2, sljit_sw src2w)
2537 {
2538 	if (!FAST_IS_REG(src1)) {
2539 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2540 		src1 = TMP_FREG;
2541 	}
2542 
2543 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2544 }
2545 
2546 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2547 	sljit_s32 dst, sljit_sw dstw,
2548 	sljit_s32 src, sljit_sw srcw)
2549 {
2550 	sljit_s32 dst_r;
2551 
2552 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2553 	compiler->mode32 = 1;
2554 #endif
2555 
2556 	CHECK_ERROR();
2557 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2558 
2559 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2560 		if (FAST_IS_REG(dst))
2561 			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2562 		if (FAST_IS_REG(src))
2563 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2564 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2565 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2566 	}
2567 
2568 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2569 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2570 		if (FAST_IS_REG(src)) {
2571 			/* We overwrite the high bits of source. From SLJIT point of view,
2572 			   this is not an issue.
2573 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2574 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2575 		}
2576 		else {
2577 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2578 			src = TMP_FREG;
2579 		}
2580 
2581 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2582 		if (dst_r == TMP_FREG)
2583 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2584 		return SLJIT_SUCCESS;
2585 	}
2586 
2587 	if (FAST_IS_REG(dst)) {
2588 		dst_r = dst;
2589 		if (dst != src)
2590 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2591 	}
2592 	else {
2593 		dst_r = TMP_FREG;
2594 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2595 	}
2596 
2597 	switch (GET_OPCODE(op)) {
2598 	case SLJIT_NEG_F64:
2599 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2600 		break;
2601 
2602 	case SLJIT_ABS_F64:
2603 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2604 		break;
2605 	}
2606 
2607 	if (dst_r == TMP_FREG)
2608 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2609 	return SLJIT_SUCCESS;
2610 }
2611 
2612 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2613 	sljit_s32 dst, sljit_sw dstw,
2614 	sljit_s32 src1, sljit_sw src1w,
2615 	sljit_s32 src2, sljit_sw src2w)
2616 {
2617 	sljit_s32 dst_r;
2618 
2619 	CHECK_ERROR();
2620 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2621 	ADJUST_LOCAL_OFFSET(dst, dstw);
2622 	ADJUST_LOCAL_OFFSET(src1, src1w);
2623 	ADJUST_LOCAL_OFFSET(src2, src2w);
2624 
2625 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2626 	compiler->mode32 = 1;
2627 #endif
2628 
2629 	if (FAST_IS_REG(dst)) {
2630 		dst_r = dst;
2631 		if (dst == src1)
2632 			; /* Do nothing here. */
2633 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2634 			/* Swap arguments. */
2635 			src2 = src1;
2636 			src2w = src1w;
2637 		}
2638 		else if (dst != src2)
2639 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2640 		else {
2641 			dst_r = TMP_FREG;
2642 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2643 		}
2644 	}
2645 	else {
2646 		dst_r = TMP_FREG;
2647 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2648 	}
2649 
2650 	switch (GET_OPCODE(op)) {
2651 	case SLJIT_ADD_F64:
2652 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2653 		break;
2654 
2655 	case SLJIT_SUB_F64:
2656 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2657 		break;
2658 
2659 	case SLJIT_MUL_F64:
2660 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2661 		break;
2662 
2663 	case SLJIT_DIV_F64:
2664 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2665 		break;
2666 	}
2667 
2668 	if (dst_r == TMP_FREG)
2669 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2670 	return SLJIT_SUCCESS;
2671 }
2672 
2673 /* --------------------------------------------------------------------- */
2674 /*  Conditional instructions                                             */
2675 /* --------------------------------------------------------------------- */
2676 
2677 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2678 {
2679 	sljit_u8 *inst;
2680 	struct sljit_label *label;
2681 
2682 	CHECK_ERROR_PTR();
2683 	CHECK_PTR(check_sljit_emit_label(compiler));
2684 
2685 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2686 		return compiler->last_label;
2687 
2688 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2689 	PTR_FAIL_IF(!label);
2690 	set_label(label, compiler);
2691 
2692 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2693 	PTR_FAIL_IF(!inst);
2694 
2695 	*inst++ = 0;
2696 	*inst++ = 0;
2697 
2698 	return label;
2699 }
2700 
2701 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2702 {
2703 	sljit_u8 *inst;
2704 	struct sljit_jump *jump;
2705 
2706 	CHECK_ERROR_PTR();
2707 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2708 
2709 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2710 	PTR_FAIL_IF_NULL(jump);
2711 	set_jump(jump, compiler, (type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT));
2712 	type &= 0xff;
2713 
2714 	/* Worst case size. */
2715 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2716 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2717 #else
2718 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2719 #endif
2720 
2721 	inst = (sljit_u8*)ensure_buf(compiler, 2);
2722 	PTR_FAIL_IF_NULL(inst);
2723 
2724 	*inst++ = 0;
2725 	*inst++ = 1;
2726 	return jump;
2727 }
2728 
2729 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2730 {
2731 	sljit_u8 *inst;
2732 	struct sljit_jump *jump;
2733 
2734 	CHECK_ERROR();
2735 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2736 	ADJUST_LOCAL_OFFSET(src, srcw);
2737 
2738 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2739 
2740 	if (src == SLJIT_IMM) {
2741 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2742 		FAIL_IF_NULL(jump);
2743 		set_jump(jump, compiler, JUMP_ADDR | (type << TYPE_SHIFT));
2744 		jump->u.target = srcw;
2745 
2746 		/* Worst case size. */
2747 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2748 		compiler->size += 5;
2749 #else
2750 		compiler->size += 10 + 3;
2751 #endif
2752 
2753 		inst = (sljit_u8*)ensure_buf(compiler, 2);
2754 		FAIL_IF_NULL(inst);
2755 
2756 		*inst++ = 0;
2757 		*inst++ = 1;
2758 	}
2759 	else {
2760 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2761 		/* REX_W is not necessary (src is not immediate). */
2762 		compiler->mode32 = 1;
2763 #endif
2764 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2765 		FAIL_IF(!inst);
2766 		*inst++ = GROUP_FF;
2767 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2768 	}
2769 	return SLJIT_SUCCESS;
2770 }
2771 
2772 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2773 	sljit_s32 dst, sljit_sw dstw,
2774 	sljit_s32 type)
2775 {
2776 	sljit_u8 *inst;
2777 	sljit_u8 cond_set = 0;
2778 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2779 	sljit_s32 reg;
2780 #endif
2781 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
2782 	sljit_s32 dst_save = dst;
2783 	sljit_sw dstw_save = dstw;
2784 
2785 	CHECK_ERROR();
2786 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
2787 
2788 	ADJUST_LOCAL_OFFSET(dst, dstw);
2789 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2790 
2791 	type &= 0xff;
2792 	/* setcc = jcc + 0x10. */
2793 	cond_set = get_jump_code(type) + 0x10;
2794 
2795 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2796 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
2797 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2798 		FAIL_IF(!inst);
2799 		INC_SIZE(4 + 3);
2800 		/* Set low register to conditional flag. */
2801 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2802 		*inst++ = GROUP_0F;
2803 		*inst++ = cond_set;
2804 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2805 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2806 		*inst++ = OR_rm8_r8;
2807 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2808 		return SLJIT_SUCCESS;
2809 	}
2810 
2811 	reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2812 
2813 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2814 	FAIL_IF(!inst);
2815 	INC_SIZE(4 + 4);
2816 	/* Set low register to conditional flag. */
2817 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2818 	*inst++ = GROUP_0F;
2819 	*inst++ = cond_set;
2820 	*inst++ = MOD_REG | reg_lmap[reg];
2821 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2822 	/* The movzx instruction does not affect flags. */
2823 	*inst++ = GROUP_0F;
2824 	*inst++ = MOVZX_r_rm8;
2825 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2826 
2827 	if (reg != TMP_REG1)
2828 		return SLJIT_SUCCESS;
2829 
2830 	if (GET_OPCODE(op) < SLJIT_ADD) {
2831 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2832 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2833 	}
2834 
2835 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2836 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2837 	compiler->skip_checks = 1;
2838 #endif
2839 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2840 
2841 #else
2842 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
2843 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2844 		if (reg_map[dst] <= 4) {
2845 			/* Low byte is accessible. */
2846 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2847 			FAIL_IF(!inst);
2848 			INC_SIZE(3 + 3);
2849 			/* Set low byte to conditional flag. */
2850 			*inst++ = GROUP_0F;
2851 			*inst++ = cond_set;
2852 			*inst++ = MOD_REG | reg_map[dst];
2853 
2854 			*inst++ = GROUP_0F;
2855 			*inst++ = MOVZX_r_rm8;
2856 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2857 			return SLJIT_SUCCESS;
2858 		}
2859 
2860 		/* Low byte is not accessible. */
2861 		if (cpu_has_cmov == -1)
2862 			get_cpu_features();
2863 
2864 		if (cpu_has_cmov) {
2865 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2866 			/* a xor reg, reg operation would overwrite the flags. */
2867 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2868 
2869 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2870 			FAIL_IF(!inst);
2871 			INC_SIZE(3);
2872 
2873 			*inst++ = GROUP_0F;
2874 			/* cmovcc = setcc - 0x50. */
2875 			*inst++ = cond_set - 0x50;
2876 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2877 			return SLJIT_SUCCESS;
2878 		}
2879 
2880 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2881 		FAIL_IF(!inst);
2882 		INC_SIZE(1 + 3 + 3 + 1);
2883 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2884 		/* Set al to conditional flag. */
2885 		*inst++ = GROUP_0F;
2886 		*inst++ = cond_set;
2887 		*inst++ = MOD_REG | 0 /* eax */;
2888 
2889 		*inst++ = GROUP_0F;
2890 		*inst++ = MOVZX_r_rm8;
2891 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2892 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2893 		return SLJIT_SUCCESS;
2894 	}
2895 
2896 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
2897 		SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
2898 
2899 		if (dst != SLJIT_R0) {
2900 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2901 			FAIL_IF(!inst);
2902 			INC_SIZE(1 + 3 + 2 + 1);
2903 			/* Set low register to conditional flag. */
2904 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2905 			*inst++ = GROUP_0F;
2906 			*inst++ = cond_set;
2907 			*inst++ = MOD_REG | 0 /* eax */;
2908 			*inst++ = OR_rm8_r8;
2909 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2910 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2911 		}
2912 		else {
2913 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2914 			FAIL_IF(!inst);
2915 			INC_SIZE(2 + 3 + 2 + 2);
2916 			/* Set low register to conditional flag. */
2917 			*inst++ = XCHG_r_rm;
2918 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2919 			*inst++ = GROUP_0F;
2920 			*inst++ = cond_set;
2921 			*inst++ = MOD_REG | 1 /* ecx */;
2922 			*inst++ = OR_rm8_r8;
2923 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2924 			*inst++ = XCHG_r_rm;
2925 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2926 		}
2927 		return SLJIT_SUCCESS;
2928 	}
2929 
2930 	/* Set TMP_REG1 to the bit. */
2931 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2932 	FAIL_IF(!inst);
2933 	INC_SIZE(1 + 3 + 3 + 1);
2934 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2935 	/* Set al to conditional flag. */
2936 	*inst++ = GROUP_0F;
2937 	*inst++ = cond_set;
2938 	*inst++ = MOD_REG | 0 /* eax */;
2939 
2940 	*inst++ = GROUP_0F;
2941 	*inst++ = MOVZX_r_rm8;
2942 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2943 
2944 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2945 
2946 	if (GET_OPCODE(op) < SLJIT_ADD)
2947 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2948 
2949 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2950 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2951 	compiler->skip_checks = 1;
2952 #endif
2953 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2954 #endif /* SLJIT_CONFIG_X86_64 */
2955 }
2956 
2957 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
2958 	sljit_s32 dst_reg,
2959 	sljit_s32 src, sljit_sw srcw)
2960 {
2961 	sljit_u8* inst;
2962 
2963 	CHECK_ERROR();
2964 	CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
2965 
2966 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2967 	dst_reg &= ~SLJIT_I32_OP;
2968 
2969 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3))
2970 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2971 #else
2972 	if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV))
2973 		return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2974 #endif
2975 
2976 	/* ADJUST_LOCAL_OFFSET is not needed. */
2977 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2978 
2979 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2980 	compiler->mode32 = dst_reg & SLJIT_I32_OP;
2981 	dst_reg &= ~SLJIT_I32_OP;
2982 #endif
2983 
2984 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2985 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2986 		src = TMP_REG1;
2987 		srcw = 0;
2988 	}
2989 
2990 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
2991 	FAIL_IF(!inst);
2992 	*inst++ = GROUP_0F;
2993 	*inst = get_jump_code(type & 0xff) - 0x40;
2994 	return SLJIT_SUCCESS;
2995 }
2996 
2997 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2998 {
2999 	CHECK_ERROR();
3000 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
3001 	ADJUST_LOCAL_OFFSET(dst, dstw);
3002 
3003 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3004 
3005 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3006 	compiler->mode32 = 0;
3007 #endif
3008 
3009 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
3010 
3011 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3012 	if (NOT_HALFWORD(offset)) {
3013 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
3014 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
3015 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
3016 		return compiler->error;
3017 #else
3018 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
3019 #endif
3020 	}
3021 #endif
3022 
3023 	if (offset != 0)
3024 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
3025 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
3026 }
3027 
3028 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
3029 {
3030 	sljit_u8 *inst;
3031 	struct sljit_const *const_;
3032 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3033 	sljit_s32 reg;
3034 #endif
3035 
3036 	CHECK_ERROR_PTR();
3037 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
3038 	ADJUST_LOCAL_OFFSET(dst, dstw);
3039 
3040 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3041 
3042 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
3043 	PTR_FAIL_IF(!const_);
3044 	set_const(const_, compiler);
3045 
3046 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3047 	compiler->mode32 = 0;
3048 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
3049 
3050 	if (emit_load_imm64(compiler, reg, init_value))
3051 		return NULL;
3052 #else
3053 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
3054 		return NULL;
3055 #endif
3056 
3057 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3058 	PTR_FAIL_IF(!inst);
3059 
3060 	*inst++ = 0;
3061 	*inst++ = 2;
3062 
3063 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3064 	if (dst & SLJIT_MEM)
3065 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
3066 			return NULL;
3067 #endif
3068 
3069 	return const_;
3070 }
3071 
3072 SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
3073 {
3074 	struct sljit_put_label *put_label;
3075 	sljit_u8 *inst;
3076 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3077 	sljit_s32 reg;
3078 	sljit_uw start_size;
3079 #endif
3080 
3081 	CHECK_ERROR_PTR();
3082 	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
3083 	ADJUST_LOCAL_OFFSET(dst, dstw);
3084 
3085 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3086 
3087 	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
3088 	PTR_FAIL_IF(!put_label);
3089 	set_put_label(put_label, compiler, 0);
3090 
3091 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3092 	compiler->mode32 = 0;
3093 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
3094 
3095 	if (emit_load_imm64(compiler, reg, 0))
3096 		return NULL;
3097 #else
3098 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, 0))
3099 		return NULL;
3100 #endif
3101 
3102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3103 	if (dst & SLJIT_MEM) {
3104 		start_size = compiler->size;
3105 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
3106 			return NULL;
3107 		put_label->flags = compiler->size - start_size;
3108 	}
3109 #endif
3110 
3111 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3112 	PTR_FAIL_IF(!inst);
3113 
3114 	*inst++ = 0;
3115 	*inst++ = 3;
3116 
3117 	return put_label;
3118 }
3119 
3120 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
3121 {
3122 	SLJIT_UNUSED_ARG(executable_offset);
3123 
3124 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0);
3125 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3126 	sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset);
3127 #else
3128 	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target);
3129 #endif
3130 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1);
3131 }
3132 
3133 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
3134 {
3135 	SLJIT_UNUSED_ARG(executable_offset);
3136 
3137 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 0);
3138 	sljit_unaligned_store_sw((void*)addr, new_constant);
3139 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 1);
3140 }
3141