1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
sljit_get_platform_name(void)27 SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
28 {
29 	return "x86" SLJIT_CPUINFO;
30 }
31 
32 /*
33    32b register indexes:
34      0 - EAX
35      1 - ECX
36      2 - EDX
37      3 - EBX
38      4 - none
39      5 - EBP
40      6 - ESI
41      7 - EDI
42 */
43 
44 /*
45    64b register indexes:
46      0 - RAX
47      1 - RCX
48      2 - RDX
49      3 - RBX
50      4 - none
51      5 - RBP
52      6 - RSI
53      7 - RDI
54      8 - R8   - From now on REX prefix is required
55      9 - R9
56     10 - R10
57     11 - R11
58     12 - R12
59     13 - R13
60     14 - R14
61     15 - R15
62 */
63 
64 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
65 
66 /* Last register + 1. */
67 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
68 
69 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70 	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
71 };
72 
73 #define CHECK_EXTRA_REGS(p, w, do) \
74 	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
75 		w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
76 		p = SLJIT_MEM1(SLJIT_SP); \
77 		do; \
78 	}
79 
80 #else /* SLJIT_CONFIG_X86_32 */
81 
82 /* Last register + 1. */
83 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
84 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
85 #define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
86 
87 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
88    Note: avoid to use r12 and r13 for memory addessing
89    therefore r12 is better for SAVED_EREG than SAVED_REG. */
90 #ifndef _WIN64
91 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
92 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
93 	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
94 };
95 /* low-map. reg_map & 0x7. */
96 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
97 	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
98 };
99 #else
100 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
101 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102 	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
103 };
104 /* low-map. reg_map & 0x7. */
105 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
106 	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
107 };
108 #endif
109 
110 #define REX_W		0x48
111 #define REX_R		0x44
112 #define REX_X		0x42
113 #define REX_B		0x41
114 #define REX		0x40
115 
116 #ifndef _WIN64
117 #define HALFWORD_MAX 0x7fffffffl
118 #define HALFWORD_MIN -0x80000000l
119 #else
120 #define HALFWORD_MAX 0x7fffffffll
121 #define HALFWORD_MIN -0x80000000ll
122 #endif
123 
124 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
125 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
126 
127 #define CHECK_EXTRA_REGS(p, w, do)
128 
129 #endif /* SLJIT_CONFIG_X86_32 */
130 
131 #define TMP_FREG	(0)
132 
133 /* Size flags for emit_x86_instruction: */
134 #define EX86_BIN_INS		0x0010
135 #define EX86_SHIFT_INS		0x0020
136 #define EX86_REX		0x0040
137 #define EX86_NO_REXW		0x0080
138 #define EX86_BYTE_ARG		0x0100
139 #define EX86_HALF_ARG		0x0200
140 #define EX86_PREF_66		0x0400
141 #define EX86_PREF_F2		0x0800
142 #define EX86_PREF_F3		0x1000
143 #define EX86_SSE2_OP1		0x2000
144 #define EX86_SSE2_OP2		0x4000
145 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
146 
147 /* --------------------------------------------------------------------- */
148 /*  Instrucion forms                                                     */
149 /* --------------------------------------------------------------------- */
150 
151 #define ADD		(/* BINARY */ 0 << 3)
152 #define ADD_EAX_i32	0x05
153 #define ADD_r_rm	0x03
154 #define ADD_rm_r	0x01
155 #define ADDSD_x_xm	0x58
156 #define ADC		(/* BINARY */ 2 << 3)
157 #define ADC_EAX_i32	0x15
158 #define ADC_r_rm	0x13
159 #define ADC_rm_r	0x11
160 #define AND		(/* BINARY */ 4 << 3)
161 #define AND_EAX_i32	0x25
162 #define AND_r_rm	0x23
163 #define AND_rm_r	0x21
164 #define ANDPD_x_xm	0x54
165 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
166 #define CALL_i32	0xe8
167 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
168 #define CDQ		0x99
169 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
170 #define CMP		(/* BINARY */ 7 << 3)
171 #define CMP_EAX_i32	0x3d
172 #define CMP_r_rm	0x3b
173 #define CMP_rm_r	0x39
174 #define CVTPD2PS_x_xm	0x5a
175 #define CVTSI2SD_x_rm	0x2a
176 #define CVTTSD2SI_r_xm	0x2c
177 #define DIV		(/* GROUP_F7 */ 6 << 3)
178 #define DIVSD_x_xm	0x5e
179 #define INT3		0xcc
180 #define IDIV		(/* GROUP_F7 */ 7 << 3)
181 #define IMUL		(/* GROUP_F7 */ 5 << 3)
182 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
183 #define IMUL_r_rm_i8	0x6b
184 #define IMUL_r_rm_i32	0x69
185 #define JE_i8		0x74
186 #define JNE_i8		0x75
187 #define JMP_i8		0xeb
188 #define JMP_i32		0xe9
189 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
190 #define LEA_r_m		0x8d
191 #define MOV_r_rm	0x8b
192 #define MOV_r_i32	0xb8
193 #define MOV_rm_r	0x89
194 #define MOV_rm_i32	0xc7
195 #define MOV_rm8_i8	0xc6
196 #define MOV_rm8_r8	0x88
197 #define MOVSD_x_xm	0x10
198 #define MOVSD_xm_x	0x11
199 #define MOVSXD_r_rm	0x63
200 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
201 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
202 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
203 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
204 #define MUL		(/* GROUP_F7 */ 4 << 3)
205 #define MULSD_x_xm	0x59
206 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
207 #define NOP		0x90
208 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
209 #define OR		(/* BINARY */ 1 << 3)
210 #define OR_r_rm		0x0b
211 #define OR_EAX_i32	0x0d
212 #define OR_rm_r		0x09
213 #define OR_rm8_r8	0x08
214 #define POP_r		0x58
215 #define POP_rm		0x8f
216 #define POPF		0x9d
217 #define PUSH_i32	0x68
218 #define PUSH_r		0x50
219 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
220 #define PUSHF		0x9c
221 #define RET_near	0xc3
222 #define RET_i16		0xc2
223 #define SBB		(/* BINARY */ 3 << 3)
224 #define SBB_EAX_i32	0x1d
225 #define SBB_r_rm	0x1b
226 #define SBB_rm_r	0x19
227 #define SAR		(/* SHIFT */ 7 << 3)
228 #define SHL		(/* SHIFT */ 4 << 3)
229 #define SHR		(/* SHIFT */ 5 << 3)
230 #define SUB		(/* BINARY */ 5 << 3)
231 #define SUB_EAX_i32	0x2d
232 #define SUB_r_rm	0x2b
233 #define SUB_rm_r	0x29
234 #define SUBSD_x_xm	0x5c
235 #define TEST_EAX_i32	0xa9
236 #define TEST_rm_r	0x85
237 #define UCOMISD_x_xm	0x2e
238 #define UNPCKLPD_x_xm	0x14
239 #define XCHG_EAX_r	0x90
240 #define XCHG_r_rm	0x87
241 #define XOR		(/* BINARY */ 6 << 3)
242 #define XOR_EAX_i32	0x35
243 #define XOR_r_rm	0x33
244 #define XOR_rm_r	0x31
245 #define XORPD_x_xm	0x57
246 
247 #define GROUP_0F	0x0f
248 #define GROUP_F7	0xf7
249 #define GROUP_FF	0xff
250 #define GROUP_BINARY_81	0x81
251 #define GROUP_BINARY_83	0x83
252 #define GROUP_SHIFT_1	0xd1
253 #define GROUP_SHIFT_N	0xc1
254 #define GROUP_SHIFT_CL	0xd3
255 
256 #define MOD_REG		0xc0
257 #define MOD_DISP8	0x40
258 
259 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
260 
261 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
262 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
263 #define RET()				(*inst++ = (RET_near))
264 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
265 /* r32, r/m32 */
266 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
267 
268 /* Multithreading does not affect these static variables, since they store
269    built-in CPU features. Therefore they can be overwritten by different threads
270    if they detect the CPU features in the same time. */
271 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
272 static sljit_si cpu_has_sse2 = -1;
273 #endif
274 static sljit_si cpu_has_cmov = -1;
275 
276 #ifdef _WIN32_WCE
277 #include <cmnintrin.h>
278 #elif defined(_MSC_VER) && _MSC_VER >= 1400
279 #include <intrin.h>
280 #endif
281 
get_cpu_features(void)282 static void get_cpu_features(void)
283 {
284 	sljit_ui features;
285 
286 #if defined(_MSC_VER) && _MSC_VER >= 1400
287 
288 	int CPUInfo[4];
289 	__cpuid(CPUInfo, 1);
290 	features = (sljit_ui)CPUInfo[3];
291 
292 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
293 
294 	/* AT&T syntax. */
295 	__asm__ (
296 		"movl $0x1, %%eax\n"
297 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
298 		/* On x86-32, there is no red zone, so this
299 		   should work (no need for a local variable). */
300 		"push %%ebx\n"
301 #endif
302 		"cpuid\n"
303 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
304 		"pop %%ebx\n"
305 #endif
306 		"movl %%edx, %0\n"
307 		: "=g" (features)
308 		:
309 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
310 		: "%eax", "%ecx", "%edx"
311 #else
312 		: "%rax", "%rbx", "%rcx", "%rdx"
313 #endif
314 	);
315 
316 #else /* _MSC_VER && _MSC_VER >= 1400 */
317 
318 	/* Intel syntax. */
319 	__asm {
320 		mov eax, 1
321 		cpuid
322 		mov features, edx
323 	}
324 
325 #endif /* _MSC_VER && _MSC_VER >= 1400 */
326 
327 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
328 	cpu_has_sse2 = (features >> 26) & 0x1;
329 #endif
330 	cpu_has_cmov = (features >> 15) & 0x1;
331 }
332 
get_jump_code(sljit_si type)333 static sljit_ub get_jump_code(sljit_si type)
334 {
335 	switch (type) {
336 	case SLJIT_EQUAL:
337 	case SLJIT_D_EQUAL:
338 		return 0x84 /* je */;
339 
340 	case SLJIT_NOT_EQUAL:
341 	case SLJIT_D_NOT_EQUAL:
342 		return 0x85 /* jne */;
343 
344 	case SLJIT_LESS:
345 	case SLJIT_D_LESS:
346 		return 0x82 /* jc */;
347 
348 	case SLJIT_GREATER_EQUAL:
349 	case SLJIT_D_GREATER_EQUAL:
350 		return 0x83 /* jae */;
351 
352 	case SLJIT_GREATER:
353 	case SLJIT_D_GREATER:
354 		return 0x87 /* jnbe */;
355 
356 	case SLJIT_LESS_EQUAL:
357 	case SLJIT_D_LESS_EQUAL:
358 		return 0x86 /* jbe */;
359 
360 	case SLJIT_SIG_LESS:
361 		return 0x8c /* jl */;
362 
363 	case SLJIT_SIG_GREATER_EQUAL:
364 		return 0x8d /* jnl */;
365 
366 	case SLJIT_SIG_GREATER:
367 		return 0x8f /* jnle */;
368 
369 	case SLJIT_SIG_LESS_EQUAL:
370 		return 0x8e /* jle */;
371 
372 	case SLJIT_OVERFLOW:
373 	case SLJIT_MUL_OVERFLOW:
374 		return 0x80 /* jo */;
375 
376 	case SLJIT_NOT_OVERFLOW:
377 	case SLJIT_MUL_NOT_OVERFLOW:
378 		return 0x81 /* jno */;
379 
380 	case SLJIT_D_UNORDERED:
381 		return 0x8a /* jp */;
382 
383 	case SLJIT_D_ORDERED:
384 		return 0x8b /* jpo */;
385 	}
386 	return 0;
387 }
388 
389 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);
390 
391 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
392 static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
393 #endif
394 
generate_near_jump_code(struct sljit_jump * jump,sljit_ub * code_ptr,sljit_ub * code,sljit_si type)395 static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
396 {
397 	sljit_si short_jump;
398 	sljit_uw label_addr;
399 
400 	if (jump->flags & JUMP_LABEL)
401 		label_addr = (sljit_uw)(code + jump->u.label->size);
402 	else
403 		label_addr = jump->u.target;
404 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
405 
406 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
407 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
408 		return generate_far_jump_code(jump, code_ptr, type);
409 #endif
410 
411 	if (type == SLJIT_JUMP) {
412 		if (short_jump)
413 			*code_ptr++ = JMP_i8;
414 		else
415 			*code_ptr++ = JMP_i32;
416 		jump->addr++;
417 	}
418 	else if (type >= SLJIT_FAST_CALL) {
419 		short_jump = 0;
420 		*code_ptr++ = CALL_i32;
421 		jump->addr++;
422 	}
423 	else if (short_jump) {
424 		*code_ptr++ = get_jump_code(type) - 0x10;
425 		jump->addr++;
426 	}
427 	else {
428 		*code_ptr++ = GROUP_0F;
429 		*code_ptr++ = get_jump_code(type);
430 		jump->addr += 2;
431 	}
432 
433 	if (short_jump) {
434 		jump->flags |= PATCH_MB;
435 		code_ptr += sizeof(sljit_sb);
436 	} else {
437 		jump->flags |= PATCH_MW;
438 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
439 		code_ptr += sizeof(sljit_sw);
440 #else
441 		code_ptr += sizeof(sljit_si);
442 #endif
443 	}
444 
445 	return code_ptr;
446 }
447 
sljit_generate_code(struct sljit_compiler * compiler)448 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
449 {
450 	struct sljit_memory_fragment *buf;
451 	sljit_ub *code;
452 	sljit_ub *code_ptr;
453 	sljit_ub *buf_ptr;
454 	sljit_ub *buf_end;
455 	sljit_ub len;
456 
457 	struct sljit_label *label;
458 	struct sljit_jump *jump;
459 	struct sljit_const *const_;
460 
461 	CHECK_ERROR_PTR();
462 	CHECK_PTR(check_sljit_generate_code(compiler));
463 	reverse_buf(compiler);
464 
465 	/* Second code generation pass. */
466 	code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
467 	PTR_FAIL_WITH_EXEC_IF(code);
468 	buf = compiler->buf;
469 
470 	code_ptr = code;
471 	label = compiler->labels;
472 	jump = compiler->jumps;
473 	const_ = compiler->consts;
474 	do {
475 		buf_ptr = buf->memory;
476 		buf_end = buf_ptr + buf->used_size;
477 		do {
478 			len = *buf_ptr++;
479 			if (len > 0) {
480 				/* The code is already generated. */
481 				SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
482 				code_ptr += len;
483 				buf_ptr += len;
484 			}
485 			else {
486 				if (*buf_ptr >= 4) {
487 					jump->addr = (sljit_uw)code_ptr;
488 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
489 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
490 					else
491 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
492 					jump = jump->next;
493 				}
494 				else if (*buf_ptr == 0) {
495 					label->addr = (sljit_uw)code_ptr;
496 					label->size = code_ptr - code;
497 					label = label->next;
498 				}
499 				else if (*buf_ptr == 1) {
500 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
501 					const_ = const_->next;
502 				}
503 				else {
504 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
505 					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
506 					buf_ptr++;
507 					*(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
508 					code_ptr += sizeof(sljit_sw);
509 					buf_ptr += sizeof(sljit_sw) - 1;
510 #else
511 					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
512 					buf_ptr += sizeof(sljit_sw);
513 #endif
514 				}
515 				buf_ptr++;
516 			}
517 		} while (buf_ptr < buf_end);
518 		SLJIT_ASSERT(buf_ptr == buf_end);
519 		buf = buf->next;
520 	} while (buf);
521 
522 	SLJIT_ASSERT(!label);
523 	SLJIT_ASSERT(!jump);
524 	SLJIT_ASSERT(!const_);
525 
526 	jump = compiler->jumps;
527 	while (jump) {
528 		if (jump->flags & PATCH_MB) {
529 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
530 			*(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
531 		} else if (jump->flags & PATCH_MW) {
532 			if (jump->flags & JUMP_LABEL) {
533 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
534 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
535 #else
536 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
537 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
538 #endif
539 			}
540 			else {
541 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
542 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
543 #else
544 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
545 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
546 #endif
547 			}
548 		}
549 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
550 		else if (jump->flags & PATCH_MD)
551 			*(sljit_sw*)jump->addr = jump->u.label->addr;
552 #endif
553 
554 		jump = jump->next;
555 	}
556 
557 	/* Maybe we waste some space because of short jumps. */
558 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
559 	compiler->error = SLJIT_ERR_COMPILED;
560 	compiler->executable_size = code_ptr - code;
561 	return (void*)code;
562 }
563 
564 /* --------------------------------------------------------------------- */
565 /*  Operators                                                            */
566 /* --------------------------------------------------------------------- */
567 
568 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
569 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
570 	sljit_si dst, sljit_sw dstw,
571 	sljit_si src1, sljit_sw src1w,
572 	sljit_si src2, sljit_sw src2w);
573 
574 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
575 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
576 	sljit_si dst, sljit_sw dstw,
577 	sljit_si src1, sljit_sw src1w,
578 	sljit_si src2, sljit_sw src2w);
579 
580 static sljit_si emit_mov(struct sljit_compiler *compiler,
581 	sljit_si dst, sljit_sw dstw,
582 	sljit_si src, sljit_sw srcw);
583 
emit_save_flags(struct sljit_compiler * compiler)584 static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
585 {
586 	sljit_ub *inst;
587 
588 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
589 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
590 	FAIL_IF(!inst);
591 	INC_SIZE(5);
592 #else
593 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
594 	FAIL_IF(!inst);
595 	INC_SIZE(6);
596 	*inst++ = REX_W;
597 #endif
598 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
599 	*inst++ = 0x64;
600 	*inst++ = 0x24;
601 	*inst++ = (sljit_ub)sizeof(sljit_sw);
602 	*inst++ = PUSHF;
603 	compiler->flags_saved = 1;
604 	return SLJIT_SUCCESS;
605 }
606 
emit_restore_flags(struct sljit_compiler * compiler,sljit_si keep_flags)607 static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
608 {
609 	sljit_ub *inst;
610 
611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
612 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
613 	FAIL_IF(!inst);
614 	INC_SIZE(5);
615 	*inst++ = POPF;
616 #else
617 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
618 	FAIL_IF(!inst);
619 	INC_SIZE(6);
620 	*inst++ = POPF;
621 	*inst++ = REX_W;
622 #endif
623 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
624 	*inst++ = 0x64;
625 	*inst++ = 0x24;
626 	*inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
627 	compiler->flags_saved = keep_flags;
628 	return SLJIT_SUCCESS;
629 }
630 
631 #ifdef _WIN32
632 #include <malloc.h>
633 
sljit_grow_stack(sljit_sw local_size)634 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
635 {
636 	/* Workaround for calling the internal _chkstk() function on Windows.
637 	This function touches all 4k pages belongs to the requested stack space,
638 	which size is passed in local_size. This is necessary on Windows where
639 	the stack can only grow in 4k steps. However, this function just burn
640 	CPU cycles if the stack is large enough. However, you don't know it in
641 	advance, so it must always be called. I think this is a bad design in
642 	general even if it has some reasons. */
643 	*(volatile sljit_si*)alloca(local_size) = 0;
644 }
645 
646 #endif
647 
648 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
649 #include "sljitNativeX86_32.c"
650 #else
651 #include "sljitNativeX86_64.c"
652 #endif
653 
emit_mov(struct sljit_compiler * compiler,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)654 static sljit_si emit_mov(struct sljit_compiler *compiler,
655 	sljit_si dst, sljit_sw dstw,
656 	sljit_si src, sljit_sw srcw)
657 {
658 	sljit_ub* inst;
659 
660 	if (dst == SLJIT_UNUSED) {
661 		/* No destination, doesn't need to setup flags. */
662 		if (src & SLJIT_MEM) {
663 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
664 			FAIL_IF(!inst);
665 			*inst = MOV_r_rm;
666 		}
667 		return SLJIT_SUCCESS;
668 	}
669 	if (FAST_IS_REG(src)) {
670 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
671 		FAIL_IF(!inst);
672 		*inst = MOV_rm_r;
673 		return SLJIT_SUCCESS;
674 	}
675 	if (src & SLJIT_IMM) {
676 		if (FAST_IS_REG(dst)) {
677 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
678 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
679 #else
680 			if (!compiler->mode32) {
681 				if (NOT_HALFWORD(srcw))
682 					return emit_load_imm64(compiler, dst, srcw);
683 			}
684 			else
685 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
686 #endif
687 		}
688 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
689 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
690 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
691 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
692 			FAIL_IF(!inst);
693 			*inst = MOV_rm_r;
694 			return SLJIT_SUCCESS;
695 		}
696 #endif
697 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
698 		FAIL_IF(!inst);
699 		*inst = MOV_rm_i32;
700 		return SLJIT_SUCCESS;
701 	}
702 	if (FAST_IS_REG(dst)) {
703 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
704 		FAIL_IF(!inst);
705 		*inst = MOV_r_rm;
706 		return SLJIT_SUCCESS;
707 	}
708 
709 	/* Memory to memory move. Requires two instruction. */
710 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
711 	FAIL_IF(!inst);
712 	*inst = MOV_r_rm;
713 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
714 	FAIL_IF(!inst);
715 	*inst = MOV_rm_r;
716 	return SLJIT_SUCCESS;
717 }
718 
719 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
720 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
721 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_si op)722 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
723 {
724 	sljit_ub *inst;
725 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
726 	sljit_si size;
727 #endif
728 
729 	CHECK_ERROR();
730 	CHECK(check_sljit_emit_op0(compiler, op));
731 
732 	switch (GET_OPCODE(op)) {
733 	case SLJIT_BREAKPOINT:
734 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
735 		FAIL_IF(!inst);
736 		INC_SIZE(1);
737 		*inst = INT3;
738 		break;
739 	case SLJIT_NOP:
740 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
741 		FAIL_IF(!inst);
742 		INC_SIZE(1);
743 		*inst = NOP;
744 		break;
745 	case SLJIT_LUMUL:
746 	case SLJIT_LSMUL:
747 	case SLJIT_UDIVMOD:
748 	case SLJIT_SDIVMOD:
749 	case SLJIT_UDIVI:
750 	case SLJIT_SDIVI:
751 		compiler->flags_saved = 0;
752 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
753 #ifdef _WIN64
754 		SLJIT_COMPILE_ASSERT(
755 			reg_map[SLJIT_R0] == 0
756 			&& reg_map[SLJIT_R1] == 2
757 			&& reg_map[TMP_REG1] > 7,
758 			invalid_register_assignment_for_div_mul);
759 #else
760 		SLJIT_COMPILE_ASSERT(
761 			reg_map[SLJIT_R0] == 0
762 			&& reg_map[SLJIT_R1] < 7
763 			&& reg_map[TMP_REG1] == 2,
764 			invalid_register_assignment_for_div_mul);
765 #endif
766 		compiler->mode32 = op & SLJIT_INT_OP;
767 #endif
768 		SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
769 
770 		op = GET_OPCODE(op);
771 		if ((op | 0x2) == SLJIT_UDIVI) {
772 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
773 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
774 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
775 #else
776 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
777 #endif
778 			FAIL_IF(!inst);
779 			*inst = XOR_r_rm;
780 		}
781 
782 		if ((op | 0x2) == SLJIT_SDIVI) {
783 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
784 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
785 #endif
786 
787 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
788 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
789 			FAIL_IF(!inst);
790 			INC_SIZE(1);
791 			*inst = CDQ;
792 #else
793 			if (compiler->mode32) {
794 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
795 				FAIL_IF(!inst);
796 				INC_SIZE(1);
797 				*inst = CDQ;
798 			} else {
799 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
800 				FAIL_IF(!inst);
801 				INC_SIZE(2);
802 				*inst++ = REX_W;
803 				*inst = CDQ;
804 			}
805 #endif
806 		}
807 
808 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
809 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
810 		FAIL_IF(!inst);
811 		INC_SIZE(2);
812 		*inst++ = GROUP_F7;
813 		*inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
814 #else
815 #ifdef _WIN64
816 		size = (!compiler->mode32 || op >= SLJIT_UDIVMOD) ? 3 : 2;
817 #else
818 		size = (!compiler->mode32) ? 3 : 2;
819 #endif
820 		inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
821 		FAIL_IF(!inst);
822 		INC_SIZE(size);
823 #ifdef _WIN64
824 		if (!compiler->mode32)
825 			*inst++ = REX_W | ((op >= SLJIT_UDIVMOD) ? REX_B : 0);
826 		else if (op >= SLJIT_UDIVMOD)
827 			*inst++ = REX_B;
828 		*inst++ = GROUP_F7;
829 		*inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
830 #else
831 		if (!compiler->mode32)
832 			*inst++ = REX_W;
833 		*inst++ = GROUP_F7;
834 		*inst = MOD_REG | reg_map[SLJIT_R1];
835 #endif
836 #endif
837 		switch (op) {
838 		case SLJIT_LUMUL:
839 			*inst |= MUL;
840 			break;
841 		case SLJIT_LSMUL:
842 			*inst |= IMUL;
843 			break;
844 		case SLJIT_UDIVMOD:
845 		case SLJIT_UDIVI:
846 			*inst |= DIV;
847 			break;
848 		case SLJIT_SDIVMOD:
849 		case SLJIT_SDIVI:
850 			*inst |= IDIV;
851 			break;
852 		}
853 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
854 		if (op <= SLJIT_SDIVMOD)
855 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
856 #else
857 		if (op >= SLJIT_UDIVI)
858 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
859 #endif
860 		break;
861 	}
862 
863 	return SLJIT_SUCCESS;
864 }
865 
866 #define ENCODE_PREFIX(prefix) \
867 	do { \
868 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
869 		FAIL_IF(!inst); \
870 		INC_SIZE(1); \
871 		*inst = (prefix); \
872 	} while (0)
873 
emit_mov_byte(struct sljit_compiler * compiler,sljit_si sign,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)874 static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
875 	sljit_si dst, sljit_sw dstw,
876 	sljit_si src, sljit_sw srcw)
877 {
878 	sljit_ub* inst;
879 	sljit_si dst_r;
880 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
881 	sljit_si work_r;
882 #endif
883 
884 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
885 	compiler->mode32 = 0;
886 #endif
887 
888 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
889 		return SLJIT_SUCCESS; /* Empty instruction. */
890 
891 	if (src & SLJIT_IMM) {
892 		if (FAST_IS_REG(dst)) {
893 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
894 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
895 #else
896 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
897 			FAIL_IF(!inst);
898 			*inst = MOV_rm_i32;
899 			return SLJIT_SUCCESS;
900 #endif
901 		}
902 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
903 		FAIL_IF(!inst);
904 		*inst = MOV_rm8_i8;
905 		return SLJIT_SUCCESS;
906 	}
907 
908 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
909 
910 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
911 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
912 		if (reg_map[src] >= 4) {
913 			SLJIT_ASSERT(dst_r == TMP_REG1);
914 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
915 		} else
916 			dst_r = src;
917 #else
918 		dst_r = src;
919 #endif
920 	}
921 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
922 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
923 		/* src, dst are registers. */
924 		SLJIT_ASSERT(SLOW_IS_REG(dst));
925 		if (reg_map[dst] < 4) {
926 			if (dst != src)
927 				EMIT_MOV(compiler, dst, 0, src, 0);
928 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
929 			FAIL_IF(!inst);
930 			*inst++ = GROUP_0F;
931 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
932 		}
933 		else {
934 			if (dst != src)
935 				EMIT_MOV(compiler, dst, 0, src, 0);
936 			if (sign) {
937 				/* shl reg, 24 */
938 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
939 				FAIL_IF(!inst);
940 				*inst |= SHL;
941 				/* sar reg, 24 */
942 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
943 				FAIL_IF(!inst);
944 				*inst |= SAR;
945 			}
946 			else {
947 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
948 				FAIL_IF(!inst);
949 				*(inst + 1) |= AND;
950 			}
951 		}
952 		return SLJIT_SUCCESS;
953 	}
954 #endif
955 	else {
956 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
957 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
958 		FAIL_IF(!inst);
959 		*inst++ = GROUP_0F;
960 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
961 	}
962 
963 	if (dst & SLJIT_MEM) {
964 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
965 		if (dst_r == TMP_REG1) {
966 			/* Find a non-used register, whose reg_map[src] < 4. */
967 			if ((dst & REG_MASK) == SLJIT_R0) {
968 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
969 					work_r = SLJIT_R2;
970 				else
971 					work_r = SLJIT_R1;
972 			}
973 			else {
974 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
975 					work_r = SLJIT_R0;
976 				else if ((dst & REG_MASK) == SLJIT_R1)
977 					work_r = SLJIT_R2;
978 				else
979 					work_r = SLJIT_R1;
980 			}
981 
982 			if (work_r == SLJIT_R0) {
983 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
984 			}
985 			else {
986 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
987 				FAIL_IF(!inst);
988 				*inst = XCHG_r_rm;
989 			}
990 
991 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
992 			FAIL_IF(!inst);
993 			*inst = MOV_rm8_r8;
994 
995 			if (work_r == SLJIT_R0) {
996 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
997 			}
998 			else {
999 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1000 				FAIL_IF(!inst);
1001 				*inst = XCHG_r_rm;
1002 			}
1003 		}
1004 		else {
1005 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1006 			FAIL_IF(!inst);
1007 			*inst = MOV_rm8_r8;
1008 		}
1009 #else
1010 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1011 		FAIL_IF(!inst);
1012 		*inst = MOV_rm8_r8;
1013 #endif
1014 	}
1015 
1016 	return SLJIT_SUCCESS;
1017 }
1018 
emit_mov_half(struct sljit_compiler * compiler,sljit_si sign,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)1019 static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
1020 	sljit_si dst, sljit_sw dstw,
1021 	sljit_si src, sljit_sw srcw)
1022 {
1023 	sljit_ub* inst;
1024 	sljit_si dst_r;
1025 
1026 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1027 	compiler->mode32 = 0;
1028 #endif
1029 
1030 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1031 		return SLJIT_SUCCESS; /* Empty instruction. */
1032 
1033 	if (src & SLJIT_IMM) {
1034 		if (FAST_IS_REG(dst)) {
1035 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1036 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1037 #else
1038 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1039 			FAIL_IF(!inst);
1040 			*inst = MOV_rm_i32;
1041 			return SLJIT_SUCCESS;
1042 #endif
1043 		}
1044 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1045 		FAIL_IF(!inst);
1046 		*inst = MOV_rm_i32;
1047 		return SLJIT_SUCCESS;
1048 	}
1049 
1050 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1051 
1052 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1053 		dst_r = src;
1054 	else {
1055 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1056 		FAIL_IF(!inst);
1057 		*inst++ = GROUP_0F;
1058 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1059 	}
1060 
1061 	if (dst & SLJIT_MEM) {
1062 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1063 		FAIL_IF(!inst);
1064 		*inst = MOV_rm_r;
1065 	}
1066 
1067 	return SLJIT_SUCCESS;
1068 }
1069 
emit_unary(struct sljit_compiler * compiler,sljit_ub opcode,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)1070 static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
1071 	sljit_si dst, sljit_sw dstw,
1072 	sljit_si src, sljit_sw srcw)
1073 {
1074 	sljit_ub* inst;
1075 
1076 	if (dst == SLJIT_UNUSED) {
1077 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1078 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1079 		FAIL_IF(!inst);
1080 		*inst++ = GROUP_F7;
1081 		*inst |= opcode;
1082 		return SLJIT_SUCCESS;
1083 	}
1084 	if (dst == src && dstw == srcw) {
1085 		/* Same input and output */
1086 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1087 		FAIL_IF(!inst);
1088 		*inst++ = GROUP_F7;
1089 		*inst |= opcode;
1090 		return SLJIT_SUCCESS;
1091 	}
1092 	if (FAST_IS_REG(dst)) {
1093 		EMIT_MOV(compiler, dst, 0, src, srcw);
1094 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1095 		FAIL_IF(!inst);
1096 		*inst++ = GROUP_F7;
1097 		*inst |= opcode;
1098 		return SLJIT_SUCCESS;
1099 	}
1100 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1101 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1102 	FAIL_IF(!inst);
1103 	*inst++ = GROUP_F7;
1104 	*inst |= opcode;
1105 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1106 	return SLJIT_SUCCESS;
1107 }
1108 
emit_not_with_flags(struct sljit_compiler * compiler,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)1109 static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
1110 	sljit_si dst, sljit_sw dstw,
1111 	sljit_si src, sljit_sw srcw)
1112 {
1113 	sljit_ub* inst;
1114 
1115 	if (dst == SLJIT_UNUSED) {
1116 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1117 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1118 		FAIL_IF(!inst);
1119 		*inst++ = GROUP_F7;
1120 		*inst |= NOT_rm;
1121 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1122 		FAIL_IF(!inst);
1123 		*inst = OR_r_rm;
1124 		return SLJIT_SUCCESS;
1125 	}
1126 	if (FAST_IS_REG(dst)) {
1127 		EMIT_MOV(compiler, dst, 0, src, srcw);
1128 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1129 		FAIL_IF(!inst);
1130 		*inst++ = GROUP_F7;
1131 		*inst |= NOT_rm;
1132 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1133 		FAIL_IF(!inst);
1134 		*inst = OR_r_rm;
1135 		return SLJIT_SUCCESS;
1136 	}
1137 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1138 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1139 	FAIL_IF(!inst);
1140 	*inst++ = GROUP_F7;
1141 	*inst |= NOT_rm;
1142 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1143 	FAIL_IF(!inst);
1144 	*inst = OR_r_rm;
1145 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1146 	return SLJIT_SUCCESS;
1147 }
1148 
emit_clz(struct sljit_compiler * compiler,sljit_si op_flags,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)1149 static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
1150 	sljit_si dst, sljit_sw dstw,
1151 	sljit_si src, sljit_sw srcw)
1152 {
1153 	sljit_ub* inst;
1154 	sljit_si dst_r;
1155 
1156 	SLJIT_UNUSED_ARG(op_flags);
1157 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1158 		/* Just set the zero flag. */
1159 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1160 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1161 		FAIL_IF(!inst);
1162 		*inst++ = GROUP_F7;
1163 		*inst |= NOT_rm;
1164 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1165 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1166 #else
1167 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
1168 #endif
1169 		FAIL_IF(!inst);
1170 		*inst |= SHR;
1171 		return SLJIT_SUCCESS;
1172 	}
1173 
1174 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1175 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1176 		src = TMP_REG1;
1177 		srcw = 0;
1178 	}
1179 
1180 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1181 	FAIL_IF(!inst);
1182 	*inst++ = GROUP_0F;
1183 	*inst = BSR_r_rm;
1184 
1185 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1186 	if (FAST_IS_REG(dst))
1187 		dst_r = dst;
1188 	else {
1189 		/* Find an unused temporary register. */
1190 		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1191 			dst_r = SLJIT_R0;
1192 		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1193 			dst_r = SLJIT_R1;
1194 		else
1195 			dst_r = SLJIT_R2;
1196 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1197 	}
1198 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1199 #else
1200 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1201 	compiler->mode32 = 0;
1202 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
1203 	compiler->mode32 = op_flags & SLJIT_INT_OP;
1204 #endif
1205 
1206 	if (cpu_has_cmov == -1)
1207 		get_cpu_features();
1208 
1209 	if (cpu_has_cmov) {
1210 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1211 		FAIL_IF(!inst);
1212 		*inst++ = GROUP_0F;
1213 		*inst = CMOVNE_r_rm;
1214 	} else {
1215 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1216 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1217 		FAIL_IF(!inst);
1218 		INC_SIZE(4);
1219 
1220 		*inst++ = JE_i8;
1221 		*inst++ = 2;
1222 		*inst++ = MOV_r_rm;
1223 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1224 #else
1225 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
1226 		FAIL_IF(!inst);
1227 		INC_SIZE(5);
1228 
1229 		*inst++ = JE_i8;
1230 		*inst++ = 3;
1231 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1232 		*inst++ = MOV_r_rm;
1233 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1234 #endif
1235 	}
1236 
1237 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1238 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1239 #else
1240 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
1241 #endif
1242 	FAIL_IF(!inst);
1243 	*(inst + 1) |= XOR;
1244 
1245 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1246 	if (dst & SLJIT_MEM) {
1247 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1248 		FAIL_IF(!inst);
1249 		*inst = XCHG_r_rm;
1250 	}
1251 #else
1252 	if (dst & SLJIT_MEM)
1253 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1254 #endif
1255 	return SLJIT_SUCCESS;
1256 }
1257 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_si op,sljit_si dst,sljit_sw dstw,sljit_si src,sljit_sw srcw)1258 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
1259 	sljit_si dst, sljit_sw dstw,
1260 	sljit_si src, sljit_sw srcw)
1261 {
1262 	sljit_ub* inst;
1263 	sljit_si update = 0;
1264 	sljit_si op_flags = GET_ALL_FLAGS(op);
1265 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1266 	sljit_si dst_is_ereg = 0;
1267 	sljit_si src_is_ereg = 0;
1268 #else
1269 #	define src_is_ereg 0
1270 #endif
1271 
1272 	CHECK_ERROR();
1273 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1274 	ADJUST_LOCAL_OFFSET(dst, dstw);
1275 	ADJUST_LOCAL_OFFSET(src, srcw);
1276 
1277 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1278 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1279 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1280 	compiler->mode32 = op_flags & SLJIT_INT_OP;
1281 #endif
1282 
1283 	op = GET_OPCODE(op);
1284 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1285 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1286 		compiler->mode32 = 0;
1287 #endif
1288 
1289 		if (op_flags & SLJIT_INT_OP) {
1290 			if (FAST_IS_REG(src) && src == dst) {
1291 				if (!TYPE_CAST_NEEDED(op))
1292 					return SLJIT_SUCCESS;
1293 			}
1294 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1295 			if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
1296 				op = SLJIT_MOV_UI;
1297 			if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
1298 				op = SLJIT_MOVU_UI;
1299 			if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
1300 				op = SLJIT_MOV_SI;
1301 			if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
1302 				op = SLJIT_MOVU_SI;
1303 #endif
1304 		}
1305 
1306 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1307 		if (op >= SLJIT_MOVU) {
1308 			update = 1;
1309 			op -= 8;
1310 		}
1311 
1312 		if (src & SLJIT_IMM) {
1313 			switch (op) {
1314 			case SLJIT_MOV_UB:
1315 				srcw = (sljit_ub)srcw;
1316 				break;
1317 			case SLJIT_MOV_SB:
1318 				srcw = (sljit_sb)srcw;
1319 				break;
1320 			case SLJIT_MOV_UH:
1321 				srcw = (sljit_uh)srcw;
1322 				break;
1323 			case SLJIT_MOV_SH:
1324 				srcw = (sljit_sh)srcw;
1325 				break;
1326 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1327 			case SLJIT_MOV_UI:
1328 				srcw = (sljit_ui)srcw;
1329 				break;
1330 			case SLJIT_MOV_SI:
1331 				srcw = (sljit_si)srcw;
1332 				break;
1333 #endif
1334 			}
1335 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1336 			if (SLJIT_UNLIKELY(dst_is_ereg))
1337 				return emit_mov(compiler, dst, dstw, src, srcw);
1338 #endif
1339 		}
1340 
1341 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1342 			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1343 			FAIL_IF(!inst);
1344 			*inst = LEA_r_m;
1345 			src &= SLJIT_MEM | 0xf;
1346 			srcw = 0;
1347 		}
1348 
1349 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1350 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1351 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1352 			dst = TMP_REG1;
1353 		}
1354 #endif
1355 
1356 		switch (op) {
1357 		case SLJIT_MOV:
1358 		case SLJIT_MOV_P:
1359 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1360 		case SLJIT_MOV_UI:
1361 		case SLJIT_MOV_SI:
1362 #endif
1363 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1364 			break;
1365 		case SLJIT_MOV_UB:
1366 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1367 			break;
1368 		case SLJIT_MOV_SB:
1369 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1370 			break;
1371 		case SLJIT_MOV_UH:
1372 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1373 			break;
1374 		case SLJIT_MOV_SH:
1375 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1376 			break;
1377 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1378 		case SLJIT_MOV_UI:
1379 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1380 			break;
1381 		case SLJIT_MOV_SI:
1382 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1383 			break;
1384 #endif
1385 		}
1386 
1387 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1388 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1389 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1390 #endif
1391 
1392 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1393 			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1394 			FAIL_IF(!inst);
1395 			*inst = LEA_r_m;
1396 		}
1397 		return SLJIT_SUCCESS;
1398 	}
1399 
1400 	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1401 		compiler->flags_saved = 0;
1402 
1403 	switch (op) {
1404 	case SLJIT_NOT:
1405 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1406 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1407 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1408 
1409 	case SLJIT_NEG:
1410 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1411 			FAIL_IF(emit_save_flags(compiler));
1412 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1413 
1414 	case SLJIT_CLZ:
1415 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1416 			FAIL_IF(emit_save_flags(compiler));
1417 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1418 	}
1419 
1420 	return SLJIT_SUCCESS;
1421 
1422 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1423 #	undef src_is_ereg
1424 #endif
1425 }
1426 
1427 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1428 
1429 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1430 	if (IS_HALFWORD(immw) || compiler->mode32) { \
1431 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1432 		FAIL_IF(!inst); \
1433 		*(inst + 1) |= (op_imm); \
1434 	} \
1435 	else { \
1436 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1437 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1438 		FAIL_IF(!inst); \
1439 		*inst = (op_mr); \
1440 	}
1441 
1442 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1443 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1444 
1445 #else
1446 
1447 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1448 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1449 	FAIL_IF(!inst); \
1450 	*(inst + 1) |= (op_imm);
1451 
1452 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1453 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1454 
1455 #endif
1456 
emit_cum_binary(struct sljit_compiler * compiler,sljit_ub op_rm,sljit_ub op_mr,sljit_ub op_imm,sljit_ub op_eax_imm,sljit_si dst,sljit_sw dstw,sljit_si src1,sljit_sw src1w,sljit_si src2,sljit_sw src2w)1457 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
1458 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1459 	sljit_si dst, sljit_sw dstw,
1460 	sljit_si src1, sljit_sw src1w,
1461 	sljit_si src2, sljit_sw src2w)
1462 {
1463 	sljit_ub* inst;
1464 
1465 	if (dst == SLJIT_UNUSED) {
1466 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1467 		if (src2 & SLJIT_IMM) {
1468 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1469 		}
1470 		else {
1471 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1472 			FAIL_IF(!inst);
1473 			*inst = op_rm;
1474 		}
1475 		return SLJIT_SUCCESS;
1476 	}
1477 
1478 	if (dst == src1 && dstw == src1w) {
1479 		if (src2 & SLJIT_IMM) {
1480 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1481 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1482 #else
1483 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1484 #endif
1485 				BINARY_EAX_IMM(op_eax_imm, src2w);
1486 			}
1487 			else {
1488 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1489 			}
1490 		}
1491 		else if (FAST_IS_REG(dst)) {
1492 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1493 			FAIL_IF(!inst);
1494 			*inst = op_rm;
1495 		}
1496 		else if (FAST_IS_REG(src2)) {
1497 			/* Special exception for sljit_emit_op_flags. */
1498 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1499 			FAIL_IF(!inst);
1500 			*inst = op_mr;
1501 		}
1502 		else {
1503 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1504 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1505 			FAIL_IF(!inst);
1506 			*inst = op_mr;
1507 		}
1508 		return SLJIT_SUCCESS;
1509 	}
1510 
1511 	/* Only for cumulative operations. */
1512 	if (dst == src2 && dstw == src2w) {
1513 		if (src1 & SLJIT_IMM) {
1514 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1515 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1516 #else
1517 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1518 #endif
1519 				BINARY_EAX_IMM(op_eax_imm, src1w);
1520 			}
1521 			else {
1522 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1523 			}
1524 		}
1525 		else if (FAST_IS_REG(dst)) {
1526 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1527 			FAIL_IF(!inst);
1528 			*inst = op_rm;
1529 		}
1530 		else if (FAST_IS_REG(src1)) {
1531 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1532 			FAIL_IF(!inst);
1533 			*inst = op_mr;
1534 		}
1535 		else {
1536 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1537 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1538 			FAIL_IF(!inst);
1539 			*inst = op_mr;
1540 		}
1541 		return SLJIT_SUCCESS;
1542 	}
1543 
1544 	/* General version. */
1545 	if (FAST_IS_REG(dst)) {
1546 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1547 		if (src2 & SLJIT_IMM) {
1548 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1549 		}
1550 		else {
1551 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1552 			FAIL_IF(!inst);
1553 			*inst = op_rm;
1554 		}
1555 	}
1556 	else {
1557 		/* This version requires less memory writing. */
1558 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1559 		if (src2 & SLJIT_IMM) {
1560 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1561 		}
1562 		else {
1563 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1564 			FAIL_IF(!inst);
1565 			*inst = op_rm;
1566 		}
1567 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1568 	}
1569 
1570 	return SLJIT_SUCCESS;
1571 }
1572 
1573 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
1574 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1575 	sljit_si dst, sljit_sw dstw,
1576 	sljit_si src1, sljit_sw src1w,
1577 	sljit_si src2, sljit_sw src2w)
1578 {
1579 	sljit_ub* inst;
1580 
1581 	if (dst == SLJIT_UNUSED) {
1582 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1583 		if (src2 & SLJIT_IMM) {
1584 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1585 		}
1586 		else {
1587 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1588 			FAIL_IF(!inst);
1589 			*inst = op_rm;
1590 		}
1591 		return SLJIT_SUCCESS;
1592 	}
1593 
1594 	if (dst == src1 && dstw == src1w) {
1595 		if (src2 & SLJIT_IMM) {
1596 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1597 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1598 #else
1599 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1600 #endif
1601 				BINARY_EAX_IMM(op_eax_imm, src2w);
1602 			}
1603 			else {
1604 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1605 			}
1606 		}
1607 		else if (FAST_IS_REG(dst)) {
1608 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1609 			FAIL_IF(!inst);
1610 			*inst = op_rm;
1611 		}
1612 		else if (FAST_IS_REG(src2)) {
1613 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1614 			FAIL_IF(!inst);
1615 			*inst = op_mr;
1616 		}
1617 		else {
1618 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1619 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1620 			FAIL_IF(!inst);
1621 			*inst = op_mr;
1622 		}
1623 		return SLJIT_SUCCESS;
1624 	}
1625 
1626 	/* General version. */
1627 	if (FAST_IS_REG(dst) && dst != src2) {
1628 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1629 		if (src2 & SLJIT_IMM) {
1630 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1631 		}
1632 		else {
1633 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1634 			FAIL_IF(!inst);
1635 			*inst = op_rm;
1636 		}
1637 	}
1638 	else {
1639 		/* This version requires less memory writing. */
1640 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1641 		if (src2 & SLJIT_IMM) {
1642 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1643 		}
1644 		else {
1645 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1646 			FAIL_IF(!inst);
1647 			*inst = op_rm;
1648 		}
1649 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1650 	}
1651 
1652 	return SLJIT_SUCCESS;
1653 }
1654 
1655 static sljit_si emit_mul(struct sljit_compiler *compiler,
1656 	sljit_si dst, sljit_sw dstw,
1657 	sljit_si src1, sljit_sw src1w,
1658 	sljit_si src2, sljit_sw src2w)
1659 {
1660 	sljit_ub* inst;
1661 	sljit_si dst_r;
1662 
1663 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1664 
1665 	/* Register destination. */
1666 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1667 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1668 		FAIL_IF(!inst);
1669 		*inst++ = GROUP_0F;
1670 		*inst = IMUL_r_rm;
1671 	}
1672 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1673 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1674 		FAIL_IF(!inst);
1675 		*inst++ = GROUP_0F;
1676 		*inst = IMUL_r_rm;
1677 	}
1678 	else if (src1 & SLJIT_IMM) {
1679 		if (src2 & SLJIT_IMM) {
1680 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1681 			src2 = dst_r;
1682 			src2w = 0;
1683 		}
1684 
1685 		if (src1w <= 127 && src1w >= -128) {
1686 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1687 			FAIL_IF(!inst);
1688 			*inst = IMUL_r_rm_i8;
1689 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1690 			FAIL_IF(!inst);
1691 			INC_SIZE(1);
1692 			*inst = (sljit_sb)src1w;
1693 		}
1694 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1695 		else {
1696 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1697 			FAIL_IF(!inst);
1698 			*inst = IMUL_r_rm_i32;
1699 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1700 			FAIL_IF(!inst);
1701 			INC_SIZE(4);
1702 			*(sljit_sw*)inst = src1w;
1703 		}
1704 #else
1705 		else if (IS_HALFWORD(src1w)) {
1706 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1707 			FAIL_IF(!inst);
1708 			*inst = IMUL_r_rm_i32;
1709 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1710 			FAIL_IF(!inst);
1711 			INC_SIZE(4);
1712 			*(sljit_si*)inst = (sljit_si)src1w;
1713 		}
1714 		else {
1715 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1716 			if (dst_r != src2)
1717 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1718 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1719 			FAIL_IF(!inst);
1720 			*inst++ = GROUP_0F;
1721 			*inst = IMUL_r_rm;
1722 		}
1723 #endif
1724 	}
1725 	else if (src2 & SLJIT_IMM) {
1726 		/* Note: src1 is NOT immediate. */
1727 
1728 		if (src2w <= 127 && src2w >= -128) {
1729 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1730 			FAIL_IF(!inst);
1731 			*inst = IMUL_r_rm_i8;
1732 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1733 			FAIL_IF(!inst);
1734 			INC_SIZE(1);
1735 			*inst = (sljit_sb)src2w;
1736 		}
1737 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1738 		else {
1739 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1740 			FAIL_IF(!inst);
1741 			*inst = IMUL_r_rm_i32;
1742 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1743 			FAIL_IF(!inst);
1744 			INC_SIZE(4);
1745 			*(sljit_sw*)inst = src2w;
1746 		}
1747 #else
1748 		else if (IS_HALFWORD(src2w)) {
1749 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1750 			FAIL_IF(!inst);
1751 			*inst = IMUL_r_rm_i32;
1752 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1753 			FAIL_IF(!inst);
1754 			INC_SIZE(4);
1755 			*(sljit_si*)inst = (sljit_si)src2w;
1756 		}
1757 		else {
1758 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1759 			if (dst_r != src1)
1760 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1761 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1762 			FAIL_IF(!inst);
1763 			*inst++ = GROUP_0F;
1764 			*inst = IMUL_r_rm;
1765 		}
1766 #endif
1767 	}
1768 	else {
1769 		/* Neither argument is immediate. */
1770 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1771 			dst_r = TMP_REG1;
1772 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1773 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1774 		FAIL_IF(!inst);
1775 		*inst++ = GROUP_0F;
1776 		*inst = IMUL_r_rm;
1777 	}
1778 
1779 	if (dst_r == TMP_REG1)
1780 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1781 
1782 	return SLJIT_SUCCESS;
1783 }
1784 
1785 static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
1786 	sljit_si dst, sljit_sw dstw,
1787 	sljit_si src1, sljit_sw src1w,
1788 	sljit_si src2, sljit_sw src2w)
1789 {
1790 	sljit_ub* inst;
1791 	sljit_si dst_r, done = 0;
1792 
1793 	/* These cases better be left to handled by normal way. */
1794 	if (!keep_flags) {
1795 		if (dst == src1 && dstw == src1w)
1796 			return SLJIT_ERR_UNSUPPORTED;
1797 		if (dst == src2 && dstw == src2w)
1798 			return SLJIT_ERR_UNSUPPORTED;
1799 	}
1800 
1801 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1802 
1803 	if (FAST_IS_REG(src1)) {
1804 		if (FAST_IS_REG(src2)) {
1805 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1806 			FAIL_IF(!inst);
1807 			*inst = LEA_r_m;
1808 			done = 1;
1809 		}
1810 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1811 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1812 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
1813 #else
1814 		if (src2 & SLJIT_IMM) {
1815 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1816 #endif
1817 			FAIL_IF(!inst);
1818 			*inst = LEA_r_m;
1819 			done = 1;
1820 		}
1821 	}
1822 	else if (FAST_IS_REG(src2)) {
1823 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1824 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1825 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
1826 #else
1827 		if (src1 & SLJIT_IMM) {
1828 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1829 #endif
1830 			FAIL_IF(!inst);
1831 			*inst = LEA_r_m;
1832 			done = 1;
1833 		}
1834 	}
1835 
1836 	if (done) {
1837 		if (dst_r == TMP_REG1)
1838 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1839 		return SLJIT_SUCCESS;
1840 	}
1841 	return SLJIT_ERR_UNSUPPORTED;
1842 }
1843 
1844 static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
1845 	sljit_si src1, sljit_sw src1w,
1846 	sljit_si src2, sljit_sw src2w)
1847 {
1848 	sljit_ub* inst;
1849 
1850 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1851 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1852 #else
1853 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1854 #endif
1855 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1856 		return SLJIT_SUCCESS;
1857 	}
1858 
1859 	if (FAST_IS_REG(src1)) {
1860 		if (src2 & SLJIT_IMM) {
1861 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1862 		}
1863 		else {
1864 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1865 			FAIL_IF(!inst);
1866 			*inst = CMP_r_rm;
1867 		}
1868 		return SLJIT_SUCCESS;
1869 	}
1870 
1871 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1872 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1873 		FAIL_IF(!inst);
1874 		*inst = CMP_rm_r;
1875 		return SLJIT_SUCCESS;
1876 	}
1877 
1878 	if (src2 & SLJIT_IMM) {
1879 		if (src1 & SLJIT_IMM) {
1880 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1881 			src1 = TMP_REG1;
1882 			src1w = 0;
1883 		}
1884 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1885 	}
1886 	else {
1887 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1888 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1889 		FAIL_IF(!inst);
1890 		*inst = CMP_r_rm;
1891 	}
1892 	return SLJIT_SUCCESS;
1893 }
1894 
1895 static sljit_si emit_test_binary(struct sljit_compiler *compiler,
1896 	sljit_si src1, sljit_sw src1w,
1897 	sljit_si src2, sljit_sw src2w)
1898 {
1899 	sljit_ub* inst;
1900 
1901 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1902 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1903 #else
1904 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1905 #endif
1906 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1907 		return SLJIT_SUCCESS;
1908 	}
1909 
1910 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1911 	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1912 #else
1913 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1914 #endif
1915 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1916 		return SLJIT_SUCCESS;
1917 	}
1918 
1919 	if (!(src1 & SLJIT_IMM)) {
1920 		if (src2 & SLJIT_IMM) {
1921 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1922 			if (IS_HALFWORD(src2w) || compiler->mode32) {
1923 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1924 				FAIL_IF(!inst);
1925 				*inst = GROUP_F7;
1926 			}
1927 			else {
1928 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1929 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1930 				FAIL_IF(!inst);
1931 				*inst = TEST_rm_r;
1932 			}
1933 #else
1934 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1935 			FAIL_IF(!inst);
1936 			*inst = GROUP_F7;
1937 #endif
1938 			return SLJIT_SUCCESS;
1939 		}
1940 		else if (FAST_IS_REG(src1)) {
1941 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1942 			FAIL_IF(!inst);
1943 			*inst = TEST_rm_r;
1944 			return SLJIT_SUCCESS;
1945 		}
1946 	}
1947 
1948 	if (!(src2 & SLJIT_IMM)) {
1949 		if (src1 & SLJIT_IMM) {
1950 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1951 			if (IS_HALFWORD(src1w) || compiler->mode32) {
1952 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1953 				FAIL_IF(!inst);
1954 				*inst = GROUP_F7;
1955 			}
1956 			else {
1957 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1958 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1959 				FAIL_IF(!inst);
1960 				*inst = TEST_rm_r;
1961 			}
1962 #else
1963 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1964 			FAIL_IF(!inst);
1965 			*inst = GROUP_F7;
1966 #endif
1967 			return SLJIT_SUCCESS;
1968 		}
1969 		else if (FAST_IS_REG(src2)) {
1970 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1971 			FAIL_IF(!inst);
1972 			*inst = TEST_rm_r;
1973 			return SLJIT_SUCCESS;
1974 		}
1975 	}
1976 
1977 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1978 	if (src2 & SLJIT_IMM) {
1979 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1980 		if (IS_HALFWORD(src2w) || compiler->mode32) {
1981 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1982 			FAIL_IF(!inst);
1983 			*inst = GROUP_F7;
1984 		}
1985 		else {
1986 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1987 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1988 			FAIL_IF(!inst);
1989 			*inst = TEST_rm_r;
1990 		}
1991 #else
1992 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1993 		FAIL_IF(!inst);
1994 		*inst = GROUP_F7;
1995 #endif
1996 	}
1997 	else {
1998 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1999 		FAIL_IF(!inst);
2000 		*inst = TEST_rm_r;
2001 	}
2002 	return SLJIT_SUCCESS;
2003 }
2004 
2005 static sljit_si emit_shift(struct sljit_compiler *compiler,
2006 	sljit_ub mode,
2007 	sljit_si dst, sljit_sw dstw,
2008 	sljit_si src1, sljit_sw src1w,
2009 	sljit_si src2, sljit_sw src2w)
2010 {
2011 	sljit_ub* inst;
2012 
2013 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2014 		if (dst == src1 && dstw == src1w) {
2015 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2016 			FAIL_IF(!inst);
2017 			*inst |= mode;
2018 			return SLJIT_SUCCESS;
2019 		}
2020 		if (dst == SLJIT_UNUSED) {
2021 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2022 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2023 			FAIL_IF(!inst);
2024 			*inst |= mode;
2025 			return SLJIT_SUCCESS;
2026 		}
2027 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2028 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2029 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2030 			FAIL_IF(!inst);
2031 			*inst |= mode;
2032 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2033 			return SLJIT_SUCCESS;
2034 		}
2035 		if (FAST_IS_REG(dst)) {
2036 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2037 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2038 			FAIL_IF(!inst);
2039 			*inst |= mode;
2040 			return SLJIT_SUCCESS;
2041 		}
2042 
2043 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2044 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2045 		FAIL_IF(!inst);
2046 		*inst |= mode;
2047 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2048 		return SLJIT_SUCCESS;
2049 	}
2050 
2051 	if (dst == SLJIT_PREF_SHIFT_REG) {
2052 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2053 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2054 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2055 		FAIL_IF(!inst);
2056 		*inst |= mode;
2057 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2058 	}
2059 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2060 		if (src1 != dst)
2061 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2062 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2063 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2064 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2065 		FAIL_IF(!inst);
2066 		*inst |= mode;
2067 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2068 	}
2069 	else {
2070 		/* This case is really difficult, since ecx itself may used for
2071 		   addressing, and we must ensure to work even in that case. */
2072 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2073 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2074 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2075 #else
2076 		/* [esp+0] contains the flags. */
2077 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2078 #endif
2079 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2080 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2081 		FAIL_IF(!inst);
2082 		*inst |= mode;
2083 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2084 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2085 #else
2086 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
2087 #endif
2088 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2089 	}
2090 
2091 	return SLJIT_SUCCESS;
2092 }
2093 
2094 static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
2095 	sljit_ub mode, sljit_si set_flags,
2096 	sljit_si dst, sljit_sw dstw,
2097 	sljit_si src1, sljit_sw src1w,
2098 	sljit_si src2, sljit_sw src2w)
2099 {
2100 	/* The CPU does not set flags if the shift count is 0. */
2101 	if (src2 & SLJIT_IMM) {
2102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2103 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2104 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2105 #else
2106 		if ((src2w & 0x1f) != 0)
2107 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2108 #endif
2109 		if (!set_flags)
2110 			return emit_mov(compiler, dst, dstw, src1, src1w);
2111 		/* OR dst, src, 0 */
2112 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2113 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2114 	}
2115 
2116 	if (!set_flags)
2117 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2118 
2119 	if (!FAST_IS_REG(dst))
2120 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2121 
2122 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2123 
2124 	if (FAST_IS_REG(dst))
2125 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2126 	return SLJIT_SUCCESS;
2127 }
2128 
2129 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
2130 	sljit_si dst, sljit_sw dstw,
2131 	sljit_si src1, sljit_sw src1w,
2132 	sljit_si src2, sljit_sw src2w)
2133 {
2134 	CHECK_ERROR();
2135 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2136 	ADJUST_LOCAL_OFFSET(dst, dstw);
2137 	ADJUST_LOCAL_OFFSET(src1, src1w);
2138 	ADJUST_LOCAL_OFFSET(src2, src2w);
2139 
2140 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2141 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2142 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2143 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2144 	compiler->mode32 = op & SLJIT_INT_OP;
2145 #endif
2146 
2147 	if (GET_OPCODE(op) >= SLJIT_MUL) {
2148 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2149 			compiler->flags_saved = 0;
2150 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2151 			FAIL_IF(emit_save_flags(compiler));
2152 	}
2153 
2154 	switch (GET_OPCODE(op)) {
2155 	case SLJIT_ADD:
2156 		if (!GET_FLAGS(op)) {
2157 			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2158 				return compiler->error;
2159 		}
2160 		else
2161 			compiler->flags_saved = 0;
2162 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2163 			FAIL_IF(emit_save_flags(compiler));
2164 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2165 			dst, dstw, src1, src1w, src2, src2w);
2166 	case SLJIT_ADDC:
2167 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2168 			FAIL_IF(emit_restore_flags(compiler, 1));
2169 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2170 			FAIL_IF(emit_save_flags(compiler));
2171 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2172 			compiler->flags_saved = 0;
2173 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2174 			dst, dstw, src1, src1w, src2, src2w);
2175 	case SLJIT_SUB:
2176 		if (!GET_FLAGS(op)) {
2177 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2178 				return compiler->error;
2179 		}
2180 		else
2181 			compiler->flags_saved = 0;
2182 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2183 			FAIL_IF(emit_save_flags(compiler));
2184 		if (dst == SLJIT_UNUSED)
2185 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2186 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2187 			dst, dstw, src1, src1w, src2, src2w);
2188 	case SLJIT_SUBC:
2189 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2190 			FAIL_IF(emit_restore_flags(compiler, 1));
2191 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2192 			FAIL_IF(emit_save_flags(compiler));
2193 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2194 			compiler->flags_saved = 0;
2195 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2196 			dst, dstw, src1, src1w, src2, src2w);
2197 	case SLJIT_MUL:
2198 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2199 	case SLJIT_AND:
2200 		if (dst == SLJIT_UNUSED)
2201 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2202 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2203 			dst, dstw, src1, src1w, src2, src2w);
2204 	case SLJIT_OR:
2205 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2206 			dst, dstw, src1, src1w, src2, src2w);
2207 	case SLJIT_XOR:
2208 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2209 			dst, dstw, src1, src1w, src2, src2w);
2210 	case SLJIT_SHL:
2211 		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2212 			dst, dstw, src1, src1w, src2, src2w);
2213 	case SLJIT_LSHR:
2214 		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2215 			dst, dstw, src1, src1w, src2, src2w);
2216 	case SLJIT_ASHR:
2217 		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2218 			dst, dstw, src1, src1w, src2, src2w);
2219 	}
2220 
2221 	return SLJIT_SUCCESS;
2222 }
2223 
2224 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
2225 {
2226 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2227 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2228 	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
2229 		return -1;
2230 #endif
2231 	return reg_map[reg];
2232 }
2233 
2234 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
2235 {
2236 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2237 	return reg;
2238 }
2239 
2240 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
2241 	void *instruction, sljit_si size)
2242 {
2243 	sljit_ub *inst;
2244 
2245 	CHECK_ERROR();
2246 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2247 
2248 	inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
2249 	FAIL_IF(!inst);
2250 	INC_SIZE(size);
2251 	SLJIT_MEMMOVE(inst, instruction, size);
2252 	return SLJIT_SUCCESS;
2253 }
2254 
2255 /* --------------------------------------------------------------------- */
2256 /*  Floating point operators                                             */
2257 /* --------------------------------------------------------------------- */
2258 
2259 /* Alignment + 2 * 16 bytes. */
2260 static sljit_si sse2_data[3 + (4 + 4) * 2];
2261 static sljit_si *sse2_buffer;
2262 
2263 static void init_compiler(void)
2264 {
2265 	sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
2266 	/* Single precision constants. */
2267 	sse2_buffer[0] = 0x80000000;
2268 	sse2_buffer[4] = 0x7fffffff;
2269 	/* Double precision constants. */
2270 	sse2_buffer[8] = 0;
2271 	sse2_buffer[9] = 0x80000000;
2272 	sse2_buffer[12] = 0xffffffff;
2273 	sse2_buffer[13] = 0x7fffffff;
2274 }
2275 
2276 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
2277 {
2278 #ifdef SLJIT_IS_FPU_AVAILABLE
2279 	return SLJIT_IS_FPU_AVAILABLE;
2280 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2281 	if (cpu_has_sse2 == -1)
2282 		get_cpu_features();
2283 	return cpu_has_sse2;
2284 #else /* SLJIT_DETECT_SSE2 */
2285 	return 1;
2286 #endif /* SLJIT_DETECT_SSE2 */
2287 }
2288 
2289 static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
2290 	sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2291 {
2292 	sljit_ub *inst;
2293 
2294 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2295 	FAIL_IF(!inst);
2296 	*inst++ = GROUP_0F;
2297 	*inst = opcode;
2298 	return SLJIT_SUCCESS;
2299 }
2300 
2301 static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
2302 	sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2303 {
2304 	sljit_ub *inst;
2305 
2306 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2307 	FAIL_IF(!inst);
2308 	*inst++ = GROUP_0F;
2309 	*inst = opcode;
2310 	return SLJIT_SUCCESS;
2311 }
2312 
2313 static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
2314 	sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
2315 {
2316 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2317 }
2318 
2319 static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
2320 	sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
2321 {
2322 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2323 }
2324 
2325 static SLJIT_INLINE sljit_si sljit_emit_fop1_convw_fromd(struct sljit_compiler *compiler, sljit_si op,
2326 	sljit_si dst, sljit_sw dstw,
2327 	sljit_si src, sljit_sw srcw)
2328 {
2329 	sljit_si dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2330 	sljit_ub *inst;
2331 
2332 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2333 	if (GET_OPCODE(op) == SLJIT_CONVW_FROMD)
2334 		compiler->mode32 = 0;
2335 #endif
2336 
2337 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2338 	FAIL_IF(!inst);
2339 	*inst++ = GROUP_0F;
2340 	*inst = CVTTSD2SI_r_xm;
2341 
2342 	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2343 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2344 	return SLJIT_SUCCESS;
2345 }
2346 
2347 static SLJIT_INLINE sljit_si sljit_emit_fop1_convd_fromw(struct sljit_compiler *compiler, sljit_si op,
2348 	sljit_si dst, sljit_sw dstw,
2349 	sljit_si src, sljit_sw srcw)
2350 {
2351 	sljit_si dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2352 	sljit_ub *inst;
2353 
2354 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2355 	if (GET_OPCODE(op) == SLJIT_CONVD_FROMW)
2356 		compiler->mode32 = 0;
2357 #endif
2358 
2359 	if (src & SLJIT_IMM) {
2360 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2361 		if (GET_OPCODE(op) == SLJIT_CONVD_FROMI)
2362 			srcw = (sljit_si)srcw;
2363 #endif
2364 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2365 		src = TMP_REG1;
2366 		srcw = 0;
2367 	}
2368 
2369 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2370 	FAIL_IF(!inst);
2371 	*inst++ = GROUP_0F;
2372 	*inst = CVTSI2SD_x_rm;
2373 
2374 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2375 	compiler->mode32 = 1;
2376 #endif
2377 	if (dst_r == TMP_FREG)
2378 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2379 	return SLJIT_SUCCESS;
2380 }
2381 
2382 static SLJIT_INLINE sljit_si sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_si op,
2383 	sljit_si src1, sljit_sw src1w,
2384 	sljit_si src2, sljit_sw src2w)
2385 {
2386 	compiler->flags_saved = 0;
2387 	if (!FAST_IS_REG(src1)) {
2388 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2389 		src1 = TMP_FREG;
2390 	}
2391 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), src1, src2, src2w);
2392 }
2393 
2394 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2395 	sljit_si dst, sljit_sw dstw,
2396 	sljit_si src, sljit_sw srcw)
2397 {
2398 	sljit_si dst_r;
2399 
2400 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2401 	compiler->mode32 = 1;
2402 #endif
2403 
2404 	CHECK_ERROR();
2405 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2406 
2407 	if (GET_OPCODE(op) == SLJIT_DMOV) {
2408 		if (FAST_IS_REG(dst))
2409 			return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
2410 		if (FAST_IS_REG(src))
2411 			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
2412 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
2413 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2414 	}
2415 
2416 	if (GET_OPCODE(op) == SLJIT_CONVD_FROMS) {
2417 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2418 		if (FAST_IS_REG(src)) {
2419 			/* We overwrite the high bits of source. From SLJIT point of view,
2420 			   this is not an issue.
2421 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2422 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_SINGLE_OP, src, src, 0));
2423 		}
2424 		else {
2425 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_SINGLE_OP), TMP_FREG, src, srcw));
2426 			src = TMP_FREG;
2427 		}
2428 
2429 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_SINGLE_OP, dst_r, src, 0));
2430 		if (dst_r == TMP_FREG)
2431 			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2432 		return SLJIT_SUCCESS;
2433 	}
2434 
2435 	if (SLOW_IS_REG(dst)) {
2436 		dst_r = dst;
2437 		if (dst != src)
2438 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2439 	}
2440 	else {
2441 		dst_r = TMP_FREG;
2442 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2443 	}
2444 
2445 	switch (GET_OPCODE(op)) {
2446 	case SLJIT_DNEG:
2447 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
2448 		break;
2449 
2450 	case SLJIT_DABS:
2451 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2452 		break;
2453 	}
2454 
2455 	if (dst_r == TMP_FREG)
2456 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2457 	return SLJIT_SUCCESS;
2458 }
2459 
2460 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2461 	sljit_si dst, sljit_sw dstw,
2462 	sljit_si src1, sljit_sw src1w,
2463 	sljit_si src2, sljit_sw src2w)
2464 {
2465 	sljit_si dst_r;
2466 
2467 	CHECK_ERROR();
2468 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2469 	ADJUST_LOCAL_OFFSET(dst, dstw);
2470 	ADJUST_LOCAL_OFFSET(src1, src1w);
2471 	ADJUST_LOCAL_OFFSET(src2, src2w);
2472 
2473 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2474 	compiler->mode32 = 1;
2475 #endif
2476 
2477 	if (FAST_IS_REG(dst)) {
2478 		dst_r = dst;
2479 		if (dst == src1)
2480 			; /* Do nothing here. */
2481 		else if (dst == src2 && (op == SLJIT_DADD || op == SLJIT_DMUL)) {
2482 			/* Swap arguments. */
2483 			src2 = src1;
2484 			src2w = src1w;
2485 		}
2486 		else if (dst != src2)
2487 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
2488 		else {
2489 			dst_r = TMP_FREG;
2490 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2491 		}
2492 	}
2493 	else {
2494 		dst_r = TMP_FREG;
2495 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2496 	}
2497 
2498 	switch (GET_OPCODE(op)) {
2499 	case SLJIT_DADD:
2500 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2501 		break;
2502 
2503 	case SLJIT_DSUB:
2504 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2505 		break;
2506 
2507 	case SLJIT_DMUL:
2508 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2509 		break;
2510 
2511 	case SLJIT_DDIV:
2512 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2513 		break;
2514 	}
2515 
2516 	if (dst_r == TMP_FREG)
2517 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2518 	return SLJIT_SUCCESS;
2519 }
2520 
2521 /* --------------------------------------------------------------------- */
2522 /*  Conditional instructions                                             */
2523 /* --------------------------------------------------------------------- */
2524 
2525 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2526 {
2527 	sljit_ub *inst;
2528 	struct sljit_label *label;
2529 
2530 	CHECK_ERROR_PTR();
2531 	CHECK_PTR(check_sljit_emit_label(compiler));
2532 
2533 	/* We should restore the flags before the label,
2534 	   since other taken jumps has their own flags as well. */
2535 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2536 		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2537 
2538 	if (compiler->last_label && compiler->last_label->size == compiler->size)
2539 		return compiler->last_label;
2540 
2541 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2542 	PTR_FAIL_IF(!label);
2543 	set_label(label, compiler);
2544 
2545 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2546 	PTR_FAIL_IF(!inst);
2547 
2548 	*inst++ = 0;
2549 	*inst++ = 0;
2550 
2551 	return label;
2552 }
2553 
2554 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
2555 {
2556 	sljit_ub *inst;
2557 	struct sljit_jump *jump;
2558 
2559 	CHECK_ERROR_PTR();
2560 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2561 
2562 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2563 		if ((type & 0xff) <= SLJIT_JUMP)
2564 			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2565 		compiler->flags_saved = 0;
2566 	}
2567 
2568 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2569 	PTR_FAIL_IF_NULL(jump);
2570 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2571 	type &= 0xff;
2572 
2573 	if (type >= SLJIT_CALL1)
2574 		PTR_FAIL_IF(call_with_args(compiler, type));
2575 
2576 	/* Worst case size. */
2577 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2578 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2579 #else
2580 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2581 #endif
2582 
2583 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2584 	PTR_FAIL_IF_NULL(inst);
2585 
2586 	*inst++ = 0;
2587 	*inst++ = type + 4;
2588 	return jump;
2589 }
2590 
2591 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
2592 {
2593 	sljit_ub *inst;
2594 	struct sljit_jump *jump;
2595 
2596 	CHECK_ERROR();
2597 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2598 	ADJUST_LOCAL_OFFSET(src, srcw);
2599 
2600 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2601 
2602 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2603 		if (type <= SLJIT_JUMP)
2604 			FAIL_IF(emit_restore_flags(compiler, 0));
2605 		compiler->flags_saved = 0;
2606 	}
2607 
2608 	if (type >= SLJIT_CALL1) {
2609 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2610 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2611 		if (src == SLJIT_R2) {
2612 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2613 			src = TMP_REG1;
2614 		}
2615 		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2616 			srcw += sizeof(sljit_sw);
2617 #endif
2618 #endif
2619 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2620 		if (src == SLJIT_R2) {
2621 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2622 			src = TMP_REG1;
2623 		}
2624 #endif
2625 		FAIL_IF(call_with_args(compiler, type));
2626 	}
2627 
2628 	if (src == SLJIT_IMM) {
2629 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2630 		FAIL_IF_NULL(jump);
2631 		set_jump(jump, compiler, JUMP_ADDR);
2632 		jump->u.target = srcw;
2633 
2634 		/* Worst case size. */
2635 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2636 		compiler->size += 5;
2637 #else
2638 		compiler->size += 10 + 3;
2639 #endif
2640 
2641 		inst = (sljit_ub*)ensure_buf(compiler, 2);
2642 		FAIL_IF_NULL(inst);
2643 
2644 		*inst++ = 0;
2645 		*inst++ = type + 4;
2646 	}
2647 	else {
2648 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2649 		/* REX_W is not necessary (src is not immediate). */
2650 		compiler->mode32 = 1;
2651 #endif
2652 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2653 		FAIL_IF(!inst);
2654 		*inst++ = GROUP_FF;
2655 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2656 	}
2657 	return SLJIT_SUCCESS;
2658 }
2659 
2660 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
2661 	sljit_si dst, sljit_sw dstw,
2662 	sljit_si src, sljit_sw srcw,
2663 	sljit_si type)
2664 {
2665 	sljit_ub *inst;
2666 	sljit_ub cond_set = 0;
2667 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2668 	sljit_si reg;
2669 #else
2670 	/* CHECK_EXTRA_REGS migh overwrite these values. */
2671 	sljit_si dst_save = dst;
2672 	sljit_sw dstw_save = dstw;
2673 #endif
2674 
2675 	CHECK_ERROR();
2676 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2677 	SLJIT_UNUSED_ARG(srcw);
2678 
2679 	if (dst == SLJIT_UNUSED)
2680 		return SLJIT_SUCCESS;
2681 
2682 	ADJUST_LOCAL_OFFSET(dst, dstw);
2683 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2684 	if (SLJIT_UNLIKELY(compiler->flags_saved))
2685 		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2686 
2687 	type &= 0xff;
2688 	/* setcc = jcc + 0x10. */
2689 	cond_set = get_jump_code(type) + 0x10;
2690 
2691 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2692 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2693 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
2694 		FAIL_IF(!inst);
2695 		INC_SIZE(4 + 3);
2696 		/* Set low register to conditional flag. */
2697 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2698 		*inst++ = GROUP_0F;
2699 		*inst++ = cond_set;
2700 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2701 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2702 		*inst++ = OR_rm8_r8;
2703 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2704 		return SLJIT_SUCCESS;
2705 	}
2706 
2707 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2708 
2709 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
2710 	FAIL_IF(!inst);
2711 	INC_SIZE(4 + 4);
2712 	/* Set low register to conditional flag. */
2713 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2714 	*inst++ = GROUP_0F;
2715 	*inst++ = cond_set;
2716 	*inst++ = MOD_REG | reg_lmap[reg];
2717 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2718 	*inst++ = GROUP_0F;
2719 	*inst++ = MOVZX_r_rm8;
2720 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2721 
2722 	if (reg != TMP_REG1)
2723 		return SLJIT_SUCCESS;
2724 
2725 	if (GET_OPCODE(op) < SLJIT_ADD) {
2726 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2727 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2728 	}
2729 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2730 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2731 	compiler->skip_checks = 1;
2732 #endif
2733 	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2734 #else /* SLJIT_CONFIG_X86_64 */
2735 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2736 		if (reg_map[dst] <= 4) {
2737 			/* Low byte is accessible. */
2738 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
2739 			FAIL_IF(!inst);
2740 			INC_SIZE(3 + 3);
2741 			/* Set low byte to conditional flag. */
2742 			*inst++ = GROUP_0F;
2743 			*inst++ = cond_set;
2744 			*inst++ = MOD_REG | reg_map[dst];
2745 
2746 			*inst++ = GROUP_0F;
2747 			*inst++ = MOVZX_r_rm8;
2748 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2749 			return SLJIT_SUCCESS;
2750 		}
2751 
2752 		/* Low byte is not accessible. */
2753 		if (cpu_has_cmov == -1)
2754 			get_cpu_features();
2755 
2756 		if (cpu_has_cmov) {
2757 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2758 			/* a xor reg, reg operation would overwrite the flags. */
2759 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2760 
2761 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
2762 			FAIL_IF(!inst);
2763 			INC_SIZE(3);
2764 
2765 			*inst++ = GROUP_0F;
2766 			/* cmovcc = setcc - 0x50. */
2767 			*inst++ = cond_set - 0x50;
2768 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2769 			return SLJIT_SUCCESS;
2770 		}
2771 
2772 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2773 		FAIL_IF(!inst);
2774 		INC_SIZE(1 + 3 + 3 + 1);
2775 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2776 		/* Set al to conditional flag. */
2777 		*inst++ = GROUP_0F;
2778 		*inst++ = cond_set;
2779 		*inst++ = MOD_REG | 0 /* eax */;
2780 
2781 		*inst++ = GROUP_0F;
2782 		*inst++ = MOVZX_r_rm8;
2783 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2784 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2785 		return SLJIT_SUCCESS;
2786 	}
2787 
2788 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2789 		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
2790 		if (dst != SLJIT_R0) {
2791 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2792 			FAIL_IF(!inst);
2793 			INC_SIZE(1 + 3 + 2 + 1);
2794 			/* Set low register to conditional flag. */
2795 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2796 			*inst++ = GROUP_0F;
2797 			*inst++ = cond_set;
2798 			*inst++ = MOD_REG | 0 /* eax */;
2799 			*inst++ = OR_rm8_r8;
2800 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2801 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2802 		}
2803 		else {
2804 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2805 			FAIL_IF(!inst);
2806 			INC_SIZE(2 + 3 + 2 + 2);
2807 			/* Set low register to conditional flag. */
2808 			*inst++ = XCHG_r_rm;
2809 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2810 			*inst++ = GROUP_0F;
2811 			*inst++ = cond_set;
2812 			*inst++ = MOD_REG | 1 /* ecx */;
2813 			*inst++ = OR_rm8_r8;
2814 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2815 			*inst++ = XCHG_r_rm;
2816 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2817 		}
2818 		return SLJIT_SUCCESS;
2819 	}
2820 
2821 	/* Set TMP_REG1 to the bit. */
2822 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2823 	FAIL_IF(!inst);
2824 	INC_SIZE(1 + 3 + 3 + 1);
2825 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2826 	/* Set al to conditional flag. */
2827 	*inst++ = GROUP_0F;
2828 	*inst++ = cond_set;
2829 	*inst++ = MOD_REG | 0 /* eax */;
2830 
2831 	*inst++ = GROUP_0F;
2832 	*inst++ = MOVZX_r_rm8;
2833 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2834 
2835 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2836 
2837 	if (GET_OPCODE(op) < SLJIT_ADD)
2838 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2839 
2840 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2841 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2842 	compiler->skip_checks = 1;
2843 #endif
2844 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2845 #endif /* SLJIT_CONFIG_X86_64 */
2846 }
2847 
2848 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
2849 {
2850 	CHECK_ERROR();
2851 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2852 	ADJUST_LOCAL_OFFSET(dst, dstw);
2853 
2854 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2855 
2856 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2857 	compiler->mode32 = 0;
2858 #endif
2859 
2860 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2861 
2862 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2863 	if (NOT_HALFWORD(offset)) {
2864 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2865 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2866 		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2867 		return compiler->error;
2868 #else
2869 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2870 #endif
2871 	}
2872 #endif
2873 
2874 	if (offset != 0)
2875 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2876 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2877 }
2878 
2879 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
2880 {
2881 	sljit_ub *inst;
2882 	struct sljit_const *const_;
2883 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2884 	sljit_si reg;
2885 #endif
2886 
2887 	CHECK_ERROR_PTR();
2888 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2889 	ADJUST_LOCAL_OFFSET(dst, dstw);
2890 
2891 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2892 
2893 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2894 	PTR_FAIL_IF(!const_);
2895 	set_const(const_, compiler);
2896 
2897 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2898 	compiler->mode32 = 0;
2899 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2900 
2901 	if (emit_load_imm64(compiler, reg, init_value))
2902 		return NULL;
2903 #else
2904 	if (dst == SLJIT_UNUSED)
2905 		dst = TMP_REG1;
2906 
2907 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2908 		return NULL;
2909 #endif
2910 
2911 	inst = (sljit_ub*)ensure_buf(compiler, 2);
2912 	PTR_FAIL_IF(!inst);
2913 
2914 	*inst++ = 0;
2915 	*inst++ = 1;
2916 
2917 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2918 	if (dst & SLJIT_MEM)
2919 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2920 			return NULL;
2921 #endif
2922 
2923 	return const_;
2924 }
2925 
2926 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2927 {
2928 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2929 	*(sljit_sw*)addr = new_addr - (addr + 4);
2930 #else
2931 	*(sljit_uw*)addr = new_addr;
2932 #endif
2933 }
2934 
2935 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2936 {
2937 	*(sljit_sw*)addr = new_constant;
2938 }
2939 
2940 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void)
2941 {
2942 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2943 	if (cpu_has_sse2 == -1)
2944 		get_cpu_features();
2945 	return cpu_has_sse2;
2946 #else
2947 	return 1;
2948 #endif
2949 }
2950 
2951 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void)
2952 {
2953 	if (cpu_has_cmov == -1)
2954 		get_cpu_features();
2955 	return cpu_has_cmov;
2956 }
2957 
2958 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2959 	sljit_si type,
2960 	sljit_si dst_reg,
2961 	sljit_si src, sljit_sw srcw)
2962 {
2963 	sljit_ub* inst;
2964 
2965 	CHECK_ERROR();
2966 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2967 	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2968 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_INT_OP)));
2969 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_D_ORDERED);
2970 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_INT_OP));
2971 	FUNCTION_CHECK_SRC(src, srcw);
2972 #endif
2973 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2974 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2975 		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2976 			!(dst_reg & SLJIT_INT_OP) ? "" : ".i",
2977 			JUMP_PREFIX(type), jump_names[type & 0xff]);
2978 		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_INT_OP);
2979 		fprintf(compiler->verbose, ", ");
2980 		sljit_verbose_param(compiler, src, srcw);
2981 		fprintf(compiler->verbose, "\n");
2982 	}
2983 #endif
2984 
2985 	ADJUST_LOCAL_OFFSET(src, srcw);
2986 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2987 
2988 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2989 	compiler->mode32 = dst_reg & SLJIT_INT_OP;
2990 #endif
2991 	dst_reg &= ~SLJIT_INT_OP;
2992 
2993 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2994 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2995 		src = TMP_REG1;
2996 		srcw = 0;
2997 	}
2998 
2999 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
3000 	FAIL_IF(!inst);
3001 	*inst++ = GROUP_0F;
3002 	*inst = get_jump_code(type & 0xff) - 0x40;
3003 	return SLJIT_SUCCESS;
3004 }
3005