1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #if defined(__has_feature)
28 #if __has_feature(memory_sanitizer)
29 #include <sanitizer/msan_interface.h>
30 #endif /* __has_feature(memory_sanitizer) */
31 #endif /* defined(__has_feature) */
32 
sljit_get_platform_name(void)33 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
34 {
35 	return "x86" SLJIT_CPUINFO;
36 }
37 
38 /*
39    32b register indexes:
40      0 - EAX
41      1 - ECX
42      2 - EDX
43      3 - EBX
44      4 - ESP
45      5 - EBP
46      6 - ESI
47      7 - EDI
48 */
49 
50 /*
51    64b register indexes:
52      0 - RAX
53      1 - RCX
54      2 - RDX
55      3 - RBX
56      4 - RSP
57      5 - RBP
58      6 - RSI
59      7 - RDI
60      8 - R8   - From now on REX prefix is required
61      9 - R9
62     10 - R10
63     11 - R11
64     12 - R12
65     13 - R13
66     14 - R14
67     15 - R15
68 */
69 
70 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
71 #define TMP_FREG	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
72 
73 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
74 
75 
76 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
77 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3
78 };
79 
80 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
81 	0, 1, 2, 3, 4, 5, 6, 7, 0
82 };
83 
84 #define CHECK_EXTRA_REGS(p, w, do) \
85 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
86 		w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
87 		p = SLJIT_MEM1(SLJIT_SP); \
88 		do; \
89 	}
90 
91 #else /* SLJIT_CONFIG_X86_32 */
92 
93 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
94 
95 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
96    Note: avoid to use r12 and r13 for memory addressing
97    therefore r12 is better to be a higher saved register. */
98 #ifndef _WIN64
99 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
100 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
101 	0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
102 };
103 /* low-map. reg_map & 0x7. */
104 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
105 	0, 0, 6, 7, 1, 0,  3,  2,  4, 5,  5,  6,  7, 3, 4, 2, 1
106 };
107 #else
108 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
109 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
110 	0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
111 };
112 /* low-map. reg_map & 0x7. */
113 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
114 	0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
115 };
116 #endif
117 
118 /* Args: xmm0-xmm3 */
119 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
120 	0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4
121 };
122 /* low-map. freg_map & 0x7. */
123 static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
124 	0, 0, 1, 2, 3, 5, 6, 7, 0, 1,  2,  3,  4,  5,  6,  7, 4
125 };
126 
127 #define REX_W		0x48
128 #define REX_R		0x44
129 #define REX_X		0x42
130 #define REX_B		0x41
131 #define REX		0x40
132 
133 #ifndef _WIN64
134 #define HALFWORD_MAX 0x7fffffffl
135 #define HALFWORD_MIN -0x80000000l
136 #else
137 #define HALFWORD_MAX 0x7fffffffll
138 #define HALFWORD_MIN -0x80000000ll
139 #endif
140 
141 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
142 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
143 
144 #define CHECK_EXTRA_REGS(p, w, do)
145 
146 #endif /* SLJIT_CONFIG_X86_32 */
147 
148 #define U8(v)			((sljit_u8)(v))
149 
150 /* Size flags for emit_x86_instruction: */
151 #define EX86_BIN_INS		((sljit_uw)0x000010)
152 #define EX86_SHIFT_INS		((sljit_uw)0x000020)
153 #define EX86_BYTE_ARG		((sljit_uw)0x000040)
154 #define EX86_HALF_ARG		((sljit_uw)0x000080)
155 /* Size flags for both emit_x86_instruction and emit_vex_instruction: */
156 #define EX86_REX		((sljit_uw)0x000100)
157 #define EX86_NO_REXW		((sljit_uw)0x000200)
158 #define EX86_PREF_66		((sljit_uw)0x000400)
159 #define EX86_PREF_F2		((sljit_uw)0x000800)
160 #define EX86_PREF_F3		((sljit_uw)0x001000)
161 #define EX86_SSE2_OP1		((sljit_uw)0x002000)
162 #define EX86_SSE2_OP2		((sljit_uw)0x004000)
163 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
164 #define EX86_VEX_EXT		((sljit_uw)0x008000)
165 /* Op flags for emit_vex_instruction: */
166 #define VEX_OP_0F38		((sljit_uw)0x010000)
167 #define VEX_OP_0F3A		((sljit_uw)0x020000)
168 #define VEX_SSE2_OPV		((sljit_uw)0x040000)
169 #define VEX_AUTO_W		((sljit_uw)0x080000)
170 #define VEX_W			((sljit_uw)0x100000)
171 #define VEX_256			((sljit_uw)0x200000)
172 
173 #define EX86_SELECT_66(op)	(((op) & SLJIT_32) ? 0 : EX86_PREF_66)
174 #define EX86_SELECT_F2_F3(op)	(((op) & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2)
175 
176 /* --------------------------------------------------------------------- */
177 /*  Instruction forms                                                    */
178 /* --------------------------------------------------------------------- */
179 
180 #define ADD			(/* BINARY */ 0 << 3)
181 #define ADD_EAX_i32		0x05
182 #define ADD_r_rm		0x03
183 #define ADD_rm_r		0x01
184 #define ADDSD_x_xm		0x58
185 #define ADC			(/* BINARY */ 2 << 3)
186 #define ADC_EAX_i32		0x15
187 #define ADC_r_rm		0x13
188 #define ADC_rm_r		0x11
189 #define AND			(/* BINARY */ 4 << 3)
190 #define AND_EAX_i32		0x25
191 #define AND_r_rm		0x23
192 #define AND_rm_r		0x21
193 #define ANDPD_x_xm		0x54
194 #define BSR_r_rm		(/* GROUP_0F */ 0xbd)
195 #define BSF_r_rm		(/* GROUP_0F */ 0xbc)
196 #define BSWAP_r			(/* GROUP_0F */ 0xc8)
197 #define CALL_i32		0xe8
198 #define CALL_rm			(/* GROUP_FF */ 2 << 3)
199 #define CDQ			0x99
200 #define CMOVE_r_rm		(/* GROUP_0F */ 0x44)
201 #define CMP			(/* BINARY */ 7 << 3)
202 #define CMP_EAX_i32		0x3d
203 #define CMP_r_rm		0x3b
204 #define CMP_rm_r		0x39
205 #define CMPS_x_xm		0xc2
206 #define CMPXCHG_rm_r		0xb1
207 #define CMPXCHG_rm8_r		0xb0
208 #define CVTPD2PS_x_xm		0x5a
209 #define CVTPS2PD_x_xm		0x5a
210 #define CVTSI2SD_x_rm		0x2a
211 #define CVTTSD2SI_r_xm		0x2c
212 #define DIV			(/* GROUP_F7 */ 6 << 3)
213 #define DIVSD_x_xm		0x5e
214 #define EXTRACTPS_x_xm		0x17
215 #define FLDS			0xd9
216 #define FLDL			0xdd
217 #define FSTPS			0xd9
218 #define FSTPD			0xdd
219 #define INSERTPS_x_xm		0x21
220 #define INT3			0xcc
221 #define IDIV			(/* GROUP_F7 */ 7 << 3)
222 #define IMUL			(/* GROUP_F7 */ 5 << 3)
223 #define IMUL_r_rm		(/* GROUP_0F */ 0xaf)
224 #define IMUL_r_rm_i8		0x6b
225 #define IMUL_r_rm_i32		0x69
226 #define JL_i8			0x7c
227 #define JE_i8			0x74
228 #define JNC_i8			0x73
229 #define JNE_i8			0x75
230 #define JMP_i8			0xeb
231 #define JMP_i32			0xe9
232 #define JMP_rm			(/* GROUP_FF */ 4 << 3)
233 #define LEA_r_m			0x8d
234 #define LOOP_i8			0xe2
235 #define LZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
236 #define MOV_r_rm		0x8b
237 #define MOV_r_i32		0xb8
238 #define MOV_rm_r		0x89
239 #define MOV_rm_i32		0xc7
240 #define MOV_rm8_i8		0xc6
241 #define MOV_rm8_r8		0x88
242 #define MOVAPS_x_xm		0x28
243 #define MOVAPS_xm_x		0x29
244 #define MOVD_x_rm		0x6e
245 #define MOVD_rm_x		0x7e
246 #define MOVDDUP_x_xm		0x12
247 #define MOVDQA_x_xm		0x6f
248 #define MOVDQA_xm_x		0x7f
249 #define MOVHLPS_x_x		0x12
250 #define MOVHPD_m_x		0x17
251 #define MOVHPD_x_m		0x16
252 #define MOVLHPS_x_x		0x16
253 #define MOVLPD_m_x		0x13
254 #define MOVLPD_x_m		0x12
255 #define MOVMSKPS_r_x		(/* GROUP_0F */ 0x50)
256 #define MOVQ_x_xm		(/* GROUP_0F */ 0x7e)
257 #define MOVSD_x_xm		0x10
258 #define MOVSD_xm_x		0x11
259 #define MOVSHDUP_x_xm		0x16
260 #define MOVSXD_r_rm		0x63
261 #define MOVSX_r_rm8		(/* GROUP_0F */ 0xbe)
262 #define MOVSX_r_rm16		(/* GROUP_0F */ 0xbf)
263 #define MOVUPS_x_xm		0x10
264 #define MOVZX_r_rm8		(/* GROUP_0F */ 0xb6)
265 #define MOVZX_r_rm16		(/* GROUP_0F */ 0xb7)
266 #define MUL			(/* GROUP_F7 */ 4 << 3)
267 #define MULSD_x_xm		0x59
268 #define NEG_rm			(/* GROUP_F7 */ 3 << 3)
269 #define NOP			0x90
270 #define NOT_rm			(/* GROUP_F7 */ 2 << 3)
271 #define OR			(/* BINARY */ 1 << 3)
272 #define OR_r_rm			0x0b
273 #define OR_EAX_i32		0x0d
274 #define OR_rm_r			0x09
275 #define OR_rm8_r8		0x08
276 #define ORPD_x_xm		0x56
277 #define PACKSSWB_x_xm		(/* GROUP_0F */ 0x63)
278 #define PAND_x_xm		0xdb
279 #define PCMPEQD_x_xm		0x76
280 #define PINSRB_x_rm_i8		0x20
281 #define PINSRW_x_rm_i8		0xc4
282 #define PINSRD_x_rm_i8		0x22
283 #define PEXTRB_rm_x_i8		0x14
284 #define PEXTRW_rm_x_i8		0x15
285 #define PEXTRD_rm_x_i8		0x16
286 #define PMOVMSKB_r_x		(/* GROUP_0F */ 0xd7)
287 #define PMOVSXBD_x_xm		0x21
288 #define PMOVSXBQ_x_xm		0x22
289 #define PMOVSXBW_x_xm		0x20
290 #define PMOVSXDQ_x_xm		0x25
291 #define PMOVSXWD_x_xm		0x23
292 #define PMOVSXWQ_x_xm		0x24
293 #define PMOVZXBD_x_xm		0x31
294 #define PMOVZXBQ_x_xm		0x32
295 #define PMOVZXBW_x_xm		0x30
296 #define PMOVZXDQ_x_xm		0x35
297 #define PMOVZXWD_x_xm		0x33
298 #define PMOVZXWQ_x_xm		0x34
299 #define POP_r			0x58
300 #define POP_rm			0x8f
301 #define POPF			0x9d
302 #define POR_x_xm		0xeb
303 #define PREFETCH		0x18
304 #define PSHUFB_x_xm		0x00
305 #define PSHUFD_x_xm		0x70
306 #define PSHUFLW_x_xm		0x70
307 #define PSRLDQ_x		0x73
308 #define PSLLD_x_i8		0x72
309 #define PSLLQ_x_i8		0x73
310 #define PUSH_i32		0x68
311 #define PUSH_r			0x50
312 #define PUSH_rm			(/* GROUP_FF */ 6 << 3)
313 #define PUSHF			0x9c
314 #define PXOR_x_xm		0xef
315 #define ROL			(/* SHIFT */ 0 << 3)
316 #define ROR			(/* SHIFT */ 1 << 3)
317 #define RET_near		0xc3
318 #define RET_i16			0xc2
319 #define SBB			(/* BINARY */ 3 << 3)
320 #define SBB_EAX_i32		0x1d
321 #define SBB_r_rm		0x1b
322 #define SBB_rm_r		0x19
323 #define SAR			(/* SHIFT */ 7 << 3)
324 #define SHL			(/* SHIFT */ 4 << 3)
325 #define SHLD			(/* GROUP_0F */ 0xa5)
326 #define SHRD			(/* GROUP_0F */ 0xad)
327 #define SHR			(/* SHIFT */ 5 << 3)
328 #define SHUFPS_x_xm		0xc6
329 #define SUB			(/* BINARY */ 5 << 3)
330 #define SUB_EAX_i32		0x2d
331 #define SUB_r_rm		0x2b
332 #define SUB_rm_r		0x29
333 #define SUBSD_x_xm		0x5c
334 #define TEST_EAX_i32		0xa9
335 #define TEST_rm_r		0x85
336 #define TZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
337 #define UCOMISD_x_xm		0x2e
338 #define UNPCKLPD_x_xm		0x14
339 #define UNPCKLPS_x_xm		0x14
340 #define VBROADCASTSD_x_xm	0x19
341 #define VBROADCASTSS_x_xm	0x18
342 #define VEXTRACTF128_x_ym	0x19
343 #define VEXTRACTI128_x_ym	0x39
344 #define VINSERTF128_y_y_xm	0x18
345 #define VINSERTI128_y_y_xm	0x38
346 #define VPBROADCASTB_x_xm	0x78
347 #define VPBROADCASTD_x_xm	0x58
348 #define VPBROADCASTQ_x_xm	0x59
349 #define VPBROADCASTW_x_xm	0x79
350 #define VPERMPD_y_ym		0x01
351 #define VPERMQ_y_ym		0x00
352 #define XCHG_EAX_r		0x90
353 #define XCHG_r_rm		0x87
354 #define XOR			(/* BINARY */ 6 << 3)
355 #define XOR_EAX_i32		0x35
356 #define XOR_r_rm		0x33
357 #define XOR_rm_r		0x31
358 #define XORPD_x_xm		0x57
359 
360 #define GROUP_0F		0x0f
361 #define GROUP_66		0x66
362 #define GROUP_F3		0xf3
363 #define GROUP_F7		0xf7
364 #define GROUP_FF		0xff
365 #define GROUP_BINARY_81		0x81
366 #define GROUP_BINARY_83		0x83
367 #define GROUP_SHIFT_1		0xd1
368 #define GROUP_SHIFT_N		0xc1
369 #define GROUP_SHIFT_CL		0xd3
370 #define GROUP_LOCK		0xf0
371 
372 #define MOD_REG			0xc0
373 #define MOD_DISP8		0x40
374 
375 #define INC_SIZE(s)		(*inst++ = U8(s), compiler->size += (s))
376 
377 #define PUSH_REG(r)		(*inst++ = U8(PUSH_r + (r)))
378 #define POP_REG(r)		(*inst++ = U8(POP_r + (r)))
379 #define RET()			(*inst++ = RET_near)
380 #define RET_I16(n)		(*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0)
381 
382 /* Multithreading does not affect these static variables, since they store
383    built-in CPU features. Therefore they can be overwritten by different threads
384    if they detect the CPU features in the same time. */
385 #define CPU_FEATURE_DETECTED		0x001
386 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
387 #define CPU_FEATURE_SSE2		0x002
388 #endif
389 #define CPU_FEATURE_SSE41		0x004
390 #define CPU_FEATURE_LZCNT		0x008
391 #define CPU_FEATURE_TZCNT		0x010
392 #define CPU_FEATURE_CMOV		0x020
393 #define CPU_FEATURE_AVX			0x040
394 #define CPU_FEATURE_AVX2		0x080
395 
396 static sljit_u32 cpu_feature_list = 0;
397 
398 #ifdef _WIN32_WCE
399 #include <cmnintrin.h>
400 #elif defined(_MSC_VER) && _MSC_VER >= 1400
401 #include <intrin.h>
402 #endif
403 
404 /******************************************************/
405 /*    Unaligned-store functions                       */
406 /******************************************************/
407 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)408 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
409 {
410 	SLJIT_MEMCPY(addr, &value, sizeof(value));
411 }
412 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)413 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
414 {
415 	SLJIT_MEMCPY(addr, &value, sizeof(value));
416 }
417 
sljit_unaligned_store_sw(void * addr,sljit_sw value)418 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
419 {
420 	SLJIT_MEMCPY(addr, &value, sizeof(value));
421 }
422 
423 /******************************************************/
424 /*    Utility functions                               */
425 /******************************************************/
426 
execute_cpu_id(sljit_u32 info[4])427 static void execute_cpu_id(sljit_u32 info[4])
428 {
429 #if defined(_MSC_VER) && _MSC_VER >= 1400
430 
431 	__cpuidex((int*)info, (int)info[0], (int)info[2]);
432 
433 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
434 
435 	/* AT&T syntax. */
436 	__asm__ (
437 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
438 		"movl %0, %%esi\n"
439 		"movl (%%esi), %%eax\n"
440 		"movl 8(%%esi), %%ecx\n"
441 		"pushl %%ebx\n"
442 		"cpuid\n"
443 		"movl %%eax, (%%esi)\n"
444 		"movl %%ebx, 4(%%esi)\n"
445 		"popl %%ebx\n"
446 		"movl %%ecx, 8(%%esi)\n"
447 		"movl %%edx, 12(%%esi)\n"
448 #else /* !SLJIT_CONFIG_X86_32 */
449 		"movq %0, %%rsi\n"
450 		"movl (%%rsi), %%eax\n"
451 		"movl 8(%%rsi), %%ecx\n"
452 		"cpuid\n"
453 		"movl %%eax, (%%rsi)\n"
454 		"movl %%ebx, 4(%%rsi)\n"
455 		"movl %%ecx, 8(%%rsi)\n"
456 		"movl %%edx, 12(%%rsi)\n"
457 #endif /* SLJIT_CONFIG_X86_32 */
458 		:
459 		: "r" (info)
460 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
461 		: "memory", "eax", "ecx", "edx", "esi"
462 #else /* !SLJIT_CONFIG_X86_32 */
463 		: "memory", "rax", "rbx", "rcx", "rdx", "rsi"
464 #endif /* SLJIT_CONFIG_X86_32 */
465 	);
466 
467 #else /* _MSC_VER < 1400 */
468 
469 	/* Intel syntax. */
470 	__asm {
471 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
472 		mov esi, info
473 		mov eax, [esi]
474 		mov ecx, [esi + 8]
475 		cpuid
476 		mov [esi], eax
477 		mov [esi + 4], ebx
478 		mov [esi + 8], ecx
479 		mov [esi + 12], edx
480 #else /* !SLJIT_CONFIG_X86_32 */
481 		mov rsi, info
482 		mov eax, [rsi]
483 		mov ecx, [rsi + 8]
484 		cpuid
485 		mov [rsi], eax
486 		mov [rsi + 4], ebx
487 		mov [rsi + 8], ecx
488 		mov [rsi + 12], edx
489 #endif /* SLJIT_CONFIG_X86_32 */
490 	}
491 
492 #endif /* _MSC_VER && _MSC_VER >= 1400 */
493 
494 #if defined(__has_feature)
495 #if __has_feature(memory_sanitizer)
496 __msan_unpoison(info, 4 * sizeof(sljit_u32));
497 #endif /* __has_feature(memory_sanitizer) */
498 #endif /* defined(__has_feature) */
499 
500 }
501 
get_cpu_features(void)502 static void get_cpu_features(void)
503 {
504 	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
505 	sljit_u32 info[4];
506 	sljit_u32 max_id;
507 
508 	info[0] = 0;
509 	execute_cpu_id(info);
510 	max_id = info[0];
511 
512 	if (max_id >= 7) {
513 		info[0] = 7;
514 		info[2] = 0;
515 		execute_cpu_id(info);
516 
517 		if (info[1] & 0x8)
518 			feature_list |= CPU_FEATURE_TZCNT;
519 		if (info[1] & 0x20)
520 			feature_list |= CPU_FEATURE_AVX2;
521 	}
522 
523 	if (max_id >= 1) {
524 		info[0] = 1;
525 		execute_cpu_id(info);
526 
527 		if (info[2] & 0x80000)
528 			feature_list |= CPU_FEATURE_SSE41;
529 		if (info[2] & 0x10000000)
530 			feature_list |= CPU_FEATURE_AVX;
531 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
532 		if (info[3] & 0x4000000)
533 			feature_list |= CPU_FEATURE_SSE2;
534 #endif
535 		if (info[3] & 0x8000)
536 			feature_list |= CPU_FEATURE_CMOV;
537 	}
538 
539 	info[0] = 0x80000001;
540 	info[2] = 0; /* Silences an incorrect compiler warning. */
541 	execute_cpu_id(info);
542 
543 	if (info[2] & 0x20)
544 		feature_list |= CPU_FEATURE_LZCNT;
545 
546 	cpu_feature_list = feature_list;
547 }
548 
get_jump_code(sljit_uw type)549 static sljit_u8 get_jump_code(sljit_uw type)
550 {
551 	switch (type) {
552 	case SLJIT_EQUAL:
553 	case SLJIT_ATOMIC_STORED:
554 	case SLJIT_F_EQUAL:
555 	case SLJIT_UNORDERED_OR_EQUAL:
556 		return 0x84 /* je */;
557 
558 	case SLJIT_NOT_EQUAL:
559 	case SLJIT_ATOMIC_NOT_STORED:
560 	case SLJIT_F_NOT_EQUAL:
561 	case SLJIT_ORDERED_NOT_EQUAL:
562 		return 0x85 /* jne */;
563 
564 	case SLJIT_LESS:
565 	case SLJIT_CARRY:
566 	case SLJIT_F_LESS:
567 	case SLJIT_UNORDERED_OR_LESS:
568 	case SLJIT_UNORDERED_OR_GREATER:
569 		return 0x82 /* jc */;
570 
571 	case SLJIT_GREATER_EQUAL:
572 	case SLJIT_NOT_CARRY:
573 	case SLJIT_F_GREATER_EQUAL:
574 	case SLJIT_ORDERED_GREATER_EQUAL:
575 	case SLJIT_ORDERED_LESS_EQUAL:
576 		return 0x83 /* jae */;
577 
578 	case SLJIT_GREATER:
579 	case SLJIT_F_GREATER:
580 	case SLJIT_ORDERED_LESS:
581 	case SLJIT_ORDERED_GREATER:
582 		return 0x87 /* jnbe */;
583 
584 	case SLJIT_LESS_EQUAL:
585 	case SLJIT_F_LESS_EQUAL:
586 	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
587 	case SLJIT_UNORDERED_OR_LESS_EQUAL:
588 		return 0x86 /* jbe */;
589 
590 	case SLJIT_SIG_LESS:
591 		return 0x8c /* jl */;
592 
593 	case SLJIT_SIG_GREATER_EQUAL:
594 		return 0x8d /* jnl */;
595 
596 	case SLJIT_SIG_GREATER:
597 		return 0x8f /* jnle */;
598 
599 	case SLJIT_SIG_LESS_EQUAL:
600 		return 0x8e /* jle */;
601 
602 	case SLJIT_OVERFLOW:
603 		return 0x80 /* jo */;
604 
605 	case SLJIT_NOT_OVERFLOW:
606 		return 0x81 /* jno */;
607 
608 	case SLJIT_UNORDERED:
609 	case SLJIT_ORDERED_EQUAL: /* NaN. */
610 		return 0x8a /* jp */;
611 
612 	case SLJIT_ORDERED:
613 	case SLJIT_UNORDERED_OR_NOT_EQUAL: /* Not NaN. */
614 		return 0x8b /* jpo */;
615 	}
616 	return 0;
617 }
618 
619 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
620 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_sw executable_offset);
621 #else
622 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr);
623 static sljit_u8* generate_put_label_code(struct sljit_put_label *put_label, sljit_u8 *code_ptr, sljit_uw max_label);
624 #endif
625 
generate_near_jump_code(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_sw executable_offset)626 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset)
627 {
628 	sljit_uw type = jump->flags >> TYPE_SHIFT;
629 	sljit_s32 short_jump;
630 	sljit_uw label_addr;
631 
632 	if (jump->flags & JUMP_LABEL)
633 		label_addr = (sljit_uw)(code + jump->u.label->size);
634 	else
635 		label_addr = jump->u.target - (sljit_uw)executable_offset;
636 
637 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
638 	if ((sljit_sw)(label_addr - (jump->addr + 2)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 6)) < HALFWORD_MIN)
639 		return generate_far_jump_code(jump, code_ptr);
640 #endif
641 
642 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
643 
644 	if (type == SLJIT_JUMP) {
645 		if (short_jump)
646 			*code_ptr++ = JMP_i8;
647 		else
648 			*code_ptr++ = JMP_i32;
649 		jump->addr++;
650 	}
651 	else if (type >= SLJIT_FAST_CALL) {
652 		short_jump = 0;
653 		*code_ptr++ = CALL_i32;
654 		jump->addr++;
655 	}
656 	else if (short_jump) {
657 		*code_ptr++ = U8(get_jump_code(type) - 0x10);
658 		jump->addr++;
659 	}
660 	else {
661 		*code_ptr++ = GROUP_0F;
662 		*code_ptr++ = get_jump_code(type);
663 		jump->addr += 2;
664 	}
665 
666 	if (short_jump) {
667 		jump->flags |= PATCH_MB;
668 		code_ptr += sizeof(sljit_s8);
669 	} else {
670 		jump->flags |= PATCH_MW;
671 		code_ptr += sizeof(sljit_s32);
672 	}
673 
674 	return code_ptr;
675 }
676 
sljit_generate_code(struct sljit_compiler * compiler)677 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
678 {
679 	struct sljit_memory_fragment *buf;
680 	sljit_u8 *code;
681 	sljit_u8 *code_ptr;
682 	sljit_u8 *buf_ptr;
683 	sljit_u8 *buf_end;
684 	sljit_u8 len;
685 	sljit_sw executable_offset;
686 	sljit_uw jump_addr;
687 
688 	struct sljit_label *label;
689 	struct sljit_jump *jump;
690 	struct sljit_const *const_;
691 	struct sljit_put_label *put_label;
692 
693 	CHECK_ERROR_PTR();
694 	CHECK_PTR(check_sljit_generate_code(compiler));
695 	reverse_buf(compiler);
696 
697 	/* Second code generation pass. */
698 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size, compiler->exec_allocator_data);
699 	PTR_FAIL_WITH_EXEC_IF(code);
700 	buf = compiler->buf;
701 
702 	code_ptr = code;
703 	label = compiler->labels;
704 	jump = compiler->jumps;
705 	const_ = compiler->consts;
706 	put_label = compiler->put_labels;
707 	executable_offset = SLJIT_EXEC_OFFSET(code);
708 
709 	do {
710 		buf_ptr = buf->memory;
711 		buf_end = buf_ptr + buf->used_size;
712 		do {
713 			len = *buf_ptr++;
714 			if (len > 0) {
715 				/* The code is already generated. */
716 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
717 				code_ptr += len;
718 				buf_ptr += len;
719 			}
720 			else {
721 				switch (*buf_ptr) {
722 				case 0:
723 					label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
724 					label->size = (sljit_uw)(code_ptr - code);
725 					label = label->next;
726 					break;
727 				case 1:
728 					jump->addr = (sljit_uw)code_ptr;
729 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
730 						code_ptr = generate_near_jump_code(jump, code_ptr, code, executable_offset);
731 					else {
732 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
733 						code_ptr = generate_far_jump_code(jump, code_ptr, executable_offset);
734 #else
735 						code_ptr = generate_far_jump_code(jump, code_ptr);
736 #endif
737 					}
738 					jump = jump->next;
739 					break;
740 				case 2:
741 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
742 					const_ = const_->next;
743 					break;
744 				default:
745 					SLJIT_ASSERT(*buf_ptr == 3);
746 					SLJIT_ASSERT(put_label->label);
747 					put_label->addr = (sljit_uw)code_ptr;
748 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
749 					code_ptr = generate_put_label_code(put_label, code_ptr, (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code, executable_offset) + put_label->label->size);
750 #endif
751 					put_label = put_label->next;
752 					break;
753 				}
754 				buf_ptr++;
755 			}
756 		} while (buf_ptr < buf_end);
757 		SLJIT_ASSERT(buf_ptr == buf_end);
758 		buf = buf->next;
759 	} while (buf);
760 
761 	SLJIT_ASSERT(!label);
762 	SLJIT_ASSERT(!jump);
763 	SLJIT_ASSERT(!const_);
764 	SLJIT_ASSERT(!put_label);
765 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
766 
767 	jump = compiler->jumps;
768 	while (jump) {
769 		if (jump->flags & (PATCH_MB | PATCH_MW)) {
770 			if (jump->flags & JUMP_LABEL)
771 				jump_addr = jump->u.label->addr;
772 			else
773 				jump_addr = jump->u.target;
774 
775 			jump_addr -= jump->addr + (sljit_uw)executable_offset;
776 
777 			if (jump->flags & PATCH_MB) {
778 				jump_addr -= sizeof(sljit_s8);
779 				SLJIT_ASSERT((sljit_sw)jump_addr >= -128 && (sljit_sw)jump_addr <= 127);
780 				*(sljit_u8*)jump->addr = U8(jump_addr);
781 			} else {
782 				jump_addr -= sizeof(sljit_s32);
783 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
784 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump_addr);
785 #else
786 				SLJIT_ASSERT((sljit_sw)jump_addr >= HALFWORD_MIN && (sljit_sw)jump_addr <= HALFWORD_MAX);
787 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)jump_addr);
788 #endif
789 			}
790 		}
791 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
792 		else if (jump->flags & PATCH_MD) {
793 				SLJIT_ASSERT(jump->flags & JUMP_LABEL);
794 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr);
795 		}
796 #endif
797 
798 		jump = jump->next;
799 	}
800 
801 	put_label = compiler->put_labels;
802 	while (put_label) {
803 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
804 		sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
805 #else
806 		if (put_label->flags & PATCH_MD) {
807 			SLJIT_ASSERT(put_label->label->addr > HALFWORD_MAX);
808 			sljit_unaligned_store_sw((void*)(put_label->addr - sizeof(sljit_sw)), (sljit_sw)put_label->label->addr);
809 		}
810 		else {
811 			SLJIT_ASSERT(put_label->label->addr <= HALFWORD_MAX);
812 			sljit_unaligned_store_s32((void*)(put_label->addr - sizeof(sljit_s32)), (sljit_s32)put_label->label->addr);
813 		}
814 #endif
815 
816 		put_label = put_label->next;
817 	}
818 
819 	compiler->error = SLJIT_ERR_COMPILED;
820 	compiler->executable_offset = executable_offset;
821 	compiler->executable_size = (sljit_uw)(code_ptr - code);
822 
823 	code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
824 
825 	SLJIT_UPDATE_WX_FLAGS(code, (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset), 1);
826 	return (void*)code;
827 }
828 
sljit_has_cpu_feature(sljit_s32 feature_type)829 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
830 {
831 	switch (feature_type) {
832 	case SLJIT_HAS_FPU:
833 #ifdef SLJIT_IS_FPU_AVAILABLE
834 		return (SLJIT_IS_FPU_AVAILABLE) != 0;
835 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
836 		if (cpu_feature_list == 0)
837 			get_cpu_features();
838 		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
839 #else /* SLJIT_DETECT_SSE2 */
840 		return 1;
841 #endif /* SLJIT_DETECT_SSE2 */
842 
843 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
844 	case SLJIT_HAS_VIRTUAL_REGISTERS:
845 		return 1;
846 #endif /* SLJIT_CONFIG_X86_32 */
847 
848 	case SLJIT_HAS_CLZ:
849 		if (cpu_feature_list == 0)
850 			get_cpu_features();
851 
852 		return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2;
853 
854 	case SLJIT_HAS_CTZ:
855 		if (cpu_feature_list == 0)
856 			get_cpu_features();
857 
858 		return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2;
859 
860 	case SLJIT_HAS_CMOV:
861 		if (cpu_feature_list == 0)
862 			get_cpu_features();
863 		return (cpu_feature_list & CPU_FEATURE_CMOV) != 0;
864 
865 	case SLJIT_HAS_REV:
866 	case SLJIT_HAS_ROT:
867 	case SLJIT_HAS_PREFETCH:
868 	case SLJIT_HAS_COPY_F32:
869 	case SLJIT_HAS_COPY_F64:
870 	case SLJIT_HAS_ATOMIC:
871 		return 1;
872 
873 #if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE
874 	case SLJIT_HAS_AVX:
875 		if (cpu_feature_list == 0)
876 			get_cpu_features();
877 		return (cpu_feature_list & CPU_FEATURE_AVX) != 0;
878 	case SLJIT_HAS_AVX2:
879 		if (cpu_feature_list == 0)
880 			get_cpu_features();
881 		return (cpu_feature_list & CPU_FEATURE_AVX2) != 0;
882 	case SLJIT_HAS_SIMD:
883 		if (cpu_feature_list == 0)
884 			get_cpu_features();
885 		return (cpu_feature_list & CPU_FEATURE_SSE41) != 0;
886 #endif /* SLJIT_IS_FPU_AVAILABLE */
887 	default:
888 		return 0;
889 	}
890 }
891 
sljit_cmp_info(sljit_s32 type)892 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
893 {
894 	switch (type) {
895 	case SLJIT_ORDERED_EQUAL:
896 	case SLJIT_UNORDERED_OR_NOT_EQUAL:
897 		return 2;
898 	}
899 
900 	return 0;
901 }
902 
903 /* --------------------------------------------------------------------- */
904 /*  Operators                                                            */
905 /* --------------------------------------------------------------------- */
906 
907 #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
908 
909 #define BINARY_IMM32(op_imm, immw, arg, argw) \
910 	do { \
911 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
912 		FAIL_IF(!inst); \
913 		*(inst + 1) |= (op_imm); \
914 	} while (0)
915 
916 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
917 
918 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
919 	do { \
920 		if (IS_HALFWORD(immw) || compiler->mode32) { \
921 			BINARY_IMM32(op_imm, immw, arg, argw); \
922 		} \
923 		else { \
924 			FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \
925 			inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
926 			FAIL_IF(!inst); \
927 			*inst = (op_mr); \
928 		} \
929 	} while (0)
930 
931 #define BINARY_EAX_IMM(op_eax_imm, immw) \
932 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
933 
934 #else /* !SLJIT_CONFIG_X86_64 */
935 
936 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
937 	BINARY_IMM32(op_imm, immw, arg, argw)
938 
939 #define BINARY_EAX_IMM(op_eax_imm, immw) \
940 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
941 
942 #endif /* SLJIT_CONFIG_X86_64 */
943 
emit_byte(struct sljit_compiler * compiler,sljit_u8 byte)944 static sljit_s32 emit_byte(struct sljit_compiler *compiler, sljit_u8 byte)
945 {
946 	sljit_u8 *inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
947 	FAIL_IF(!inst);
948 	INC_SIZE(1);
949 	*inst = byte;
950 	return SLJIT_SUCCESS;
951 }
952 
953 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
954 	sljit_s32 dst, sljit_sw dstw,
955 	sljit_s32 src, sljit_sw srcw);
956 
957 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
958 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
959 
960 static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
961 	sljit_uw op,
962 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
963 
964 static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
965 	sljit_uw op,
966 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
967 
968 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
969 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
970 
971 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
972 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
973 
974 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
975 	sljit_s32 src1, sljit_sw src1w,
976 	sljit_s32 src2, sljit_sw src2w);
977 
978 static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
979 	sljit_s32 dst_reg,
980 	sljit_s32 src, sljit_sw srcw);
981 
emit_endbranch(struct sljit_compiler * compiler)982 static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
983 {
984 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET)
985 	/* Emit endbr32/endbr64 when CET is enabled.  */
986 	sljit_u8 *inst;
987 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
988 	FAIL_IF(!inst);
989 	INC_SIZE(4);
990 	inst[0] = GROUP_F3;
991 	inst[1] = GROUP_0F;
992 	inst[2] = 0x1e;
993 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
994 	inst[3] = 0xfb;
995 #else /* !SLJIT_CONFIG_X86_32 */
996 	inst[3] = 0xfa;
997 #endif /* SLJIT_CONFIG_X86_32 */
998 #else /* !SLJIT_CONFIG_X86_CET */
999 	SLJIT_UNUSED_ARG(compiler);
1000 #endif /* SLJIT_CONFIG_X86_CET */
1001 	return SLJIT_SUCCESS;
1002 }
1003 
1004 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1005 
emit_rdssp(struct sljit_compiler * compiler,sljit_s32 reg)1006 static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_s32 reg)
1007 {
1008 	sljit_u8 *inst;
1009 	sljit_s32 size;
1010 
1011 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1012 	size = 5;
1013 #else
1014 	size = 4;
1015 #endif
1016 
1017 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1018 	FAIL_IF(!inst);
1019 	INC_SIZE(size);
1020 	*inst++ = GROUP_F3;
1021 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1022 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1023 #endif
1024 	inst[0] = GROUP_0F;
1025 	inst[1] = 0x1e;
1026 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1027 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_lmap[reg]);
1028 #else
1029 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_map[reg]);
1030 #endif
1031 	return SLJIT_SUCCESS;
1032 }
1033 
emit_incssp(struct sljit_compiler * compiler,sljit_s32 reg)1034 static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit_s32 reg)
1035 {
1036 	sljit_u8 *inst;
1037 	sljit_s32 size;
1038 
1039 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1040 	size = 5;
1041 #else
1042 	size = 4;
1043 #endif
1044 
1045 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1046 	FAIL_IF(!inst);
1047 	INC_SIZE(size);
1048 	*inst++ = GROUP_F3;
1049 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1050 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1051 #endif
1052 	inst[0] = GROUP_0F;
1053 	inst[1] = 0xae;
1054 	inst[2] = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
1055 	return SLJIT_SUCCESS;
1056 }
1057 
1058 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1059 
cpu_has_shadow_stack(void)1060 static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void)
1061 {
1062 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1063 	return _get_ssp() != 0;
1064 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1065 	return 0;
1066 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1067 }
1068 
adjust_shadow_stack(struct sljit_compiler * compiler,sljit_s32 src,sljit_sw srcw)1069 static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler,
1070 	sljit_s32 src, sljit_sw srcw)
1071 {
1072 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1073 	sljit_u8 *inst, *jz_after_cmp_inst;
1074 	sljit_uw size_jz_after_cmp_inst;
1075 
1076 	sljit_uw size_before_rdssp_inst = compiler->size;
1077 
1078 	/* Generate "RDSSP TMP_REG1". */
1079 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
1080 
1081 	/* Load return address on shadow stack into TMP_REG1. */
1082 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
1083 
1084 	/* Compare return address against TMP_REG1. */
1085 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
1086 
1087 	/* Generate JZ to skip shadow stack ajdustment when shadow
1088 	   stack matches normal stack. */
1089 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1090 	FAIL_IF(!inst);
1091 	INC_SIZE(2);
1092 	*inst++ = get_jump_code(SLJIT_EQUAL) - 0x10;
1093 	size_jz_after_cmp_inst = compiler->size;
1094 	jz_after_cmp_inst = inst;
1095 
1096 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1097 	/* REX_W is not necessary. */
1098 	compiler->mode32 = 1;
1099 #endif
1100 	/* Load 1 into TMP_REG1. */
1101 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
1102 
1103 	/* Generate "INCSSP TMP_REG1". */
1104 	FAIL_IF(emit_incssp(compiler, TMP_REG1));
1105 
1106 	/* Jump back to "RDSSP TMP_REG1" to check shadow stack again. */
1107 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1108 	FAIL_IF(!inst);
1109 	INC_SIZE(2);
1110 	inst[0] = JMP_i8;
1111 	inst[1] = size_before_rdssp_inst - compiler->size;
1112 
1113 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
1114 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1115 	SLJIT_UNUSED_ARG(compiler);
1116 	SLJIT_UNUSED_ARG(src);
1117 	SLJIT_UNUSED_ARG(srcw);
1118 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1119 	return SLJIT_SUCCESS;
1120 }
1121 
1122 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1123 #include "sljitNativeX86_32.c"
1124 #else
1125 #include "sljitNativeX86_64.c"
1126 #endif
1127 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1128 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
1129 	sljit_s32 dst, sljit_sw dstw,
1130 	sljit_s32 src, sljit_sw srcw)
1131 {
1132 	sljit_u8* inst;
1133 
1134 	if (FAST_IS_REG(src)) {
1135 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
1136 		FAIL_IF(!inst);
1137 		*inst = MOV_rm_r;
1138 		return SLJIT_SUCCESS;
1139 	}
1140 
1141 	if (src == SLJIT_IMM) {
1142 		if (FAST_IS_REG(dst)) {
1143 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1144 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1145 #else
1146 			if (!compiler->mode32) {
1147 				if (NOT_HALFWORD(srcw))
1148 					return emit_load_imm64(compiler, dst, srcw);
1149 			}
1150 			else
1151 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, U8(MOV_r_i32 | reg_lmap[dst]), srcw);
1152 #endif
1153 		}
1154 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1155 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
1156 			/* Immediate to memory move. Only SLJIT_MOV operation copies
1157 			   an immediate directly into memory so TMP_REG1 can be used. */
1158 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
1159 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1160 			FAIL_IF(!inst);
1161 			*inst = MOV_rm_r;
1162 			return SLJIT_SUCCESS;
1163 		}
1164 #endif
1165 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
1166 		FAIL_IF(!inst);
1167 		*inst = MOV_rm_i32;
1168 		return SLJIT_SUCCESS;
1169 	}
1170 	if (FAST_IS_REG(dst)) {
1171 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
1172 		FAIL_IF(!inst);
1173 		*inst = MOV_r_rm;
1174 		return SLJIT_SUCCESS;
1175 	}
1176 
1177 	/* Memory to memory move. Only SLJIT_MOV operation copies
1178 	   data from memory to memory so TMP_REG1 can be used. */
1179 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
1180 	FAIL_IF(!inst);
1181 	*inst = MOV_r_rm;
1182 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1183 	FAIL_IF(!inst);
1184 	*inst = MOV_rm_r;
1185 	return SLJIT_SUCCESS;
1186 }
1187 
emit_cmov_generic(struct sljit_compiler * compiler,sljit_s32 type,sljit_s32 dst_reg,sljit_s32 src,sljit_sw srcw)1188 static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
1189 	sljit_s32 dst_reg,
1190 	sljit_s32 src, sljit_sw srcw)
1191 {
1192 	sljit_u8* inst;
1193 	sljit_uw size;
1194 
1195 	SLJIT_ASSERT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
1196 
1197 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1198 	FAIL_IF(!inst);
1199 	INC_SIZE(2);
1200 	inst[0] = U8(get_jump_code((sljit_uw)type ^ 0x1) - 0x10);
1201 
1202 	size = compiler->size;
1203 	EMIT_MOV(compiler, dst_reg, 0, src, srcw);
1204 
1205 	inst[1] = U8(compiler->size - size);
1206 	return SLJIT_SUCCESS;
1207 }
1208 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)1209 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
1210 {
1211 	sljit_u8 *inst;
1212 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1213 	sljit_uw size;
1214 #endif
1215 
1216 	CHECK_ERROR();
1217 	CHECK(check_sljit_emit_op0(compiler, op));
1218 
1219 	switch (GET_OPCODE(op)) {
1220 	case SLJIT_BREAKPOINT:
1221 		return emit_byte(compiler, INT3);
1222 	case SLJIT_NOP:
1223 		return emit_byte(compiler, NOP);
1224 	case SLJIT_LMUL_UW:
1225 	case SLJIT_LMUL_SW:
1226 	case SLJIT_DIVMOD_UW:
1227 	case SLJIT_DIVMOD_SW:
1228 	case SLJIT_DIV_UW:
1229 	case SLJIT_DIV_SW:
1230 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1231 #ifdef _WIN64
1232 		SLJIT_ASSERT(
1233 			reg_map[SLJIT_R0] == 0
1234 			&& reg_map[SLJIT_R1] == 2
1235 			&& reg_map[TMP_REG1] > 7);
1236 #else
1237 		SLJIT_ASSERT(
1238 			reg_map[SLJIT_R0] == 0
1239 			&& reg_map[SLJIT_R1] < 7
1240 			&& reg_map[TMP_REG1] == 2);
1241 #endif
1242 		compiler->mode32 = op & SLJIT_32;
1243 #endif
1244 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
1245 
1246 		op = GET_OPCODE(op);
1247 		if ((op | 0x2) == SLJIT_DIV_UW) {
1248 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1249 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1250 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
1251 #else
1252 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1253 #endif
1254 			FAIL_IF(!inst);
1255 			*inst = XOR_r_rm;
1256 		}
1257 
1258 		if ((op | 0x2) == SLJIT_DIV_SW) {
1259 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1260 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1261 #endif
1262 
1263 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1264 			FAIL_IF(emit_byte(compiler, CDQ));
1265 #else
1266 			if (!compiler->mode32) {
1267 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1268 				FAIL_IF(!inst);
1269 				INC_SIZE(2);
1270 				inst[0] = REX_W;
1271 				inst[1] = CDQ;
1272 			} else
1273 				FAIL_IF(emit_byte(compiler, CDQ));
1274 #endif
1275 		}
1276 
1277 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1278 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1279 		FAIL_IF(!inst);
1280 		INC_SIZE(2);
1281 		inst[0] = GROUP_F7;
1282 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
1283 #else /* !SLJIT_CONFIG_X86_32 */
1284 #ifdef _WIN64
1285 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
1286 #else /* !_WIN64 */
1287 		size = (!compiler->mode32) ? 3 : 2;
1288 #endif /* _WIN64 */
1289 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1290 		FAIL_IF(!inst);
1291 		INC_SIZE(size);
1292 #ifdef _WIN64
1293 		if (!compiler->mode32)
1294 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
1295 		else if (op >= SLJIT_DIVMOD_UW)
1296 			*inst++ = REX_B;
1297 		inst[0] = GROUP_F7;
1298 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
1299 #else /* !_WIN64 */
1300 		if (!compiler->mode32)
1301 			*inst++ = REX_W;
1302 		inst[0] = GROUP_F7;
1303 		inst[1] = MOD_REG | reg_map[SLJIT_R1];
1304 #endif /* _WIN64 */
1305 #endif /* SLJIT_CONFIG_X86_32 */
1306 		switch (op) {
1307 		case SLJIT_LMUL_UW:
1308 			inst[1] |= MUL;
1309 			break;
1310 		case SLJIT_LMUL_SW:
1311 			inst[1] |= IMUL;
1312 			break;
1313 		case SLJIT_DIVMOD_UW:
1314 		case SLJIT_DIV_UW:
1315 			inst[1] |= DIV;
1316 			break;
1317 		case SLJIT_DIVMOD_SW:
1318 		case SLJIT_DIV_SW:
1319 			inst[1] |= IDIV;
1320 			break;
1321 		}
1322 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
1323 		if (op <= SLJIT_DIVMOD_SW)
1324 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1325 #else
1326 		if (op >= SLJIT_DIV_UW)
1327 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1328 #endif
1329 		break;
1330 	case SLJIT_ENDBR:
1331 		return emit_endbranch(compiler);
1332 	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
1333 		return skip_frames_before_return(compiler);
1334 	}
1335 
1336 	return SLJIT_SUCCESS;
1337 }
1338 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1339 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
1340 	sljit_s32 dst, sljit_sw dstw,
1341 	sljit_s32 src, sljit_sw srcw)
1342 {
1343 	sljit_u8* inst;
1344 	sljit_s32 dst_r;
1345 
1346 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1347 	compiler->mode32 = 0;
1348 #endif
1349 
1350 	if (src == SLJIT_IMM) {
1351 		if (FAST_IS_REG(dst)) {
1352 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1353 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1354 #else
1355 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1356 			FAIL_IF(!inst);
1357 			*inst = MOV_rm_i32;
1358 			return SLJIT_SUCCESS;
1359 #endif
1360 		}
1361 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
1362 		FAIL_IF(!inst);
1363 		*inst = MOV_rm8_i8;
1364 		return SLJIT_SUCCESS;
1365 	}
1366 
1367 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1368 
1369 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
1370 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1371 		if (reg_map[src] >= 4) {
1372 			SLJIT_ASSERT(dst_r == TMP_REG1);
1373 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1374 		} else
1375 			dst_r = src;
1376 #else
1377 		dst_r = src;
1378 #endif
1379 	} else {
1380 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1381 		if (FAST_IS_REG(src) && reg_map[src] >= 4) {
1382 			/* Both src and dst are registers. */
1383 			SLJIT_ASSERT(FAST_IS_REG(dst));
1384 
1385 			if (src == dst && !sign) {
1386 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
1387 				FAIL_IF(!inst);
1388 				*(inst + 1) |= AND;
1389 				return SLJIT_SUCCESS;
1390 			}
1391 
1392 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1393 			src = TMP_REG1;
1394 			srcw = 0;
1395 		}
1396 #endif /* !SLJIT_CONFIG_X86_32 */
1397 
1398 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
1399 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw));
1400 	}
1401 
1402 	if (dst & SLJIT_MEM) {
1403 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1404 		FAIL_IF(!inst);
1405 		*inst = MOV_rm8_r8;
1406 	}
1407 
1408 	return SLJIT_SUCCESS;
1409 }
1410 
emit_prefetch(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 src,sljit_sw srcw)1411 static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1412 	sljit_s32 src, sljit_sw srcw)
1413 {
1414 	sljit_u8* inst;
1415 
1416 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1417 	compiler->mode32 = 1;
1418 #endif
1419 
1420 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1421 	FAIL_IF(!inst);
1422 	inst[0] = GROUP_0F;
1423 	inst[1] = PREFETCH;
1424 
1425 	if (op == SLJIT_PREFETCH_L1)
1426 		inst[2] |= (1 << 3);
1427 	else if (op == SLJIT_PREFETCH_L2)
1428 		inst[2] |= (2 << 3);
1429 	else if (op == SLJIT_PREFETCH_L3)
1430 		inst[2] |= (3 << 3);
1431 
1432 	return SLJIT_SUCCESS;
1433 }
1434 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1435 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1436 	sljit_s32 dst, sljit_sw dstw,
1437 	sljit_s32 src, sljit_sw srcw)
1438 {
1439 	sljit_u8* inst;
1440 	sljit_s32 dst_r;
1441 
1442 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1443 	compiler->mode32 = 0;
1444 #endif
1445 
1446 	if (src == SLJIT_IMM) {
1447 		if (FAST_IS_REG(dst)) {
1448 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1449 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1450 #else
1451 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1452 			FAIL_IF(!inst);
1453 			*inst = MOV_rm_i32;
1454 			return SLJIT_SUCCESS;
1455 #endif
1456 		}
1457 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1458 		FAIL_IF(!inst);
1459 		*inst = MOV_rm_i32;
1460 		return SLJIT_SUCCESS;
1461 	}
1462 
1463 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1464 
1465 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1466 		dst_r = src;
1467 	else
1468 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw));
1469 
1470 	if (dst & SLJIT_MEM) {
1471 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1472 		FAIL_IF(!inst);
1473 		*inst = MOV_rm_r;
1474 	}
1475 
1476 	return SLJIT_SUCCESS;
1477 }
1478 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1479 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1480 	sljit_s32 dst, sljit_sw dstw,
1481 	sljit_s32 src, sljit_sw srcw)
1482 {
1483 	sljit_u8* inst;
1484 
1485 	if (dst == src && dstw == srcw) {
1486 		/* Same input and output */
1487 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1488 		FAIL_IF(!inst);
1489 		inst[0] = GROUP_F7;
1490 		inst[1] |= opcode;
1491 		return SLJIT_SUCCESS;
1492 	}
1493 
1494 	if (FAST_IS_REG(dst)) {
1495 		EMIT_MOV(compiler, dst, 0, src, srcw);
1496 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1497 		FAIL_IF(!inst);
1498 		inst[0] = GROUP_F7;
1499 		inst[1] |= opcode;
1500 		return SLJIT_SUCCESS;
1501 	}
1502 
1503 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1504 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1505 	FAIL_IF(!inst);
1506 	inst[0] = GROUP_F7;
1507 	inst[1] |= opcode;
1508 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1509 	return SLJIT_SUCCESS;
1510 }
1511 
1512 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1513 static const sljit_sw emit_clz_arg = 32 + 31;
1514 static const sljit_sw emit_ctz_arg = 32;
1515 #endif
1516 
emit_clz_ctz(struct sljit_compiler * compiler,sljit_s32 is_clz,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1517 static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
1518 	sljit_s32 dst, sljit_sw dstw,
1519 	sljit_s32 src, sljit_sw srcw)
1520 {
1521 	sljit_u8* inst;
1522 	sljit_s32 dst_r;
1523 	sljit_sw max;
1524 
1525 	SLJIT_ASSERT(cpu_feature_list != 0);
1526 
1527 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1528 
1529 	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
1530 		FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw));
1531 
1532 		if (dst & SLJIT_MEM)
1533 			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1534 		return SLJIT_SUCCESS;
1535 	}
1536 
1537 	FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw));
1538 
1539 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1540 	max = is_clz ? (32 + 31) : 32;
1541 
1542 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1543 		if (dst_r != TMP_REG1) {
1544 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max);
1545 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1546 		}
1547 		else
1548 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg);
1549 
1550 		FAIL_IF(!inst);
1551 		inst[0] = GROUP_0F;
1552 		inst[1] = CMOVE_r_rm;
1553 	}
1554 	else
1555 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1556 
1557 	if (is_clz) {
1558 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1559 		FAIL_IF(!inst);
1560 		*(inst + 1) |= XOR;
1561 	}
1562 #else
1563 	if (is_clz)
1564 		max = compiler->mode32 ? (32 + 31) : (64 + 63);
1565 	else
1566 		max = compiler->mode32 ? 32 : 64;
1567 
1568 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1569 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
1570 		FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0));
1571 	} else
1572 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1573 
1574 	if (is_clz) {
1575 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0);
1576 		FAIL_IF(!inst);
1577 		*(inst + 1) |= XOR;
1578 	}
1579 #endif
1580 
1581 	if (dst & SLJIT_MEM)
1582 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1583 	return SLJIT_SUCCESS;
1584 }
1585 
emit_bswap(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1586 static sljit_s32 emit_bswap(struct sljit_compiler *compiler,
1587 	sljit_s32 op,
1588 	sljit_s32 dst, sljit_sw dstw,
1589 	sljit_s32 src, sljit_sw srcw)
1590 {
1591 	sljit_u8 *inst;
1592 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1593 	sljit_uw size;
1594 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1595 	sljit_u8 rex = 0;
1596 #else /* !SLJIT_CONFIG_X86_64 */
1597 	sljit_s32 dst_is_ereg = op & SLJIT_32;
1598 #endif /* SLJIT_CONFIG_X86_64 */
1599 
1600 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1601 	if (op == SLJIT_REV_U32 || op == SLJIT_REV_S32)
1602 		compiler->mode32 = 1;
1603 #else /* !SLJIT_CONFIG_X86_64 */
1604 	op &= ~SLJIT_32;
1605 #endif /* SLJIT_CONFIG_X86_64 */
1606 
1607 	if (src != dst_r) {
1608 		/* Only the lower 16 bit is read for eregs. */
1609 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1610 			FAIL_IF(emit_mov_half(compiler, 0, dst_r, 0, src, srcw));
1611 		else
1612 			EMIT_MOV(compiler, dst_r, 0, src, srcw);
1613 	}
1614 
1615 	size = 2;
1616 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1617 	if (!compiler->mode32)
1618 		rex = REX_W;
1619 
1620 	if (reg_map[dst_r] >= 8)
1621 		rex |= REX_B;
1622 
1623 	if (rex != 0)
1624 		size++;
1625 #endif /* SLJIT_CONFIG_X86_64 */
1626 
1627 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1628 	FAIL_IF(!inst);
1629 	INC_SIZE(size);
1630 
1631 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1632 	if (rex != 0)
1633 		*inst++ = rex;
1634 
1635 	inst[0] = GROUP_0F;
1636 	inst[1] = BSWAP_r | reg_lmap[dst_r];
1637 #else /* !SLJIT_CONFIG_X86_64 */
1638 	inst[0] = GROUP_0F;
1639 	inst[1] = BSWAP_r | reg_map[dst_r];
1640 #endif /* SLJIT_CONFIG_X86_64 */
1641 
1642 	if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) {
1643 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1644 		size = compiler->mode32 ? 16 : 48;
1645 #else /* !SLJIT_CONFIG_X86_64 */
1646 		size = 16;
1647 #endif /* SLJIT_CONFIG_X86_64 */
1648 
1649 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, (sljit_sw)size, dst_r, 0);
1650 		FAIL_IF(!inst);
1651 		if (op == SLJIT_REV_U16)
1652 			inst[1] |= SHR;
1653 		else
1654 			inst[1] |= SAR;
1655 	}
1656 
1657 	if (dst & SLJIT_MEM) {
1658 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1659 		if (dst_is_ereg)
1660 			op = SLJIT_REV;
1661 #endif /* SLJIT_CONFIG_X86_32 */
1662 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1663 			return emit_mov_half(compiler, 0, dst, dstw, TMP_REG1, 0);
1664 
1665 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1666 	}
1667 
1668 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1669 	if (op == SLJIT_REV_S32) {
1670 		compiler->mode32 = 0;
1671 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1672 		FAIL_IF(!inst);
1673 		*inst = MOVSXD_r_rm;
1674 	}
1675 #endif /* SLJIT_CONFIG_X86_64 */
1676 
1677 	return SLJIT_SUCCESS;
1678 }
1679 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1680 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1681 	sljit_s32 dst, sljit_sw dstw,
1682 	sljit_s32 src, sljit_sw srcw)
1683 {
1684 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1685 	sljit_s32 dst_is_ereg = 0;
1686 #else /* !SLJIT_CONFIG_X86_32 */
1687 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1688 #endif /* SLJIT_CONFIG_X86_32 */
1689 
1690 	CHECK_ERROR();
1691 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1692 	ADJUST_LOCAL_OFFSET(dst, dstw);
1693 	ADJUST_LOCAL_OFFSET(src, srcw);
1694 
1695 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1696 	CHECK_EXTRA_REGS(src, srcw, (void)0);
1697 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1698 	compiler->mode32 = op_flags & SLJIT_32;
1699 #endif /* SLJIT_CONFIG_X86_64 */
1700 
1701 	op = GET_OPCODE(op);
1702 
1703 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1704 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1705 		compiler->mode32 = 0;
1706 #endif /* SLJIT_CONFIG_X86_64 */
1707 
1708 		if (FAST_IS_REG(src) && src == dst) {
1709 			if (!TYPE_CAST_NEEDED(op))
1710 				return SLJIT_SUCCESS;
1711 		}
1712 
1713 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1714 		if (op_flags & SLJIT_32) {
1715 			if (src & SLJIT_MEM) {
1716 				if (op == SLJIT_MOV_S32)
1717 					op = SLJIT_MOV_U32;
1718 			}
1719 			else if (src == SLJIT_IMM) {
1720 				if (op == SLJIT_MOV_U32)
1721 					op = SLJIT_MOV_S32;
1722 			}
1723 		}
1724 #endif /* SLJIT_CONFIG_X86_64 */
1725 
1726 		if (src == SLJIT_IMM) {
1727 			switch (op) {
1728 			case SLJIT_MOV_U8:
1729 				srcw = (sljit_u8)srcw;
1730 				break;
1731 			case SLJIT_MOV_S8:
1732 				srcw = (sljit_s8)srcw;
1733 				break;
1734 			case SLJIT_MOV_U16:
1735 				srcw = (sljit_u16)srcw;
1736 				break;
1737 			case SLJIT_MOV_S16:
1738 				srcw = (sljit_s16)srcw;
1739 				break;
1740 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1741 			case SLJIT_MOV_U32:
1742 				srcw = (sljit_u32)srcw;
1743 				break;
1744 			case SLJIT_MOV_S32:
1745 				srcw = (sljit_s32)srcw;
1746 				break;
1747 #endif /* SLJIT_CONFIG_X86_64 */
1748 			}
1749 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1750 			if (SLJIT_UNLIKELY(dst_is_ereg))
1751 				return emit_mov(compiler, dst, dstw, src, srcw);
1752 #endif /* SLJIT_CONFIG_X86_32 */
1753 		}
1754 
1755 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1756 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1757 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1758 			dst = TMP_REG1;
1759 		}
1760 #endif /* SLJIT_CONFIG_X86_32 */
1761 
1762 		switch (op) {
1763 		case SLJIT_MOV:
1764 		case SLJIT_MOV_P:
1765 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1766 		case SLJIT_MOV_U32:
1767 		case SLJIT_MOV_S32:
1768 		case SLJIT_MOV32:
1769 #endif /* SLJIT_CONFIG_X86_32 */
1770 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1771 			break;
1772 		case SLJIT_MOV_U8:
1773 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1774 			break;
1775 		case SLJIT_MOV_S8:
1776 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1777 			break;
1778 		case SLJIT_MOV_U16:
1779 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1780 			break;
1781 		case SLJIT_MOV_S16:
1782 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1783 			break;
1784 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1785 		case SLJIT_MOV_U32:
1786 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1787 			break;
1788 		case SLJIT_MOV_S32:
1789 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1790 			break;
1791 		case SLJIT_MOV32:
1792 			compiler->mode32 = 1;
1793 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1794 			compiler->mode32 = 0;
1795 			break;
1796 #endif /* SLJIT_CONFIG_X86_64 */
1797 		}
1798 
1799 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1800 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1801 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1802 #endif /* SLJIT_CONFIG_X86_32 */
1803 		return SLJIT_SUCCESS;
1804 	}
1805 
1806 	switch (op) {
1807 	case SLJIT_CLZ:
1808 	case SLJIT_CTZ:
1809 		return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw);
1810 	case SLJIT_REV:
1811 	case SLJIT_REV_U16:
1812 	case SLJIT_REV_S16:
1813 	case SLJIT_REV_U32:
1814 	case SLJIT_REV_S32:
1815 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1816 		if (dst_is_ereg)
1817 			op |= SLJIT_32;
1818 #endif /* SLJIT_CONFIG_X86_32 */
1819 		return emit_bswap(compiler, op, dst, dstw, src, srcw);
1820 	}
1821 
1822 	return SLJIT_SUCCESS;
1823 }
1824 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u32 op_types,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1825 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1826 	sljit_u32 op_types,
1827 	sljit_s32 dst, sljit_sw dstw,
1828 	sljit_s32 src1, sljit_sw src1w,
1829 	sljit_s32 src2, sljit_sw src2w)
1830 {
1831 	sljit_u8* inst;
1832 	sljit_u8 op_eax_imm = U8(op_types >> 24);
1833 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
1834 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
1835 	sljit_u8 op_imm = U8(op_types & 0xff);
1836 
1837 	if (dst == src1 && dstw == src1w) {
1838 		if (src2 == SLJIT_IMM) {
1839 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1840 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1841 #else
1842 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1843 #endif
1844 				BINARY_EAX_IMM(op_eax_imm, src2w);
1845 			}
1846 			else {
1847 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1848 			}
1849 		}
1850 		else if (FAST_IS_REG(dst)) {
1851 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1852 			FAIL_IF(!inst);
1853 			*inst = op_rm;
1854 		}
1855 		else if (FAST_IS_REG(src2)) {
1856 			/* Special exception for sljit_emit_op_flags. */
1857 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1858 			FAIL_IF(!inst);
1859 			*inst = op_mr;
1860 		}
1861 		else {
1862 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1863 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1864 			FAIL_IF(!inst);
1865 			*inst = op_mr;
1866 		}
1867 		return SLJIT_SUCCESS;
1868 	}
1869 
1870 	/* Only for cumulative operations. */
1871 	if (dst == src2 && dstw == src2w) {
1872 		if (src1 == SLJIT_IMM) {
1873 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1874 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1875 #else
1876 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1877 #endif
1878 				BINARY_EAX_IMM(op_eax_imm, src1w);
1879 			}
1880 			else {
1881 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1882 			}
1883 		}
1884 		else if (FAST_IS_REG(dst)) {
1885 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1886 			FAIL_IF(!inst);
1887 			*inst = op_rm;
1888 		}
1889 		else if (FAST_IS_REG(src1)) {
1890 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1891 			FAIL_IF(!inst);
1892 			*inst = op_mr;
1893 		}
1894 		else {
1895 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1896 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1897 			FAIL_IF(!inst);
1898 			*inst = op_mr;
1899 		}
1900 		return SLJIT_SUCCESS;
1901 	}
1902 
1903 	/* General version. */
1904 	if (FAST_IS_REG(dst)) {
1905 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1906 		if (src2 == SLJIT_IMM) {
1907 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1908 		}
1909 		else {
1910 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1911 			FAIL_IF(!inst);
1912 			*inst = op_rm;
1913 		}
1914 	}
1915 	else {
1916 		/* This version requires less memory writing. */
1917 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1918 		if (src2 == SLJIT_IMM) {
1919 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1920 		}
1921 		else {
1922 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1923 			FAIL_IF(!inst);
1924 			*inst = op_rm;
1925 		}
1926 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1927 	}
1928 
1929 	return SLJIT_SUCCESS;
1930 }
1931 
1932 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1933 	sljit_u32 op_types,
1934 	sljit_s32 dst, sljit_sw dstw,
1935 	sljit_s32 src1, sljit_sw src1w,
1936 	sljit_s32 src2, sljit_sw src2w)
1937 {
1938 	sljit_u8* inst;
1939 	sljit_u8 op_eax_imm = U8(op_types >> 24);
1940 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
1941 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
1942 	sljit_u8 op_imm = U8(op_types & 0xff);
1943 
1944 	if (dst == src1 && dstw == src1w) {
1945 		if (src2 == SLJIT_IMM) {
1946 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1947 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1948 #else
1949 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1950 #endif
1951 				BINARY_EAX_IMM(op_eax_imm, src2w);
1952 			}
1953 			else {
1954 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1955 			}
1956 		}
1957 		else if (FAST_IS_REG(dst)) {
1958 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1959 			FAIL_IF(!inst);
1960 			*inst = op_rm;
1961 		}
1962 		else if (FAST_IS_REG(src2)) {
1963 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1964 			FAIL_IF(!inst);
1965 			*inst = op_mr;
1966 		}
1967 		else {
1968 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1969 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1970 			FAIL_IF(!inst);
1971 			*inst = op_mr;
1972 		}
1973 		return SLJIT_SUCCESS;
1974 	}
1975 
1976 	/* General version. */
1977 	if (FAST_IS_REG(dst) && dst != src2) {
1978 		EMIT_MOV(compiler, dst, 0, src1, src1w);
1979 		if (src2 == SLJIT_IMM) {
1980 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1981 		}
1982 		else {
1983 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1984 			FAIL_IF(!inst);
1985 			*inst = op_rm;
1986 		}
1987 	}
1988 	else {
1989 		/* This version requires less memory writing. */
1990 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1991 		if (src2 == SLJIT_IMM) {
1992 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1993 		}
1994 		else {
1995 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1996 			FAIL_IF(!inst);
1997 			*inst = op_rm;
1998 		}
1999 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2000 	}
2001 
2002 	return SLJIT_SUCCESS;
2003 }
2004 
2005 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
2006 	sljit_s32 dst, sljit_sw dstw,
2007 	sljit_s32 src1, sljit_sw src1w,
2008 	sljit_s32 src2, sljit_sw src2w)
2009 {
2010 	sljit_u8* inst;
2011 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2012 
2013 	/* Register destination. */
2014 	if (dst_r == src1 && src2 != SLJIT_IMM) {
2015 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2016 	} else if (dst_r == src2 && src1 != SLJIT_IMM) {
2017 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w));
2018 	} else if (src1 == SLJIT_IMM) {
2019 		if (src2 == SLJIT_IMM) {
2020 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
2021 			src2 = dst_r;
2022 			src2w = 0;
2023 		}
2024 
2025 		if (src1w <= 127 && src1w >= -128) {
2026 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2027 			FAIL_IF(!inst);
2028 			*inst = IMUL_r_rm_i8;
2029 
2030 			FAIL_IF(emit_byte(compiler, U8(src1w)));
2031 		}
2032 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2033 		else {
2034 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2035 			FAIL_IF(!inst);
2036 			*inst = IMUL_r_rm_i32;
2037 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2038 			FAIL_IF(!inst);
2039 			INC_SIZE(4);
2040 			sljit_unaligned_store_sw(inst, src1w);
2041 		}
2042 #else
2043 		else if (IS_HALFWORD(src1w)) {
2044 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2045 			FAIL_IF(!inst);
2046 			*inst = IMUL_r_rm_i32;
2047 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2048 			FAIL_IF(!inst);
2049 			INC_SIZE(4);
2050 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
2051 		}
2052 		else {
2053 			if (dst_r != src2)
2054 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
2055 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
2056 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2057 		}
2058 #endif
2059 	}
2060 	else if (src2 == SLJIT_IMM) {
2061 		/* Note: src1 is NOT immediate. */
2062 
2063 		if (src2w <= 127 && src2w >= -128) {
2064 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2065 			FAIL_IF(!inst);
2066 			*inst = IMUL_r_rm_i8;
2067 
2068 			FAIL_IF(emit_byte(compiler, U8(src2w)));
2069 		}
2070 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2071 		else {
2072 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2073 			FAIL_IF(!inst);
2074 			*inst = IMUL_r_rm_i32;
2075 
2076 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2077 			FAIL_IF(!inst);
2078 			INC_SIZE(4);
2079 			sljit_unaligned_store_sw(inst, src2w);
2080 		}
2081 #else
2082 		else if (IS_HALFWORD(src2w)) {
2083 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2084 			FAIL_IF(!inst);
2085 			*inst = IMUL_r_rm_i32;
2086 
2087 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2088 			FAIL_IF(!inst);
2089 			INC_SIZE(4);
2090 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
2091 		} else {
2092 			if (dst_r != src1)
2093 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2094 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2095 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2096 		}
2097 #endif
2098 	} else {
2099 		/* Neither argument is immediate. */
2100 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
2101 			dst_r = TMP_REG1;
2102 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2103 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2104 	}
2105 
2106 	if (dst & SLJIT_MEM)
2107 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2108 
2109 	return SLJIT_SUCCESS;
2110 }
2111 
2112 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
2113 	sljit_s32 dst, sljit_sw dstw,
2114 	sljit_s32 src1, sljit_sw src1w,
2115 	sljit_s32 src2, sljit_sw src2w)
2116 {
2117 	sljit_u8* inst;
2118 	sljit_s32 dst_r, done = 0;
2119 
2120 	/* These cases better be left to handled by normal way. */
2121 	if (dst == src1 && dstw == src1w)
2122 		return SLJIT_ERR_UNSUPPORTED;
2123 	if (dst == src2 && dstw == src2w)
2124 		return SLJIT_ERR_UNSUPPORTED;
2125 
2126 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2127 
2128 	if (FAST_IS_REG(src1)) {
2129 		if (FAST_IS_REG(src2)) {
2130 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
2131 			FAIL_IF(!inst);
2132 			*inst = LEA_r_m;
2133 			done = 1;
2134 		}
2135 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2136 		if (src2 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src2w))) {
2137 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
2138 #else
2139 		if (src2 == SLJIT_IMM) {
2140 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
2141 #endif
2142 			FAIL_IF(!inst);
2143 			*inst = LEA_r_m;
2144 			done = 1;
2145 		}
2146 	}
2147 	else if (FAST_IS_REG(src2)) {
2148 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2149 		if (src1 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src1w))) {
2150 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
2151 #else
2152 		if (src1 == SLJIT_IMM) {
2153 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
2154 #endif
2155 			FAIL_IF(!inst);
2156 			*inst = LEA_r_m;
2157 			done = 1;
2158 		}
2159 	}
2160 
2161 	if (done) {
2162 		if (dst_r == TMP_REG1)
2163 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2164 		return SLJIT_SUCCESS;
2165 	}
2166 	return SLJIT_ERR_UNSUPPORTED;
2167 }
2168 
2169 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
2170 	sljit_s32 src1, sljit_sw src1w,
2171 	sljit_s32 src2, sljit_sw src2w)
2172 {
2173 	sljit_u8* inst;
2174 
2175 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2176 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2177 #else
2178 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2179 #endif
2180 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
2181 		return SLJIT_SUCCESS;
2182 	}
2183 
2184 	if (FAST_IS_REG(src1)) {
2185 		if (src2 == SLJIT_IMM) {
2186 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
2187 		}
2188 		else {
2189 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2190 			FAIL_IF(!inst);
2191 			*inst = CMP_r_rm;
2192 		}
2193 		return SLJIT_SUCCESS;
2194 	}
2195 
2196 	if (FAST_IS_REG(src2) && src1 != SLJIT_IMM) {
2197 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2198 		FAIL_IF(!inst);
2199 		*inst = CMP_rm_r;
2200 		return SLJIT_SUCCESS;
2201 	}
2202 
2203 	if (src2 == SLJIT_IMM) {
2204 		if (src1 == SLJIT_IMM) {
2205 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2206 			src1 = TMP_REG1;
2207 			src1w = 0;
2208 		}
2209 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
2210 	}
2211 	else {
2212 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2213 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2214 		FAIL_IF(!inst);
2215 		*inst = CMP_r_rm;
2216 	}
2217 	return SLJIT_SUCCESS;
2218 }
2219 
2220 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
2221 	sljit_s32 src1, sljit_sw src1w,
2222 	sljit_s32 src2, sljit_sw src2w)
2223 {
2224 	sljit_u8* inst;
2225 
2226 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2227 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2228 #else
2229 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2230 #endif
2231 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
2232 		return SLJIT_SUCCESS;
2233 	}
2234 
2235 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2236 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2237 #else
2238 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128)) {
2239 #endif
2240 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
2241 		return SLJIT_SUCCESS;
2242 	}
2243 
2244 	if (src1 != SLJIT_IMM) {
2245 		if (src2 == SLJIT_IMM) {
2246 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2247 			if (IS_HALFWORD(src2w) || compiler->mode32) {
2248 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2249 				FAIL_IF(!inst);
2250 				*inst = GROUP_F7;
2251 			}
2252 			else {
2253 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src2w));
2254 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src1, src1w);
2255 				FAIL_IF(!inst);
2256 				*inst = TEST_rm_r;
2257 			}
2258 #else
2259 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2260 			FAIL_IF(!inst);
2261 			*inst = GROUP_F7;
2262 #endif
2263 			return SLJIT_SUCCESS;
2264 		}
2265 		else if (FAST_IS_REG(src1)) {
2266 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2267 			FAIL_IF(!inst);
2268 			*inst = TEST_rm_r;
2269 			return SLJIT_SUCCESS;
2270 		}
2271 	}
2272 
2273 	if (src2 != SLJIT_IMM) {
2274 		if (src1 == SLJIT_IMM) {
2275 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2276 			if (IS_HALFWORD(src1w) || compiler->mode32) {
2277 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
2278 				FAIL_IF(!inst);
2279 				*inst = GROUP_F7;
2280 			}
2281 			else {
2282 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
2283 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2284 				FAIL_IF(!inst);
2285 				*inst = TEST_rm_r;
2286 			}
2287 #else
2288 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
2289 			FAIL_IF(!inst);
2290 			*inst = GROUP_F7;
2291 #endif
2292 			return SLJIT_SUCCESS;
2293 		}
2294 		else if (FAST_IS_REG(src2)) {
2295 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2296 			FAIL_IF(!inst);
2297 			*inst = TEST_rm_r;
2298 			return SLJIT_SUCCESS;
2299 		}
2300 	}
2301 
2302 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2303 	if (src2 == SLJIT_IMM) {
2304 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2305 		if (IS_HALFWORD(src2w) || compiler->mode32) {
2306 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2307 			FAIL_IF(!inst);
2308 			*inst = GROUP_F7;
2309 		}
2310 		else {
2311 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2312 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2313 			FAIL_IF(!inst);
2314 			*inst = TEST_rm_r;
2315 		}
2316 #else
2317 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2318 		FAIL_IF(!inst);
2319 		*inst = GROUP_F7;
2320 #endif
2321 	}
2322 	else {
2323 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2324 		FAIL_IF(!inst);
2325 		*inst = TEST_rm_r;
2326 	}
2327 	return SLJIT_SUCCESS;
2328 }
2329 
2330 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2331 	sljit_u8 mode,
2332 	sljit_s32 dst, sljit_sw dstw,
2333 	sljit_s32 src1, sljit_sw src1w,
2334 	sljit_s32 src2, sljit_sw src2w)
2335 {
2336 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2337 	sljit_s32 mode32;
2338 #endif
2339 	sljit_u8* inst;
2340 
2341 	if (src2 == SLJIT_IMM || src2 == SLJIT_PREF_SHIFT_REG) {
2342 		if (dst == src1 && dstw == src1w) {
2343 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2344 			FAIL_IF(!inst);
2345 			inst[1] |= mode;
2346 			return SLJIT_SUCCESS;
2347 		}
2348 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2349 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2350 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2351 			FAIL_IF(!inst);
2352 			inst[1] |= mode;
2353 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2354 			return SLJIT_SUCCESS;
2355 		}
2356 		if (FAST_IS_REG(dst)) {
2357 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2358 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2359 			FAIL_IF(!inst);
2360 			inst[1] |= mode;
2361 			return SLJIT_SUCCESS;
2362 		}
2363 
2364 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2365 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2366 		FAIL_IF(!inst);
2367 		inst[1] |= mode;
2368 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2369 		return SLJIT_SUCCESS;
2370 	}
2371 
2372 	if (dst == SLJIT_PREF_SHIFT_REG) {
2373 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2374 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2375 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2376 		FAIL_IF(!inst);
2377 		inst[1] |= mode;
2378 		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2379 	}
2380 
2381 	if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2382 		if (src1 != dst)
2383 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2384 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2385 		mode32 = compiler->mode32;
2386 		compiler->mode32 = 0;
2387 #endif
2388 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2389 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2390 		compiler->mode32 = mode32;
2391 #endif
2392 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2393 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2394 		FAIL_IF(!inst);
2395 		inst[1] |= mode;
2396 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2397 		compiler->mode32 = 0;
2398 #endif
2399 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2400 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2401 		compiler->mode32 = mode32;
2402 #endif
2403 		return SLJIT_SUCCESS;
2404 	}
2405 
2406 	/* This case is complex since ecx itself may be used for
2407 	   addressing, and this case must be supported as well. */
2408 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2409 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2410 	EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2411 #else /* !SLJIT_CONFIG_X86_32 */
2412 	mode32 = compiler->mode32;
2413 	compiler->mode32 = 0;
2414 	EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2415 	compiler->mode32 = mode32;
2416 #endif /* SLJIT_CONFIG_X86_32 */
2417 
2418 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2419 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2420 	FAIL_IF(!inst);
2421 	inst[1] |= mode;
2422 
2423 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2424 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2425 #else
2426 	compiler->mode32 = 0;
2427 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2428 	compiler->mode32 = mode32;
2429 #endif /* SLJIT_CONFIG_X86_32 */
2430 
2431 	if (dst != TMP_REG1)
2432 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2433 
2434 	return SLJIT_SUCCESS;
2435 }
2436 
2437 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2438 	sljit_u8 mode, sljit_s32 set_flags,
2439 	sljit_s32 dst, sljit_sw dstw,
2440 	sljit_s32 src1, sljit_sw src1w,
2441 	sljit_s32 src2, sljit_sw src2w)
2442 {
2443 	/* The CPU does not set flags if the shift count is 0. */
2444 	if (src2 == SLJIT_IMM) {
2445 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2446 		src2w &= compiler->mode32 ? 0x1f : 0x3f;
2447 #else /* !SLJIT_CONFIG_X86_64 */
2448 		src2w &= 0x1f;
2449 #endif /* SLJIT_CONFIG_X86_64 */
2450 		if (src2w != 0)
2451 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2452 
2453 		if (!set_flags)
2454 			return emit_mov(compiler, dst, dstw, src1, src1w);
2455 		/* OR dst, src, 0 */
2456 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2457 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2458 	}
2459 
2460 	if (!set_flags)
2461 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2462 
2463 	if (!FAST_IS_REG(dst))
2464 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2465 
2466 	FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2467 
2468 	if (FAST_IS_REG(dst))
2469 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2470 	return SLJIT_SUCCESS;
2471 }
2472 
2473 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2474 	sljit_s32 dst, sljit_sw dstw,
2475 	sljit_s32 src1, sljit_sw src1w,
2476 	sljit_s32 src2, sljit_sw src2w)
2477 {
2478 	CHECK_ERROR();
2479 	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
2480 	ADJUST_LOCAL_OFFSET(dst, dstw);
2481 	ADJUST_LOCAL_OFFSET(src1, src1w);
2482 	ADJUST_LOCAL_OFFSET(src2, src2w);
2483 
2484 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2485 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2486 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2487 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2488 	compiler->mode32 = op & SLJIT_32;
2489 #endif
2490 
2491 	SLJIT_ASSERT(dst != TMP_REG1 || HAS_FLAGS(op));
2492 
2493 	switch (GET_OPCODE(op)) {
2494 	case SLJIT_ADD:
2495 		if (!HAS_FLAGS(op)) {
2496 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2497 				return compiler->error;
2498 		}
2499 		return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2500 			dst, dstw, src1, src1w, src2, src2w);
2501 	case SLJIT_ADDC:
2502 		return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2503 			dst, dstw, src1, src1w, src2, src2w);
2504 	case SLJIT_SUB:
2505 		if (src1 == SLJIT_IMM && src1w == 0)
2506 			return emit_unary(compiler, NEG_rm, dst, dstw, src2, src2w);
2507 
2508 		if (!HAS_FLAGS(op)) {
2509 			if (src2 == SLJIT_IMM && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2510 				return compiler->error;
2511 			if (FAST_IS_REG(dst) && src2 == dst) {
2512 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
2513 				return emit_unary(compiler, NEG_rm, dst, 0, dst, 0);
2514 			}
2515 		}
2516 
2517 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2518 			dst, dstw, src1, src1w, src2, src2w);
2519 	case SLJIT_SUBC:
2520 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2521 			dst, dstw, src1, src1w, src2, src2w);
2522 	case SLJIT_MUL:
2523 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2524 	case SLJIT_AND:
2525 		return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2526 			dst, dstw, src1, src1w, src2, src2w);
2527 	case SLJIT_OR:
2528 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2529 			dst, dstw, src1, src1w, src2, src2w);
2530 	case SLJIT_XOR:
2531 		if (!HAS_FLAGS(op)) {
2532 			if (src2 == SLJIT_IMM && src2w == -1)
2533 				return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w);
2534 			if (src1 == SLJIT_IMM && src1w == -1)
2535 				return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w);
2536 		}
2537 
2538 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2539 			dst, dstw, src1, src1w, src2, src2w);
2540 	case SLJIT_SHL:
2541 	case SLJIT_MSHL:
2542 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2543 			dst, dstw, src1, src1w, src2, src2w);
2544 	case SLJIT_LSHR:
2545 	case SLJIT_MLSHR:
2546 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2547 			dst, dstw, src1, src1w, src2, src2w);
2548 	case SLJIT_ASHR:
2549 	case SLJIT_MASHR:
2550 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2551 			dst, dstw, src1, src1w, src2, src2w);
2552 	case SLJIT_ROTL:
2553 		return emit_shift_with_flags(compiler, ROL, 0,
2554 			dst, dstw, src1, src1w, src2, src2w);
2555 	case SLJIT_ROTR:
2556 		return emit_shift_with_flags(compiler, ROR, 0,
2557 			dst, dstw, src1, src1w, src2, src2w);
2558 	}
2559 
2560 	return SLJIT_SUCCESS;
2561 }
2562 
2563 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
2564 	sljit_s32 src1, sljit_sw src1w,
2565 	sljit_s32 src2, sljit_sw src2w)
2566 {
2567 	sljit_s32 opcode = GET_OPCODE(op);
2568 
2569 	CHECK_ERROR();
2570 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
2571 
2572 	if (opcode != SLJIT_SUB && opcode != SLJIT_AND) {
2573 		SLJIT_SKIP_CHECKS(compiler);
2574 		return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
2575 	}
2576 
2577 	ADJUST_LOCAL_OFFSET(src1, src1w);
2578 	ADJUST_LOCAL_OFFSET(src2, src2w);
2579 
2580 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2581 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2582 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2583 	compiler->mode32 = op & SLJIT_32;
2584 #endif
2585 
2586 	if (opcode == SLJIT_SUB) {
2587 		return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2588 	}
2589 	return emit_test_binary(compiler, src1, src1w, src2, src2w);
2590 }
2591 
2592 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
2593 	sljit_s32 dst_reg,
2594 	sljit_s32 src1_reg,
2595 	sljit_s32 src2_reg,
2596 	sljit_s32 src3, sljit_sw src3w)
2597 {
2598 	sljit_s32 is_rotate, is_left, move_src1;
2599 	sljit_u8* inst;
2600 	sljit_sw src1w = 0;
2601 	sljit_sw dstw = 0;
2602 	/* The whole register must be saved even for 32 bit operations. */
2603 	sljit_u8 restore_ecx = 0;
2604 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2605 	sljit_sw src2w = 0;
2606 	sljit_s32 restore_sp4 = 0;
2607 #endif /* SLJIT_CONFIG_X86_32 */
2608 
2609 	CHECK_ERROR();
2610 	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
2611 	ADJUST_LOCAL_OFFSET(src3, src3w);
2612 
2613 	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
2614 	CHECK_EXTRA_REGS(src3, src3w, (void)0);
2615 
2616 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2617 	compiler->mode32 = op & SLJIT_32;
2618 #endif /* SLJIT_CONFIG_X86_64 */
2619 
2620 	if (src3 == SLJIT_IMM) {
2621 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2622 		src3w &= 0x1f;
2623 #else /* !SLJIT_CONFIG_X86_32 */
2624 		src3w &= (op & SLJIT_32) ? 0x1f : 0x3f;
2625 #endif /* SLJIT_CONFIG_X86_32 */
2626 
2627 		if (src3w == 0)
2628 			return SLJIT_SUCCESS;
2629 	}
2630 
2631 	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
2632 
2633 	is_rotate = (src1_reg == src2_reg);
2634 	CHECK_EXTRA_REGS(src1_reg, src1w, (void)0);
2635 	CHECK_EXTRA_REGS(src2_reg, src2w, (void)0);
2636 
2637 	if (is_rotate)
2638 		return emit_shift(compiler, is_left ? ROL : ROR, dst_reg, dstw, src1_reg, src1w, src3, src3w);
2639 
2640 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2641 	if (src2_reg & SLJIT_MEM) {
2642 		EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w);
2643 		src2_reg = TMP_REG1;
2644 	}
2645 #endif /* SLJIT_CONFIG_X86_32 */
2646 
2647 	if (dst_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
2648 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2649 		EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2650 		src1_reg = TMP_REG1;
2651 		src1w = 0;
2652 #else /* !SLJIT_CONFIG_X86_64 */
2653 		if (src2_reg != TMP_REG1) {
2654 			EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2655 			src1_reg = TMP_REG1;
2656 			src1w = 0;
2657 		} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2658 			restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2659 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2660 			EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2661 			src1_reg = restore_sp4;
2662 			src1w = 0;
2663 		} else {
2664 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2665 			restore_sp4 = src1_reg;
2666 		}
2667 #endif /* SLJIT_CONFIG_X86_64 */
2668 
2669 		if (src3 != SLJIT_PREF_SHIFT_REG)
2670 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2671 	} else {
2672 		if (src2_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2673 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2674 			compiler->mode32 = 0;
2675 #endif /* SLJIT_CONFIG_X86_64 */
2676 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2677 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2678 			compiler->mode32 = op & SLJIT_32;
2679 #endif /* SLJIT_CONFIG_X86_64 */
2680 			src2_reg = TMP_REG1;
2681 			restore_ecx = 1;
2682 		}
2683 
2684 		move_src1 = 0;
2685 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2686 		if (dst_reg != src1_reg) {
2687 			if (dst_reg != src3) {
2688 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2689 				src1_reg = dst_reg;
2690 				src1w = 0;
2691 			} else
2692 				move_src1 = 1;
2693 		}
2694 #else /* !SLJIT_CONFIG_X86_64 */
2695 		if (dst_reg & SLJIT_MEM) {
2696 			if (src2_reg != TMP_REG1) {
2697 				EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2698 				src1_reg = TMP_REG1;
2699 				src1w = 0;
2700 			} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2701 				restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2702 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2703 				EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2704 				src1_reg = restore_sp4;
2705 				src1w = 0;
2706 			} else {
2707 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2708 				restore_sp4 = src1_reg;
2709 			}
2710 		} else if (dst_reg != src1_reg) {
2711 			if (dst_reg != src3) {
2712 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2713 				src1_reg = dst_reg;
2714 				src1w = 0;
2715 			} else
2716 				move_src1 = 1;
2717 		}
2718 #endif /* SLJIT_CONFIG_X86_64 */
2719 
2720 		if (src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2721 			if (!restore_ecx) {
2722 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2723 				compiler->mode32 = 0;
2724 				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2725 				compiler->mode32 = op & SLJIT_32;
2726 				restore_ecx = 1;
2727 #else /* !SLJIT_CONFIG_X86_64 */
2728 				if (src1_reg != TMP_REG1 && src2_reg != TMP_REG1) {
2729 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2730 					restore_ecx = 1;
2731 				} else {
2732 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2733 					restore_ecx = 2;
2734 				}
2735 #endif /* SLJIT_CONFIG_X86_64 */
2736 			}
2737 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2738 		}
2739 
2740 		if (move_src1) {
2741 			EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2742 			src1_reg = dst_reg;
2743 			src1w = 0;
2744 		}
2745 	}
2746 
2747 	inst = emit_x86_instruction(compiler, 2, src2_reg, 0, src1_reg, src1w);
2748 	FAIL_IF(!inst);
2749 	inst[0] = GROUP_0F;
2750 
2751 	if (src3 == SLJIT_IMM) {
2752 		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
2753 
2754 		/* Immediate argument is added separately. */
2755 		FAIL_IF(emit_byte(compiler, U8(src3w)));
2756 	} else
2757 		inst[1] = U8(is_left ? SHLD : SHRD);
2758 
2759 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2760 	if (restore_ecx) {
2761 		compiler->mode32 = 0;
2762 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2763 	}
2764 
2765 	if (src1_reg != dst_reg) {
2766 		compiler->mode32 = op & SLJIT_32;
2767 		return emit_mov(compiler, dst_reg, dstw, src1_reg, 0);
2768 	}
2769 #else /* !SLJIT_CONFIG_X86_64 */
2770 	if (restore_ecx)
2771 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, restore_ecx == 1 ? TMP_REG1 : SLJIT_MEM1(SLJIT_SP), 0);
2772 
2773 	if (src1_reg != dst_reg)
2774 		EMIT_MOV(compiler, dst_reg, dstw, src1_reg, 0);
2775 
2776 	if (restore_sp4)
2777 		return emit_mov(compiler, restore_sp4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
2778 #endif /* SLJIT_CONFIG_X86_32 */
2779 
2780 	return SLJIT_SUCCESS;
2781 }
2782 
2783 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
2784 	sljit_s32 src, sljit_sw srcw)
2785 {
2786 	CHECK_ERROR();
2787 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
2788 	ADJUST_LOCAL_OFFSET(src, srcw);
2789 
2790 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2791 
2792 	switch (op) {
2793 	case SLJIT_FAST_RETURN:
2794 		return emit_fast_return(compiler, src, srcw);
2795 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
2796 		/* Don't adjust shadow stack if it isn't enabled.  */
2797 		if (!cpu_has_shadow_stack ())
2798 			return SLJIT_SUCCESS;
2799 		return adjust_shadow_stack(compiler, src, srcw);
2800 	case SLJIT_PREFETCH_L1:
2801 	case SLJIT_PREFETCH_L2:
2802 	case SLJIT_PREFETCH_L3:
2803 	case SLJIT_PREFETCH_ONCE:
2804 		return emit_prefetch(compiler, op, src, srcw);
2805 	}
2806 
2807 	return SLJIT_SUCCESS;
2808 }
2809 
2810 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
2811 	sljit_s32 dst, sljit_sw dstw)
2812 {
2813 	CHECK_ERROR();
2814 	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
2815 	ADJUST_LOCAL_OFFSET(dst, dstw);
2816 
2817 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2818 
2819 	switch (op) {
2820 	case SLJIT_FAST_ENTER:
2821 		return emit_fast_enter(compiler, dst, dstw);
2822 	case SLJIT_GET_RETURN_ADDRESS:
2823 		return sljit_emit_get_return_address(compiler, dst, dstw);
2824 	}
2825 
2826 	return SLJIT_SUCCESS;
2827 }
2828 
2829 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, sljit_s32 reg)
2830 {
2831 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
2832 
2833 	if (type == SLJIT_GP_REGISTER) {
2834 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2835 		if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2836 			return -1;
2837 #endif /* SLJIT_CONFIG_X86_32 */
2838 		return reg_map[reg];
2839 	}
2840 
2841 	if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512)
2842 		return -1;
2843 
2844 	return freg_map[reg];
2845 }
2846 
2847 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2848 	void *instruction, sljit_u32 size)
2849 {
2850 	sljit_u8 *inst;
2851 
2852 	CHECK_ERROR();
2853 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2854 
2855 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2856 	FAIL_IF(!inst);
2857 	INC_SIZE(size);
2858 	SLJIT_MEMCPY(inst, instruction, size);
2859 	return SLJIT_SUCCESS;
2860 }
2861 
2862 /* --------------------------------------------------------------------- */
2863 /*  Floating point operators                                             */
2864 /* --------------------------------------------------------------------- */
2865 
2866 /* Alignment(3) + 4 * 16 bytes. */
2867 static sljit_u32 sse2_data[3 + (4 * 4)];
2868 static sljit_u32 *sse2_buffer;
2869 
2870 static void init_compiler(void)
2871 {
2872 	get_cpu_features();
2873 
2874 	/* Align to 16 bytes. */
2875 	sse2_buffer = (sljit_u32*)(((sljit_uw)sse2_data + 15) & ~(sljit_uw)0xf);
2876 
2877 	/* Single precision constants (each constant is 16 byte long). */
2878 	sse2_buffer[0] = 0x80000000;
2879 	sse2_buffer[4] = 0x7fffffff;
2880 	/* Double precision constants (each constant is 16 byte long). */
2881 	sse2_buffer[8] = 0;
2882 	sse2_buffer[9] = 0x80000000;
2883 	sse2_buffer[12] = 0xffffffff;
2884 	sse2_buffer[13] = 0x7fffffff;
2885 }
2886 
2887 static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
2888 	sljit_uw op,
2889 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2890 {
2891 	sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw);
2892 	FAIL_IF(!inst);
2893 	inst[0] = GROUP_0F;
2894 	inst[1] = op & 0xff;
2895 	return SLJIT_SUCCESS;
2896 }
2897 
2898 static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
2899 	sljit_uw op,
2900 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2901 {
2902 	sljit_u8 *inst;
2903 
2904 	SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A)));
2905 
2906 	inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
2907 	FAIL_IF(!inst);
2908 	inst[0] = GROUP_0F;
2909 	inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A);
2910 	inst[2] = op & 0xff;
2911 	return SLJIT_SUCCESS;
2912 }
2913 
2914 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2915 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2916 {
2917 	return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
2918 }
2919 
2920 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2921 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2922 {
2923 	return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
2924 }
2925 
2926 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2927 	sljit_s32 dst, sljit_sw dstw,
2928 	sljit_s32 src, sljit_sw srcw)
2929 {
2930 	sljit_s32 dst_r;
2931 
2932 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2933 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2934 
2935 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2936 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2937 		compiler->mode32 = 0;
2938 #endif
2939 
2940 	FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
2941 
2942 	if (dst & SLJIT_MEM)
2943 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2944 	return SLJIT_SUCCESS;
2945 }
2946 
2947 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2948 	sljit_s32 dst, sljit_sw dstw,
2949 	sljit_s32 src, sljit_sw srcw)
2950 {
2951 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2952 
2953 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2954 
2955 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2956 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2957 		compiler->mode32 = 0;
2958 #endif
2959 
2960 	if (src == SLJIT_IMM) {
2961 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2962 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2963 			srcw = (sljit_s32)srcw;
2964 #endif
2965 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2966 		src = TMP_REG1;
2967 		srcw = 0;
2968 	}
2969 
2970 	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
2971 
2972 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2973 	compiler->mode32 = 1;
2974 #endif
2975 	if (dst_r == TMP_FREG)
2976 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
2977 	return SLJIT_SUCCESS;
2978 }
2979 
2980 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2981 	sljit_s32 src1, sljit_sw src1w,
2982 	sljit_s32 src2, sljit_sw src2w)
2983 {
2984 	switch (GET_FLAG_TYPE(op)) {
2985 	case SLJIT_ORDERED_EQUAL:
2986 		/* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */
2987 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
2988 		FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
2989 
2990 		/* EQ */
2991 		FAIL_IF(emit_byte(compiler, 0));
2992 
2993 		src1 = TMP_FREG;
2994 		src2 = TMP_FREG;
2995 		src2w = 0;
2996 		break;
2997 
2998 	case SLJIT_ORDERED_LESS:
2999 	case SLJIT_UNORDERED_OR_GREATER:
3000 		/* Also: SLJIT_UNORDERED_OR_GREATER_EQUAL, SLJIT_ORDERED_LESS_EQUAL  */
3001 		if (!FAST_IS_REG(src2)) {
3002 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3003 			src2 = TMP_FREG;
3004 		}
3005 
3006 		return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
3007 	}
3008 
3009 	if (!FAST_IS_REG(src1)) {
3010 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3011 		src1 = TMP_FREG;
3012 	}
3013 
3014 	return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
3015 }
3016 
3017 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
3018 	sljit_s32 dst, sljit_sw dstw,
3019 	sljit_s32 src, sljit_sw srcw)
3020 {
3021 	sljit_s32 dst_r;
3022 	sljit_u8 *inst;
3023 
3024 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3025 	compiler->mode32 = 1;
3026 #endif
3027 
3028 	CHECK_ERROR();
3029 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
3030 
3031 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
3032 		if (FAST_IS_REG(dst))
3033 			return emit_sse2_load(compiler, op & SLJIT_32, dst, src, srcw);
3034 		if (FAST_IS_REG(src))
3035 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, src);
3036 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3037 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3038 	}
3039 
3040 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
3041 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
3042 		if (FAST_IS_REG(src)) {
3043 			/* We overwrite the high bits of source. From SLJIT point of view,
3044 			   this is not an issue.
3045 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
3046 			FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
3047 		} else {
3048 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw));
3049 			src = TMP_FREG;
3050 		}
3051 
3052 		FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
3053 		if (dst_r == TMP_FREG)
3054 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3055 		return SLJIT_SUCCESS;
3056 	}
3057 
3058 	if (FAST_IS_REG(dst)) {
3059 		dst_r = (dst == src) ? TMP_FREG : dst;
3060 
3061 		if (src & SLJIT_MEM)
3062 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3063 
3064 		FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0));
3065 
3066 		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0);
3067 		inst[0] = GROUP_0F;
3068 		/* Same as PSRLD_x / PSRLQ_x */
3069 		inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8;
3070 
3071 		if (GET_OPCODE(op) == SLJIT_ABS_F64) {
3072 			inst[2] |= 2 << 3;
3073 			FAIL_IF(emit_byte(compiler, 1));
3074 		} else {
3075 			inst[2] |= 6 << 3;
3076 			FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63)));
3077 		}
3078 
3079 		if (dst_r != TMP_FREG)
3080 			dst_r = (src & SLJIT_MEM) ? TMP_FREG : src;
3081 		return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0);
3082 	}
3083 
3084 	FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3085 
3086 	switch (GET_OPCODE(op)) {
3087 	case SLJIT_NEG_F64:
3088 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3089 		break;
3090 
3091 	case SLJIT_ABS_F64:
3092 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
3093 		break;
3094 	}
3095 
3096 	return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3097 }
3098 
3099 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
3100 	sljit_s32 dst, sljit_sw dstw,
3101 	sljit_s32 src1, sljit_sw src1w,
3102 	sljit_s32 src2, sljit_sw src2w)
3103 {
3104 	sljit_s32 dst_r;
3105 
3106 	CHECK_ERROR();
3107 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
3108 	ADJUST_LOCAL_OFFSET(dst, dstw);
3109 	ADJUST_LOCAL_OFFSET(src1, src1w);
3110 	ADJUST_LOCAL_OFFSET(src2, src2w);
3111 
3112 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3113 	compiler->mode32 = 1;
3114 #endif
3115 
3116 	if (FAST_IS_REG(dst)) {
3117 		dst_r = dst;
3118 		if (dst == src1)
3119 			; /* Do nothing here. */
3120 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
3121 			/* Swap arguments. */
3122 			src2 = src1;
3123 			src2w = src1w;
3124 		}
3125 		else if (dst != src2)
3126 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src1, src1w));
3127 		else {
3128 			dst_r = TMP_FREG;
3129 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3130 		}
3131 	}
3132 	else {
3133 		dst_r = TMP_FREG;
3134 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3135 	}
3136 
3137 	switch (GET_OPCODE(op)) {
3138 	case SLJIT_ADD_F64:
3139 		FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3140 		break;
3141 
3142 	case SLJIT_SUB_F64:
3143 		FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3144 		break;
3145 
3146 	case SLJIT_MUL_F64:
3147 		FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3148 		break;
3149 
3150 	case SLJIT_DIV_F64:
3151 		FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3152 		break;
3153 	}
3154 
3155 	if (dst_r == TMP_FREG)
3156 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3157 	return SLJIT_SUCCESS;
3158 }
3159 
3160 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
3161 	sljit_s32 dst_freg,
3162 	sljit_s32 src1, sljit_sw src1w,
3163 	sljit_s32 src2, sljit_sw src2w)
3164 {
3165 	sljit_uw pref;
3166 
3167 	CHECK_ERROR();
3168 	CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w));
3169 	ADJUST_LOCAL_OFFSET(src1, src1w);
3170 	ADJUST_LOCAL_OFFSET(src2, src2w);
3171 
3172 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3173 	compiler->mode32 = 1;
3174 #endif
3175 
3176 	if (dst_freg == src1) {
3177 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3178 		pref = EX86_SELECT_66(op) | EX86_SSE2;
3179 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w));
3180 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3181 		return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0);
3182 	}
3183 
3184 	if (src1 & SLJIT_MEM) {
3185 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3186 		src1 = TMP_FREG;
3187 		src1w = 0;
3188 	}
3189 
3190 	if (dst_freg != src2)
3191 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w));
3192 
3193 	pref = EX86_SELECT_66(op) | EX86_SSE2;
3194 	FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w));
3195 	FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3196 	return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w);
3197 }
3198 
3199 /* --------------------------------------------------------------------- */
3200 /*  Conditional instructions                                             */
3201 /* --------------------------------------------------------------------- */
3202 
3203 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
3204 {
3205 	sljit_u8 *inst;
3206 	struct sljit_label *label;
3207 
3208 	CHECK_ERROR_PTR();
3209 	CHECK_PTR(check_sljit_emit_label(compiler));
3210 
3211 	if (compiler->last_label && compiler->last_label->size == compiler->size)
3212 		return compiler->last_label;
3213 
3214 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
3215 	PTR_FAIL_IF(!label);
3216 	set_label(label, compiler);
3217 
3218 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3219 	PTR_FAIL_IF(!inst);
3220 	inst[0] = 0;
3221 	inst[1] = 0;
3222 
3223 	return label;
3224 }
3225 
3226 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
3227 {
3228 	sljit_u8 *inst;
3229 	struct sljit_jump *jump;
3230 
3231 	CHECK_ERROR_PTR();
3232 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
3233 
3234 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3235 	PTR_FAIL_IF_NULL(jump);
3236 	set_jump(jump, compiler, (sljit_u32)((type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT)));
3237 	type &= 0xff;
3238 
3239 	/* Worst case size. */
3240 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3241 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
3242 #else
3243 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
3244 #endif
3245 
3246 	inst = (sljit_u8*)ensure_buf(compiler, 2);
3247 	PTR_FAIL_IF_NULL(inst);
3248 
3249 	inst[0] = 0;
3250 	inst[1] = 1;
3251 	return jump;
3252 }
3253 
3254 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
3255 {
3256 	sljit_u8 *inst;
3257 	struct sljit_jump *jump;
3258 
3259 	CHECK_ERROR();
3260 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
3261 	ADJUST_LOCAL_OFFSET(src, srcw);
3262 
3263 	CHECK_EXTRA_REGS(src, srcw, (void)0);
3264 
3265 	if (src == SLJIT_IMM) {
3266 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3267 		FAIL_IF_NULL(jump);
3268 		set_jump(jump, compiler, (sljit_u32)(JUMP_ADDR | (type << TYPE_SHIFT)));
3269 		jump->u.target = (sljit_uw)srcw;
3270 
3271 		/* Worst case size. */
3272 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3273 		compiler->size += 5;
3274 #else
3275 		compiler->size += 10 + 3;
3276 #endif
3277 
3278 		inst = (sljit_u8*)ensure_buf(compiler, 2);
3279 		FAIL_IF_NULL(inst);
3280 
3281 		inst[0] = 0;
3282 		inst[1] = 1;
3283 	}
3284 	else {
3285 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3286 		/* REX_W is not necessary (src is not immediate). */
3287 		compiler->mode32 = 1;
3288 #endif
3289 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
3290 		FAIL_IF(!inst);
3291 		inst[0] = GROUP_FF;
3292 		inst[1] = U8(inst[1] | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm));
3293 	}
3294 	return SLJIT_SUCCESS;
3295 }
3296 
3297 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
3298 	sljit_s32 dst, sljit_sw dstw,
3299 	sljit_s32 type)
3300 {
3301 	sljit_u8 *inst;
3302 	sljit_u8 cond_set;
3303 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3304 	sljit_s32 reg;
3305 #endif /* !SLJIT_CONFIG_X86_64 */
3306 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
3307 	sljit_s32 dst_save = dst;
3308 	sljit_sw dstw_save = dstw;
3309 
3310 	CHECK_ERROR();
3311 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
3312 
3313 	ADJUST_LOCAL_OFFSET(dst, dstw);
3314 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3315 
3316 	/* setcc = jcc + 0x10. */
3317 	cond_set = U8(get_jump_code((sljit_uw)type) + 0x10);
3318 
3319 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3320 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
3321 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
3322 		FAIL_IF(!inst);
3323 		INC_SIZE(4 + 3);
3324 		/* Set low register to conditional flag. */
3325 		inst[0] = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
3326 		inst[1] = GROUP_0F;
3327 		inst[2] = cond_set;
3328 		inst[3] = MOD_REG | reg_lmap[TMP_REG1];
3329 		inst[4] = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B));
3330 		inst[5] = OR_rm8_r8;
3331 		inst[6] = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]);
3332 		return SLJIT_SUCCESS;
3333 	}
3334 
3335 	reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
3336 
3337 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
3338 	FAIL_IF(!inst);
3339 	INC_SIZE(4 + 4);
3340 	/* Set low register to conditional flag. */
3341 	inst[0] = (reg_map[reg] <= 7) ? REX : REX_B;
3342 	inst[1] = GROUP_0F;
3343 	inst[2] = cond_set;
3344 	inst[3] = MOD_REG | reg_lmap[reg];
3345 	inst[4] = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
3346 	/* The movzx instruction does not affect flags. */
3347 	inst[5] = GROUP_0F;
3348 	inst[6] = MOVZX_r_rm8;
3349 	inst[7] = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]);
3350 
3351 	if (reg != TMP_REG1)
3352 		return SLJIT_SUCCESS;
3353 
3354 	if (GET_OPCODE(op) < SLJIT_ADD) {
3355 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
3356 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3357 	}
3358 
3359 	SLJIT_SKIP_CHECKS(compiler);
3360 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3361 
3362 #else /* !SLJIT_CONFIG_X86_64 */
3363 	SLJIT_ASSERT(reg_map[TMP_REG1] < 4);
3364 
3365 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
3366 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3367 		/* Low byte is accessible. */
3368 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3369 		FAIL_IF(!inst);
3370 		INC_SIZE(3 + 3);
3371 		/* Set low byte to conditional flag. */
3372 		inst[0] = GROUP_0F;
3373 		inst[1] = cond_set;
3374 		inst[2] = U8(MOD_REG | reg_map[dst]);
3375 
3376 		inst[3] = GROUP_0F;
3377 		inst[4] = MOVZX_r_rm8;
3378 		inst[5] = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]);
3379 		return SLJIT_SUCCESS;
3380 	}
3381 
3382 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3383 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 2);
3384 		FAIL_IF(!inst);
3385 		INC_SIZE(3 + 2);
3386 
3387 		/* Set low byte to conditional flag. */
3388 		inst[0] = GROUP_0F;
3389 		inst[1] = cond_set;
3390 		inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3391 
3392 		inst[3] = OR_rm8_r8;
3393 		inst[4] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[dst]);
3394 		return SLJIT_SUCCESS;
3395 	}
3396 
3397 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3398 	FAIL_IF(!inst);
3399 	INC_SIZE(3 + 3);
3400 	/* Set low byte to conditional flag. */
3401 	inst[0] = GROUP_0F;
3402 	inst[1] = cond_set;
3403 	inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3404 
3405 	inst[3] = GROUP_0F;
3406 	inst[4] = MOVZX_r_rm8;
3407 	inst[5] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[TMP_REG1]);
3408 
3409 	if (GET_OPCODE(op) < SLJIT_ADD)
3410 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3411 
3412 	SLJIT_SKIP_CHECKS(compiler);
3413 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3414 #endif /* SLJIT_CONFIG_X86_64 */
3415 }
3416 
3417 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_select(struct sljit_compiler *compiler, sljit_s32 type,
3418 	sljit_s32 dst_reg,
3419 	sljit_s32 src1, sljit_sw src1w,
3420 	sljit_s32 src2_reg)
3421 {
3422 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3423 	sljit_s32 dst = dst_reg;
3424 	sljit_sw dstw = 0;
3425 #endif /* SLJIT_CONFIG_X86_32 */
3426 	sljit_sw src2w = 0;
3427 
3428 	CHECK_ERROR();
3429 	CHECK(check_sljit_emit_select(compiler, type, dst_reg, src1, src1w, src2_reg));
3430 
3431 	ADJUST_LOCAL_OFFSET(src1, src1w);
3432 
3433 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3434 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
3435 	CHECK_EXTRA_REGS(src2_reg, src2w, (void)0);
3436 
3437 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3438 	compiler->mode32 = type & SLJIT_32;
3439 #endif /* SLJIT_CONFIG_X86_64 */
3440 	type &= ~SLJIT_32;
3441 
3442 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3443 	if (dst & SLJIT_MEM) {
3444 		if (src1 == SLJIT_IMM || (!(src1 & SLJIT_MEM) && (src2_reg & SLJIT_MEM))) {
3445 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
3446 			src1 = src2_reg;
3447 			src1w = src2w;
3448 			type ^= 0x1;
3449 		} else
3450 			EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w);
3451 
3452 		dst_reg = TMP_REG1;
3453 	} else {
3454 #endif /* SLJIT_CONFIG_X86_32 */
3455 		if (dst_reg != src2_reg) {
3456 			if (dst_reg == src1) {
3457 				src1 = src2_reg;
3458 				src1w = src2w;
3459 				type ^= 0x1;
3460 			} else {
3461 				if (ADDRESSING_DEPENDS_ON(src1, dst_reg)) {
3462 					EMIT_MOV(compiler, dst_reg, 0, src1, src1w);
3463 					src1 = src2_reg;
3464 					src1w = src2w;
3465 					type ^= 0x1;
3466 				} else
3467 					EMIT_MOV(compiler, dst_reg, 0, src2_reg, src2w);
3468 			}
3469 		}
3470 
3471 		if (SLJIT_UNLIKELY(src1 == SLJIT_IMM)) {
3472 			SLJIT_ASSERT(dst_reg != TMP_REG1);
3473 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
3474 			src1 = TMP_REG1;
3475 			src1w = 0;
3476 		}
3477 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3478 	}
3479 #endif /* SLJIT_CONFIG_X86_32 */
3480 
3481 	if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
3482 		FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), dst_reg, src1, src1w));
3483 	else
3484 		FAIL_IF(emit_cmov_generic(compiler, type, dst_reg, src1, src1w));
3485 
3486 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3487 	if (dst_reg == TMP_REG1)
3488 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3489 #endif /* SLJIT_CONFIG_X86_32 */
3490 	return SLJIT_SUCCESS;
3491 }
3492 
3493 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fselect(struct sljit_compiler *compiler, sljit_s32 type,
3494 	sljit_s32 dst_freg,
3495 	sljit_s32 src1, sljit_sw src1w,
3496 	sljit_s32 src2_freg)
3497 {
3498 	sljit_u8* inst;
3499 	sljit_uw size;
3500 
3501 	CHECK_ERROR();
3502 	CHECK(check_sljit_emit_fselect(compiler, type, dst_freg, src1, src1w, src2_freg));
3503 
3504 	ADJUST_LOCAL_OFFSET(src1, src1w);
3505 
3506 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3507 	compiler->mode32 = 1;
3508 #endif /* SLJIT_CONFIG_X86_64 */
3509 
3510 	if (dst_freg != src2_freg) {
3511 		if (dst_freg == src1) {
3512 			src1 = src2_freg;
3513 			src1w = 0;
3514 			type ^= 0x1;
3515 		} else
3516 			FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src2_freg, 0));
3517 	}
3518 
3519 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
3520 	FAIL_IF(!inst);
3521 	INC_SIZE(2);
3522 	inst[0] = U8(get_jump_code((sljit_uw)(type & ~SLJIT_32) ^ 0x1) - 0x10);
3523 
3524 	size = compiler->size;
3525 	FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src1, src1w));
3526 
3527 	inst[1] = U8(compiler->size - size);
3528 	return SLJIT_SUCCESS;
3529 }
3530 
3531 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
3532 	sljit_s32 freg,
3533 	sljit_s32 srcdst, sljit_sw srcdstw)
3534 {
3535 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3536 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3537 	sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
3538 	sljit_uw op;
3539 
3540 	CHECK_ERROR();
3541 	CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
3542 
3543 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3544 
3545 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3546 	compiler->mode32 = 1;
3547 #endif /* SLJIT_CONFIG_X86_64 */
3548 
3549 	switch (reg_size) {
3550 	case 4:
3551 		op = EX86_SSE2;
3552 		break;
3553 	case 5:
3554 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3555 			return SLJIT_ERR_UNSUPPORTED;
3556 		op = EX86_SSE2 | VEX_256;
3557 		break;
3558 	default:
3559 		return SLJIT_ERR_UNSUPPORTED;
3560 	}
3561 
3562 	if (!(srcdst & SLJIT_MEM))
3563 		alignment = reg_size;
3564 
3565 	if (type & SLJIT_SIMD_FLOAT) {
3566 		if (elem_size == 2 || elem_size == 3) {
3567 			op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
3568 
3569 			if (elem_size == 3)
3570 				op |= EX86_PREF_66;
3571 
3572 			if (type & SLJIT_SIMD_STORE)
3573 				op += 1;
3574 		} else
3575 			return SLJIT_ERR_UNSUPPORTED;
3576 	} else {
3577 		op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm)
3578 			| (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3);
3579 	}
3580 
3581 	if (type & SLJIT_SIMD_TEST)
3582 		return SLJIT_SUCCESS;
3583 
3584 	if (op & VEX_256)
3585 		return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw);
3586 
3587 	return emit_groupf(compiler, op, freg, srcdst, srcdstw);
3588 }
3589 
3590 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
3591 	sljit_s32 freg,
3592 	sljit_s32 src, sljit_sw srcw)
3593 {
3594 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3595 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3596 	sljit_u8 *inst;
3597 	sljit_u8 opcode = 0;
3598 	sljit_uw size;
3599 
3600 	CHECK_ERROR();
3601 	CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw));
3602 
3603 	ADJUST_LOCAL_OFFSET(src, srcw);
3604 
3605 	if (!(type & SLJIT_SIMD_FLOAT)) {
3606 		CHECK_EXTRA_REGS(src, srcw, (void)0);
3607 	}
3608 
3609 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3610 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : (elem_size > 2))
3611 		return SLJIT_ERR_UNSUPPORTED;
3612 #else /* !SLJIT_CONFIG_X86_32 */
3613 	compiler->mode32 = 1;
3614 
3615 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3616 		return SLJIT_ERR_UNSUPPORTED;
3617 #endif /* SLJIT_CONFIG_X86_32 */
3618 
3619 	if (cpu_feature_list & CPU_FEATURE_AVX2) {
3620 		if (reg_size < 4 || reg_size > 5)
3621 			return SLJIT_ERR_UNSUPPORTED;
3622 
3623 		if (src != SLJIT_IMM && (reg_size == 5 || elem_size < 3 || !(type & SLJIT_SIMD_FLOAT))) {
3624 			if (type & SLJIT_SIMD_TEST)
3625 				return SLJIT_SUCCESS;
3626 
3627 			if (!(src & SLJIT_MEM) && !(type & SLJIT_SIMD_FLOAT)) {
3628 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3629 				if (elem_size >= 3)
3630 					compiler->mode32 = 0;
3631 #endif /* SLJIT_CONFIG_X86_64 */
3632 				FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw));
3633 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3634 				compiler->mode32 = 1;
3635 #endif /* SLJIT_CONFIG_X86_64 */
3636 				src = freg;
3637 				srcw = 0;
3638 			}
3639 
3640 			switch (elem_size) {
3641 			case 0:
3642 				size = VPBROADCASTB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3643 				break;
3644 			case 1:
3645 				size = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3646 				break;
3647 			case 2:
3648 				size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSS_x_xm : VPBROADCASTD_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3649 				break;
3650 			default:
3651 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3652 				size = VBROADCASTSD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3653 #else /* !SLJIT_CONFIG_X86_32 */
3654 				size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSD_x_xm : VPBROADCASTQ_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3655 #endif /* SLJIT_CONFIG_X86_32 */
3656 				break;
3657 			}
3658 
3659 			if (reg_size == 5)
3660 				size |= VEX_256;
3661 
3662 			return emit_vex_instruction(compiler, size, freg, 0, src, srcw);
3663 		}
3664 	} else if (reg_size != 4)
3665 		return SLJIT_ERR_UNSUPPORTED;
3666 
3667 	if (type & SLJIT_SIMD_TEST)
3668 		return SLJIT_SUCCESS;
3669 
3670 	if (type & SLJIT_SIMD_FLOAT) {
3671 		if (src == SLJIT_IMM) {
3672 			if (reg_size == 5)
3673 				return emit_vex_instruction(compiler, XORPD_x_xm | VEX_256 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3674 
3675 			return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
3676 		}
3677 
3678 		if (elem_size == 2 && freg != src) {
3679 			FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw));
3680 			src = freg;
3681 			srcw = 0;
3682 		}
3683 
3684 		FAIL_IF(emit_groupf(compiler, (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw));
3685 
3686 		if (elem_size == 2)
3687 			return emit_byte(compiler, 0);
3688 		return SLJIT_SUCCESS;
3689 	}
3690 
3691 	if (src == SLJIT_IMM) {
3692 		if (elem_size == 0) {
3693 			srcw = (sljit_u8)srcw;
3694 			srcw |= srcw << 8;
3695 			srcw |= srcw << 16;
3696 			elem_size = 2;
3697 		} else if (elem_size == 1) {
3698 			srcw = (sljit_u16)srcw;
3699 			srcw |= srcw << 16;
3700 			elem_size = 2;
3701 		}
3702 
3703 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3704 		if (elem_size == 2 && (sljit_s32)srcw == -1)
3705 			srcw = -1;
3706 #endif /* SLJIT_CONFIG_X86_64 */
3707 
3708 		if (srcw == 0 || srcw == -1) {
3709 			if (reg_size == 5)
3710 				return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3711 
3712 			return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
3713 		}
3714 
3715 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3716 		if (elem_size == 3)
3717 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
3718 		else
3719 #endif /* SLJIT_CONFIG_X86_64 */
3720 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
3721 
3722 		src = TMP_REG1;
3723 		srcw = 0;
3724 	}
3725 
3726 	size = 2;
3727 	opcode = MOVD_x_rm;
3728 
3729 	switch (elem_size) {
3730 	case 0:
3731 		if (!FAST_IS_REG(src)) {
3732 			opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */;
3733 			size = 3;
3734 		}
3735 		break;
3736 	case 1:
3737 		if (!FAST_IS_REG(src))
3738 			opcode = PINSRW_x_rm_i8;
3739 		break;
3740 	case 2:
3741 		break;
3742 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3743 	case 3:
3744 		/* MOVQ */
3745 		compiler->mode32 = 0;
3746 		break;
3747 #endif /* SLJIT_CONFIG_X86_64 */
3748 	}
3749 
3750 	inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw);
3751 	FAIL_IF(!inst);
3752 	inst[0] = GROUP_0F;
3753 	inst[1] = opcode;
3754 
3755 	if (reg_size == 5) {
3756 		SLJIT_ASSERT(opcode == MOVD_x_rm);
3757 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3758 		size = VPBROADCASTD_x_xm;
3759 #else /* !SLJIT_CONFIG_X86_32 */
3760 		size = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm;
3761 #endif /* SLJIT_CONFIG_X86_32 */
3762 		return emit_vex_instruction(compiler, size | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
3763 	}
3764 
3765 	if (size == 3) {
3766 		SLJIT_ASSERT(opcode == 0x3a);
3767 		inst[2] = PINSRB_x_rm_i8;
3768 	}
3769 
3770 	if (opcode != MOVD_x_rm)
3771 		FAIL_IF(emit_byte(compiler, 0));
3772 
3773 	switch (elem_size) {
3774 	case 0:
3775 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
3776 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
3777 	case 1:
3778 		FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
3779 		FAIL_IF(emit_byte(compiler, 0));
3780 		/* fallthrough */
3781 	default:
3782 		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3783 		return emit_byte(compiler, 0);
3784 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3785 	case 3:
3786 		compiler->mode32 = 1;
3787 		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3788 		return emit_byte(compiler, 0x44);
3789 #endif /* SLJIT_CONFIG_X86_64 */
3790 	}
3791 }
3792 
3793 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type,
3794 	sljit_s32 freg, sljit_s32 lane_index,
3795 	sljit_s32 srcdst, sljit_sw srcdstw)
3796 {
3797 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3798 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3799 	sljit_u8 *inst;
3800 	sljit_u8 opcode = 0;
3801 	sljit_uw size;
3802 	sljit_s32 freg_orig = freg;
3803 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3804 	sljit_s32 srcdst_is_ereg = 0;
3805 	sljit_s32 srcdst_orig = 0;
3806 	sljit_sw srcdstw_orig = 0;
3807 #endif /* SLJIT_CONFIG_X86_32 */
3808 
3809 	CHECK_ERROR();
3810 	CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
3811 
3812 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3813 
3814 	if (reg_size == 5) {
3815 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3816 			return SLJIT_ERR_UNSUPPORTED;
3817 	} else if (reg_size != 4)
3818 		return SLJIT_ERR_UNSUPPORTED;
3819 
3820 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3821 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2)
3822 		return SLJIT_ERR_UNSUPPORTED;
3823 #else /* SLJIT_CONFIG_X86_32 */
3824 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3825 		return SLJIT_ERR_UNSUPPORTED;
3826 #endif /* SLJIT_CONFIG_X86_32 */
3827 
3828 	if (type & SLJIT_SIMD_TEST)
3829 		return SLJIT_SUCCESS;
3830 
3831 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3832 	compiler->mode32 = 1;
3833 #else /* !SLJIT_CONFIG_X86_64 */
3834 	if (!(type & SLJIT_SIMD_FLOAT)) {
3835 		CHECK_EXTRA_REGS(srcdst, srcdstw, srcdst_is_ereg = 1);
3836 
3837 		if ((type & SLJIT_SIMD_STORE) && ((srcdst_is_ereg && elem_size < 2) || (elem_size == 0 && (type & SLJIT_SIMD_LANE_SIGNED) && FAST_IS_REG(srcdst) && reg_map[srcdst] >= 4))) {
3838 			srcdst_orig = srcdst;
3839 			srcdstw_orig = srcdstw;
3840 			srcdst = TMP_REG1;
3841 			srcdstw = 0;
3842 		}
3843 	}
3844 #endif /* SLJIT_CONFIG_X86_64 */
3845 
3846 	if (type & SLJIT_SIMD_LANE_ZERO) {
3847 		if (lane_index == 0) {
3848 			if (!(type & SLJIT_SIMD_FLOAT)) {
3849 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3850 				if (elem_size == 3) {
3851 					compiler->mode32 = 0;
3852 					elem_size = 2;
3853 				}
3854 #endif /* SLJIT_CONFIG_X86_64 */
3855 				if (srcdst == SLJIT_IMM) {
3856 					if (elem_size == 0)
3857 						srcdstw = (sljit_u8)srcdstw;
3858 					else if (elem_size == 1)
3859 						srcdstw = (sljit_u16)srcdstw;
3860 
3861 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
3862 					srcdst = TMP_REG1;
3863 					srcdstw = 0;
3864 					elem_size = 2;
3865 				}
3866 
3867 				if (elem_size == 2) {
3868 					if (reg_size == 4)
3869 						return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
3870 					return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
3871 				}
3872 			} else if (srcdst & SLJIT_MEM) {
3873 				SLJIT_ASSERT(elem_size == 2 || elem_size == 3);
3874 
3875 				if (reg_size == 4)
3876 					return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
3877 				return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw);
3878 			} else if (elem_size == 3) {
3879 				if (reg_size == 4)
3880 					return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
3881 				return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0);
3882 			}
3883 		}
3884 
3885 		if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
3886 			freg = TMP_FREG;
3887 			lane_index -= (1 << (4 - elem_size));
3888 		} else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
3889 			FAIL_IF(emit_sse2_load(compiler, elem_size == 2, TMP_FREG, srcdst, srcdstw));
3890 			srcdst = TMP_FREG;
3891 			srcdstw = 0;
3892 		}
3893 
3894 		size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0)
3895 			| ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2;
3896 
3897 		if (reg_size == 5)
3898 			FAIL_IF(emit_vex_instruction(compiler,  size | VEX_256 | VEX_SSE2_OPV, freg, freg, freg, 0));
3899 		else
3900 			FAIL_IF(emit_groupf(compiler, size, freg, freg, 0));
3901 	} else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
3902 		FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
3903 		FAIL_IF(emit_byte(compiler, 1));
3904 
3905 		freg = TMP_FREG;
3906 		lane_index -= (1 << (4 - elem_size));
3907 	}
3908 
3909 	if (type & SLJIT_SIMD_FLOAT) {
3910 		if (elem_size == 3) {
3911 			if (srcdst & SLJIT_MEM) {
3912 				if (type & SLJIT_SIMD_STORE)
3913 					size = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
3914 				else
3915 					size = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
3916 
3917 				FAIL_IF(emit_groupf(compiler, size | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
3918 
3919 				/* In case of store, freg is not TMP_FREG. */
3920 			} else if (type & SLJIT_SIMD_STORE) {
3921 				if (lane_index == 1)
3922 					return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0);
3923 				return emit_sse2_load(compiler, 0, srcdst, freg, 0);
3924 			} else {
3925 				if (lane_index == 1)
3926 					FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0));
3927 				else
3928 					FAIL_IF(emit_sse2_store(compiler, 0, freg, 0, srcdst));
3929 			}
3930 		} else if (type & SLJIT_SIMD_STORE) {
3931 			if (lane_index == 0)
3932 				return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg);
3933 
3934 			if (srcdst & SLJIT_MEM) {
3935 				FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
3936 				return emit_byte(compiler, U8(lane_index));
3937 			}
3938 
3939 			if (srcdst == freg)
3940 				size = SHUFPS_x_xm | EX86_SSE2;
3941 			else {
3942 				if (cpu_feature_list & CPU_FEATURE_AVX) {
3943 					FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0));
3944 					return emit_byte(compiler, U8(lane_index));
3945 				}
3946 
3947 				switch (lane_index) {
3948 				case 1:
3949 					size = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2;
3950 					break;
3951 				case 2:
3952 					size = MOVHLPS_x_x | EX86_SSE2;
3953 					break;
3954 				default:
3955 					SLJIT_ASSERT(lane_index == 3);
3956 					size = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2;
3957 					break;
3958 				}
3959 			}
3960 
3961 			FAIL_IF(emit_groupf(compiler, size, srcdst, freg, 0));
3962 
3963 			size &= 0xff;
3964 			if (size == SHUFPS_x_xm || size == PSHUFD_x_xm)
3965 				return emit_byte(compiler, U8(lane_index));
3966 
3967 			return SLJIT_SUCCESS;
3968 		} else {
3969 			if (lane_index != 0 || (srcdst & SLJIT_MEM)) {
3970 				FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
3971 				FAIL_IF(emit_byte(compiler, U8(lane_index << 4)));
3972 			} else
3973 				FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst));
3974 		}
3975 
3976 		if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE))
3977 			return SLJIT_SUCCESS;
3978 
3979 		SLJIT_ASSERT(reg_size == 5);
3980 
3981 		if (type & SLJIT_SIMD_LANE_ZERO) {
3982 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
3983 			return emit_byte(compiler, 0x4e);
3984 		}
3985 
3986 		FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
3987 		return emit_byte(compiler, 1);
3988 	}
3989 
3990 	if (srcdst == SLJIT_IMM) {
3991 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
3992 		srcdst = TMP_REG1;
3993 		srcdstw = 0;
3994 	}
3995 
3996 	size = 3;
3997 
3998 	switch (elem_size) {
3999 	case 0:
4000 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8;
4001 		break;
4002 	case 1:
4003 		if (!(type & SLJIT_SIMD_STORE)) {
4004 			size = 2;
4005 			opcode = PINSRW_x_rm_i8;
4006 		} else
4007 			opcode = PEXTRW_rm_x_i8;
4008 		break;
4009 	case 2:
4010 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4011 		break;
4012 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4013 	case 3:
4014 		/* PINSRQ / PEXTRQ */
4015 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4016 		compiler->mode32 = 0;
4017 		break;
4018 #endif /* SLJIT_CONFIG_X86_64 */
4019 	}
4020 
4021 	inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
4022 	FAIL_IF(!inst);
4023 	inst[0] = GROUP_0F;
4024 
4025 	if (size == 3) {
4026 		inst[1] = 0x3a;
4027 		inst[2] = opcode;
4028 	} else
4029 		inst[1] = opcode;
4030 
4031 	FAIL_IF(emit_byte(compiler, U8(lane_index)));
4032 
4033 	if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) {
4034 		if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) {
4035 			SLJIT_ASSERT(reg_size == 5);
4036 
4037 			if (type & SLJIT_SIMD_LANE_ZERO) {
4038 				FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
4039 				return emit_byte(compiler, 0x4e);
4040 			}
4041 
4042 			FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
4043 			return emit_byte(compiler, 1);
4044 		}
4045 
4046 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4047 		if (srcdst_orig & SLJIT_MEM)
4048 			return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4049 #endif /* SLJIT_CONFIG_X86_32 */
4050 		return SLJIT_SUCCESS;
4051 	}
4052 
4053 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4054 	if (elem_size >= 3)
4055 		return SLJIT_SUCCESS;
4056 
4057 	compiler->mode32 = (type & SLJIT_32);
4058 
4059 	size = 2;
4060 
4061 	if (elem_size == 0)
4062 		size |= EX86_REX;
4063 
4064 	if (elem_size == 2) {
4065 		if (type & SLJIT_32)
4066 			return SLJIT_SUCCESS;
4067 
4068 		SLJIT_ASSERT(!(compiler->mode32));
4069 		size = 1;
4070 	}
4071 
4072 	inst = emit_x86_instruction(compiler, size, srcdst, 0, srcdst, 0);
4073 	FAIL_IF(!inst);
4074 
4075 	if (size != 1) {
4076 		inst[0] = GROUP_0F;
4077 		inst[1] = U8((elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16);
4078 	} else
4079 		inst[0] = MOVSXD_r_rm;
4080 #else /* !SLJIT_CONFIG_X86_64 */
4081 	if (elem_size >= 2)
4082 		return SLJIT_SUCCESS;
4083 
4084 	FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16,
4085 		(srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0));
4086 
4087 	if (srcdst_orig & SLJIT_MEM)
4088 		return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4089 #endif /* SLJIT_CONFIG_X86_64 */
4090 	return SLJIT_SUCCESS;
4091 }
4092 
4093 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type,
4094 	sljit_s32 freg,
4095 	sljit_s32 src, sljit_s32 src_lane_index)
4096 {
4097 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4098 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4099 	sljit_uw pref;
4100 	sljit_u8 byte;
4101 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4102 	sljit_s32 opcode3 = TMP_REG1;
4103 #else /* !SLJIT_CONFIG_X86_32 */
4104 	sljit_s32 opcode3 = SLJIT_S0;
4105 #endif /* SLJIT_CONFIG_X86_32 */
4106 
4107 	CHECK_ERROR();
4108 	CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
4109 
4110 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4111 	compiler->mode32 = 1;
4112 #endif /* SLJIT_CONFIG_X86_64 */
4113 	SLJIT_ASSERT(reg_map[opcode3] == 3);
4114 
4115 	if (reg_size == 5) {
4116 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4117 			return SLJIT_ERR_UNSUPPORTED;
4118 	} else if (reg_size != 4)
4119 		return SLJIT_ERR_UNSUPPORTED;
4120 
4121 	if (type & SLJIT_SIMD_FLOAT) {
4122 		pref = 0;
4123 		byte = U8(src_lane_index);
4124 
4125 		if (elem_size == 3) {
4126 			if (type & SLJIT_SIMD_TEST)
4127 				return SLJIT_SUCCESS;
4128 
4129 			if (reg_size == 5) {
4130 				if (src_lane_index == 0)
4131 					return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4132 
4133 				FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4134 
4135 				byte = U8(byte | (byte << 2));
4136 				return emit_byte(compiler, U8(byte | (byte << 4)));
4137 			}
4138 
4139 			if (src_lane_index == 0)
4140 				return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
4141 
4142 			/* Changes it to SHUFPD_x_xm. */
4143 			pref = EX86_PREF_66;
4144 		} else if (elem_size != 2)
4145 			return SLJIT_ERR_UNSUPPORTED;
4146 		else if (type & SLJIT_SIMD_TEST)
4147 			return SLJIT_SUCCESS;
4148 
4149 		if (reg_size == 5) {
4150 			SLJIT_ASSERT(elem_size == 2);
4151 
4152 			if (src_lane_index == 0)
4153 				return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4154 
4155 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4156 
4157 			byte = 0x44;
4158 			if (src_lane_index >= 4) {
4159 				byte = 0xee;
4160 				src_lane_index -= 4;
4161 			}
4162 
4163 			FAIL_IF(emit_byte(compiler, byte));
4164 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
4165 			byte = U8(src_lane_index);
4166 		} else if (freg != src && (cpu_feature_list & CPU_FEATURE_AVX)) {
4167 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
4168 		} else {
4169 			if (freg != src)
4170 				FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0));
4171 
4172 			FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0));
4173 		}
4174 
4175 		if (elem_size == 2) {
4176 			byte = U8(byte | (byte << 2));
4177 			byte = U8(byte | (byte << 4));
4178 		} else
4179 			byte = U8(byte | (byte << 1));
4180 
4181 		return emit_byte(compiler, U8(byte));
4182 	}
4183 
4184 	if (type & SLJIT_SIMD_TEST)
4185 		return SLJIT_SUCCESS;
4186 
4187 	if (elem_size == 0) {
4188 		if (reg_size == 5 && src_lane_index >= 16) {
4189 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4190 			FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa));
4191 			src_lane_index &= 0x7;
4192 			src = freg;
4193 		}
4194 
4195 		if ((freg != src && !(cpu_feature_list & CPU_FEATURE_AVX2)) || src_lane_index != 0) {
4196 			pref = 0;
4197 
4198 			if ((src_lane_index & 0x3) == 0) {
4199 				pref = EX86_PREF_66;
4200 				byte = U8(src_lane_index >> 2);
4201 			} else if (src_lane_index < 8 && (src_lane_index & 0x1) == 0) {
4202 				pref = EX86_PREF_F2;
4203 				byte = U8(src_lane_index >> 1);
4204 			} else {
4205 				if (freg == src || !(cpu_feature_list & CPU_FEATURE_AVX2)) {
4206 					if (freg != src)
4207 						FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4208 
4209 					FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
4210 				} else
4211 					FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0));
4212 
4213 				FAIL_IF(emit_byte(compiler, U8(src_lane_index)));
4214 			}
4215 
4216 			if (pref != 0) {
4217 				FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4218 				FAIL_IF(emit_byte(compiler, byte));
4219 			}
4220 
4221 			src = freg;
4222 		}
4223 
4224 		if (cpu_feature_list & CPU_FEATURE_AVX2)
4225 			return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4226 
4227 		SLJIT_ASSERT(reg_size == 4);
4228 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
4229 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
4230 	}
4231 
4232 	if ((cpu_feature_list & CPU_FEATURE_AVX2) && src_lane_index == 0 && elem_size <= 3) {
4233 		switch (elem_size) {
4234 		case 1:
4235 			pref = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4236 			break;
4237 		case 2:
4238 			pref = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4239 			break;
4240 		default:
4241 			pref = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4242 			break;
4243 		}
4244 
4245 		if (reg_size == 5)
4246 			pref |= VEX_256;
4247 
4248 		return emit_vex_instruction(compiler, pref, freg, 0, src, 0);
4249 	}
4250 
4251 	if (reg_size == 5) {
4252 		switch (elem_size) {
4253 		case 1:
4254 			byte = U8(src_lane_index & 0x3);
4255 			src_lane_index >>= 2;
4256 			pref = PSHUFLW_x_xm | VEX_256 | ((src_lane_index & 1) == 0 ? EX86_PREF_F2 : EX86_PREF_F3) | EX86_SSE2;
4257 			break;
4258 		case 2:
4259 			byte = U8(src_lane_index & 0x3);
4260 			src_lane_index >>= 1;
4261 			pref = PSHUFD_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2;
4262 			break;
4263 		case 3:
4264 			pref = 0;
4265 			break;
4266 		default:
4267 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4268 			return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee));
4269 		}
4270 
4271 		if (pref != 0) {
4272 			FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0));
4273 			byte = U8(byte | (byte << 2));
4274 			FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4275 
4276 			if (src_lane_index == 0)
4277 				return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4278 
4279 			src = freg;
4280 		}
4281 
4282 		FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4283 		byte = U8(src_lane_index);
4284 		byte = U8(byte | (byte << 2));
4285 		return emit_byte(compiler, U8(byte | (byte << 4)));
4286 	}
4287 
4288 	switch (elem_size) {
4289 	case 1:
4290 		byte = U8(src_lane_index & 0x3);
4291 		src_lane_index >>= 1;
4292 		pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3;
4293 
4294 		FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4295 		byte = U8(byte | (byte << 2));
4296 		FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4297 
4298 		if ((cpu_feature_list & CPU_FEATURE_AVX2) && pref == EX86_PREF_F2)
4299 			return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4300 
4301 		src = freg;
4302 		/* fallthrough */
4303 	case 2:
4304 		byte = U8(src_lane_index);
4305 		byte = U8(byte | (byte << 2));
4306 		break;
4307 	default:
4308 		byte = U8(src_lane_index << 1);
4309 		byte = U8(byte | (byte << 2) | 0x4);
4310 		break;
4311 	}
4312 
4313 	FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4314 	return emit_byte(compiler, U8(byte | (byte << 4)));
4315 }
4316 
4317 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
4318 	sljit_s32 freg,
4319 	sljit_s32 src, sljit_sw srcw)
4320 {
4321 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4322 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4323 	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
4324 	sljit_u8 opcode;
4325 
4326 	CHECK_ERROR();
4327 	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
4328 
4329 	ADJUST_LOCAL_OFFSET(src, srcw);
4330 
4331 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4332 	compiler->mode32 = 1;
4333 #endif /* SLJIT_CONFIG_X86_64 */
4334 
4335 	if (reg_size == 5) {
4336 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4337 			return SLJIT_ERR_UNSUPPORTED;
4338 	} else if (reg_size != 4)
4339 		return SLJIT_ERR_UNSUPPORTED;
4340 
4341 	if (type & SLJIT_SIMD_FLOAT) {
4342 		if (elem_size != 2 || elem2_size != 3)
4343 			return SLJIT_ERR_UNSUPPORTED;
4344 
4345 		if (type & SLJIT_SIMD_TEST)
4346 			return SLJIT_SUCCESS;
4347 
4348 		if (reg_size == 4)
4349 			return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw);
4350 		return emit_vex_instruction(compiler, CVTPS2PD_x_xm | VEX_256 | EX86_SSE2, freg, 0, src, srcw);
4351 	}
4352 
4353 	switch (elem_size) {
4354 	case 0:
4355 		if (elem2_size == 1)
4356 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBW_x_xm : PMOVZXBW_x_xm;
4357 		else if (elem2_size == 2)
4358 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBD_x_xm : PMOVZXBD_x_xm;
4359 		else if (elem2_size == 3)
4360 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBQ_x_xm : PMOVZXBQ_x_xm;
4361 		else
4362 			return SLJIT_ERR_UNSUPPORTED;
4363 		break;
4364 	case 1:
4365 		if (elem2_size == 2)
4366 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWD_x_xm : PMOVZXWD_x_xm;
4367 		else if (elem2_size == 3)
4368 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWQ_x_xm : PMOVZXWQ_x_xm;
4369 		else
4370 			return SLJIT_ERR_UNSUPPORTED;
4371 		break;
4372 	case 2:
4373 		if (elem2_size == 3)
4374 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXDQ_x_xm : PMOVZXDQ_x_xm;
4375 		else
4376 			return SLJIT_ERR_UNSUPPORTED;
4377 		break;
4378 	default:
4379 		return SLJIT_ERR_UNSUPPORTED;
4380 	}
4381 
4382 	if (type & SLJIT_SIMD_TEST)
4383 		return SLJIT_SUCCESS;
4384 
4385 	if (reg_size == 4)
4386 		return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
4387 	return emit_vex_instruction(compiler, opcode | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw);
4388 }
4389 
4390 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
4391 	sljit_s32 freg,
4392 	sljit_s32 dst, sljit_sw dstw)
4393 {
4394 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4395 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4396 	sljit_s32 dst_r;
4397 	sljit_uw pref;
4398 	sljit_u8 *inst;
4399 
4400 	CHECK_ERROR();
4401 	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
4402 
4403 	ADJUST_LOCAL_OFFSET(dst, dstw);
4404 
4405 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4406 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4407 	compiler->mode32 = 1;
4408 #endif /* SLJIT_CONFIG_X86_64 */
4409 
4410 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
4411 		return SLJIT_ERR_UNSUPPORTED;
4412 
4413 	if (reg_size == 4) {
4414 		if (type & SLJIT_SIMD_TEST)
4415 			return SLJIT_SUCCESS;
4416 
4417 		pref = EX86_PREF_66 | EX86_SSE2_OP2;
4418 
4419 		switch (elem_size) {
4420 		case 1:
4421 			FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
4422 			freg = TMP_FREG;
4423 			break;
4424 		case 2:
4425 			pref = EX86_SSE2_OP2;
4426 			break;
4427 		}
4428 
4429 		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4430 		FAIL_IF(emit_groupf(compiler, (elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x) | pref, dst_r, freg, 0));
4431 
4432 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4433 		compiler->mode32 = type & SLJIT_32;
4434 #endif /* SLJIT_CONFIG_X86_64 */
4435 
4436 		if (elem_size == 1) {
4437 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 8, dst_r, 0);
4438 			FAIL_IF(!inst);
4439 			inst[1] |= SHR;
4440 		}
4441 
4442 		if (dst_r == TMP_REG1)
4443 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4444 
4445 		return SLJIT_SUCCESS;
4446 	}
4447 
4448 	if (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2))
4449 		return SLJIT_ERR_UNSUPPORTED;
4450 
4451 	if (type & SLJIT_SIMD_TEST)
4452 		return SLJIT_SUCCESS;
4453 
4454 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4455 
4456 	if (elem_size == 1) {
4457 		FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
4458 		FAIL_IF(emit_byte(compiler, 1));
4459 		FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0));
4460 		FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
4461 	} else {
4462 		pref = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2;
4463 
4464 		if (elem_size == 0)
4465 			pref = PMOVMSKB_r_x | VEX_256 | EX86_PREF_66 | EX86_SSE2_OP2;
4466 		else if (elem_size == 3)
4467 			pref |= EX86_PREF_66;
4468 
4469 		FAIL_IF(emit_vex_instruction(compiler, pref, dst_r, 0, freg, 0));
4470 	}
4471 
4472 	if (dst_r == TMP_REG1) {
4473 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4474 		compiler->mode32 = type & SLJIT_32;
4475 #endif /* SLJIT_CONFIG_X86_64 */
4476 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4477 	}
4478 
4479 	return SLJIT_SUCCESS;
4480 }
4481 
4482 static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
4483 	sljit_s32 dst_freg, sljit_s32 src_freg)
4484 {
4485 	sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2;
4486 
4487 	SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4);
4488 
4489 	if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3)
4490 		op |= EX86_PREF_66;
4491 
4492 	return emit_groupf(compiler, op, dst_freg, src_freg, 0);
4493 }
4494 
4495 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
4496 	sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
4497 {
4498 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4499 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4500 	sljit_s32 needs_move = 0;
4501 	sljit_uw op = 0;
4502 
4503 	CHECK_ERROR();
4504 	CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
4505 
4506 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4507 	compiler->mode32 = 1;
4508 #endif /* SLJIT_CONFIG_X86_64 */
4509 
4510 	if (reg_size == 5) {
4511 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4512 			return SLJIT_ERR_UNSUPPORTED;
4513 	} else if (reg_size != 4)
4514 		return SLJIT_ERR_UNSUPPORTED;
4515 
4516 	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
4517 		return SLJIT_ERR_UNSUPPORTED;
4518 
4519 	switch (SLJIT_SIMD_GET_OPCODE(type)) {
4520 	case SLJIT_SIMD_OP2_AND:
4521 		op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm;
4522 
4523 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4524 			op |= EX86_PREF_66;
4525 		break;
4526 	case SLJIT_SIMD_OP2_OR:
4527 		op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm;
4528 
4529 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4530 			op |= EX86_PREF_66;
4531 		break;
4532 	case SLJIT_SIMD_OP2_XOR:
4533 		op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm;
4534 
4535 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4536 			op |= EX86_PREF_66;
4537 		break;
4538 	}
4539 
4540 	if (type & SLJIT_SIMD_TEST)
4541 		return SLJIT_SUCCESS;
4542 
4543 	needs_move = dst_freg != src1_freg && dst_freg != src2_freg;
4544 
4545 	if (reg_size == 5 || (needs_move && (cpu_feature_list & CPU_FEATURE_AVX2))) {
4546 		if (reg_size == 5)
4547 			op |= VEX_256;
4548 
4549 		return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0);
4550 	}
4551 
4552 	if (needs_move) {
4553 		FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg));
4554 	} else if (dst_freg != src1_freg) {
4555 		SLJIT_ASSERT(dst_freg == src2_freg);
4556 		src2_freg = src1_freg;
4557 	}
4558 
4559 	FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0));
4560 	return SLJIT_SUCCESS;
4561 }
4562 
4563 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
4564 	sljit_s32 dst_reg,
4565 	sljit_s32 mem_reg)
4566 {
4567 	CHECK_ERROR();
4568 	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
4569 
4570 	SLJIT_SKIP_CHECKS(compiler);
4571 	return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0);
4572 }
4573 
4574 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op,
4575 	sljit_s32 src_reg,
4576 	sljit_s32 mem_reg,
4577 	sljit_s32 temp_reg)
4578 {
4579 	sljit_uw pref;
4580 	sljit_s32 free_reg = TMP_REG1;
4581 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4582 	sljit_sw srcw = 0;
4583 	sljit_sw tempw = 0;
4584 #endif /* SLJIT_CONFIG_X86_32 */
4585 
4586 	CHECK_ERROR();
4587 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
4588 	CHECK_EXTRA_REGS(src_reg, srcw, (void)0);
4589 	CHECK_EXTRA_REGS(temp_reg, tempw, (void)0);
4590 
4591 	SLJIT_ASSERT(FAST_IS_REG(src_reg) || src_reg == SLJIT_MEM1(SLJIT_SP));
4592 	SLJIT_ASSERT(FAST_IS_REG(temp_reg) || temp_reg == SLJIT_MEM1(SLJIT_SP));
4593 
4594 	op = GET_OPCODE(op);
4595 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4596 	if ((src_reg & SLJIT_MEM) || (op == SLJIT_MOV_U8 && reg_map[src_reg] >= 4)) {
4597 		/* Src is virtual register or its low byte is not accessible. */
4598 		SLJIT_ASSERT(src_reg != SLJIT_R1);
4599 		free_reg = src_reg;
4600 
4601 		EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw);
4602 		src_reg = TMP_REG1;
4603 
4604 		if (mem_reg == src_reg)
4605 			mem_reg = TMP_REG1;
4606 	}
4607 #endif /* SLJIT_CONFIG_X86_32 */
4608 
4609 	if (temp_reg != SLJIT_R0) {
4610 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4611 		compiler->mode32 = 0;
4612 
4613 		EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4614 		EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, 0);
4615 
4616 		if (src_reg == SLJIT_R0)
4617 			src_reg = free_reg;
4618 		if (mem_reg == SLJIT_R0)
4619 			mem_reg = free_reg;
4620 #else /* !SLJIT_CONFIG_X86_64 */
4621 		if (src_reg == TMP_REG1 && mem_reg == SLJIT_R0 && (free_reg & SLJIT_MEM)) {
4622 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0);
4623 			EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0);
4624 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4625 
4626 			mem_reg = SLJIT_R1;
4627 			free_reg = SLJIT_R1;
4628 		} else {
4629 			EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4630 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4631 
4632 			if (src_reg == SLJIT_R0)
4633 				src_reg = free_reg;
4634 			if (mem_reg == SLJIT_R0)
4635 				mem_reg = free_reg;
4636 		}
4637 #endif /* SLJIT_CONFIG_X86_64 */
4638 	}
4639 
4640 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4641 	compiler->mode32 = op != SLJIT_MOV && op != SLJIT_MOV_P;
4642 #endif /* SLJIT_CONFIG_X86_64 */
4643 
4644 	/* Lock prefix. */
4645 	FAIL_IF(emit_byte(compiler, GROUP_LOCK));
4646 
4647 	pref = 0;
4648 	if (op == SLJIT_MOV_U16)
4649 		pref = EX86_HALF_ARG | EX86_PREF_66;
4650 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4651 	if (op == SLJIT_MOV_U8)
4652 		pref = EX86_REX;
4653 #endif /* SLJIT_CONFIG_X86_64 */
4654 
4655 	FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0));
4656 
4657 	if (temp_reg != SLJIT_R0) {
4658 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4659 		compiler->mode32 = 0;
4660 		return emit_mov(compiler, SLJIT_R0, 0, TMP_REG1, 0);
4661 #else /* !SLJIT_CONFIG_X86_64 */
4662 		EMIT_MOV(compiler, SLJIT_R0, 0, free_reg, 0);
4663 		if (free_reg != TMP_REG1)
4664 			return emit_mov(compiler, free_reg, 0, (free_reg == SLJIT_R1) ? SLJIT_MEM1(SLJIT_SP) : TMP_REG1, 0);
4665 #endif /* SLJIT_CONFIG_X86_64 */
4666 	}
4667 	return SLJIT_SUCCESS;
4668 }
4669 
4670 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
4671 {
4672 	CHECK_ERROR();
4673 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
4674 	ADJUST_LOCAL_OFFSET(dst, dstw);
4675 
4676 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4677 
4678 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4679 	compiler->mode32 = 0;
4680 #endif
4681 
4682 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
4683 
4684 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4685 	if (NOT_HALFWORD(offset)) {
4686 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
4687 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
4688 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
4689 		return compiler->error;
4690 #else
4691 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
4692 #endif
4693 	}
4694 #endif
4695 
4696 	if (offset != 0)
4697 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
4698 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
4699 }
4700 
4701 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
4702 {
4703 	sljit_u8 *inst;
4704 	struct sljit_const *const_;
4705 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4706 	sljit_s32 reg;
4707 #endif
4708 
4709 	CHECK_ERROR_PTR();
4710 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
4711 	ADJUST_LOCAL_OFFSET(dst, dstw);
4712 
4713 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4714 
4715 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
4716 	PTR_FAIL_IF(!const_);
4717 	set_const(const_, compiler);
4718 
4719 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4720 	compiler->mode32 = 0;
4721 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4722 
4723 	if (emit_load_imm64(compiler, reg, init_value))
4724 		return NULL;
4725 #else
4726 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
4727 		return NULL;
4728 #endif
4729 
4730 	inst = (sljit_u8*)ensure_buf(compiler, 2);
4731 	PTR_FAIL_IF(!inst);
4732 
4733 	inst[0] = 0;
4734 	inst[1] = 2;
4735 
4736 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4737 	if (dst & SLJIT_MEM)
4738 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
4739 			return NULL;
4740 #endif
4741 
4742 	return const_;
4743 }
4744 
4745 SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
4746 {
4747 	struct sljit_put_label *put_label;
4748 	sljit_u8 *inst;
4749 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4750 	sljit_s32 reg;
4751 	sljit_uw start_size;
4752 #endif
4753 
4754 	CHECK_ERROR_PTR();
4755 	CHECK_PTR(check_sljit_emit_put_label(compiler, dst, dstw));
4756 	ADJUST_LOCAL_OFFSET(dst, dstw);
4757 
4758 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4759 
4760 	put_label = (struct sljit_put_label*)ensure_abuf(compiler, sizeof(struct sljit_put_label));
4761 	PTR_FAIL_IF(!put_label);
4762 	set_put_label(put_label, compiler, 0);
4763 
4764 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4765 	compiler->mode32 = 0;
4766 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4767 
4768 	if (emit_load_imm64(compiler, reg, 0))
4769 		return NULL;
4770 #else
4771 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, 0))
4772 		return NULL;
4773 #endif
4774 
4775 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4776 	if (dst & SLJIT_MEM) {
4777 		start_size = compiler->size;
4778 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
4779 			return NULL;
4780 		put_label->flags = compiler->size - start_size;
4781 	}
4782 #endif
4783 
4784 	inst = (sljit_u8*)ensure_buf(compiler, 2);
4785 	PTR_FAIL_IF(!inst);
4786 
4787 	inst[0] = 0;
4788 	inst[1] = 3;
4789 
4790 	return put_label;
4791 }
4792 
4793 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
4794 {
4795 	SLJIT_UNUSED_ARG(executable_offset);
4796 
4797 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0);
4798 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4799 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)(new_target - (addr + 4) - (sljit_uw)executable_offset));
4800 #else
4801 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)new_target);
4802 #endif
4803 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1);
4804 }
4805 
4806 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
4807 {
4808 	SLJIT_UNUSED_ARG(executable_offset);
4809 
4810 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 0);
4811 	sljit_unaligned_store_sw((void*)addr, new_constant);
4812 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 1);
4813 }
4814