1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
sljit_get_platform_name(void)27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28 {
29 	return "x86" SLJIT_CPUINFO;
30 }
31 
32 /*
33    32b register indexes:
34      0 - EAX
35      1 - ECX
36      2 - EDX
37      3 - EBX
38      4 - ESP
39      5 - EBP
40      6 - ESI
41      7 - EDI
42 */
43 
44 /*
45    64b register indexes:
46      0 - RAX
47      1 - RCX
48      2 - RDX
49      3 - RBX
50      4 - RSP
51      5 - RBP
52      6 - RSI
53      7 - RDI
54      8 - R8   - From now on REX prefix is required
55      9 - R9
56     10 - R10
57     11 - R11
58     12 - R12
59     13 - R13
60     14 - R14
61     15 - R15
62 */
63 
64 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
65 #define TMP_FREG	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
66 
67 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
68 
69 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3
71 };
72 
73 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
74 	0, 1, 2, 3, 4, 5, 6, 7, 0
75 };
76 
77 #define CHECK_EXTRA_REGS(p, w, do) \
78 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
79 		w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
80 		p = SLJIT_MEM1(SLJIT_SP); \
81 		do; \
82 	}
83 
84 #else /* SLJIT_CONFIG_X86_32 */
85 
86 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
87 
88 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
89    Note: avoid to use r12 and r13 for memory addressing
90    therefore r12 is better to be a higher saved register. */
91 #ifndef _WIN64
92 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
93 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
94 	0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
95 };
96 /* low-map. reg_map & 0x7. */
97 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
98 	0, 0, 6, 7, 1, 0,  3,  2,  4, 5,  5,  6,  7, 3, 4, 2, 1
99 };
100 #else
101 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
102 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
103 	0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
104 };
105 /* low-map. reg_map & 0x7. */
106 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
107 	0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
108 };
109 #endif
110 
111 /* Args: xmm0-xmm3 */
112 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
113 	0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4
114 };
115 /* low-map. freg_map & 0x7. */
116 static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
117 	0, 0, 1, 2, 3, 5, 6, 7, 0, 1,  2,  3,  4,  5,  6,  7, 4
118 };
119 
120 #define REX_W		0x48
121 #define REX_R		0x44
122 #define REX_X		0x42
123 #define REX_B		0x41
124 #define REX		0x40
125 
126 #ifndef _WIN64
127 #define HALFWORD_MAX 0x7fffffffl
128 #define HALFWORD_MIN -0x80000000l
129 #else
130 #define HALFWORD_MAX 0x7fffffffll
131 #define HALFWORD_MIN -0x80000000ll
132 #endif
133 
134 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
135 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
136 
137 #define CHECK_EXTRA_REGS(p, w, do)
138 
139 #endif /* SLJIT_CONFIG_X86_32 */
140 
141 #define U8(v)			((sljit_u8)(v))
142 
143 /* Size flags for emit_x86_instruction: */
144 #define EX86_BIN_INS		((sljit_uw)0x000010)
145 #define EX86_SHIFT_INS		((sljit_uw)0x000020)
146 #define EX86_BYTE_ARG		((sljit_uw)0x000040)
147 #define EX86_HALF_ARG		((sljit_uw)0x000080)
148 /* Size flags for both emit_x86_instruction and emit_vex_instruction: */
149 #define EX86_REX		((sljit_uw)0x000100)
150 #define EX86_NO_REXW		((sljit_uw)0x000200)
151 #define EX86_PREF_66		((sljit_uw)0x000400)
152 #define EX86_PREF_F2		((sljit_uw)0x000800)
153 #define EX86_PREF_F3		((sljit_uw)0x001000)
154 #define EX86_SSE2_OP1		((sljit_uw)0x002000)
155 #define EX86_SSE2_OP2		((sljit_uw)0x004000)
156 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
157 #define EX86_VEX_EXT		((sljit_uw)0x008000)
158 /* Op flags for emit_vex_instruction: */
159 #define VEX_OP_0F38		((sljit_uw)0x010000)
160 #define VEX_OP_0F3A		((sljit_uw)0x020000)
161 #define VEX_SSE2_OPV		((sljit_uw)0x040000)
162 #define VEX_AUTO_W		((sljit_uw)0x080000)
163 #define VEX_W			((sljit_uw)0x100000)
164 #define VEX_256			((sljit_uw)0x200000)
165 
166 #define EX86_SELECT_66(op)	(((op) & SLJIT_32) ? 0 : EX86_PREF_66)
167 #define EX86_SELECT_F2_F3(op)	(((op) & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2)
168 
169 /* --------------------------------------------------------------------- */
170 /*  Instruction forms                                                    */
171 /* --------------------------------------------------------------------- */
172 
173 #define ADD			(/* BINARY */ 0 << 3)
174 #define ADD_EAX_i32		0x05
175 #define ADD_r_rm		0x03
176 #define ADD_rm_r		0x01
177 #define ADDSD_x_xm		0x58
178 #define ADC			(/* BINARY */ 2 << 3)
179 #define ADC_EAX_i32		0x15
180 #define ADC_r_rm		0x13
181 #define ADC_rm_r		0x11
182 #define AND			(/* BINARY */ 4 << 3)
183 #define AND_EAX_i32		0x25
184 #define AND_r_rm		0x23
185 #define AND_rm_r		0x21
186 #define ANDPD_x_xm		0x54
187 #define BSR_r_rm		(/* GROUP_0F */ 0xbd)
188 #define BSF_r_rm		(/* GROUP_0F */ 0xbc)
189 #define BSWAP_r			(/* GROUP_0F */ 0xc8)
190 #define CALL_i32		0xe8
191 #define CALL_rm			(/* GROUP_FF */ 2 << 3)
192 #define CDQ			0x99
193 #define CMOVE_r_rm		(/* GROUP_0F */ 0x44)
194 #define CMP			(/* BINARY */ 7 << 3)
195 #define CMP_EAX_i32		0x3d
196 #define CMP_r_rm		0x3b
197 #define CMP_rm_r		0x39
198 #define CMPS_x_xm		0xc2
199 #define CMPXCHG_rm_r		0xb1
200 #define CMPXCHG_rm8_r		0xb0
201 #define CVTPD2PS_x_xm		0x5a
202 #define CVTPS2PD_x_xm		0x5a
203 #define CVTSI2SD_x_rm		0x2a
204 #define CVTTSD2SI_r_xm		0x2c
205 #define DIV			(/* GROUP_F7 */ 6 << 3)
206 #define DIVSD_x_xm		0x5e
207 #define EXTRACTPS_x_xm		0x17
208 #define FLDS			0xd9
209 #define FLDL			0xdd
210 #define FSTPS			0xd9
211 #define FSTPD			0xdd
212 #define INSERTPS_x_xm		0x21
213 #define INT3			0xcc
214 #define IDIV			(/* GROUP_F7 */ 7 << 3)
215 #define IMUL			(/* GROUP_F7 */ 5 << 3)
216 #define IMUL_r_rm		(/* GROUP_0F */ 0xaf)
217 #define IMUL_r_rm_i8		0x6b
218 #define IMUL_r_rm_i32		0x69
219 #define JL_i8			0x7c
220 #define JE_i8			0x74
221 #define JNC_i8			0x73
222 #define JNE_i8			0x75
223 #define JMP_i8			0xeb
224 #define JMP_i32			0xe9
225 #define JMP_rm			(/* GROUP_FF */ 4 << 3)
226 #define LEA_r_m			0x8d
227 #define LOOP_i8			0xe2
228 #define LZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
229 #define MOV_r_rm		0x8b
230 #define MOV_r_i32		0xb8
231 #define MOV_rm_r		0x89
232 #define MOV_rm_i32		0xc7
233 #define MOV_rm8_i8		0xc6
234 #define MOV_rm8_r8		0x88
235 #define MOVAPS_x_xm		0x28
236 #define MOVAPS_xm_x		0x29
237 #define MOVD_x_rm		0x6e
238 #define MOVD_rm_x		0x7e
239 #define MOVDDUP_x_xm		0x12
240 #define MOVDQA_x_xm		0x6f
241 #define MOVDQA_xm_x		0x7f
242 #define MOVHLPS_x_x		0x12
243 #define MOVHPD_m_x		0x17
244 #define MOVHPD_x_m		0x16
245 #define MOVLHPS_x_x		0x16
246 #define MOVLPD_m_x		0x13
247 #define MOVLPD_x_m		0x12
248 #define MOVMSKPS_r_x		(/* GROUP_0F */ 0x50)
249 #define MOVQ_x_xm		(/* GROUP_0F */ 0x7e)
250 #define MOVSD_x_xm		0x10
251 #define MOVSD_xm_x		0x11
252 #define MOVSHDUP_x_xm		0x16
253 #define MOVSXD_r_rm		0x63
254 #define MOVSX_r_rm8		(/* GROUP_0F */ 0xbe)
255 #define MOVSX_r_rm16		(/* GROUP_0F */ 0xbf)
256 #define MOVUPS_x_xm		0x10
257 #define MOVZX_r_rm8		(/* GROUP_0F */ 0xb6)
258 #define MOVZX_r_rm16		(/* GROUP_0F */ 0xb7)
259 #define MUL			(/* GROUP_F7 */ 4 << 3)
260 #define MULSD_x_xm		0x59
261 #define NEG_rm			(/* GROUP_F7 */ 3 << 3)
262 #define NOP			0x90
263 #define NOT_rm			(/* GROUP_F7 */ 2 << 3)
264 #define OR			(/* BINARY */ 1 << 3)
265 #define OR_r_rm			0x0b
266 #define OR_EAX_i32		0x0d
267 #define OR_rm_r			0x09
268 #define OR_rm8_r8		0x08
269 #define ORPD_x_xm		0x56
270 #define PACKSSWB_x_xm		(/* GROUP_0F */ 0x63)
271 #define PAND_x_xm		0xdb
272 #define PCMPEQD_x_xm		0x76
273 #define PINSRB_x_rm_i8		0x20
274 #define PINSRW_x_rm_i8		0xc4
275 #define PINSRD_x_rm_i8		0x22
276 #define PEXTRB_rm_x_i8		0x14
277 #define PEXTRW_rm_x_i8		0x15
278 #define PEXTRD_rm_x_i8		0x16
279 #define PMOVMSKB_r_x		(/* GROUP_0F */ 0xd7)
280 #define PMOVSXBD_x_xm		0x21
281 #define PMOVSXBQ_x_xm		0x22
282 #define PMOVSXBW_x_xm		0x20
283 #define PMOVSXDQ_x_xm		0x25
284 #define PMOVSXWD_x_xm		0x23
285 #define PMOVSXWQ_x_xm		0x24
286 #define PMOVZXBD_x_xm		0x31
287 #define PMOVZXBQ_x_xm		0x32
288 #define PMOVZXBW_x_xm		0x30
289 #define PMOVZXDQ_x_xm		0x35
290 #define PMOVZXWD_x_xm		0x33
291 #define PMOVZXWQ_x_xm		0x34
292 #define POP_r			0x58
293 #define POP_rm			0x8f
294 #define POPF			0x9d
295 #define POR_x_xm		0xeb
296 #define PREFETCH		0x18
297 #define PSHUFB_x_xm		0x00
298 #define PSHUFD_x_xm		0x70
299 #define PSHUFLW_x_xm		0x70
300 #define PSRLDQ_x		0x73
301 #define PSLLD_x_i8		0x72
302 #define PSLLQ_x_i8		0x73
303 #define PUSH_i32		0x68
304 #define PUSH_r			0x50
305 #define PUSH_rm			(/* GROUP_FF */ 6 << 3)
306 #define PUSHF			0x9c
307 #define PXOR_x_xm		0xef
308 #define ROL			(/* SHIFT */ 0 << 3)
309 #define ROR			(/* SHIFT */ 1 << 3)
310 #define RET_near		0xc3
311 #define RET_i16			0xc2
312 #define SBB			(/* BINARY */ 3 << 3)
313 #define SBB_EAX_i32		0x1d
314 #define SBB_r_rm		0x1b
315 #define SBB_rm_r		0x19
316 #define SAR			(/* SHIFT */ 7 << 3)
317 #define SHL			(/* SHIFT */ 4 << 3)
318 #define SHLD			(/* GROUP_0F */ 0xa5)
319 #define SHRD			(/* GROUP_0F */ 0xad)
320 #define SHR			(/* SHIFT */ 5 << 3)
321 #define SHUFPS_x_xm		0xc6
322 #define SUB			(/* BINARY */ 5 << 3)
323 #define SUB_EAX_i32		0x2d
324 #define SUB_r_rm		0x2b
325 #define SUB_rm_r		0x29
326 #define SUBSD_x_xm		0x5c
327 #define TEST_EAX_i32		0xa9
328 #define TEST_rm_r		0x85
329 #define TZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
330 #define UCOMISD_x_xm		0x2e
331 #define UNPCKLPD_x_xm		0x14
332 #define UNPCKLPS_x_xm		0x14
333 #define VBROADCASTSD_x_xm	0x19
334 #define VBROADCASTSS_x_xm	0x18
335 #define VEXTRACTF128_x_ym	0x19
336 #define VEXTRACTI128_x_ym	0x39
337 #define VINSERTF128_y_y_xm	0x18
338 #define VINSERTI128_y_y_xm	0x38
339 #define VPBROADCASTB_x_xm	0x78
340 #define VPBROADCASTD_x_xm	0x58
341 #define VPBROADCASTQ_x_xm	0x59
342 #define VPBROADCASTW_x_xm	0x79
343 #define VPERMPD_y_ym		0x01
344 #define VPERMQ_y_ym		0x00
345 #define XCHG_EAX_r		0x90
346 #define XCHG_r_rm		0x87
347 #define XOR			(/* BINARY */ 6 << 3)
348 #define XOR_EAX_i32		0x35
349 #define XOR_r_rm		0x33
350 #define XOR_rm_r		0x31
351 #define XORPD_x_xm		0x57
352 
353 #define GROUP_0F		0x0f
354 #define GROUP_66		0x66
355 #define GROUP_F3		0xf3
356 #define GROUP_F7		0xf7
357 #define GROUP_FF		0xff
358 #define GROUP_BINARY_81		0x81
359 #define GROUP_BINARY_83		0x83
360 #define GROUP_SHIFT_1		0xd1
361 #define GROUP_SHIFT_N		0xc1
362 #define GROUP_SHIFT_CL		0xd3
363 #define GROUP_LOCK		0xf0
364 
365 #define MOD_REG			0xc0
366 #define MOD_DISP8		0x40
367 
368 #define INC_SIZE(s)		(*inst++ = U8(s), compiler->size += (s))
369 
370 #define PUSH_REG(r)		(*inst++ = U8(PUSH_r + (r)))
371 #define POP_REG(r)		(*inst++ = U8(POP_r + (r)))
372 #define RET()			(*inst++ = RET_near)
373 #define RET_I16(n)		(*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0)
374 
375 #define SLJIT_INST_LABEL	255
376 #define SLJIT_INST_JUMP		254
377 #define SLJIT_INST_MOV_ADDR	253
378 #define SLJIT_INST_CONST	252
379 
380 /* Multithreading does not affect these static variables, since they store
381    built-in CPU features. Therefore they can be overwritten by different threads
382    if they detect the CPU features in the same time. */
383 #define CPU_FEATURE_DETECTED		0x001
384 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
385 #define CPU_FEATURE_SSE2		0x002
386 #endif
387 #define CPU_FEATURE_SSE41		0x004
388 #define CPU_FEATURE_LZCNT		0x008
389 #define CPU_FEATURE_TZCNT		0x010
390 #define CPU_FEATURE_CMOV		0x020
391 #define CPU_FEATURE_AVX			0x040
392 #define CPU_FEATURE_AVX2		0x080
393 #define CPU_FEATURE_OSXSAVE		0x100
394 
395 static sljit_u32 cpu_feature_list = 0;
396 
397 #ifdef _WIN32_WCE
398 #include <cmnintrin.h>
399 #elif defined(_MSC_VER) && _MSC_VER >= 1400
400 #include <intrin.h>
401 #endif
402 
403 /******************************************************/
404 /*    Unaligned-store functions                       */
405 /******************************************************/
406 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)407 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
408 {
409 	SLJIT_MEMCPY(addr, &value, sizeof(value));
410 }
411 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)412 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
413 {
414 	SLJIT_MEMCPY(addr, &value, sizeof(value));
415 }
416 
sljit_unaligned_store_sw(void * addr,sljit_sw value)417 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
418 {
419 	SLJIT_MEMCPY(addr, &value, sizeof(value));
420 }
421 
422 /******************************************************/
423 /*    Utility functions                               */
424 /******************************************************/
425 
execute_cpu_id(sljit_u32 info[4])426 static void execute_cpu_id(sljit_u32 info[4])
427 {
428 #if defined(_MSC_VER) && _MSC_VER >= 1400
429 
430 	__cpuidex((int*)info, (int)info[0], (int)info[2]);
431 
432 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
433 
434 	/* AT&T syntax. */
435 	__asm__ (
436 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
437 		"movl %0, %%esi\n"
438 		"movl (%%esi), %%eax\n"
439 		"movl 8(%%esi), %%ecx\n"
440 		"pushl %%ebx\n"
441 		"cpuid\n"
442 		"movl %%eax, (%%esi)\n"
443 		"movl %%ebx, 4(%%esi)\n"
444 		"popl %%ebx\n"
445 		"movl %%ecx, 8(%%esi)\n"
446 		"movl %%edx, 12(%%esi)\n"
447 #else /* !SLJIT_CONFIG_X86_32 */
448 		"movq %0, %%rsi\n"
449 		"movl (%%rsi), %%eax\n"
450 		"movl 8(%%rsi), %%ecx\n"
451 		"cpuid\n"
452 		"movl %%eax, (%%rsi)\n"
453 		"movl %%ebx, 4(%%rsi)\n"
454 		"movl %%ecx, 8(%%rsi)\n"
455 		"movl %%edx, 12(%%rsi)\n"
456 #endif /* SLJIT_CONFIG_X86_32 */
457 		:
458 		: "r" (info)
459 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
460 		: "memory", "eax", "ecx", "edx", "esi"
461 #else /* !SLJIT_CONFIG_X86_32 */
462 		: "memory", "rax", "rbx", "rcx", "rdx", "rsi"
463 #endif /* SLJIT_CONFIG_X86_32 */
464 	);
465 
466 #else /* _MSC_VER < 1400 */
467 
468 	/* Intel syntax. */
469 	__asm {
470 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
471 		mov esi, info
472 		mov eax, [esi]
473 		mov ecx, [esi + 8]
474 		cpuid
475 		mov [esi], eax
476 		mov [esi + 4], ebx
477 		mov [esi + 8], ecx
478 		mov [esi + 12], edx
479 #else /* !SLJIT_CONFIG_X86_32 */
480 		mov rsi, info
481 		mov eax, [rsi]
482 		mov ecx, [rsi + 8]
483 		cpuid
484 		mov [rsi], eax
485 		mov [rsi + 4], ebx
486 		mov [rsi + 8], ecx
487 		mov [rsi + 12], edx
488 #endif /* SLJIT_CONFIG_X86_32 */
489 	}
490 
491 #endif /* _MSC_VER && _MSC_VER >= 1400 */
492 }
493 
execute_get_xcr0_low(void)494 static sljit_u32 execute_get_xcr0_low(void)
495 {
496 	sljit_u32 xcr0;
497 
498 #if defined(_MSC_VER) && _MSC_VER >= 1400
499 
500 	xcr0 = (sljit_u32)_xgetbv(0);
501 
502 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
503 
504 	/* AT&T syntax. */
505 	__asm__ (
506 		"xorl %%ecx, %%ecx\n"
507 		"xgetbv\n"
508 		: "=a" (xcr0)
509 		:
510 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
511 		: "ecx", "edx"
512 #else /* !SLJIT_CONFIG_X86_32 */
513 		: "rcx", "rdx"
514 #endif /* SLJIT_CONFIG_X86_32 */
515 	);
516 
517 #else /* _MSC_VER < 1400 */
518 
519 	/* Intel syntax. */
520 	__asm {
521 		mov ecx, 0
522 		xgetbv
523 		mov xcr0, eax
524 	}
525 
526 #endif /* _MSC_VER && _MSC_VER >= 1400 */
527 	return xcr0;
528 }
529 
get_cpu_features(void)530 static void get_cpu_features(void)
531 {
532 	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
533 	sljit_u32 info[4] = {0};
534 	sljit_u32 max_id;
535 
536 	execute_cpu_id(info);
537 	max_id = info[0];
538 
539 	if (max_id >= 7) {
540 		info[0] = 7;
541 		info[2] = 0;
542 		execute_cpu_id(info);
543 
544 		if (info[1] & 0x8)
545 			feature_list |= CPU_FEATURE_TZCNT;
546 		if (info[1] & 0x20)
547 			feature_list |= CPU_FEATURE_AVX2;
548 	}
549 
550 	if (max_id >= 1) {
551 		info[0] = 1;
552 		execute_cpu_id(info);
553 
554 		if (info[2] & 0x80000)
555 			feature_list |= CPU_FEATURE_SSE41;
556 		if (info[2] & 0x8000000)
557 			feature_list |= CPU_FEATURE_OSXSAVE;
558 		if (info[2] & 0x10000000)
559 			feature_list |= CPU_FEATURE_AVX;
560 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
561 		if (info[3] & 0x4000000)
562 			feature_list |= CPU_FEATURE_SSE2;
563 #endif
564 		if (info[3] & 0x8000)
565 			feature_list |= CPU_FEATURE_CMOV;
566 	}
567 
568 	info[0] = 0x80000001;
569 	execute_cpu_id(info);
570 
571 	if (info[2] & 0x20)
572 		feature_list |= CPU_FEATURE_LZCNT;
573 
574 	if ((feature_list & CPU_FEATURE_OSXSAVE) && (execute_get_xcr0_low() & 0x4) == 0)
575 		feature_list &= ~(sljit_u32)(CPU_FEATURE_AVX | CPU_FEATURE_AVX2);
576 
577 	cpu_feature_list = feature_list;
578 }
579 
get_jump_code(sljit_uw type)580 static sljit_u8 get_jump_code(sljit_uw type)
581 {
582 	switch (type) {
583 	case SLJIT_EQUAL:
584 	case SLJIT_ATOMIC_STORED:
585 	case SLJIT_F_EQUAL:
586 	case SLJIT_UNORDERED_OR_EQUAL:
587 		return 0x84 /* je */;
588 
589 	case SLJIT_NOT_EQUAL:
590 	case SLJIT_ATOMIC_NOT_STORED:
591 	case SLJIT_F_NOT_EQUAL:
592 	case SLJIT_ORDERED_NOT_EQUAL:
593 		return 0x85 /* jne */;
594 
595 	case SLJIT_LESS:
596 	case SLJIT_CARRY:
597 	case SLJIT_F_LESS:
598 	case SLJIT_UNORDERED_OR_LESS:
599 	case SLJIT_UNORDERED_OR_GREATER:
600 		return 0x82 /* jc */;
601 
602 	case SLJIT_GREATER_EQUAL:
603 	case SLJIT_NOT_CARRY:
604 	case SLJIT_F_GREATER_EQUAL:
605 	case SLJIT_ORDERED_GREATER_EQUAL:
606 	case SLJIT_ORDERED_LESS_EQUAL:
607 		return 0x83 /* jae */;
608 
609 	case SLJIT_GREATER:
610 	case SLJIT_F_GREATER:
611 	case SLJIT_ORDERED_LESS:
612 	case SLJIT_ORDERED_GREATER:
613 		return 0x87 /* jnbe */;
614 
615 	case SLJIT_LESS_EQUAL:
616 	case SLJIT_F_LESS_EQUAL:
617 	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
618 	case SLJIT_UNORDERED_OR_LESS_EQUAL:
619 		return 0x86 /* jbe */;
620 
621 	case SLJIT_SIG_LESS:
622 		return 0x8c /* jl */;
623 
624 	case SLJIT_SIG_GREATER_EQUAL:
625 		return 0x8d /* jnl */;
626 
627 	case SLJIT_SIG_GREATER:
628 		return 0x8f /* jnle */;
629 
630 	case SLJIT_SIG_LESS_EQUAL:
631 		return 0x8e /* jle */;
632 
633 	case SLJIT_OVERFLOW:
634 		return 0x80 /* jo */;
635 
636 	case SLJIT_NOT_OVERFLOW:
637 		return 0x81 /* jno */;
638 
639 	case SLJIT_UNORDERED:
640 	case SLJIT_ORDERED_EQUAL: /* NaN. */
641 		return 0x8a /* jp */;
642 
643 	case SLJIT_ORDERED:
644 	case SLJIT_UNORDERED_OR_NOT_EQUAL: /* Not NaN. */
645 		return 0x8b /* jpo */;
646 	}
647 	return 0;
648 }
649 
650 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
651 static sljit_u8* detect_far_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_sw executable_offset);
652 #else /* !SLJIT_CONFIG_X86_32 */
653 static sljit_u8* detect_far_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr);
654 static sljit_u8* generate_mov_addr_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset);
655 #endif /* SLJIT_CONFIG_X86_32 */
656 
detect_near_jump_type(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_sw executable_offset)657 static sljit_u8* detect_near_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset)
658 {
659 	sljit_uw type = jump->flags >> TYPE_SHIFT;
660 	sljit_s32 short_jump;
661 	sljit_uw label_addr;
662 
663 	if (jump->flags & JUMP_ADDR)
664 		label_addr = jump->u.target - (sljit_uw)executable_offset;
665 	else
666 		label_addr = (sljit_uw)(code + jump->u.label->size);
667 
668 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
669 	if ((sljit_sw)(label_addr - (sljit_uw)(code_ptr + 6)) > HALFWORD_MAX || (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 5)) < HALFWORD_MIN)
670 		return detect_far_jump_type(jump, code_ptr);
671 #endif /* SLJIT_CONFIG_X86_64 */
672 
673 	short_jump = (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 2)) >= -0x80 && (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 2)) <= 0x7f;
674 
675 	if (type == SLJIT_JUMP) {
676 		if (short_jump)
677 			*code_ptr++ = JMP_i8;
678 		else
679 			*code_ptr++ = JMP_i32;
680 	} else if (type > SLJIT_JUMP) {
681 		short_jump = 0;
682 		*code_ptr++ = CALL_i32;
683 	} else if (short_jump) {
684 		*code_ptr++ = U8(get_jump_code(type) - 0x10);
685 	} else {
686 		*code_ptr++ = GROUP_0F;
687 		*code_ptr++ = get_jump_code(type);
688 	}
689 
690 	jump->addr = (sljit_uw)code_ptr;
691 
692 	if (short_jump) {
693 		jump->flags |= PATCH_MB;
694 		code_ptr += sizeof(sljit_s8);
695 	} else {
696 		jump->flags |= PATCH_MW;
697 		code_ptr += sizeof(sljit_s32);
698 	}
699 
700 	return code_ptr;
701 }
702 
generate_jump_or_mov_addr(struct sljit_jump * jump,sljit_sw executable_offset)703 static void generate_jump_or_mov_addr(struct sljit_jump *jump, sljit_sw executable_offset)
704 {
705 	sljit_uw flags = jump->flags;
706 	sljit_uw addr = (flags & JUMP_ADDR) ? jump->u.target : jump->u.label->u.addr;
707 	sljit_uw jump_addr = jump->addr;
708 	SLJIT_UNUSED_ARG(executable_offset);
709 
710 	if (SLJIT_UNLIKELY(flags & JUMP_MOV_ADDR)) {
711 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
712 		sljit_unaligned_store_sw((void*)(jump_addr - sizeof(sljit_sw)), (sljit_sw)addr);
713 #else /* SLJIT_CONFIG_X86_32 */
714 		if (flags & PATCH_MD) {
715 			SLJIT_ASSERT(addr > HALFWORD_MAX);
716 			sljit_unaligned_store_sw((void*)(jump_addr - sizeof(sljit_sw)), (sljit_sw)addr);
717 			return;
718 		}
719 
720 		if (flags & PATCH_MW) {
721 			addr -= (sljit_uw)SLJIT_ADD_EXEC_OFFSET((sljit_u8*)jump_addr, executable_offset);
722 			SLJIT_ASSERT((sljit_sw)addr <= HALFWORD_MAX && (sljit_sw)addr >= HALFWORD_MIN);
723 		} else {
724 			SLJIT_ASSERT(addr <= HALFWORD_MAX);
725 		}
726 		sljit_unaligned_store_s32((void*)(jump_addr - sizeof(sljit_s32)), (sljit_s32)addr);
727 #endif /* !SLJIT_CONFIG_X86_32 */
728 		return;
729 	}
730 
731 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
732 	if (SLJIT_UNLIKELY(flags & PATCH_MD)) {
733 		SLJIT_ASSERT(!(flags & JUMP_ADDR));
734 		sljit_unaligned_store_sw((void*)jump_addr, (sljit_sw)addr);
735 		return;
736 	}
737 #endif /* SLJIT_CONFIG_X86_64 */
738 
739 	addr -= (sljit_uw)SLJIT_ADD_EXEC_OFFSET((sljit_u8*)jump_addr, executable_offset);
740 
741 	if (flags & PATCH_MB) {
742 		addr -= sizeof(sljit_s8);
743 		SLJIT_ASSERT((sljit_sw)addr <= 0x7f && (sljit_sw)addr >= -0x80);
744 		*(sljit_u8*)jump_addr = U8(addr);
745 		return;
746 	} else if (flags & PATCH_MW) {
747 		addr -= sizeof(sljit_s32);
748 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
749 		sljit_unaligned_store_sw((void*)jump_addr, (sljit_sw)addr);
750 #else /* !SLJIT_CONFIG_X86_32 */
751 		SLJIT_ASSERT((sljit_sw)addr <= HALFWORD_MAX && (sljit_sw)addr >= HALFWORD_MIN);
752 		sljit_unaligned_store_s32((void*)jump_addr, (sljit_s32)addr);
753 #endif /* SLJIT_CONFIG_X86_32 */
754 	}
755 }
756 
reduce_code_size(struct sljit_compiler * compiler)757 static void reduce_code_size(struct sljit_compiler *compiler)
758 {
759 	struct sljit_label *label;
760 	struct sljit_jump *jump;
761 	sljit_uw next_label_size;
762 	sljit_uw next_jump_addr;
763 	sljit_uw next_min_addr;
764 	sljit_uw size_reduce = 0;
765 	sljit_sw diff;
766 	sljit_uw type;
767 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
768 	sljit_uw size_reduce_max;
769 #endif /* SLJIT_DEBUG */
770 
771 	label = compiler->labels;
772 	jump = compiler->jumps;
773 
774 	next_label_size = SLJIT_GET_NEXT_SIZE(label);
775 	next_jump_addr = SLJIT_GET_NEXT_ADDRESS(jump);
776 
777 	while (1) {
778 		next_min_addr = next_label_size;
779 		if (next_jump_addr < next_min_addr)
780 			next_min_addr = next_jump_addr;
781 
782 		if (next_min_addr == SLJIT_MAX_ADDRESS)
783 			break;
784 
785 		if (next_min_addr == next_label_size) {
786 			label->size -= size_reduce;
787 
788 			label = label->next;
789 			next_label_size = SLJIT_GET_NEXT_SIZE(label);
790 		}
791 
792 		if (next_min_addr != next_jump_addr)
793 			continue;
794 
795 		if (!(jump->flags & JUMP_MOV_ADDR)) {
796 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
797 			size_reduce_max = size_reduce + (((jump->flags >> TYPE_SHIFT) < SLJIT_JUMP) ? CJUMP_MAX_SIZE : JUMP_MAX_SIZE);
798 #endif /* SLJIT_DEBUG */
799 
800 			if (!(jump->flags & SLJIT_REWRITABLE_JUMP)) {
801 				if (jump->flags & JUMP_ADDR) {
802 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
803 					if (jump->u.target <= 0xffffffffl)
804 						size_reduce += sizeof(sljit_s32);
805 #endif /* SLJIT_CONFIG_X86_64 */
806 				} else {
807 					/* Unit size: instruction. */
808 					diff = (sljit_sw)jump->u.label->size - (sljit_sw)(jump->addr - size_reduce);
809 					type = jump->flags >> TYPE_SHIFT;
810 
811 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
812 					if (type == SLJIT_JUMP) {
813 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
814 							size_reduce += JUMP_MAX_SIZE - 2;
815 						else if (diff <= HALFWORD_MAX + 5 && diff >= HALFWORD_MIN + 5)
816 							size_reduce += JUMP_MAX_SIZE - 5;
817 					} else if (type < SLJIT_JUMP) {
818 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
819 							size_reduce += CJUMP_MAX_SIZE - 2;
820 						else if (diff <= HALFWORD_MAX + 6 && diff >= HALFWORD_MIN + 6)
821 							size_reduce += CJUMP_MAX_SIZE - 6;
822 					} else  {
823 						if (diff <= HALFWORD_MAX + 5 && diff >= HALFWORD_MIN + 5)
824 							size_reduce += JUMP_MAX_SIZE - 5;
825 					}
826 #else /* !SLJIT_CONFIG_X86_64 */
827 					if (type == SLJIT_JUMP) {
828 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
829 							size_reduce += JUMP_MAX_SIZE - 2;
830 					} else if (type < SLJIT_JUMP) {
831 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
832 							size_reduce += CJUMP_MAX_SIZE - 2;
833 					}
834 #endif /* SLJIT_CONFIG_X86_64 */
835 				}
836 			}
837 
838 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
839 			jump->flags |= (size_reduce_max - size_reduce) << JUMP_SIZE_SHIFT;
840 #endif /* SLJIT_DEBUG */
841 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
842 		} else {
843 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
844 			size_reduce_max = size_reduce + 10;
845 #endif /* SLJIT_DEBUG */
846 
847 			if (!(jump->flags & JUMP_ADDR)) {
848 				diff = (sljit_sw)jump->u.label->size - (sljit_sw)(jump->addr - size_reduce - 3);
849 
850 				if (diff <= HALFWORD_MAX && diff >= HALFWORD_MIN)
851 					size_reduce += 3;
852 			} else if (jump->u.target <= 0xffffffffl)
853 				size_reduce += (jump->flags & MOV_ADDR_HI) ? 4 : 5;
854 
855 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
856 			jump->flags |= (size_reduce_max - size_reduce) << JUMP_SIZE_SHIFT;
857 #endif /* SLJIT_DEBUG */
858 #endif /* SLJIT_CONFIG_X86_64 */
859 		}
860 
861 		jump = jump->next;
862 		next_jump_addr = SLJIT_GET_NEXT_ADDRESS(jump);
863 	}
864 
865 	compiler->size -= size_reduce;
866 }
867 
sljit_generate_code(struct sljit_compiler * compiler,sljit_s32 options,void * exec_allocator_data)868 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler, sljit_s32 options, void *exec_allocator_data)
869 {
870 	struct sljit_memory_fragment *buf;
871 	sljit_u8 *code;
872 	sljit_u8 *code_ptr;
873 	sljit_u8 *buf_ptr;
874 	sljit_u8 *buf_end;
875 	sljit_u8 len;
876 	sljit_sw executable_offset;
877 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
878 	sljit_uw addr;
879 #endif /* SLJIT_DEBUG */
880 
881 	struct sljit_label *label;
882 	struct sljit_jump *jump;
883 	struct sljit_const *const_;
884 
885 	CHECK_ERROR_PTR();
886 	CHECK_PTR(check_sljit_generate_code(compiler));
887 
888 	reduce_code_size(compiler);
889 
890 	/* Second code generation pass. */
891 	code = (sljit_u8*)allocate_executable_memory(compiler->size, options, exec_allocator_data, &executable_offset);
892 	PTR_FAIL_WITH_EXEC_IF(code);
893 
894 	reverse_buf(compiler);
895 	buf = compiler->buf;
896 
897 	code_ptr = code;
898 	label = compiler->labels;
899 	jump = compiler->jumps;
900 	const_ = compiler->consts;
901 
902 	do {
903 		buf_ptr = buf->memory;
904 		buf_end = buf_ptr + buf->used_size;
905 		do {
906 			len = *buf_ptr++;
907 			SLJIT_ASSERT(len > 0);
908 			if (len < SLJIT_INST_CONST) {
909 				/* The code is already generated. */
910 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
911 				code_ptr += len;
912 				buf_ptr += len;
913 			} else {
914 				switch (len) {
915 				case SLJIT_INST_LABEL:
916 					label->u.addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
917 					label->size = (sljit_uw)(code_ptr - code);
918 					label = label->next;
919 					break;
920 				case SLJIT_INST_JUMP:
921 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
922 					addr = (sljit_uw)code_ptr;
923 #endif /* SLJIT_DEBUG */
924 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
925 						code_ptr = detect_near_jump_type(jump, code_ptr, code, executable_offset);
926 					else {
927 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
928 						code_ptr = detect_far_jump_type(jump, code_ptr, executable_offset);
929 #else /* !SLJIT_CONFIG_X86_32 */
930 						code_ptr = detect_far_jump_type(jump, code_ptr);
931 #endif /* SLJIT_CONFIG_X86_32 */
932 					}
933 
934 					SLJIT_ASSERT((sljit_uw)code_ptr - addr <= ((jump->flags >> JUMP_SIZE_SHIFT) & 0x1f));
935 					jump = jump->next;
936 					break;
937 				case SLJIT_INST_MOV_ADDR:
938 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
939 					code_ptr = generate_mov_addr_code(jump, code_ptr, code, executable_offset);
940 #endif /* SLJIT_CONFIG_X86_64 */
941 					jump->addr = (sljit_uw)code_ptr;
942 					jump = jump->next;
943 					break;
944 				default:
945 					SLJIT_ASSERT(len == SLJIT_INST_CONST);
946 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
947 					const_ = const_->next;
948 					break;
949 				}
950 			}
951 		} while (buf_ptr < buf_end);
952 
953 		SLJIT_ASSERT(buf_ptr == buf_end);
954 		buf = buf->next;
955 	} while (buf);
956 
957 	SLJIT_ASSERT(!label);
958 	SLJIT_ASSERT(!jump);
959 	SLJIT_ASSERT(!const_);
960 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
961 
962 	jump = compiler->jumps;
963 	while (jump) {
964 		generate_jump_or_mov_addr(jump, executable_offset);
965 		jump = jump->next;
966 	}
967 
968 	compiler->error = SLJIT_ERR_COMPILED;
969 	compiler->executable_offset = executable_offset;
970 	compiler->executable_size = (sljit_uw)(code_ptr - code);
971 
972 	code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
973 
974 	SLJIT_UPDATE_WX_FLAGS(code, (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset), 1);
975 	return (void*)code;
976 }
977 
sljit_has_cpu_feature(sljit_s32 feature_type)978 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
979 {
980 	switch (feature_type) {
981 	case SLJIT_HAS_FPU:
982 #ifdef SLJIT_IS_FPU_AVAILABLE
983 		return (SLJIT_IS_FPU_AVAILABLE) != 0;
984 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
985 		if (cpu_feature_list == 0)
986 			get_cpu_features();
987 		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
988 #else /* SLJIT_DETECT_SSE2 */
989 		return 1;
990 #endif /* SLJIT_DETECT_SSE2 */
991 
992 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
993 	case SLJIT_HAS_VIRTUAL_REGISTERS:
994 		return 1;
995 #endif /* SLJIT_CONFIG_X86_32 */
996 
997 	case SLJIT_HAS_CLZ:
998 		if (cpu_feature_list == 0)
999 			get_cpu_features();
1000 
1001 		return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2;
1002 
1003 	case SLJIT_HAS_CTZ:
1004 		if (cpu_feature_list == 0)
1005 			get_cpu_features();
1006 
1007 		return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2;
1008 
1009 	case SLJIT_HAS_CMOV:
1010 		if (cpu_feature_list == 0)
1011 			get_cpu_features();
1012 		return (cpu_feature_list & CPU_FEATURE_CMOV) != 0;
1013 
1014 	case SLJIT_HAS_REV:
1015 	case SLJIT_HAS_ROT:
1016 	case SLJIT_HAS_PREFETCH:
1017 	case SLJIT_HAS_COPY_F32:
1018 	case SLJIT_HAS_COPY_F64:
1019 	case SLJIT_HAS_ATOMIC:
1020 		return 1;
1021 
1022 #if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE
1023 	case SLJIT_HAS_AVX:
1024 		if (cpu_feature_list == 0)
1025 			get_cpu_features();
1026 		return (cpu_feature_list & CPU_FEATURE_AVX) != 0;
1027 	case SLJIT_HAS_AVX2:
1028 		if (cpu_feature_list == 0)
1029 			get_cpu_features();
1030 		return (cpu_feature_list & CPU_FEATURE_AVX2) != 0;
1031 	case SLJIT_HAS_SIMD:
1032 		if (cpu_feature_list == 0)
1033 			get_cpu_features();
1034 		return (cpu_feature_list & CPU_FEATURE_SSE41) != 0;
1035 #endif /* SLJIT_IS_FPU_AVAILABLE */
1036 	default:
1037 		return 0;
1038 	}
1039 }
1040 
sljit_cmp_info(sljit_s32 type)1041 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
1042 {
1043 	switch (type) {
1044 	case SLJIT_ORDERED_EQUAL:
1045 	case SLJIT_UNORDERED_OR_NOT_EQUAL:
1046 		return 2;
1047 	}
1048 
1049 	return 0;
1050 }
1051 
1052 /* --------------------------------------------------------------------- */
1053 /*  Operators                                                            */
1054 /* --------------------------------------------------------------------- */
1055 
1056 #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
1057 
1058 #define BINARY_IMM32(op_imm, immw, arg, argw) \
1059 	do { \
1060 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1061 		FAIL_IF(!inst); \
1062 		*(inst + 1) |= (op_imm); \
1063 	} while (0)
1064 
1065 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1066 
1067 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1068 	do { \
1069 		if (IS_HALFWORD(immw) || compiler->mode32) { \
1070 			BINARY_IMM32(op_imm, immw, arg, argw); \
1071 		} \
1072 		else { \
1073 			FAIL_IF(emit_load_imm64(compiler, FAST_IS_REG(arg) ? TMP_REG2 : TMP_REG1, immw)); \
1074 			inst = emit_x86_instruction(compiler, 1, FAST_IS_REG(arg) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
1075 			FAIL_IF(!inst); \
1076 			*inst = (op_mr); \
1077 		} \
1078 	} while (0)
1079 
1080 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1081 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1082 
1083 #else /* !SLJIT_CONFIG_X86_64 */
1084 
1085 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1086 	BINARY_IMM32(op_imm, immw, arg, argw)
1087 
1088 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1089 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1090 
1091 #endif /* SLJIT_CONFIG_X86_64 */
1092 
emit_byte(struct sljit_compiler * compiler,sljit_u8 byte)1093 static sljit_s32 emit_byte(struct sljit_compiler *compiler, sljit_u8 byte)
1094 {
1095 	sljit_u8 *inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1096 	FAIL_IF(!inst);
1097 	INC_SIZE(1);
1098 	*inst = byte;
1099 	return SLJIT_SUCCESS;
1100 }
1101 
1102 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
1103 	sljit_s32 dst, sljit_sw dstw,
1104 	sljit_s32 src, sljit_sw srcw);
1105 
1106 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
1107 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1108 
1109 static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
1110 	sljit_uw op,
1111 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1112 
1113 static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
1114 	sljit_uw op,
1115 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1116 
1117 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
1118 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
1119 
1120 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
1121 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1122 
1123 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1124 	sljit_s32 src1, sljit_sw src1w,
1125 	sljit_s32 src2, sljit_sw src2w);
1126 
1127 static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
1128 	sljit_s32 dst_reg,
1129 	sljit_s32 src, sljit_sw srcw);
1130 
emit_endbranch(struct sljit_compiler * compiler)1131 static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
1132 {
1133 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET)
1134 	/* Emit endbr32/endbr64 when CET is enabled.  */
1135 	sljit_u8 *inst;
1136 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1137 	FAIL_IF(!inst);
1138 	INC_SIZE(4);
1139 	inst[0] = GROUP_F3;
1140 	inst[1] = GROUP_0F;
1141 	inst[2] = 0x1e;
1142 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1143 	inst[3] = 0xfb;
1144 #else /* !SLJIT_CONFIG_X86_32 */
1145 	inst[3] = 0xfa;
1146 #endif /* SLJIT_CONFIG_X86_32 */
1147 #else /* !SLJIT_CONFIG_X86_CET */
1148 	SLJIT_UNUSED_ARG(compiler);
1149 #endif /* SLJIT_CONFIG_X86_CET */
1150 	return SLJIT_SUCCESS;
1151 }
1152 
1153 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1154 
emit_rdssp(struct sljit_compiler * compiler,sljit_s32 reg)1155 static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_s32 reg)
1156 {
1157 	sljit_u8 *inst;
1158 	sljit_s32 size;
1159 
1160 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1161 	size = 5;
1162 #else
1163 	size = 4;
1164 #endif
1165 
1166 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1167 	FAIL_IF(!inst);
1168 	INC_SIZE(size);
1169 	*inst++ = GROUP_F3;
1170 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1171 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1172 #endif
1173 	inst[0] = GROUP_0F;
1174 	inst[1] = 0x1e;
1175 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1176 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_lmap[reg]);
1177 #else
1178 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_map[reg]);
1179 #endif
1180 	return SLJIT_SUCCESS;
1181 }
1182 
emit_incssp(struct sljit_compiler * compiler,sljit_s32 reg)1183 static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit_s32 reg)
1184 {
1185 	sljit_u8 *inst;
1186 	sljit_s32 size;
1187 
1188 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1189 	size = 5;
1190 #else
1191 	size = 4;
1192 #endif
1193 
1194 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1195 	FAIL_IF(!inst);
1196 	INC_SIZE(size);
1197 	*inst++ = GROUP_F3;
1198 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1199 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1200 #endif
1201 	inst[0] = GROUP_0F;
1202 	inst[1] = 0xae;
1203 	inst[2] = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
1204 	return SLJIT_SUCCESS;
1205 }
1206 
1207 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1208 
cpu_has_shadow_stack(void)1209 static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void)
1210 {
1211 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1212 	return _get_ssp() != 0;
1213 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1214 	return 0;
1215 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1216 }
1217 
adjust_shadow_stack(struct sljit_compiler * compiler,sljit_s32 src,sljit_sw srcw)1218 static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler,
1219 	sljit_s32 src, sljit_sw srcw)
1220 {
1221 #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1222 	sljit_u8 *inst, *jz_after_cmp_inst;
1223 	sljit_uw size_jz_after_cmp_inst;
1224 
1225 	sljit_uw size_before_rdssp_inst = compiler->size;
1226 
1227 	/* Generate "RDSSP TMP_REG1". */
1228 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
1229 
1230 	/* Load return address on shadow stack into TMP_REG1. */
1231 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
1232 
1233 	/* Compare return address against TMP_REG1. */
1234 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
1235 
1236 	/* Generate JZ to skip shadow stack ajdustment when shadow
1237 	   stack matches normal stack. */
1238 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1239 	FAIL_IF(!inst);
1240 	INC_SIZE(2);
1241 	*inst++ = get_jump_code(SLJIT_EQUAL) - 0x10;
1242 	size_jz_after_cmp_inst = compiler->size;
1243 	jz_after_cmp_inst = inst;
1244 
1245 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1246 	/* REX_W is not necessary. */
1247 	compiler->mode32 = 1;
1248 #endif
1249 	/* Load 1 into TMP_REG1. */
1250 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
1251 
1252 	/* Generate "INCSSP TMP_REG1". */
1253 	FAIL_IF(emit_incssp(compiler, TMP_REG1));
1254 
1255 	/* Jump back to "RDSSP TMP_REG1" to check shadow stack again. */
1256 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1257 	FAIL_IF(!inst);
1258 	INC_SIZE(2);
1259 	inst[0] = JMP_i8;
1260 	inst[1] = size_before_rdssp_inst - compiler->size;
1261 
1262 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
1263 #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1264 	SLJIT_UNUSED_ARG(compiler);
1265 	SLJIT_UNUSED_ARG(src);
1266 	SLJIT_UNUSED_ARG(srcw);
1267 #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1268 	return SLJIT_SUCCESS;
1269 }
1270 
1271 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1272 #include "sljitNativeX86_32.c"
1273 #else
1274 #include "sljitNativeX86_64.c"
1275 #endif
1276 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1277 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
1278 	sljit_s32 dst, sljit_sw dstw,
1279 	sljit_s32 src, sljit_sw srcw)
1280 {
1281 	sljit_u8* inst;
1282 
1283 	if (FAST_IS_REG(src)) {
1284 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
1285 		FAIL_IF(!inst);
1286 		*inst = MOV_rm_r;
1287 		return SLJIT_SUCCESS;
1288 	}
1289 
1290 	if (src == SLJIT_IMM) {
1291 		if (FAST_IS_REG(dst)) {
1292 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1293 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1294 #else
1295 			if (!compiler->mode32) {
1296 				if (NOT_HALFWORD(srcw))
1297 					return emit_load_imm64(compiler, dst, srcw);
1298 			}
1299 			else
1300 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, U8(MOV_r_i32 | reg_lmap[dst]), srcw);
1301 #endif
1302 		}
1303 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1304 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
1305 			/* Immediate to memory move. Only SLJIT_MOV operation copies
1306 			   an immediate directly into memory so TMP_REG1 can be used. */
1307 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
1308 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1309 			FAIL_IF(!inst);
1310 			*inst = MOV_rm_r;
1311 			return SLJIT_SUCCESS;
1312 		}
1313 #endif
1314 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
1315 		FAIL_IF(!inst);
1316 		*inst = MOV_rm_i32;
1317 		return SLJIT_SUCCESS;
1318 	}
1319 	if (FAST_IS_REG(dst)) {
1320 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
1321 		FAIL_IF(!inst);
1322 		*inst = MOV_r_rm;
1323 		return SLJIT_SUCCESS;
1324 	}
1325 
1326 	/* Memory to memory move. Only SLJIT_MOV operation copies
1327 	   data from memory to memory so TMP_REG1 can be used. */
1328 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
1329 	FAIL_IF(!inst);
1330 	*inst = MOV_r_rm;
1331 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1332 	FAIL_IF(!inst);
1333 	*inst = MOV_rm_r;
1334 	return SLJIT_SUCCESS;
1335 }
1336 
emit_cmov_generic(struct sljit_compiler * compiler,sljit_s32 type,sljit_s32 dst_reg,sljit_s32 src,sljit_sw srcw)1337 static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
1338 	sljit_s32 dst_reg,
1339 	sljit_s32 src, sljit_sw srcw)
1340 {
1341 	sljit_u8* inst;
1342 	sljit_uw size;
1343 
1344 	SLJIT_ASSERT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
1345 
1346 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1347 	FAIL_IF(!inst);
1348 	INC_SIZE(2);
1349 	inst[0] = U8(get_jump_code((sljit_uw)type ^ 0x1) - 0x10);
1350 
1351 	size = compiler->size;
1352 	EMIT_MOV(compiler, dst_reg, 0, src, srcw);
1353 
1354 	inst[1] = U8(compiler->size - size);
1355 	return SLJIT_SUCCESS;
1356 }
1357 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)1358 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
1359 {
1360 	sljit_u8 *inst;
1361 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1362 	sljit_uw size;
1363 #endif
1364 
1365 	CHECK_ERROR();
1366 	CHECK(check_sljit_emit_op0(compiler, op));
1367 
1368 	switch (GET_OPCODE(op)) {
1369 	case SLJIT_BREAKPOINT:
1370 		return emit_byte(compiler, INT3);
1371 	case SLJIT_NOP:
1372 		return emit_byte(compiler, NOP);
1373 	case SLJIT_LMUL_UW:
1374 	case SLJIT_LMUL_SW:
1375 	case SLJIT_DIVMOD_UW:
1376 	case SLJIT_DIVMOD_SW:
1377 	case SLJIT_DIV_UW:
1378 	case SLJIT_DIV_SW:
1379 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1380 #ifdef _WIN64
1381 		SLJIT_ASSERT(
1382 			reg_map[SLJIT_R0] == 0
1383 			&& reg_map[SLJIT_R1] == 2
1384 			&& reg_map[TMP_REG1] > 7);
1385 #else
1386 		SLJIT_ASSERT(
1387 			reg_map[SLJIT_R0] == 0
1388 			&& reg_map[SLJIT_R1] < 7
1389 			&& reg_map[TMP_REG1] == 2);
1390 #endif
1391 		compiler->mode32 = op & SLJIT_32;
1392 #endif
1393 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
1394 
1395 		op = GET_OPCODE(op);
1396 		if ((op | 0x2) == SLJIT_DIV_UW) {
1397 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1398 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1399 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
1400 #else
1401 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1402 #endif
1403 			FAIL_IF(!inst);
1404 			*inst = XOR_r_rm;
1405 		}
1406 
1407 		if ((op | 0x2) == SLJIT_DIV_SW) {
1408 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1409 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1410 #endif
1411 
1412 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1413 			FAIL_IF(emit_byte(compiler, CDQ));
1414 #else
1415 			if (!compiler->mode32) {
1416 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1417 				FAIL_IF(!inst);
1418 				INC_SIZE(2);
1419 				inst[0] = REX_W;
1420 				inst[1] = CDQ;
1421 			} else
1422 				FAIL_IF(emit_byte(compiler, CDQ));
1423 #endif
1424 		}
1425 
1426 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1427 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1428 		FAIL_IF(!inst);
1429 		INC_SIZE(2);
1430 		inst[0] = GROUP_F7;
1431 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
1432 #else /* !SLJIT_CONFIG_X86_32 */
1433 #ifdef _WIN64
1434 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
1435 #else /* !_WIN64 */
1436 		size = (!compiler->mode32) ? 3 : 2;
1437 #endif /* _WIN64 */
1438 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1439 		FAIL_IF(!inst);
1440 		INC_SIZE(size);
1441 #ifdef _WIN64
1442 		if (!compiler->mode32)
1443 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
1444 		else if (op >= SLJIT_DIVMOD_UW)
1445 			*inst++ = REX_B;
1446 		inst[0] = GROUP_F7;
1447 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
1448 #else /* !_WIN64 */
1449 		if (!compiler->mode32)
1450 			*inst++ = REX_W;
1451 		inst[0] = GROUP_F7;
1452 		inst[1] = MOD_REG | reg_map[SLJIT_R1];
1453 #endif /* _WIN64 */
1454 #endif /* SLJIT_CONFIG_X86_32 */
1455 		switch (op) {
1456 		case SLJIT_LMUL_UW:
1457 			inst[1] |= MUL;
1458 			break;
1459 		case SLJIT_LMUL_SW:
1460 			inst[1] |= IMUL;
1461 			break;
1462 		case SLJIT_DIVMOD_UW:
1463 		case SLJIT_DIV_UW:
1464 			inst[1] |= DIV;
1465 			break;
1466 		case SLJIT_DIVMOD_SW:
1467 		case SLJIT_DIV_SW:
1468 			inst[1] |= IDIV;
1469 			break;
1470 		}
1471 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
1472 		if (op <= SLJIT_DIVMOD_SW)
1473 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1474 #else
1475 		if (op >= SLJIT_DIV_UW)
1476 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1477 #endif
1478 		break;
1479 	case SLJIT_ENDBR:
1480 		return emit_endbranch(compiler);
1481 	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
1482 		return skip_frames_before_return(compiler);
1483 	}
1484 
1485 	return SLJIT_SUCCESS;
1486 }
1487 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1488 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
1489 	sljit_s32 dst, sljit_sw dstw,
1490 	sljit_s32 src, sljit_sw srcw)
1491 {
1492 	sljit_u8* inst;
1493 	sljit_s32 dst_r;
1494 
1495 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1496 	compiler->mode32 = 0;
1497 #endif
1498 
1499 	if (src == SLJIT_IMM) {
1500 		if (FAST_IS_REG(dst)) {
1501 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1502 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1503 #else
1504 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1505 			FAIL_IF(!inst);
1506 			*inst = MOV_rm_i32;
1507 			return SLJIT_SUCCESS;
1508 #endif
1509 		}
1510 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
1511 		FAIL_IF(!inst);
1512 		*inst = MOV_rm8_i8;
1513 		return SLJIT_SUCCESS;
1514 	}
1515 
1516 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1517 
1518 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
1519 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1520 		if (reg_map[src] >= 4) {
1521 			SLJIT_ASSERT(dst_r == TMP_REG1);
1522 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1523 		} else
1524 			dst_r = src;
1525 #else
1526 		dst_r = src;
1527 #endif
1528 	} else {
1529 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1530 		if (FAST_IS_REG(src) && reg_map[src] >= 4) {
1531 			/* Both src and dst are registers. */
1532 			SLJIT_ASSERT(FAST_IS_REG(dst));
1533 
1534 			if (src == dst && !sign) {
1535 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
1536 				FAIL_IF(!inst);
1537 				*(inst + 1) |= AND;
1538 				return SLJIT_SUCCESS;
1539 			}
1540 
1541 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1542 			src = TMP_REG1;
1543 			srcw = 0;
1544 		}
1545 #endif /* !SLJIT_CONFIG_X86_32 */
1546 
1547 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
1548 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw));
1549 	}
1550 
1551 	if (dst & SLJIT_MEM) {
1552 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1553 		FAIL_IF(!inst);
1554 		*inst = MOV_rm8_r8;
1555 	}
1556 
1557 	return SLJIT_SUCCESS;
1558 }
1559 
emit_prefetch(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 src,sljit_sw srcw)1560 static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1561 	sljit_s32 src, sljit_sw srcw)
1562 {
1563 	sljit_u8* inst;
1564 
1565 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1566 	compiler->mode32 = 1;
1567 #endif
1568 
1569 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1570 	FAIL_IF(!inst);
1571 	inst[0] = GROUP_0F;
1572 	inst[1] = PREFETCH;
1573 
1574 	if (op == SLJIT_PREFETCH_L1)
1575 		inst[2] |= (1 << 3);
1576 	else if (op == SLJIT_PREFETCH_L2)
1577 		inst[2] |= (2 << 3);
1578 	else if (op == SLJIT_PREFETCH_L3)
1579 		inst[2] |= (3 << 3);
1580 
1581 	return SLJIT_SUCCESS;
1582 }
1583 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1584 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1585 	sljit_s32 dst, sljit_sw dstw,
1586 	sljit_s32 src, sljit_sw srcw)
1587 {
1588 	sljit_u8* inst;
1589 	sljit_s32 dst_r;
1590 
1591 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1592 	compiler->mode32 = 0;
1593 #endif
1594 
1595 	if (src == SLJIT_IMM) {
1596 		if (FAST_IS_REG(dst)) {
1597 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1598 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1599 #else
1600 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1601 			FAIL_IF(!inst);
1602 			*inst = MOV_rm_i32;
1603 			return SLJIT_SUCCESS;
1604 #endif
1605 		}
1606 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1607 		FAIL_IF(!inst);
1608 		*inst = MOV_rm_i32;
1609 		return SLJIT_SUCCESS;
1610 	}
1611 
1612 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1613 
1614 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1615 		dst_r = src;
1616 	else
1617 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw));
1618 
1619 	if (dst & SLJIT_MEM) {
1620 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1621 		FAIL_IF(!inst);
1622 		*inst = MOV_rm_r;
1623 	}
1624 
1625 	return SLJIT_SUCCESS;
1626 }
1627 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1628 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1629 	sljit_s32 dst, sljit_sw dstw,
1630 	sljit_s32 src, sljit_sw srcw)
1631 {
1632 	sljit_u8* inst;
1633 
1634 	if (dst == src && dstw == srcw) {
1635 		/* Same input and output */
1636 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1637 		FAIL_IF(!inst);
1638 		inst[0] = GROUP_F7;
1639 		inst[1] |= opcode;
1640 		return SLJIT_SUCCESS;
1641 	}
1642 
1643 	if (FAST_IS_REG(dst)) {
1644 		EMIT_MOV(compiler, dst, 0, src, srcw);
1645 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1646 		FAIL_IF(!inst);
1647 		inst[0] = GROUP_F7;
1648 		inst[1] |= opcode;
1649 		return SLJIT_SUCCESS;
1650 	}
1651 
1652 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1653 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1654 	FAIL_IF(!inst);
1655 	inst[0] = GROUP_F7;
1656 	inst[1] |= opcode;
1657 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1658 	return SLJIT_SUCCESS;
1659 }
1660 
1661 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1662 static const sljit_sw emit_clz_arg = 32 + 31;
1663 static const sljit_sw emit_ctz_arg = 32;
1664 #endif
1665 
emit_clz_ctz(struct sljit_compiler * compiler,sljit_s32 is_clz,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1666 static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
1667 	sljit_s32 dst, sljit_sw dstw,
1668 	sljit_s32 src, sljit_sw srcw)
1669 {
1670 	sljit_u8* inst;
1671 	sljit_s32 dst_r;
1672 	sljit_sw max;
1673 
1674 	SLJIT_ASSERT(cpu_feature_list != 0);
1675 
1676 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1677 
1678 	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
1679 		FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw));
1680 
1681 		if (dst & SLJIT_MEM)
1682 			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1683 		return SLJIT_SUCCESS;
1684 	}
1685 
1686 	FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw));
1687 
1688 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1689 	max = is_clz ? (32 + 31) : 32;
1690 
1691 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1692 		if (dst_r != TMP_REG1) {
1693 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max);
1694 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1695 		}
1696 		else
1697 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg);
1698 
1699 		FAIL_IF(!inst);
1700 		inst[0] = GROUP_0F;
1701 		inst[1] = CMOVE_r_rm;
1702 	}
1703 	else
1704 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1705 
1706 	if (is_clz) {
1707 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1708 		FAIL_IF(!inst);
1709 		*(inst + 1) |= XOR;
1710 	}
1711 #else
1712 	if (is_clz)
1713 		max = compiler->mode32 ? (32 + 31) : (64 + 63);
1714 	else
1715 		max = compiler->mode32 ? 32 : 64;
1716 
1717 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1718 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
1719 		FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0));
1720 	} else
1721 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1722 
1723 	if (is_clz) {
1724 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0);
1725 		FAIL_IF(!inst);
1726 		*(inst + 1) |= XOR;
1727 	}
1728 #endif
1729 
1730 	if (dst & SLJIT_MEM)
1731 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1732 	return SLJIT_SUCCESS;
1733 }
1734 
emit_bswap(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1735 static sljit_s32 emit_bswap(struct sljit_compiler *compiler,
1736 	sljit_s32 op,
1737 	sljit_s32 dst, sljit_sw dstw,
1738 	sljit_s32 src, sljit_sw srcw)
1739 {
1740 	sljit_u8 *inst;
1741 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1742 	sljit_uw size;
1743 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1744 	sljit_u8 rex = 0;
1745 #else /* !SLJIT_CONFIG_X86_64 */
1746 	sljit_s32 dst_is_ereg = op & SLJIT_32;
1747 #endif /* SLJIT_CONFIG_X86_64 */
1748 
1749 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1750 	if (op == SLJIT_REV_U32 || op == SLJIT_REV_S32)
1751 		compiler->mode32 = 1;
1752 #else /* !SLJIT_CONFIG_X86_64 */
1753 	op &= ~SLJIT_32;
1754 #endif /* SLJIT_CONFIG_X86_64 */
1755 
1756 	if (src != dst_r) {
1757 		/* Only the lower 16 bit is read for eregs. */
1758 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1759 			FAIL_IF(emit_mov_half(compiler, 0, dst_r, 0, src, srcw));
1760 		else
1761 			EMIT_MOV(compiler, dst_r, 0, src, srcw);
1762 	}
1763 
1764 	size = 2;
1765 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1766 	if (!compiler->mode32)
1767 		rex = REX_W;
1768 
1769 	if (reg_map[dst_r] >= 8)
1770 		rex |= REX_B;
1771 
1772 	if (rex != 0)
1773 		size++;
1774 #endif /* SLJIT_CONFIG_X86_64 */
1775 
1776 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1777 	FAIL_IF(!inst);
1778 	INC_SIZE(size);
1779 
1780 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1781 	if (rex != 0)
1782 		*inst++ = rex;
1783 
1784 	inst[0] = GROUP_0F;
1785 	inst[1] = BSWAP_r | reg_lmap[dst_r];
1786 #else /* !SLJIT_CONFIG_X86_64 */
1787 	inst[0] = GROUP_0F;
1788 	inst[1] = BSWAP_r | reg_map[dst_r];
1789 #endif /* SLJIT_CONFIG_X86_64 */
1790 
1791 	if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) {
1792 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1793 		size = compiler->mode32 ? 16 : 48;
1794 #else /* !SLJIT_CONFIG_X86_64 */
1795 		size = 16;
1796 #endif /* SLJIT_CONFIG_X86_64 */
1797 
1798 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, (sljit_sw)size, dst_r, 0);
1799 		FAIL_IF(!inst);
1800 		if (op == SLJIT_REV_U16)
1801 			inst[1] |= SHR;
1802 		else
1803 			inst[1] |= SAR;
1804 	}
1805 
1806 	if (dst & SLJIT_MEM) {
1807 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1808 		if (dst_is_ereg)
1809 			op = SLJIT_REV;
1810 #endif /* SLJIT_CONFIG_X86_32 */
1811 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1812 			return emit_mov_half(compiler, 0, dst, dstw, TMP_REG1, 0);
1813 
1814 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1815 	}
1816 
1817 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1818 	if (op == SLJIT_REV_S32) {
1819 		compiler->mode32 = 0;
1820 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1821 		FAIL_IF(!inst);
1822 		*inst = MOVSXD_r_rm;
1823 	}
1824 #endif /* SLJIT_CONFIG_X86_64 */
1825 
1826 	return SLJIT_SUCCESS;
1827 }
1828 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1829 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1830 	sljit_s32 dst, sljit_sw dstw,
1831 	sljit_s32 src, sljit_sw srcw)
1832 {
1833 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1834 	sljit_s32 dst_is_ereg = 0;
1835 #else /* !SLJIT_CONFIG_X86_32 */
1836 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1837 #endif /* SLJIT_CONFIG_X86_32 */
1838 
1839 	CHECK_ERROR();
1840 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1841 	ADJUST_LOCAL_OFFSET(dst, dstw);
1842 	ADJUST_LOCAL_OFFSET(src, srcw);
1843 
1844 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1845 	CHECK_EXTRA_REGS(src, srcw, (void)0);
1846 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1847 	compiler->mode32 = op_flags & SLJIT_32;
1848 #endif /* SLJIT_CONFIG_X86_64 */
1849 
1850 	op = GET_OPCODE(op);
1851 
1852 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1853 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1854 		compiler->mode32 = 0;
1855 #endif /* SLJIT_CONFIG_X86_64 */
1856 
1857 		if (FAST_IS_REG(src) && src == dst) {
1858 			if (!TYPE_CAST_NEEDED(op))
1859 				return SLJIT_SUCCESS;
1860 		}
1861 
1862 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1863 		if (op_flags & SLJIT_32) {
1864 			if (src & SLJIT_MEM) {
1865 				if (op == SLJIT_MOV_S32)
1866 					op = SLJIT_MOV_U32;
1867 			}
1868 			else if (src == SLJIT_IMM) {
1869 				if (op == SLJIT_MOV_U32)
1870 					op = SLJIT_MOV_S32;
1871 			}
1872 		}
1873 #endif /* SLJIT_CONFIG_X86_64 */
1874 
1875 		if (src == SLJIT_IMM) {
1876 			switch (op) {
1877 			case SLJIT_MOV_U8:
1878 				srcw = (sljit_u8)srcw;
1879 				break;
1880 			case SLJIT_MOV_S8:
1881 				srcw = (sljit_s8)srcw;
1882 				break;
1883 			case SLJIT_MOV_U16:
1884 				srcw = (sljit_u16)srcw;
1885 				break;
1886 			case SLJIT_MOV_S16:
1887 				srcw = (sljit_s16)srcw;
1888 				break;
1889 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1890 			case SLJIT_MOV_U32:
1891 				srcw = (sljit_u32)srcw;
1892 				break;
1893 			case SLJIT_MOV_S32:
1894 				srcw = (sljit_s32)srcw;
1895 				break;
1896 #endif /* SLJIT_CONFIG_X86_64 */
1897 			}
1898 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1899 			if (SLJIT_UNLIKELY(dst_is_ereg))
1900 				return emit_mov(compiler, dst, dstw, src, srcw);
1901 #endif /* SLJIT_CONFIG_X86_32 */
1902 		}
1903 
1904 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1905 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1906 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1907 			dst = TMP_REG1;
1908 		}
1909 #endif /* SLJIT_CONFIG_X86_32 */
1910 
1911 		switch (op) {
1912 		case SLJIT_MOV:
1913 		case SLJIT_MOV_P:
1914 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1915 		case SLJIT_MOV_U32:
1916 		case SLJIT_MOV_S32:
1917 		case SLJIT_MOV32:
1918 #endif /* SLJIT_CONFIG_X86_32 */
1919 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1920 			break;
1921 		case SLJIT_MOV_U8:
1922 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1923 			break;
1924 		case SLJIT_MOV_S8:
1925 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1926 			break;
1927 		case SLJIT_MOV_U16:
1928 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1929 			break;
1930 		case SLJIT_MOV_S16:
1931 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1932 			break;
1933 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1934 		case SLJIT_MOV_U32:
1935 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1936 			break;
1937 		case SLJIT_MOV_S32:
1938 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1939 			break;
1940 		case SLJIT_MOV32:
1941 			compiler->mode32 = 1;
1942 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1943 			compiler->mode32 = 0;
1944 			break;
1945 #endif /* SLJIT_CONFIG_X86_64 */
1946 		}
1947 
1948 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1949 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1950 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1951 #endif /* SLJIT_CONFIG_X86_32 */
1952 		return SLJIT_SUCCESS;
1953 	}
1954 
1955 	switch (op) {
1956 	case SLJIT_CLZ:
1957 	case SLJIT_CTZ:
1958 		return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw);
1959 	case SLJIT_REV:
1960 	case SLJIT_REV_U16:
1961 	case SLJIT_REV_S16:
1962 	case SLJIT_REV_U32:
1963 	case SLJIT_REV_S32:
1964 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1965 		if (dst_is_ereg)
1966 			op |= SLJIT_32;
1967 #endif /* SLJIT_CONFIG_X86_32 */
1968 		return emit_bswap(compiler, op, dst, dstw, src, srcw);
1969 	}
1970 
1971 	return SLJIT_SUCCESS;
1972 }
1973 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u32 op_types,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1974 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1975 	sljit_u32 op_types,
1976 	sljit_s32 dst, sljit_sw dstw,
1977 	sljit_s32 src1, sljit_sw src1w,
1978 	sljit_s32 src2, sljit_sw src2w)
1979 {
1980 	sljit_u8* inst;
1981 	sljit_u8 op_eax_imm = U8(op_types >> 24);
1982 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
1983 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
1984 	sljit_u8 op_imm = U8(op_types & 0xff);
1985 
1986 	if (dst == src1 && dstw == src1w) {
1987 		if (src2 == SLJIT_IMM) {
1988 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1989 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1990 #else
1991 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1992 #endif
1993 				BINARY_EAX_IMM(op_eax_imm, src2w);
1994 			}
1995 			else {
1996 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1997 			}
1998 		}
1999 		else if (FAST_IS_REG(dst)) {
2000 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
2001 			FAIL_IF(!inst);
2002 			*inst = op_rm;
2003 		}
2004 		else if (FAST_IS_REG(src2)) {
2005 			/* Special exception for sljit_emit_op_flags. */
2006 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
2007 			FAIL_IF(!inst);
2008 			*inst = op_mr;
2009 		}
2010 		else {
2011 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
2012 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2013 			FAIL_IF(!inst);
2014 			*inst = op_mr;
2015 		}
2016 		return SLJIT_SUCCESS;
2017 	}
2018 
2019 	/* Only for cumulative operations. */
2020 	if (dst == src2 && dstw == src2w) {
2021 		if (src1 == SLJIT_IMM) {
2022 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2023 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2024 #else
2025 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
2026 #endif
2027 				BINARY_EAX_IMM(op_eax_imm, src1w);
2028 			}
2029 			else {
2030 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
2031 			}
2032 		}
2033 		else if (FAST_IS_REG(dst)) {
2034 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
2035 			FAIL_IF(!inst);
2036 			*inst = op_rm;
2037 		}
2038 		else if (FAST_IS_REG(src1)) {
2039 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
2040 			FAIL_IF(!inst);
2041 			*inst = op_mr;
2042 		}
2043 		else {
2044 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2045 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2046 			FAIL_IF(!inst);
2047 			*inst = op_mr;
2048 		}
2049 		return SLJIT_SUCCESS;
2050 	}
2051 
2052 	/* General version. */
2053 	if (FAST_IS_REG(dst)) {
2054 		EMIT_MOV(compiler, dst, 0, src1, src1w);
2055 		if (src2 == SLJIT_IMM) {
2056 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
2057 		}
2058 		else {
2059 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
2060 			FAIL_IF(!inst);
2061 			*inst = op_rm;
2062 		}
2063 	}
2064 	else {
2065 		/* This version requires less memory writing. */
2066 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2067 		if (src2 == SLJIT_IMM) {
2068 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
2069 		}
2070 		else {
2071 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2072 			FAIL_IF(!inst);
2073 			*inst = op_rm;
2074 		}
2075 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2076 	}
2077 
2078 	return SLJIT_SUCCESS;
2079 }
2080 
2081 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
2082 	sljit_u32 op_types,
2083 	sljit_s32 dst, sljit_sw dstw,
2084 	sljit_s32 src1, sljit_sw src1w,
2085 	sljit_s32 src2, sljit_sw src2w)
2086 {
2087 	sljit_u8* inst;
2088 	sljit_u8 op_eax_imm = U8(op_types >> 24);
2089 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
2090 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
2091 	sljit_u8 op_imm = U8(op_types & 0xff);
2092 
2093 	if (dst == src1 && dstw == src1w) {
2094 		if (src2 == SLJIT_IMM) {
2095 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2096 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2097 #else
2098 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
2099 #endif
2100 				BINARY_EAX_IMM(op_eax_imm, src2w);
2101 			}
2102 			else {
2103 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
2104 			}
2105 		}
2106 		else if (FAST_IS_REG(dst)) {
2107 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
2108 			FAIL_IF(!inst);
2109 			*inst = op_rm;
2110 		}
2111 		else if (FAST_IS_REG(src2)) {
2112 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
2113 			FAIL_IF(!inst);
2114 			*inst = op_mr;
2115 		}
2116 		else {
2117 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
2118 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2119 			FAIL_IF(!inst);
2120 			*inst = op_mr;
2121 		}
2122 		return SLJIT_SUCCESS;
2123 	}
2124 
2125 	/* General version. */
2126 	if (FAST_IS_REG(dst) && dst != src2) {
2127 		EMIT_MOV(compiler, dst, 0, src1, src1w);
2128 		if (src2 == SLJIT_IMM) {
2129 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
2130 		}
2131 		else {
2132 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
2133 			FAIL_IF(!inst);
2134 			*inst = op_rm;
2135 		}
2136 	}
2137 	else {
2138 		/* This version requires less memory writing. */
2139 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2140 		if (src2 == SLJIT_IMM) {
2141 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
2142 		}
2143 		else {
2144 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2145 			FAIL_IF(!inst);
2146 			*inst = op_rm;
2147 		}
2148 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2149 	}
2150 
2151 	return SLJIT_SUCCESS;
2152 }
2153 
2154 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
2155 	sljit_s32 dst, sljit_sw dstw,
2156 	sljit_s32 src1, sljit_sw src1w,
2157 	sljit_s32 src2, sljit_sw src2w)
2158 {
2159 	sljit_u8* inst;
2160 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2161 
2162 	/* Register destination. */
2163 	if (dst_r == src1 && src2 != SLJIT_IMM) {
2164 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2165 	} else if (dst_r == src2 && src1 != SLJIT_IMM) {
2166 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w));
2167 	} else if (src1 == SLJIT_IMM) {
2168 		if (src2 == SLJIT_IMM) {
2169 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
2170 			src2 = dst_r;
2171 			src2w = 0;
2172 		}
2173 
2174 		if (src1w <= 127 && src1w >= -128) {
2175 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2176 			FAIL_IF(!inst);
2177 			*inst = IMUL_r_rm_i8;
2178 
2179 			FAIL_IF(emit_byte(compiler, U8(src1w)));
2180 		}
2181 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2182 		else {
2183 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2184 			FAIL_IF(!inst);
2185 			*inst = IMUL_r_rm_i32;
2186 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2187 			FAIL_IF(!inst);
2188 			INC_SIZE(4);
2189 			sljit_unaligned_store_sw(inst, src1w);
2190 		}
2191 #else
2192 		else if (IS_HALFWORD(src1w)) {
2193 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2194 			FAIL_IF(!inst);
2195 			*inst = IMUL_r_rm_i32;
2196 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2197 			FAIL_IF(!inst);
2198 			INC_SIZE(4);
2199 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
2200 		}
2201 		else {
2202 			if (dst_r != src2)
2203 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
2204 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
2205 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2206 		}
2207 #endif
2208 	}
2209 	else if (src2 == SLJIT_IMM) {
2210 		/* Note: src1 is NOT immediate. */
2211 
2212 		if (src2w <= 127 && src2w >= -128) {
2213 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2214 			FAIL_IF(!inst);
2215 			*inst = IMUL_r_rm_i8;
2216 
2217 			FAIL_IF(emit_byte(compiler, U8(src2w)));
2218 		}
2219 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2220 		else {
2221 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2222 			FAIL_IF(!inst);
2223 			*inst = IMUL_r_rm_i32;
2224 
2225 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2226 			FAIL_IF(!inst);
2227 			INC_SIZE(4);
2228 			sljit_unaligned_store_sw(inst, src2w);
2229 		}
2230 #else
2231 		else if (IS_HALFWORD(src2w)) {
2232 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2233 			FAIL_IF(!inst);
2234 			*inst = IMUL_r_rm_i32;
2235 
2236 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2237 			FAIL_IF(!inst);
2238 			INC_SIZE(4);
2239 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
2240 		} else {
2241 			if (dst_r != src1)
2242 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2243 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2244 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2245 		}
2246 #endif
2247 	} else {
2248 		/* Neither argument is immediate. */
2249 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
2250 			dst_r = TMP_REG1;
2251 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2252 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2253 	}
2254 
2255 	if (dst & SLJIT_MEM)
2256 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2257 
2258 	return SLJIT_SUCCESS;
2259 }
2260 
2261 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
2262 	sljit_s32 dst, sljit_sw dstw,
2263 	sljit_s32 src1, sljit_sw src1w,
2264 	sljit_s32 src2, sljit_sw src2w)
2265 {
2266 	sljit_u8* inst;
2267 	sljit_s32 dst_r, done = 0;
2268 
2269 	/* These cases better be left to handled by normal way. */
2270 	if (dst == src1 && dstw == src1w)
2271 		return SLJIT_ERR_UNSUPPORTED;
2272 	if (dst == src2 && dstw == src2w)
2273 		return SLJIT_ERR_UNSUPPORTED;
2274 
2275 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2276 
2277 	if (FAST_IS_REG(src1)) {
2278 		if (FAST_IS_REG(src2)) {
2279 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
2280 			FAIL_IF(!inst);
2281 			*inst = LEA_r_m;
2282 			done = 1;
2283 		}
2284 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2285 		if (src2 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src2w))) {
2286 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
2287 #else
2288 		if (src2 == SLJIT_IMM) {
2289 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
2290 #endif
2291 			FAIL_IF(!inst);
2292 			*inst = LEA_r_m;
2293 			done = 1;
2294 		}
2295 	}
2296 	else if (FAST_IS_REG(src2)) {
2297 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2298 		if (src1 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src1w))) {
2299 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
2300 #else
2301 		if (src1 == SLJIT_IMM) {
2302 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
2303 #endif
2304 			FAIL_IF(!inst);
2305 			*inst = LEA_r_m;
2306 			done = 1;
2307 		}
2308 	}
2309 
2310 	if (done) {
2311 		if (dst_r == TMP_REG1)
2312 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2313 		return SLJIT_SUCCESS;
2314 	}
2315 	return SLJIT_ERR_UNSUPPORTED;
2316 }
2317 
2318 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
2319 	sljit_s32 src1, sljit_sw src1w,
2320 	sljit_s32 src2, sljit_sw src2w)
2321 {
2322 	sljit_u8* inst;
2323 
2324 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2325 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2326 #else
2327 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2328 #endif
2329 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
2330 		return SLJIT_SUCCESS;
2331 	}
2332 
2333 	if (FAST_IS_REG(src1)) {
2334 		if (src2 == SLJIT_IMM) {
2335 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
2336 		}
2337 		else {
2338 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2339 			FAIL_IF(!inst);
2340 			*inst = CMP_r_rm;
2341 		}
2342 		return SLJIT_SUCCESS;
2343 	}
2344 
2345 	if (FAST_IS_REG(src2) && src1 != SLJIT_IMM) {
2346 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2347 		FAIL_IF(!inst);
2348 		*inst = CMP_rm_r;
2349 		return SLJIT_SUCCESS;
2350 	}
2351 
2352 	if (src2 == SLJIT_IMM) {
2353 		if (src1 == SLJIT_IMM) {
2354 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2355 			src1 = TMP_REG1;
2356 			src1w = 0;
2357 		}
2358 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
2359 	}
2360 	else {
2361 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2362 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2363 		FAIL_IF(!inst);
2364 		*inst = CMP_r_rm;
2365 	}
2366 	return SLJIT_SUCCESS;
2367 }
2368 
2369 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
2370 	sljit_s32 src1, sljit_sw src1w,
2371 	sljit_s32 src2, sljit_sw src2w)
2372 {
2373 	sljit_u8* inst;
2374 
2375 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2376 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2377 #else
2378 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2379 #endif
2380 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
2381 		return SLJIT_SUCCESS;
2382 	}
2383 
2384 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2385 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2386 #else
2387 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128)) {
2388 #endif
2389 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
2390 		return SLJIT_SUCCESS;
2391 	}
2392 
2393 	if (src1 != SLJIT_IMM) {
2394 		if (src2 == SLJIT_IMM) {
2395 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2396 			if (IS_HALFWORD(src2w) || compiler->mode32) {
2397 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2398 				FAIL_IF(!inst);
2399 				*inst = GROUP_F7;
2400 			} else {
2401 				FAIL_IF(emit_load_imm64(compiler, FAST_IS_REG(src1) ? TMP_REG2 : TMP_REG1, src2w));
2402 				inst = emit_x86_instruction(compiler, 1, FAST_IS_REG(src1) ? TMP_REG2 : TMP_REG1, 0, src1, src1w);
2403 				FAIL_IF(!inst);
2404 				*inst = TEST_rm_r;
2405 			}
2406 #else
2407 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2408 			FAIL_IF(!inst);
2409 			*inst = GROUP_F7;
2410 #endif
2411 			return SLJIT_SUCCESS;
2412 		}
2413 		else if (FAST_IS_REG(src1)) {
2414 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2415 			FAIL_IF(!inst);
2416 			*inst = TEST_rm_r;
2417 			return SLJIT_SUCCESS;
2418 		}
2419 	}
2420 
2421 	if (src2 != SLJIT_IMM) {
2422 		if (src1 == SLJIT_IMM) {
2423 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2424 			if (IS_HALFWORD(src1w) || compiler->mode32) {
2425 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
2426 				FAIL_IF(!inst);
2427 				*inst = GROUP_F7;
2428 			}
2429 			else {
2430 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
2431 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2432 				FAIL_IF(!inst);
2433 				*inst = TEST_rm_r;
2434 			}
2435 #else
2436 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
2437 			FAIL_IF(!inst);
2438 			*inst = GROUP_F7;
2439 #endif
2440 			return SLJIT_SUCCESS;
2441 		}
2442 		else if (FAST_IS_REG(src2)) {
2443 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2444 			FAIL_IF(!inst);
2445 			*inst = TEST_rm_r;
2446 			return SLJIT_SUCCESS;
2447 		}
2448 	}
2449 
2450 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2451 	if (src2 == SLJIT_IMM) {
2452 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2453 		if (IS_HALFWORD(src2w) || compiler->mode32) {
2454 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2455 			FAIL_IF(!inst);
2456 			*inst = GROUP_F7;
2457 		}
2458 		else {
2459 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2460 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2461 			FAIL_IF(!inst);
2462 			*inst = TEST_rm_r;
2463 		}
2464 #else
2465 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2466 		FAIL_IF(!inst);
2467 		*inst = GROUP_F7;
2468 #endif
2469 	}
2470 	else {
2471 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2472 		FAIL_IF(!inst);
2473 		*inst = TEST_rm_r;
2474 	}
2475 	return SLJIT_SUCCESS;
2476 }
2477 
2478 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2479 	sljit_u8 mode,
2480 	sljit_s32 dst, sljit_sw dstw,
2481 	sljit_s32 src1, sljit_sw src1w,
2482 	sljit_s32 src2, sljit_sw src2w)
2483 {
2484 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2485 	sljit_s32 mode32;
2486 #endif
2487 	sljit_u8* inst;
2488 
2489 	if (src2 == SLJIT_IMM || src2 == SLJIT_PREF_SHIFT_REG) {
2490 		if (dst == src1 && dstw == src1w) {
2491 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2492 			FAIL_IF(!inst);
2493 			inst[1] |= mode;
2494 			return SLJIT_SUCCESS;
2495 		}
2496 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2497 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2498 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2499 			FAIL_IF(!inst);
2500 			inst[1] |= mode;
2501 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2502 			return SLJIT_SUCCESS;
2503 		}
2504 		if (FAST_IS_REG(dst)) {
2505 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2506 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2507 			FAIL_IF(!inst);
2508 			inst[1] |= mode;
2509 			return SLJIT_SUCCESS;
2510 		}
2511 
2512 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2513 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2514 		FAIL_IF(!inst);
2515 		inst[1] |= mode;
2516 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2517 		return SLJIT_SUCCESS;
2518 	}
2519 
2520 	if (dst == SLJIT_PREF_SHIFT_REG) {
2521 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2522 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2523 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2524 		FAIL_IF(!inst);
2525 		inst[1] |= mode;
2526 		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2527 	}
2528 
2529 	if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2530 		if (src1 != dst)
2531 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2532 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2533 		mode32 = compiler->mode32;
2534 		compiler->mode32 = 0;
2535 #endif
2536 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2537 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2538 		compiler->mode32 = mode32;
2539 #endif
2540 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2541 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2542 		FAIL_IF(!inst);
2543 		inst[1] |= mode;
2544 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2545 		compiler->mode32 = 0;
2546 #endif
2547 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2548 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2549 		compiler->mode32 = mode32;
2550 #endif
2551 		return SLJIT_SUCCESS;
2552 	}
2553 
2554 	/* This case is complex since ecx itself may be used for
2555 	   addressing, and this case must be supported as well. */
2556 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2557 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2558 	EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2559 #else /* !SLJIT_CONFIG_X86_32 */
2560 	mode32 = compiler->mode32;
2561 	compiler->mode32 = 0;
2562 	EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2563 	compiler->mode32 = mode32;
2564 #endif /* SLJIT_CONFIG_X86_32 */
2565 
2566 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2567 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2568 	FAIL_IF(!inst);
2569 	inst[1] |= mode;
2570 
2571 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2572 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2573 #else
2574 	compiler->mode32 = 0;
2575 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2576 	compiler->mode32 = mode32;
2577 #endif /* SLJIT_CONFIG_X86_32 */
2578 
2579 	if (dst != TMP_REG1)
2580 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2581 
2582 	return SLJIT_SUCCESS;
2583 }
2584 
2585 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2586 	sljit_u8 mode, sljit_s32 set_flags,
2587 	sljit_s32 dst, sljit_sw dstw,
2588 	sljit_s32 src1, sljit_sw src1w,
2589 	sljit_s32 src2, sljit_sw src2w)
2590 {
2591 	/* The CPU does not set flags if the shift count is 0. */
2592 	if (src2 == SLJIT_IMM) {
2593 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2594 		src2w &= compiler->mode32 ? 0x1f : 0x3f;
2595 #else /* !SLJIT_CONFIG_X86_64 */
2596 		src2w &= 0x1f;
2597 #endif /* SLJIT_CONFIG_X86_64 */
2598 		if (src2w != 0)
2599 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2600 
2601 		if (!set_flags)
2602 			return emit_mov(compiler, dst, dstw, src1, src1w);
2603 		/* OR dst, src, 0 */
2604 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2605 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2606 	}
2607 
2608 	if (!set_flags)
2609 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2610 
2611 	if (!FAST_IS_REG(dst))
2612 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2613 
2614 	FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2615 
2616 	if (FAST_IS_REG(dst))
2617 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2618 	return SLJIT_SUCCESS;
2619 }
2620 
2621 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2622 	sljit_s32 dst, sljit_sw dstw,
2623 	sljit_s32 src1, sljit_sw src1w,
2624 	sljit_s32 src2, sljit_sw src2w)
2625 {
2626 	CHECK_ERROR();
2627 	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
2628 	ADJUST_LOCAL_OFFSET(dst, dstw);
2629 	ADJUST_LOCAL_OFFSET(src1, src1w);
2630 	ADJUST_LOCAL_OFFSET(src2, src2w);
2631 
2632 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2633 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2634 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2635 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2636 	compiler->mode32 = op & SLJIT_32;
2637 #endif
2638 
2639 	switch (GET_OPCODE(op)) {
2640 	case SLJIT_ADD:
2641 		if (!HAS_FLAGS(op)) {
2642 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2643 				return compiler->error;
2644 		}
2645 		return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2646 			dst, dstw, src1, src1w, src2, src2w);
2647 	case SLJIT_ADDC:
2648 		return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2649 			dst, dstw, src1, src1w, src2, src2w);
2650 	case SLJIT_SUB:
2651 		if (src1 == SLJIT_IMM && src1w == 0)
2652 			return emit_unary(compiler, NEG_rm, dst, dstw, src2, src2w);
2653 
2654 		if (!HAS_FLAGS(op)) {
2655 			if (src2 == SLJIT_IMM && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2656 				return compiler->error;
2657 			if (FAST_IS_REG(dst) && src2 == dst) {
2658 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
2659 				return emit_unary(compiler, NEG_rm, dst, 0, dst, 0);
2660 			}
2661 		}
2662 
2663 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2664 			dst, dstw, src1, src1w, src2, src2w);
2665 	case SLJIT_SUBC:
2666 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2667 			dst, dstw, src1, src1w, src2, src2w);
2668 	case SLJIT_MUL:
2669 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2670 	case SLJIT_AND:
2671 		return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2672 			dst, dstw, src1, src1w, src2, src2w);
2673 	case SLJIT_OR:
2674 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2675 			dst, dstw, src1, src1w, src2, src2w);
2676 	case SLJIT_XOR:
2677 		if (!HAS_FLAGS(op)) {
2678 			if (src2 == SLJIT_IMM && src2w == -1)
2679 				return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w);
2680 			if (src1 == SLJIT_IMM && src1w == -1)
2681 				return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w);
2682 		}
2683 
2684 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2685 			dst, dstw, src1, src1w, src2, src2w);
2686 	case SLJIT_SHL:
2687 	case SLJIT_MSHL:
2688 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2689 			dst, dstw, src1, src1w, src2, src2w);
2690 	case SLJIT_LSHR:
2691 	case SLJIT_MLSHR:
2692 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2693 			dst, dstw, src1, src1w, src2, src2w);
2694 	case SLJIT_ASHR:
2695 	case SLJIT_MASHR:
2696 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2697 			dst, dstw, src1, src1w, src2, src2w);
2698 	case SLJIT_ROTL:
2699 		return emit_shift_with_flags(compiler, ROL, 0,
2700 			dst, dstw, src1, src1w, src2, src2w);
2701 	case SLJIT_ROTR:
2702 		return emit_shift_with_flags(compiler, ROR, 0,
2703 			dst, dstw, src1, src1w, src2, src2w);
2704 	}
2705 
2706 	return SLJIT_SUCCESS;
2707 }
2708 
2709 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
2710 	sljit_s32 src1, sljit_sw src1w,
2711 	sljit_s32 src2, sljit_sw src2w)
2712 {
2713 	sljit_s32 opcode = GET_OPCODE(op);
2714 
2715 	CHECK_ERROR();
2716 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
2717 
2718 	if (opcode != SLJIT_SUB && opcode != SLJIT_AND) {
2719 		SLJIT_SKIP_CHECKS(compiler);
2720 		return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
2721 	}
2722 
2723 	ADJUST_LOCAL_OFFSET(src1, src1w);
2724 	ADJUST_LOCAL_OFFSET(src2, src2w);
2725 
2726 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2727 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2728 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2729 	compiler->mode32 = op & SLJIT_32;
2730 #endif
2731 
2732 	if (opcode == SLJIT_SUB)
2733 		return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2734 
2735 	return emit_test_binary(compiler, src1, src1w, src2, src2w);
2736 }
2737 
2738 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2r(struct sljit_compiler *compiler, sljit_s32 op,
2739 	sljit_s32 dst_reg,
2740 	sljit_s32 src1, sljit_sw src1w,
2741 	sljit_s32 src2, sljit_sw src2w)
2742 {
2743 	sljit_u8* inst;
2744 	sljit_sw dstw = 0;
2745 
2746 	CHECK_ERROR();
2747 	CHECK(check_sljit_emit_op2r(compiler, op, dst_reg, src1, src1w, src2, src2w));
2748 	ADJUST_LOCAL_OFFSET(src1, src1w);
2749 	ADJUST_LOCAL_OFFSET(src2, src2w);
2750 
2751 	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
2752 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2753 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2754 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2755 	compiler->mode32 = op & SLJIT_32;
2756 #endif
2757 
2758 	switch (GET_OPCODE(op)) {
2759 	case SLJIT_MULADD:
2760 		FAIL_IF(emit_mul(compiler, TMP_REG1, 0, src1, src1w, src2, src2w));
2761 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst_reg, dstw);
2762 		FAIL_IF(!inst);
2763 		*inst = ADD_rm_r;
2764 		return SLJIT_SUCCESS;
2765 	}
2766 
2767 	return SLJIT_SUCCESS;
2768 }
2769 
2770 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
2771 	sljit_s32 dst_reg,
2772 	sljit_s32 src1_reg,
2773 	sljit_s32 src2_reg,
2774 	sljit_s32 src3, sljit_sw src3w)
2775 {
2776 	sljit_s32 is_rotate, is_left, move_src1;
2777 	sljit_u8* inst;
2778 	sljit_sw src1w = 0;
2779 	sljit_sw dstw = 0;
2780 	/* The whole register must be saved even for 32 bit operations. */
2781 	sljit_u8 restore_ecx = 0;
2782 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2783 	sljit_sw src2w = 0;
2784 	sljit_s32 restore_sp4 = 0;
2785 #endif /* SLJIT_CONFIG_X86_32 */
2786 
2787 	CHECK_ERROR();
2788 	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
2789 	ADJUST_LOCAL_OFFSET(src3, src3w);
2790 
2791 	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
2792 	CHECK_EXTRA_REGS(src3, src3w, (void)0);
2793 
2794 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2795 	compiler->mode32 = op & SLJIT_32;
2796 #endif /* SLJIT_CONFIG_X86_64 */
2797 
2798 	if (src3 == SLJIT_IMM) {
2799 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2800 		src3w &= 0x1f;
2801 #else /* !SLJIT_CONFIG_X86_32 */
2802 		src3w &= (op & SLJIT_32) ? 0x1f : 0x3f;
2803 #endif /* SLJIT_CONFIG_X86_32 */
2804 
2805 		if (src3w == 0)
2806 			return SLJIT_SUCCESS;
2807 	}
2808 
2809 	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
2810 
2811 	is_rotate = (src1_reg == src2_reg);
2812 	CHECK_EXTRA_REGS(src1_reg, src1w, (void)0);
2813 	CHECK_EXTRA_REGS(src2_reg, src2w, (void)0);
2814 
2815 	if (is_rotate)
2816 		return emit_shift(compiler, is_left ? ROL : ROR, dst_reg, dstw, src1_reg, src1w, src3, src3w);
2817 
2818 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2819 	if (src2_reg & SLJIT_MEM) {
2820 		EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w);
2821 		src2_reg = TMP_REG1;
2822 	}
2823 #endif /* SLJIT_CONFIG_X86_32 */
2824 
2825 	if (dst_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
2826 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2827 		EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2828 		src1_reg = TMP_REG1;
2829 		src1w = 0;
2830 #else /* !SLJIT_CONFIG_X86_64 */
2831 		if (src2_reg != TMP_REG1) {
2832 			EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2833 			src1_reg = TMP_REG1;
2834 			src1w = 0;
2835 		} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2836 			restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2837 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2838 			EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2839 			src1_reg = restore_sp4;
2840 			src1w = 0;
2841 		} else {
2842 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2843 			restore_sp4 = src1_reg;
2844 		}
2845 #endif /* SLJIT_CONFIG_X86_64 */
2846 
2847 		if (src3 != SLJIT_PREF_SHIFT_REG)
2848 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2849 	} else {
2850 		if (src2_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2851 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2852 			compiler->mode32 = 0;
2853 #endif /* SLJIT_CONFIG_X86_64 */
2854 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2855 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2856 			compiler->mode32 = op & SLJIT_32;
2857 #endif /* SLJIT_CONFIG_X86_64 */
2858 			src2_reg = TMP_REG1;
2859 			restore_ecx = 1;
2860 		}
2861 
2862 		move_src1 = 0;
2863 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2864 		if (dst_reg != src1_reg) {
2865 			if (dst_reg != src3) {
2866 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2867 				src1_reg = dst_reg;
2868 				src1w = 0;
2869 			} else
2870 				move_src1 = 1;
2871 		}
2872 #else /* !SLJIT_CONFIG_X86_64 */
2873 		if (dst_reg & SLJIT_MEM) {
2874 			if (src2_reg != TMP_REG1) {
2875 				EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2876 				src1_reg = TMP_REG1;
2877 				src1w = 0;
2878 			} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2879 				restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2880 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2881 				EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2882 				src1_reg = restore_sp4;
2883 				src1w = 0;
2884 			} else {
2885 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2886 				restore_sp4 = src1_reg;
2887 			}
2888 		} else if (dst_reg != src1_reg) {
2889 			if (dst_reg != src3) {
2890 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2891 				src1_reg = dst_reg;
2892 				src1w = 0;
2893 			} else
2894 				move_src1 = 1;
2895 		}
2896 #endif /* SLJIT_CONFIG_X86_64 */
2897 
2898 		if (src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2899 			if (!restore_ecx) {
2900 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2901 				compiler->mode32 = 0;
2902 				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2903 				compiler->mode32 = op & SLJIT_32;
2904 				restore_ecx = 1;
2905 #else /* !SLJIT_CONFIG_X86_64 */
2906 				if (src1_reg != TMP_REG1 && src2_reg != TMP_REG1) {
2907 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2908 					restore_ecx = 1;
2909 				} else {
2910 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2911 					restore_ecx = 2;
2912 				}
2913 #endif /* SLJIT_CONFIG_X86_64 */
2914 			}
2915 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2916 		}
2917 
2918 		if (move_src1) {
2919 			EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2920 			src1_reg = dst_reg;
2921 			src1w = 0;
2922 		}
2923 	}
2924 
2925 	inst = emit_x86_instruction(compiler, 2, src2_reg, 0, src1_reg, src1w);
2926 	FAIL_IF(!inst);
2927 	inst[0] = GROUP_0F;
2928 
2929 	if (src3 == SLJIT_IMM) {
2930 		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
2931 
2932 		/* Immediate argument is added separately. */
2933 		FAIL_IF(emit_byte(compiler, U8(src3w)));
2934 	} else
2935 		inst[1] = U8(is_left ? SHLD : SHRD);
2936 
2937 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2938 	if (restore_ecx) {
2939 		compiler->mode32 = 0;
2940 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2941 	}
2942 
2943 	if (src1_reg != dst_reg) {
2944 		compiler->mode32 = op & SLJIT_32;
2945 		return emit_mov(compiler, dst_reg, dstw, src1_reg, 0);
2946 	}
2947 #else /* !SLJIT_CONFIG_X86_64 */
2948 	if (restore_ecx)
2949 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, restore_ecx == 1 ? TMP_REG1 : SLJIT_MEM1(SLJIT_SP), 0);
2950 
2951 	if (src1_reg != dst_reg)
2952 		EMIT_MOV(compiler, dst_reg, dstw, src1_reg, 0);
2953 
2954 	if (restore_sp4)
2955 		return emit_mov(compiler, restore_sp4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
2956 #endif /* SLJIT_CONFIG_X86_32 */
2957 
2958 	return SLJIT_SUCCESS;
2959 }
2960 
2961 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
2962 	sljit_s32 src, sljit_sw srcw)
2963 {
2964 	CHECK_ERROR();
2965 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
2966 	ADJUST_LOCAL_OFFSET(src, srcw);
2967 
2968 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2969 
2970 	switch (op) {
2971 	case SLJIT_FAST_RETURN:
2972 		return emit_fast_return(compiler, src, srcw);
2973 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
2974 		/* Don't adjust shadow stack if it isn't enabled.  */
2975 		if (!cpu_has_shadow_stack ())
2976 			return SLJIT_SUCCESS;
2977 		return adjust_shadow_stack(compiler, src, srcw);
2978 	case SLJIT_PREFETCH_L1:
2979 	case SLJIT_PREFETCH_L2:
2980 	case SLJIT_PREFETCH_L3:
2981 	case SLJIT_PREFETCH_ONCE:
2982 		return emit_prefetch(compiler, op, src, srcw);
2983 	}
2984 
2985 	return SLJIT_SUCCESS;
2986 }
2987 
2988 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
2989 	sljit_s32 dst, sljit_sw dstw)
2990 {
2991 	CHECK_ERROR();
2992 	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
2993 	ADJUST_LOCAL_OFFSET(dst, dstw);
2994 
2995 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2996 
2997 	switch (op) {
2998 	case SLJIT_FAST_ENTER:
2999 		return emit_fast_enter(compiler, dst, dstw);
3000 	case SLJIT_GET_RETURN_ADDRESS:
3001 		return sljit_emit_get_return_address(compiler, dst, dstw);
3002 	}
3003 
3004 	return SLJIT_SUCCESS;
3005 }
3006 
3007 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, sljit_s32 reg)
3008 {
3009 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
3010 
3011 	if (type == SLJIT_GP_REGISTER) {
3012 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3013 		if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
3014 			return -1;
3015 #endif /* SLJIT_CONFIG_X86_32 */
3016 		return reg_map[reg];
3017 	}
3018 
3019 	if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512)
3020 		return -1;
3021 
3022 	return freg_map[reg];
3023 }
3024 
3025 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
3026 	void *instruction, sljit_u32 size)
3027 {
3028 	sljit_u8 *inst;
3029 
3030 	CHECK_ERROR();
3031 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
3032 
3033 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
3034 	FAIL_IF(!inst);
3035 	INC_SIZE(size);
3036 	SLJIT_MEMCPY(inst, instruction, size);
3037 	return SLJIT_SUCCESS;
3038 }
3039 
3040 /* --------------------------------------------------------------------- */
3041 /*  Floating point operators                                             */
3042 /* --------------------------------------------------------------------- */
3043 
3044 /* Alignment(3) + 4 * 16 bytes. */
3045 static sljit_u32 sse2_data[3 + (4 * 4)];
3046 static sljit_u32 *sse2_buffer;
3047 
3048 static void init_compiler(void)
3049 {
3050 	get_cpu_features();
3051 
3052 	/* Align to 16 bytes. */
3053 	sse2_buffer = (sljit_u32*)(((sljit_uw)sse2_data + 15) & ~(sljit_uw)0xf);
3054 
3055 	/* Single precision constants (each constant is 16 byte long). */
3056 	sse2_buffer[0] = 0x80000000;
3057 	sse2_buffer[4] = 0x7fffffff;
3058 	/* Double precision constants (each constant is 16 byte long). */
3059 	sse2_buffer[8] = 0;
3060 	sse2_buffer[9] = 0x80000000;
3061 	sse2_buffer[12] = 0xffffffff;
3062 	sse2_buffer[13] = 0x7fffffff;
3063 }
3064 
3065 static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
3066 	sljit_uw op,
3067 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3068 {
3069 	sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw);
3070 	FAIL_IF(!inst);
3071 	inst[0] = GROUP_0F;
3072 	inst[1] = op & 0xff;
3073 	return SLJIT_SUCCESS;
3074 }
3075 
3076 static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
3077 	sljit_uw op,
3078 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3079 {
3080 	sljit_u8 *inst;
3081 
3082 	SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A)));
3083 
3084 	inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
3085 	FAIL_IF(!inst);
3086 	inst[0] = GROUP_0F;
3087 	inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A);
3088 	inst[2] = op & 0xff;
3089 	return SLJIT_SUCCESS;
3090 }
3091 
3092 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
3093 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3094 {
3095 	return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
3096 }
3097 
3098 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
3099 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
3100 {
3101 	return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
3102 }
3103 
3104 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
3105 	sljit_s32 dst, sljit_sw dstw,
3106 	sljit_s32 src, sljit_sw srcw)
3107 {
3108 	sljit_s32 dst_r;
3109 
3110 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3111 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
3112 
3113 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3114 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
3115 		compiler->mode32 = 0;
3116 #endif
3117 
3118 	FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
3119 
3120 	if (dst & SLJIT_MEM)
3121 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3122 	return SLJIT_SUCCESS;
3123 }
3124 
3125 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
3126 	sljit_s32 dst, sljit_sw dstw,
3127 	sljit_s32 src, sljit_sw srcw)
3128 {
3129 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
3130 
3131 	CHECK_EXTRA_REGS(src, srcw, (void)0);
3132 
3133 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3134 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
3135 		compiler->mode32 = 0;
3136 #endif
3137 
3138 	if (src == SLJIT_IMM) {
3139 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3140 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
3141 			srcw = (sljit_s32)srcw;
3142 #endif
3143 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
3144 		src = TMP_REG1;
3145 		srcw = 0;
3146 	}
3147 
3148 	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
3149 
3150 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3151 	compiler->mode32 = 1;
3152 #endif
3153 	if (dst_r == TMP_FREG)
3154 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3155 	return SLJIT_SUCCESS;
3156 }
3157 
3158 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
3159 	sljit_s32 src1, sljit_sw src1w,
3160 	sljit_s32 src2, sljit_sw src2w)
3161 {
3162 	switch (GET_FLAG_TYPE(op)) {
3163 	case SLJIT_ORDERED_EQUAL:
3164 		/* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */
3165 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3166 		FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
3167 
3168 		/* EQ */
3169 		FAIL_IF(emit_byte(compiler, 0));
3170 
3171 		src1 = TMP_FREG;
3172 		src2 = TMP_FREG;
3173 		src2w = 0;
3174 		break;
3175 
3176 	case SLJIT_ORDERED_LESS:
3177 	case SLJIT_UNORDERED_OR_GREATER:
3178 		/* Also: SLJIT_UNORDERED_OR_GREATER_EQUAL, SLJIT_ORDERED_LESS_EQUAL  */
3179 		if (!FAST_IS_REG(src2)) {
3180 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3181 			src2 = TMP_FREG;
3182 		}
3183 
3184 		return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
3185 	}
3186 
3187 	if (!FAST_IS_REG(src1)) {
3188 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3189 		src1 = TMP_FREG;
3190 	}
3191 
3192 	return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
3193 }
3194 
3195 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
3196 	sljit_s32 dst, sljit_sw dstw,
3197 	sljit_s32 src, sljit_sw srcw)
3198 {
3199 	sljit_s32 dst_r;
3200 	sljit_u8 *inst;
3201 
3202 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3203 	compiler->mode32 = 1;
3204 #endif
3205 
3206 	CHECK_ERROR();
3207 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
3208 
3209 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
3210 		if (FAST_IS_REG(dst))
3211 			return emit_sse2_load(compiler, op & SLJIT_32, dst, src, srcw);
3212 		if (FAST_IS_REG(src))
3213 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, src);
3214 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3215 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3216 	}
3217 
3218 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
3219 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
3220 		if (FAST_IS_REG(src)) {
3221 			/* We overwrite the high bits of source. From SLJIT point of view,
3222 			   this is not an issue.
3223 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
3224 			FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
3225 		} else {
3226 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw));
3227 			src = TMP_FREG;
3228 		}
3229 
3230 		FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
3231 		if (dst_r == TMP_FREG)
3232 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3233 		return SLJIT_SUCCESS;
3234 	}
3235 
3236 	if (FAST_IS_REG(dst)) {
3237 		dst_r = (dst == src) ? TMP_FREG : dst;
3238 
3239 		if (src & SLJIT_MEM)
3240 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3241 
3242 		FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0));
3243 
3244 		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0);
3245 		inst[0] = GROUP_0F;
3246 		/* Same as PSRLD_x / PSRLQ_x */
3247 		inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8;
3248 
3249 		if (GET_OPCODE(op) == SLJIT_ABS_F64) {
3250 			inst[2] |= 2 << 3;
3251 			FAIL_IF(emit_byte(compiler, 1));
3252 		} else {
3253 			inst[2] |= 6 << 3;
3254 			FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63)));
3255 		}
3256 
3257 		if (dst_r != TMP_FREG)
3258 			dst_r = (src & SLJIT_MEM) ? TMP_FREG : src;
3259 		return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0);
3260 	}
3261 
3262 	FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3263 
3264 	switch (GET_OPCODE(op)) {
3265 	case SLJIT_NEG_F64:
3266 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3267 		break;
3268 
3269 	case SLJIT_ABS_F64:
3270 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
3271 		break;
3272 	}
3273 
3274 	return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3275 }
3276 
3277 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
3278 	sljit_s32 dst, sljit_sw dstw,
3279 	sljit_s32 src1, sljit_sw src1w,
3280 	sljit_s32 src2, sljit_sw src2w)
3281 {
3282 	sljit_s32 dst_r;
3283 
3284 	CHECK_ERROR();
3285 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
3286 	ADJUST_LOCAL_OFFSET(dst, dstw);
3287 	ADJUST_LOCAL_OFFSET(src1, src1w);
3288 	ADJUST_LOCAL_OFFSET(src2, src2w);
3289 
3290 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3291 	compiler->mode32 = 1;
3292 #endif
3293 
3294 	if (FAST_IS_REG(dst)) {
3295 		dst_r = dst;
3296 		if (dst == src1)
3297 			; /* Do nothing here. */
3298 		else if (dst == src2 && (GET_OPCODE(op) == SLJIT_ADD_F64 || GET_OPCODE(op) == SLJIT_MUL_F64)) {
3299 			/* Swap arguments. */
3300 			src2 = src1;
3301 			src2w = src1w;
3302 		} else if (dst != src2)
3303 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src1, src1w));
3304 		else {
3305 			dst_r = TMP_FREG;
3306 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3307 		}
3308 	} else {
3309 		dst_r = TMP_FREG;
3310 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3311 	}
3312 
3313 	switch (GET_OPCODE(op)) {
3314 	case SLJIT_ADD_F64:
3315 		FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3316 		break;
3317 
3318 	case SLJIT_SUB_F64:
3319 		FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3320 		break;
3321 
3322 	case SLJIT_MUL_F64:
3323 		FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3324 		break;
3325 
3326 	case SLJIT_DIV_F64:
3327 		FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3328 		break;
3329 	}
3330 
3331 	if (dst_r != dst)
3332 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3333 	return SLJIT_SUCCESS;
3334 }
3335 
3336 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
3337 	sljit_s32 dst_freg,
3338 	sljit_s32 src1, sljit_sw src1w,
3339 	sljit_s32 src2, sljit_sw src2w)
3340 {
3341 	sljit_uw pref;
3342 
3343 	CHECK_ERROR();
3344 	CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w));
3345 	ADJUST_LOCAL_OFFSET(src1, src1w);
3346 	ADJUST_LOCAL_OFFSET(src2, src2w);
3347 
3348 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3349 	compiler->mode32 = 1;
3350 #endif
3351 
3352 	if (dst_freg == src1) {
3353 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3354 		pref = EX86_SELECT_66(op) | EX86_SSE2;
3355 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w));
3356 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3357 		return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0);
3358 	}
3359 
3360 	if (src1 & SLJIT_MEM) {
3361 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3362 		src1 = TMP_FREG;
3363 		src1w = 0;
3364 	}
3365 
3366 	if (dst_freg != src2)
3367 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w));
3368 
3369 	pref = EX86_SELECT_66(op) | EX86_SSE2;
3370 	FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w));
3371 	FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3372 	return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w);
3373 }
3374 
3375 /* --------------------------------------------------------------------- */
3376 /*  Conditional instructions                                             */
3377 /* --------------------------------------------------------------------- */
3378 
3379 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
3380 {
3381 	sljit_u8 *inst;
3382 	struct sljit_label *label;
3383 
3384 	CHECK_ERROR_PTR();
3385 	CHECK_PTR(check_sljit_emit_label(compiler));
3386 
3387 	if (compiler->last_label && compiler->last_label->size == compiler->size)
3388 		return compiler->last_label;
3389 
3390 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
3391 	PTR_FAIL_IF(!label);
3392 	set_label(label, compiler);
3393 
3394 	inst = (sljit_u8*)ensure_buf(compiler, 1);
3395 	PTR_FAIL_IF(!inst);
3396 	inst[0] = SLJIT_INST_LABEL;
3397 
3398 	return label;
3399 }
3400 
3401 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
3402 {
3403 	sljit_u8 *inst;
3404 	struct sljit_jump *jump;
3405 
3406 	CHECK_ERROR_PTR();
3407 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
3408 
3409 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3410 	PTR_FAIL_IF_NULL(jump);
3411 	set_jump(jump, compiler, (sljit_u32)((type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT)));
3412 	type &= 0xff;
3413 
3414 	jump->addr = compiler->size;
3415 	/* Worst case size. */
3416 	compiler->size += (type >= SLJIT_JUMP) ? JUMP_MAX_SIZE : CJUMP_MAX_SIZE;
3417 	inst = (sljit_u8*)ensure_buf(compiler, 1);
3418 	PTR_FAIL_IF_NULL(inst);
3419 
3420 	inst[0] = SLJIT_INST_JUMP;
3421 	return jump;
3422 }
3423 
3424 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
3425 {
3426 	sljit_u8 *inst;
3427 	struct sljit_jump *jump;
3428 
3429 	CHECK_ERROR();
3430 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
3431 	ADJUST_LOCAL_OFFSET(src, srcw);
3432 
3433 	CHECK_EXTRA_REGS(src, srcw, (void)0);
3434 
3435 	if (src == SLJIT_IMM) {
3436 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3437 		FAIL_IF_NULL(jump);
3438 		set_jump(jump, compiler, (sljit_u32)(JUMP_ADDR | (type << TYPE_SHIFT)));
3439 		jump->u.target = (sljit_uw)srcw;
3440 
3441 		jump->addr = compiler->size;
3442 		/* Worst case size. */
3443 		compiler->size += JUMP_MAX_SIZE;
3444 		inst = (sljit_u8*)ensure_buf(compiler, 1);
3445 		FAIL_IF_NULL(inst);
3446 
3447 		inst[0] = SLJIT_INST_JUMP;
3448 	} else {
3449 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3450 		/* REX_W is not necessary (src is not immediate). */
3451 		compiler->mode32 = 1;
3452 #endif
3453 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
3454 		FAIL_IF(!inst);
3455 		inst[0] = GROUP_FF;
3456 		inst[1] = U8(inst[1] | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm));
3457 	}
3458 	return SLJIT_SUCCESS;
3459 }
3460 
3461 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
3462 	sljit_s32 dst, sljit_sw dstw,
3463 	sljit_s32 type)
3464 {
3465 	sljit_u8 *inst;
3466 	sljit_u8 cond_set;
3467 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3468 	sljit_s32 reg;
3469 #endif /* !SLJIT_CONFIG_X86_64 */
3470 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
3471 	sljit_s32 dst_save = dst;
3472 	sljit_sw dstw_save = dstw;
3473 
3474 	CHECK_ERROR();
3475 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
3476 
3477 	ADJUST_LOCAL_OFFSET(dst, dstw);
3478 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3479 
3480 	/* setcc = jcc + 0x10. */
3481 	cond_set = U8(get_jump_code((sljit_uw)type) + 0x10);
3482 
3483 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3484 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
3485 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
3486 		FAIL_IF(!inst);
3487 		INC_SIZE(4 + 3);
3488 		/* Set low register to conditional flag. */
3489 		inst[0] = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
3490 		inst[1] = GROUP_0F;
3491 		inst[2] = cond_set;
3492 		inst[3] = MOD_REG | reg_lmap[TMP_REG1];
3493 		inst[4] = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B));
3494 		inst[5] = OR_rm8_r8;
3495 		inst[6] = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]);
3496 		return SLJIT_SUCCESS;
3497 	}
3498 
3499 	reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
3500 
3501 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
3502 	FAIL_IF(!inst);
3503 	INC_SIZE(4 + 4);
3504 	/* Set low register to conditional flag. */
3505 	inst[0] = (reg_map[reg] <= 7) ? REX : REX_B;
3506 	inst[1] = GROUP_0F;
3507 	inst[2] = cond_set;
3508 	inst[3] = MOD_REG | reg_lmap[reg];
3509 	inst[4] = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
3510 	/* The movzx instruction does not affect flags. */
3511 	inst[5] = GROUP_0F;
3512 	inst[6] = MOVZX_r_rm8;
3513 	inst[7] = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]);
3514 
3515 	if (reg != TMP_REG1)
3516 		return SLJIT_SUCCESS;
3517 
3518 	if (GET_OPCODE(op) < SLJIT_ADD) {
3519 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
3520 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3521 	}
3522 
3523 	SLJIT_SKIP_CHECKS(compiler);
3524 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3525 
3526 #else /* !SLJIT_CONFIG_X86_64 */
3527 	SLJIT_ASSERT(reg_map[TMP_REG1] < 4);
3528 
3529 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
3530 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3531 		/* Low byte is accessible. */
3532 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3533 		FAIL_IF(!inst);
3534 		INC_SIZE(3 + 3);
3535 		/* Set low byte to conditional flag. */
3536 		inst[0] = GROUP_0F;
3537 		inst[1] = cond_set;
3538 		inst[2] = U8(MOD_REG | reg_map[dst]);
3539 
3540 		inst[3] = GROUP_0F;
3541 		inst[4] = MOVZX_r_rm8;
3542 		inst[5] = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]);
3543 		return SLJIT_SUCCESS;
3544 	}
3545 
3546 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3547 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 2);
3548 		FAIL_IF(!inst);
3549 		INC_SIZE(3 + 2);
3550 
3551 		/* Set low byte to conditional flag. */
3552 		inst[0] = GROUP_0F;
3553 		inst[1] = cond_set;
3554 		inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3555 
3556 		inst[3] = OR_rm8_r8;
3557 		inst[4] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[dst]);
3558 		return SLJIT_SUCCESS;
3559 	}
3560 
3561 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3562 	FAIL_IF(!inst);
3563 	INC_SIZE(3 + 3);
3564 	/* Set low byte to conditional flag. */
3565 	inst[0] = GROUP_0F;
3566 	inst[1] = cond_set;
3567 	inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3568 
3569 	inst[3] = GROUP_0F;
3570 	inst[4] = MOVZX_r_rm8;
3571 	inst[5] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[TMP_REG1]);
3572 
3573 	if (GET_OPCODE(op) < SLJIT_ADD)
3574 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3575 
3576 	SLJIT_SKIP_CHECKS(compiler);
3577 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3578 #endif /* SLJIT_CONFIG_X86_64 */
3579 }
3580 
3581 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fselect(struct sljit_compiler *compiler, sljit_s32 type,
3582 	sljit_s32 dst_freg,
3583 	sljit_s32 src1, sljit_sw src1w,
3584 	sljit_s32 src2_freg)
3585 {
3586 	sljit_u8* inst;
3587 	sljit_uw size;
3588 
3589 	CHECK_ERROR();
3590 	CHECK(check_sljit_emit_fselect(compiler, type, dst_freg, src1, src1w, src2_freg));
3591 
3592 	ADJUST_LOCAL_OFFSET(src1, src1w);
3593 
3594 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3595 	compiler->mode32 = 1;
3596 #endif /* SLJIT_CONFIG_X86_64 */
3597 
3598 	if (dst_freg != src2_freg) {
3599 		if (dst_freg == src1) {
3600 			src1 = src2_freg;
3601 			src1w = 0;
3602 			type ^= 0x1;
3603 		} else
3604 			FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src2_freg, 0));
3605 	}
3606 
3607 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
3608 	FAIL_IF(!inst);
3609 	INC_SIZE(2);
3610 	inst[0] = U8(get_jump_code((sljit_uw)(type & ~SLJIT_32) ^ 0x1) - 0x10);
3611 
3612 	size = compiler->size;
3613 	FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src1, src1w));
3614 
3615 	inst[1] = U8(compiler->size - size);
3616 	return SLJIT_SUCCESS;
3617 }
3618 
3619 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
3620 	sljit_s32 freg,
3621 	sljit_s32 srcdst, sljit_sw srcdstw)
3622 {
3623 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3624 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3625 	sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
3626 	sljit_uw op;
3627 
3628 	CHECK_ERROR();
3629 	CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
3630 
3631 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3632 
3633 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3634 	compiler->mode32 = 1;
3635 #endif /* SLJIT_CONFIG_X86_64 */
3636 
3637 	switch (reg_size) {
3638 	case 4:
3639 		op = EX86_SSE2;
3640 		break;
3641 	case 5:
3642 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3643 			return SLJIT_ERR_UNSUPPORTED;
3644 		op = EX86_SSE2 | VEX_256;
3645 		break;
3646 	default:
3647 		return SLJIT_ERR_UNSUPPORTED;
3648 	}
3649 
3650 	if (!(srcdst & SLJIT_MEM))
3651 		alignment = reg_size;
3652 
3653 	if (type & SLJIT_SIMD_FLOAT) {
3654 		if (elem_size == 2 || elem_size == 3) {
3655 			op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
3656 
3657 			if (elem_size == 3)
3658 				op |= EX86_PREF_66;
3659 
3660 			if (type & SLJIT_SIMD_STORE)
3661 				op += 1;
3662 		} else
3663 			return SLJIT_ERR_UNSUPPORTED;
3664 	} else {
3665 		op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm)
3666 			| (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3);
3667 	}
3668 
3669 	if (type & SLJIT_SIMD_TEST)
3670 		return SLJIT_SUCCESS;
3671 
3672 	if ((op & VEX_256) || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX)))
3673 		return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw);
3674 
3675 	return emit_groupf(compiler, op, freg, srcdst, srcdstw);
3676 }
3677 
3678 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
3679 	sljit_s32 freg,
3680 	sljit_s32 src, sljit_sw srcw)
3681 {
3682 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3683 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3684 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
3685 	sljit_u8 *inst;
3686 	sljit_u8 opcode = 0;
3687 	sljit_uw op;
3688 
3689 	CHECK_ERROR();
3690 	CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw));
3691 
3692 	ADJUST_LOCAL_OFFSET(src, srcw);
3693 
3694 	if (!(type & SLJIT_SIMD_FLOAT)) {
3695 		CHECK_EXTRA_REGS(src, srcw, (void)0);
3696 	}
3697 
3698 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3699 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : (elem_size > 2))
3700 		return SLJIT_ERR_UNSUPPORTED;
3701 #else /* !SLJIT_CONFIG_X86_32 */
3702 	compiler->mode32 = 1;
3703 
3704 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3705 		return SLJIT_ERR_UNSUPPORTED;
3706 #endif /* SLJIT_CONFIG_X86_32 */
3707 
3708 	if (reg_size != 4 && (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2)))
3709 		return SLJIT_ERR_UNSUPPORTED;
3710 
3711 	if (type & SLJIT_SIMD_TEST)
3712 		return SLJIT_SUCCESS;
3713 
3714 	if (reg_size == 5)
3715 		use_vex = 1;
3716 
3717 	if (use_vex && src != SLJIT_IMM) {
3718 		op = 0;
3719 
3720 		switch (elem_size) {
3721 		case 0:
3722 			if (cpu_feature_list & CPU_FEATURE_AVX2)
3723 				op = VPBROADCASTB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3724 			break;
3725 		case 1:
3726 			if (cpu_feature_list & CPU_FEATURE_AVX2)
3727 				op = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3728 			break;
3729 		case 2:
3730 			if (type & SLJIT_SIMD_FLOAT) {
3731 				if ((cpu_feature_list & CPU_FEATURE_AVX2) || ((cpu_feature_list & CPU_FEATURE_AVX) && (src & SLJIT_MEM)))
3732 					op = VBROADCASTSS_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3733 			} else if (cpu_feature_list & CPU_FEATURE_AVX2)
3734 				op = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3735 			break;
3736 		default:
3737 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3738 			if (!(type & SLJIT_SIMD_FLOAT)) {
3739 				if (cpu_feature_list & CPU_FEATURE_AVX2)
3740 					op = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3741 				break;
3742 			}
3743 #endif /* SLJIT_CONFIG_X86_64 */
3744 
3745 			if (reg_size == 5)
3746 				op = VBROADCASTSD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3747 			break;
3748 		}
3749 
3750 		if (op != 0) {
3751 			if (!(src & SLJIT_MEM) && !(type & SLJIT_SIMD_FLOAT)) {
3752 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3753 				if (elem_size >= 3)
3754 					compiler->mode32 = 0;
3755 #endif /* SLJIT_CONFIG_X86_64 */
3756 				FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw));
3757 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3758 				compiler->mode32 = 1;
3759 #endif /* SLJIT_CONFIG_X86_64 */
3760 				src = freg;
3761 				srcw = 0;
3762 			}
3763 
3764 			if (reg_size == 5)
3765 				op |= VEX_256;
3766 
3767 			return emit_vex_instruction(compiler, op, freg, 0, src, srcw);
3768 		}
3769 	}
3770 
3771 	if (type & SLJIT_SIMD_FLOAT) {
3772 		if (src == SLJIT_IMM) {
3773 			if (use_vex)
3774 				return emit_vex_instruction(compiler, XORPD_x_xm | (reg_size == 5 ? VEX_256 : 0) | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3775 
3776 			return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
3777 		}
3778 
3779 		SLJIT_ASSERT(reg_size == 4);
3780 
3781 		if (use_vex) {
3782 			if (elem_size == 3)
3783 				return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, srcw);
3784 
3785 			SLJIT_ASSERT(!(src & SLJIT_MEM));
3786 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
3787 			return emit_byte(compiler, 0);
3788 		}
3789 
3790 		if (elem_size == 2 && freg != src) {
3791 			FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw));
3792 			src = freg;
3793 			srcw = 0;
3794 		}
3795 
3796 		op = (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2;
3797 		FAIL_IF(emit_groupf(compiler, op, freg, src, srcw));
3798 
3799 		if (elem_size == 2)
3800 			return emit_byte(compiler, 0);
3801 		return SLJIT_SUCCESS;
3802 	}
3803 
3804 	if (src == SLJIT_IMM) {
3805 		if (elem_size == 0) {
3806 			srcw = (sljit_u8)srcw;
3807 			srcw |= srcw << 8;
3808 			srcw |= srcw << 16;
3809 			elem_size = 2;
3810 		} else if (elem_size == 1) {
3811 			srcw = (sljit_u16)srcw;
3812 			srcw |= srcw << 16;
3813 			elem_size = 2;
3814 		}
3815 
3816 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3817 		if (elem_size == 2 && (sljit_s32)srcw == -1)
3818 			srcw = -1;
3819 #endif /* SLJIT_CONFIG_X86_64 */
3820 
3821 		if (srcw == 0 || srcw == -1) {
3822 			if (use_vex)
3823 				return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3824 
3825 			return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
3826 		}
3827 
3828 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3829 		if (elem_size == 3)
3830 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
3831 		else
3832 #endif /* SLJIT_CONFIG_X86_64 */
3833 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
3834 
3835 		src = TMP_REG1;
3836 		srcw = 0;
3837 
3838 	}
3839 
3840 	op = 2;
3841 	opcode = MOVD_x_rm;
3842 
3843 	switch (elem_size) {
3844 	case 0:
3845 		if (!FAST_IS_REG(src)) {
3846 			opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */;
3847 			op = 3;
3848 		}
3849 		break;
3850 	case 1:
3851 		if (!FAST_IS_REG(src))
3852 			opcode = PINSRW_x_rm_i8;
3853 		break;
3854 	case 2:
3855 		break;
3856 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3857 	case 3:
3858 		/* MOVQ */
3859 		compiler->mode32 = 0;
3860 		break;
3861 #endif /* SLJIT_CONFIG_X86_64 */
3862 	}
3863 
3864 	if (use_vex) {
3865 		if (opcode != MOVD_x_rm) {
3866 			op = (opcode == 0x3a) ? (PINSRB_x_rm_i8 | VEX_OP_0F3A) : opcode;
3867 			FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, freg, src, srcw));
3868 		} else
3869 			FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw));
3870 	} else {
3871 		inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw);
3872 		FAIL_IF(!inst);
3873 		inst[0] = GROUP_0F;
3874 		inst[1] = opcode;
3875 
3876 		if (op == 3) {
3877 			SLJIT_ASSERT(opcode == 0x3a);
3878 			inst[2] = PINSRB_x_rm_i8;
3879 		}
3880 	}
3881 
3882 	if (use_vex && elem_size >= 2) {
3883 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3884 		op = VPBROADCASTD_x_xm;
3885 #else /* !SLJIT_CONFIG_X86_32 */
3886 		op = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm;
3887 #endif /* SLJIT_CONFIG_X86_32 */
3888 		return emit_vex_instruction(compiler, op | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
3889 	}
3890 
3891 	SLJIT_ASSERT(reg_size == 4);
3892 
3893 	if (opcode != MOVD_x_rm)
3894 		FAIL_IF(emit_byte(compiler, 0));
3895 
3896 	switch (elem_size) {
3897 	case 0:
3898 		if (use_vex) {
3899 			FAIL_IF(emit_vex_instruction(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0));
3900 			return emit_vex_instruction(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, TMP_FREG, 0);
3901 		}
3902 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
3903 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
3904 	case 1:
3905 		if (use_vex)
3906 			FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, freg, 0));
3907 		else
3908 			FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
3909 		FAIL_IF(emit_byte(compiler, 0));
3910 		/* fallthrough */
3911 	default:
3912 		if (use_vex)
3913 			FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0));
3914 		else
3915 			FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3916 		return emit_byte(compiler, 0);
3917 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3918 	case 3:
3919 		compiler->mode32 = 1;
3920 		if (use_vex)
3921 			FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0));
3922 		else
3923 			FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3924 		return emit_byte(compiler, 0x44);
3925 #endif /* SLJIT_CONFIG_X86_64 */
3926 	}
3927 }
3928 
3929 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type,
3930 	sljit_s32 freg, sljit_s32 lane_index,
3931 	sljit_s32 srcdst, sljit_sw srcdstw)
3932 {
3933 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3934 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3935 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
3936 	sljit_u8 *inst;
3937 	sljit_u8 opcode = 0;
3938 	sljit_uw op;
3939 	sljit_s32 freg_orig = freg;
3940 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3941 	sljit_s32 srcdst_is_ereg = 0;
3942 	sljit_s32 srcdst_orig = 0;
3943 	sljit_sw srcdstw_orig = 0;
3944 #endif /* SLJIT_CONFIG_X86_32 */
3945 
3946 	CHECK_ERROR();
3947 	CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
3948 
3949 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3950 
3951 	if (reg_size == 5) {
3952 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3953 			return SLJIT_ERR_UNSUPPORTED;
3954 		use_vex = 1;
3955 	} else if (reg_size != 4)
3956 		return SLJIT_ERR_UNSUPPORTED;
3957 
3958 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3959 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2)
3960 		return SLJIT_ERR_UNSUPPORTED;
3961 #else /* SLJIT_CONFIG_X86_32 */
3962 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3963 		return SLJIT_ERR_UNSUPPORTED;
3964 #endif /* SLJIT_CONFIG_X86_32 */
3965 
3966 	if (type & SLJIT_SIMD_TEST)
3967 		return SLJIT_SUCCESS;
3968 
3969 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3970 	compiler->mode32 = 1;
3971 #else /* !SLJIT_CONFIG_X86_64 */
3972 	if (!(type & SLJIT_SIMD_FLOAT)) {
3973 		CHECK_EXTRA_REGS(srcdst, srcdstw, srcdst_is_ereg = 1);
3974 
3975 		if ((type & SLJIT_SIMD_STORE) && ((srcdst_is_ereg && elem_size < 2) || (elem_size == 0 && (type & SLJIT_SIMD_LANE_SIGNED) && FAST_IS_REG(srcdst) && reg_map[srcdst] >= 4))) {
3976 			srcdst_orig = srcdst;
3977 			srcdstw_orig = srcdstw;
3978 			srcdst = TMP_REG1;
3979 			srcdstw = 0;
3980 		}
3981 	}
3982 #endif /* SLJIT_CONFIG_X86_64 */
3983 
3984 	if (type & SLJIT_SIMD_LANE_ZERO) {
3985 		if (lane_index == 0) {
3986 			if (!(type & SLJIT_SIMD_FLOAT)) {
3987 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3988 				if (elem_size == 3) {
3989 					compiler->mode32 = 0;
3990 					elem_size = 2;
3991 				}
3992 #endif /* SLJIT_CONFIG_X86_64 */
3993 				if (srcdst == SLJIT_IMM) {
3994 					if (elem_size == 0)
3995 						srcdstw = (sljit_u8)srcdstw;
3996 					else if (elem_size == 1)
3997 						srcdstw = (sljit_u16)srcdstw;
3998 
3999 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
4000 					srcdst = TMP_REG1;
4001 					srcdstw = 0;
4002 					elem_size = 2;
4003 				}
4004 
4005 				if (elem_size == 2) {
4006 					if (use_vex)
4007 						return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
4008 					return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
4009 				}
4010 			} else if (srcdst & SLJIT_MEM) {
4011 				SLJIT_ASSERT(elem_size == 2 || elem_size == 3);
4012 
4013 				if (use_vex)
4014 					return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw);
4015 				return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
4016 			} else if (elem_size == 3) {
4017 				if (use_vex)
4018 					return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0);
4019 				return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
4020 			} else if (use_vex) {
4021 				FAIL_IF(emit_vex_instruction(compiler, XORPD_x_xm | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0));
4022 				return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F3 | EX86_SSE2 | VEX_SSE2_OPV, freg, TMP_FREG, srcdst, 0);
4023 			}
4024 		}
4025 
4026 		if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
4027 			freg = TMP_FREG;
4028 			lane_index -= (1 << (4 - elem_size));
4029 		} else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
4030 			if (use_vex)
4031 				FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, srcdst, srcdstw));
4032 			else
4033 				FAIL_IF(emit_sse2_load(compiler, elem_size == 2, TMP_FREG, srcdst, srcdstw));
4034 			srcdst = TMP_FREG;
4035 			srcdstw = 0;
4036 		}
4037 
4038 		op = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0)
4039 			| ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2;
4040 
4041 		if (use_vex)
4042 			FAIL_IF(emit_vex_instruction(compiler, op | (reg_size == 5 ? VEX_256 : 0) | VEX_SSE2_OPV, freg, freg, freg, 0));
4043 		else
4044 			FAIL_IF(emit_groupf(compiler, op, freg, freg, 0));
4045 	} else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
4046 		FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
4047 		FAIL_IF(emit_byte(compiler, 1));
4048 
4049 		freg = TMP_FREG;
4050 		lane_index -= (1 << (4 - elem_size));
4051 	}
4052 
4053 	if (type & SLJIT_SIMD_FLOAT) {
4054 		if (elem_size == 3) {
4055 			if (srcdst & SLJIT_MEM) {
4056 				if (type & SLJIT_SIMD_STORE)
4057 					op = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
4058 				else
4059 					op = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
4060 
4061 				/* VEX prefix clears upper bits of the target register. */
4062 				if (use_vex && ((type & SLJIT_SIMD_STORE) || reg_size == 4 || freg == TMP_FREG))
4063 					FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2
4064 						| ((type & SLJIT_SIMD_STORE) ? 0 : VEX_SSE2_OPV), freg, (type & SLJIT_SIMD_STORE) ? 0 : freg, srcdst, srcdstw));
4065 				else
4066 					FAIL_IF(emit_groupf(compiler, op | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
4067 
4068 				/* In case of store, freg is not TMP_FREG. */
4069 			} else if (type & SLJIT_SIMD_STORE) {
4070 				if (lane_index == 1) {
4071 					if (use_vex)
4072 						return emit_vex_instruction(compiler, MOVHLPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0);
4073 					return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0);
4074 				}
4075 				if (use_vex)
4076 					return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0);
4077 				return emit_sse2_load(compiler, 0, srcdst, freg, 0);
4078 			} else if (use_vex && (reg_size == 4 || freg == TMP_FREG)) {
4079 				if (lane_index == 1)
4080 					FAIL_IF(emit_vex_instruction(compiler, MOVLHPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0));
4081 				else
4082 					FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0));
4083 			} else {
4084 				if (lane_index == 1)
4085 					FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0));
4086 				else
4087 					FAIL_IF(emit_sse2_load(compiler, 0, freg, srcdst, 0));
4088 			}
4089 		} else if (type & SLJIT_SIMD_STORE) {
4090 			if (lane_index == 0) {
4091 				if (use_vex)
4092 					return emit_vex_instruction(compiler, ((srcdst & SLJIT_MEM) ? MOVSD_xm_x : MOVSD_x_xm) | EX86_PREF_F3 | EX86_SSE2
4093 						| ((srcdst & SLJIT_MEM) ? 0 : VEX_SSE2_OPV), freg, ((srcdst & SLJIT_MEM) ? 0 : freg), srcdst, srcdstw);
4094 				return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg);
4095 			}
4096 
4097 			if (srcdst & SLJIT_MEM) {
4098 				if (use_vex)
4099 					FAIL_IF(emit_vex_instruction(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, srcdst, srcdstw));
4100 				else
4101 					FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
4102 				return emit_byte(compiler, U8(lane_index));
4103 			}
4104 
4105 			if (use_vex) {
4106 				FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0));
4107 				return emit_byte(compiler, U8(lane_index));
4108 			}
4109 
4110 			if (srcdst == freg)
4111 				op = SHUFPS_x_xm | EX86_SSE2;
4112 			else {
4113 				switch (lane_index) {
4114 				case 1:
4115 					op = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2;
4116 					break;
4117 				case 2:
4118 					op = MOVHLPS_x_x | EX86_SSE2;
4119 					break;
4120 				default:
4121 					SLJIT_ASSERT(lane_index == 3);
4122 					op = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2;
4123 					break;
4124 				}
4125 			}
4126 
4127 			FAIL_IF(emit_groupf(compiler, op, srcdst, freg, 0));
4128 
4129 			op &= 0xff;
4130 			if (op == SHUFPS_x_xm || op == PSHUFD_x_xm)
4131 				return emit_byte(compiler, U8(lane_index));
4132 
4133 			return SLJIT_SUCCESS;
4134 		} else {
4135 			if (lane_index != 0 || (srcdst & SLJIT_MEM)) {
4136 				FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
4137 				FAIL_IF(emit_byte(compiler, U8(lane_index << 4)));
4138 			} else
4139 				FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst));
4140 		}
4141 
4142 		if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE))
4143 			return SLJIT_SUCCESS;
4144 
4145 		SLJIT_ASSERT(reg_size == 5);
4146 
4147 		if (type & SLJIT_SIMD_LANE_ZERO) {
4148 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
4149 			return emit_byte(compiler, 0x4e);
4150 		}
4151 
4152 		FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
4153 		return emit_byte(compiler, 1);
4154 	}
4155 
4156 	if (srcdst == SLJIT_IMM) {
4157 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
4158 		srcdst = TMP_REG1;
4159 		srcdstw = 0;
4160 	}
4161 
4162 	op = 3;
4163 
4164 	switch (elem_size) {
4165 	case 0:
4166 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8;
4167 		break;
4168 	case 1:
4169 		if (!(type & SLJIT_SIMD_STORE)) {
4170 			op = 2;
4171 			opcode = PINSRW_x_rm_i8;
4172 		} else
4173 			opcode = PEXTRW_rm_x_i8;
4174 		break;
4175 	case 2:
4176 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4177 		break;
4178 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4179 	case 3:
4180 		/* PINSRQ / PEXTRQ */
4181 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4182 		compiler->mode32 = 0;
4183 		break;
4184 #endif /* SLJIT_CONFIG_X86_64 */
4185 	}
4186 
4187 	if (use_vex && (type & SLJIT_SIMD_STORE)) {
4188 		op = opcode | ((op == 3) ? VEX_OP_0F3A : 0);
4189 		FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | VEX_AUTO_W | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, 0, srcdst, srcdstw));
4190 	} else {
4191 		inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
4192 		FAIL_IF(!inst);
4193 		inst[0] = GROUP_0F;
4194 
4195 		if (op == 3) {
4196 			inst[1] = 0x3a;
4197 			inst[2] = opcode;
4198 		} else
4199 			inst[1] = opcode;
4200 	}
4201 
4202 	FAIL_IF(emit_byte(compiler, U8(lane_index)));
4203 
4204 	if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) {
4205 		if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) {
4206 			SLJIT_ASSERT(reg_size == 5);
4207 
4208 			if (type & SLJIT_SIMD_LANE_ZERO) {
4209 				FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
4210 				return emit_byte(compiler, 0x4e);
4211 			}
4212 
4213 			FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
4214 			return emit_byte(compiler, 1);
4215 		}
4216 
4217 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4218 		if (srcdst_orig & SLJIT_MEM)
4219 			return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4220 #endif /* SLJIT_CONFIG_X86_32 */
4221 		return SLJIT_SUCCESS;
4222 	}
4223 
4224 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4225 	if (elem_size >= 3)
4226 		return SLJIT_SUCCESS;
4227 
4228 	compiler->mode32 = (type & SLJIT_32);
4229 
4230 	op = 2;
4231 
4232 	if (elem_size == 0)
4233 		op |= EX86_REX;
4234 
4235 	if (elem_size == 2) {
4236 		if (type & SLJIT_32)
4237 			return SLJIT_SUCCESS;
4238 
4239 		SLJIT_ASSERT(!(compiler->mode32));
4240 		op = 1;
4241 	}
4242 
4243 	inst = emit_x86_instruction(compiler, op, srcdst, 0, srcdst, 0);
4244 	FAIL_IF(!inst);
4245 
4246 	if (op != 1) {
4247 		inst[0] = GROUP_0F;
4248 		inst[1] = U8((elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16);
4249 	} else
4250 		inst[0] = MOVSXD_r_rm;
4251 #else /* !SLJIT_CONFIG_X86_64 */
4252 	if (elem_size >= 2)
4253 		return SLJIT_SUCCESS;
4254 
4255 	FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16,
4256 		(srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0));
4257 
4258 	if (srcdst_orig & SLJIT_MEM)
4259 		return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4260 #endif /* SLJIT_CONFIG_X86_64 */
4261 	return SLJIT_SUCCESS;
4262 }
4263 
4264 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type,
4265 	sljit_s32 freg,
4266 	sljit_s32 src, sljit_s32 src_lane_index)
4267 {
4268 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4269 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4270 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4271 	sljit_uw pref;
4272 	sljit_u8 byte;
4273 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4274 	sljit_s32 opcode3 = TMP_REG1;
4275 #else /* !SLJIT_CONFIG_X86_32 */
4276 	sljit_s32 opcode3 = SLJIT_S0;
4277 #endif /* SLJIT_CONFIG_X86_32 */
4278 
4279 	CHECK_ERROR();
4280 	CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
4281 
4282 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4283 	compiler->mode32 = 1;
4284 #endif /* SLJIT_CONFIG_X86_64 */
4285 	SLJIT_ASSERT(reg_map[opcode3] == 3);
4286 
4287 	if (reg_size == 5) {
4288 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4289 			return SLJIT_ERR_UNSUPPORTED;
4290 		use_vex = 1;
4291 	} else if (reg_size != 4)
4292 		return SLJIT_ERR_UNSUPPORTED;
4293 
4294 	if (type & SLJIT_SIMD_FLOAT) {
4295 		pref = 0;
4296 		byte = U8(src_lane_index);
4297 
4298 		if (elem_size == 3) {
4299 			if (type & SLJIT_SIMD_TEST)
4300 				return SLJIT_SUCCESS;
4301 
4302 			if (reg_size == 5) {
4303 				if (src_lane_index == 0)
4304 					return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4305 
4306 				FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4307 
4308 				byte = U8(byte | (byte << 2));
4309 				return emit_byte(compiler, U8(byte | (byte << 4)));
4310 			}
4311 
4312 			if (src_lane_index == 0) {
4313 				if (use_vex)
4314 					return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, 0);
4315 				return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
4316 			}
4317 
4318 			/* Changes it to SHUFPD_x_xm. */
4319 			pref = EX86_PREF_66;
4320 		} else if (elem_size != 2)
4321 			return SLJIT_ERR_UNSUPPORTED;
4322 		else if (type & SLJIT_SIMD_TEST)
4323 			return SLJIT_SUCCESS;
4324 
4325 		if (reg_size == 5) {
4326 			SLJIT_ASSERT(elem_size == 2);
4327 
4328 			if (src_lane_index == 0)
4329 				return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4330 
4331 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4332 
4333 			byte = 0x44;
4334 			if (src_lane_index >= 4) {
4335 				byte = 0xee;
4336 				src_lane_index -= 4;
4337 			}
4338 
4339 			FAIL_IF(emit_byte(compiler, byte));
4340 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
4341 			byte = U8(src_lane_index);
4342 		} else if (use_vex) {
4343 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
4344 		} else {
4345 			if (freg != src)
4346 				FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0));
4347 
4348 			FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0));
4349 		}
4350 
4351 		if (elem_size == 2) {
4352 			byte = U8(byte | (byte << 2));
4353 			byte = U8(byte | (byte << 4));
4354 		} else
4355 			byte = U8(byte | (byte << 1));
4356 
4357 		return emit_byte(compiler, U8(byte));
4358 	}
4359 
4360 	if (type & SLJIT_SIMD_TEST)
4361 		return SLJIT_SUCCESS;
4362 
4363 	if (elem_size == 0) {
4364 		if (reg_size == 5 && src_lane_index >= 16) {
4365 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4366 			FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa));
4367 			src_lane_index &= 0x7;
4368 			src = freg;
4369 		}
4370 
4371 		if (src_lane_index != 0 || (freg != src && (!(cpu_feature_list & CPU_FEATURE_AVX2) || !use_vex))) {
4372 			pref = 0;
4373 
4374 			if ((src_lane_index & 0x3) == 0) {
4375 				pref = EX86_PREF_66;
4376 				byte = U8(src_lane_index >> 2);
4377 			} else if (src_lane_index < 8 && (src_lane_index & 0x1) == 0) {
4378 				pref = EX86_PREF_F2;
4379 				byte = U8(src_lane_index >> 1);
4380 			} else {
4381 				if (!use_vex) {
4382 					if (freg != src)
4383 						FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4384 
4385 					FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
4386 				} else
4387 					FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0));
4388 
4389 				FAIL_IF(emit_byte(compiler, U8(src_lane_index)));
4390 			}
4391 
4392 			if (pref != 0) {
4393 				if (use_vex)
4394 					FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0));
4395 				else
4396 					FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4397 				FAIL_IF(emit_byte(compiler, byte));
4398 			}
4399 
4400 			src = freg;
4401 		}
4402 
4403 		if (use_vex && (cpu_feature_list & CPU_FEATURE_AVX2))
4404 			return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4405 
4406 		SLJIT_ASSERT(reg_size == 4);
4407 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
4408 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
4409 	}
4410 
4411 	if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && src_lane_index == 0 && elem_size <= 3) {
4412 		switch (elem_size) {
4413 		case 1:
4414 			pref = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4415 			break;
4416 		case 2:
4417 			pref = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4418 			break;
4419 		default:
4420 			pref = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4421 			break;
4422 		}
4423 
4424 		if (reg_size == 5)
4425 			pref |= VEX_256;
4426 
4427 		return emit_vex_instruction(compiler, pref, freg, 0, src, 0);
4428 	}
4429 
4430 	if (reg_size == 5) {
4431 		switch (elem_size) {
4432 		case 1:
4433 			byte = U8(src_lane_index & 0x3);
4434 			src_lane_index >>= 2;
4435 			pref = PSHUFLW_x_xm | VEX_256 | ((src_lane_index & 1) == 0 ? EX86_PREF_F2 : EX86_PREF_F3) | EX86_SSE2;
4436 			break;
4437 		case 2:
4438 			byte = U8(src_lane_index & 0x3);
4439 			src_lane_index >>= 1;
4440 			pref = PSHUFD_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2;
4441 			break;
4442 		case 3:
4443 			pref = 0;
4444 			break;
4445 		default:
4446 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4447 			return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee));
4448 		}
4449 
4450 		if (pref != 0) {
4451 			FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0));
4452 			byte = U8(byte | (byte << 2));
4453 			FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4454 
4455 			if (src_lane_index == 0)
4456 				return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4457 
4458 			src = freg;
4459 		}
4460 
4461 		FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4462 		byte = U8(src_lane_index);
4463 		byte = U8(byte | (byte << 2));
4464 		return emit_byte(compiler, U8(byte | (byte << 4)));
4465 	}
4466 
4467 	switch (elem_size) {
4468 	case 1:
4469 		byte = U8(src_lane_index & 0x3);
4470 		src_lane_index >>= 1;
4471 		pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3;
4472 
4473 		if (use_vex)
4474 			FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0));
4475 		else
4476 			FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4477 		byte = U8(byte | (byte << 2));
4478 		FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4479 
4480 		if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && pref == EX86_PREF_F2)
4481 			return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4482 
4483 		src = freg;
4484 		/* fallthrough */
4485 	case 2:
4486 		byte = U8(src_lane_index);
4487 		byte = U8(byte | (byte << 2));
4488 		break;
4489 	default:
4490 		byte = U8(src_lane_index << 1);
4491 		byte = U8(byte | (byte << 2) | 0x4);
4492 		break;
4493 	}
4494 
4495 	if (use_vex)
4496 		FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, src, 0));
4497 	else
4498 		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4499 	return emit_byte(compiler, U8(byte | (byte << 4)));
4500 }
4501 
4502 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
4503 	sljit_s32 freg,
4504 	sljit_s32 src, sljit_sw srcw)
4505 {
4506 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4507 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4508 	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
4509 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4510 	sljit_u8 opcode;
4511 
4512 	CHECK_ERROR();
4513 	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
4514 
4515 	ADJUST_LOCAL_OFFSET(src, srcw);
4516 
4517 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4518 	compiler->mode32 = 1;
4519 #endif /* SLJIT_CONFIG_X86_64 */
4520 
4521 	if (reg_size == 5) {
4522 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4523 			return SLJIT_ERR_UNSUPPORTED;
4524 		use_vex = 1;
4525 	} else if (reg_size != 4)
4526 		return SLJIT_ERR_UNSUPPORTED;
4527 
4528 	if (type & SLJIT_SIMD_FLOAT) {
4529 		if (elem_size != 2 || elem2_size != 3)
4530 			return SLJIT_ERR_UNSUPPORTED;
4531 
4532 		if (type & SLJIT_SIMD_TEST)
4533 			return SLJIT_SUCCESS;
4534 
4535 		if (use_vex)
4536 			return emit_vex_instruction(compiler, CVTPS2PD_x_xm | ((reg_size == 5) ? VEX_256 : 0) | EX86_SSE2, freg, 0, src, srcw);
4537 		return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw);
4538 	}
4539 
4540 	switch (elem_size) {
4541 	case 0:
4542 		if (elem2_size == 1)
4543 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBW_x_xm : PMOVZXBW_x_xm;
4544 		else if (elem2_size == 2)
4545 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBD_x_xm : PMOVZXBD_x_xm;
4546 		else if (elem2_size == 3)
4547 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBQ_x_xm : PMOVZXBQ_x_xm;
4548 		else
4549 			return SLJIT_ERR_UNSUPPORTED;
4550 		break;
4551 	case 1:
4552 		if (elem2_size == 2)
4553 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWD_x_xm : PMOVZXWD_x_xm;
4554 		else if (elem2_size == 3)
4555 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWQ_x_xm : PMOVZXWQ_x_xm;
4556 		else
4557 			return SLJIT_ERR_UNSUPPORTED;
4558 		break;
4559 	case 2:
4560 		if (elem2_size == 3)
4561 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXDQ_x_xm : PMOVZXDQ_x_xm;
4562 		else
4563 			return SLJIT_ERR_UNSUPPORTED;
4564 		break;
4565 	default:
4566 		return SLJIT_ERR_UNSUPPORTED;
4567 	}
4568 
4569 	if (type & SLJIT_SIMD_TEST)
4570 		return SLJIT_SUCCESS;
4571 
4572 	if (use_vex)
4573 		return emit_vex_instruction(compiler, opcode | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw);
4574 	return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
4575 }
4576 
4577 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
4578 	sljit_s32 freg,
4579 	sljit_s32 dst, sljit_sw dstw)
4580 {
4581 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4582 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4583 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4584 	sljit_s32 dst_r;
4585 	sljit_uw op;
4586 	sljit_u8 *inst;
4587 
4588 	CHECK_ERROR();
4589 	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
4590 
4591 	ADJUST_LOCAL_OFFSET(dst, dstw);
4592 
4593 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4594 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4595 	compiler->mode32 = 1;
4596 #endif /* SLJIT_CONFIG_X86_64 */
4597 
4598 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
4599 		return SLJIT_ERR_UNSUPPORTED;
4600 
4601 	if (reg_size == 4) {
4602 		if (type & SLJIT_SIMD_TEST)
4603 			return SLJIT_SUCCESS;
4604 
4605 		op = EX86_PREF_66 | EX86_SSE2_OP2;
4606 
4607 		switch (elem_size) {
4608 		case 1:
4609 			if (use_vex)
4610 				FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, freg, 0));
4611 			else
4612 				FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
4613 			freg = TMP_FREG;
4614 			break;
4615 		case 2:
4616 			op = EX86_SSE2_OP2;
4617 			break;
4618 		}
4619 
4620 		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4621 		op |= (elem_size < 2) ? PMOVMSKB_r_x : MOVMSKPS_r_x;
4622 
4623 		if (use_vex)
4624 			FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0));
4625 		else
4626 			FAIL_IF(emit_groupf(compiler, op, dst_r, freg, 0));
4627 
4628 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4629 		compiler->mode32 = type & SLJIT_32;
4630 #endif /* SLJIT_CONFIG_X86_64 */
4631 
4632 		if (elem_size == 1) {
4633 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 8, dst_r, 0);
4634 			FAIL_IF(!inst);
4635 			inst[1] |= SHR;
4636 		}
4637 
4638 		if (dst_r == TMP_REG1)
4639 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4640 
4641 		return SLJIT_SUCCESS;
4642 	}
4643 
4644 	if (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2))
4645 		return SLJIT_ERR_UNSUPPORTED;
4646 
4647 	if (type & SLJIT_SIMD_TEST)
4648 		return SLJIT_SUCCESS;
4649 
4650 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4651 
4652 	if (elem_size == 1) {
4653 		FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
4654 		FAIL_IF(emit_byte(compiler, 1));
4655 		FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0));
4656 		FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
4657 	} else {
4658 		op = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2;
4659 
4660 		if (elem_size == 0)
4661 			op = PMOVMSKB_r_x | VEX_256 | EX86_PREF_66 | EX86_SSE2_OP2;
4662 		else if (elem_size == 3)
4663 			op |= EX86_PREF_66;
4664 
4665 		FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0));
4666 	}
4667 
4668 	if (dst_r == TMP_REG1) {
4669 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4670 		compiler->mode32 = type & SLJIT_32;
4671 #endif /* SLJIT_CONFIG_X86_64 */
4672 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4673 	}
4674 
4675 	return SLJIT_SUCCESS;
4676 }
4677 
4678 static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
4679 	sljit_s32 dst_freg, sljit_s32 src_freg)
4680 {
4681 	sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2;
4682 
4683 	SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4);
4684 
4685 	if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3)
4686 		op |= EX86_PREF_66;
4687 
4688 	return emit_groupf(compiler, op, dst_freg, src_freg, 0);
4689 }
4690 
4691 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
4692 	sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
4693 {
4694 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4695 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4696 	sljit_uw op = 0;
4697 
4698 	CHECK_ERROR();
4699 	CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
4700 
4701 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4702 	compiler->mode32 = 1;
4703 #endif /* SLJIT_CONFIG_X86_64 */
4704 
4705 	if (reg_size == 5) {
4706 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4707 			return SLJIT_ERR_UNSUPPORTED;
4708 	} else if (reg_size != 4)
4709 		return SLJIT_ERR_UNSUPPORTED;
4710 
4711 	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
4712 		return SLJIT_ERR_UNSUPPORTED;
4713 
4714 	switch (SLJIT_SIMD_GET_OPCODE(type)) {
4715 	case SLJIT_SIMD_OP2_AND:
4716 		op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm;
4717 
4718 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4719 			op |= EX86_PREF_66;
4720 		break;
4721 	case SLJIT_SIMD_OP2_OR:
4722 		op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm;
4723 
4724 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4725 			op |= EX86_PREF_66;
4726 		break;
4727 	case SLJIT_SIMD_OP2_XOR:
4728 		op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm;
4729 
4730 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4731 			op |= EX86_PREF_66;
4732 		break;
4733 	}
4734 
4735 	if (type & SLJIT_SIMD_TEST)
4736 		return SLJIT_SUCCESS;
4737 
4738 	if (reg_size == 5 || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX))) {
4739 		if (reg_size == 5)
4740 			op |= VEX_256;
4741 
4742 		return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0);
4743 	}
4744 
4745 	if (dst_freg != src1_freg) {
4746 		if (dst_freg == src2_freg)
4747 			src2_freg = src1_freg;
4748 		else
4749 			FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg));
4750 	}
4751 
4752 	FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0));
4753 	return SLJIT_SUCCESS;
4754 }
4755 
4756 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
4757 	sljit_s32 dst_reg,
4758 	sljit_s32 mem_reg)
4759 {
4760 	CHECK_ERROR();
4761 	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
4762 
4763 	SLJIT_SKIP_CHECKS(compiler);
4764 	return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0);
4765 }
4766 
4767 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op,
4768 	sljit_s32 src_reg,
4769 	sljit_s32 mem_reg,
4770 	sljit_s32 temp_reg)
4771 {
4772 	sljit_uw pref;
4773 	sljit_s32 free_reg = TMP_REG1;
4774 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4775 	sljit_sw srcw = 0;
4776 	sljit_sw tempw = 0;
4777 #endif /* SLJIT_CONFIG_X86_32 */
4778 
4779 	CHECK_ERROR();
4780 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
4781 	CHECK_EXTRA_REGS(src_reg, srcw, (void)0);
4782 	CHECK_EXTRA_REGS(temp_reg, tempw, (void)0);
4783 
4784 	SLJIT_ASSERT(FAST_IS_REG(src_reg) || src_reg == SLJIT_MEM1(SLJIT_SP));
4785 	SLJIT_ASSERT(FAST_IS_REG(temp_reg) || temp_reg == SLJIT_MEM1(SLJIT_SP));
4786 
4787 	op = GET_OPCODE(op);
4788 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4789 	if ((src_reg & SLJIT_MEM) || (op == SLJIT_MOV_U8 && reg_map[src_reg] >= 4)) {
4790 		/* Src is virtual register or its low byte is not accessible. */
4791 		SLJIT_ASSERT(src_reg != SLJIT_R1);
4792 		free_reg = src_reg;
4793 
4794 		EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw);
4795 		src_reg = TMP_REG1;
4796 
4797 		if (mem_reg == src_reg)
4798 			mem_reg = TMP_REG1;
4799 	}
4800 #endif /* SLJIT_CONFIG_X86_32 */
4801 
4802 	if (temp_reg != SLJIT_R0) {
4803 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4804 		compiler->mode32 = 0;
4805 
4806 		EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4807 		EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, 0);
4808 
4809 		if (src_reg == SLJIT_R0)
4810 			src_reg = free_reg;
4811 		if (mem_reg == SLJIT_R0)
4812 			mem_reg = free_reg;
4813 #else /* !SLJIT_CONFIG_X86_64 */
4814 		if (src_reg == TMP_REG1 && mem_reg == SLJIT_R0 && (free_reg & SLJIT_MEM)) {
4815 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0);
4816 			EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0);
4817 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4818 
4819 			mem_reg = SLJIT_R1;
4820 			free_reg = SLJIT_R1;
4821 		} else {
4822 			EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4823 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4824 
4825 			if (src_reg == SLJIT_R0)
4826 				src_reg = free_reg;
4827 			if (mem_reg == SLJIT_R0)
4828 				mem_reg = free_reg;
4829 		}
4830 #endif /* SLJIT_CONFIG_X86_64 */
4831 	}
4832 
4833 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4834 	compiler->mode32 = op != SLJIT_MOV && op != SLJIT_MOV_P;
4835 #endif /* SLJIT_CONFIG_X86_64 */
4836 
4837 	/* Lock prefix. */
4838 	FAIL_IF(emit_byte(compiler, GROUP_LOCK));
4839 
4840 	pref = 0;
4841 	if (op == SLJIT_MOV_U16)
4842 		pref = EX86_HALF_ARG | EX86_PREF_66;
4843 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4844 	if (op == SLJIT_MOV_U8)
4845 		pref = EX86_REX;
4846 #endif /* SLJIT_CONFIG_X86_64 */
4847 
4848 	FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0));
4849 
4850 	if (temp_reg != SLJIT_R0) {
4851 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4852 		compiler->mode32 = 0;
4853 		return emit_mov(compiler, SLJIT_R0, 0, TMP_REG1, 0);
4854 #else /* !SLJIT_CONFIG_X86_64 */
4855 		EMIT_MOV(compiler, SLJIT_R0, 0, free_reg, 0);
4856 		if (free_reg != TMP_REG1)
4857 			return emit_mov(compiler, free_reg, 0, (free_reg == SLJIT_R1) ? SLJIT_MEM1(SLJIT_SP) : TMP_REG1, 0);
4858 #endif /* SLJIT_CONFIG_X86_64 */
4859 	}
4860 	return SLJIT_SUCCESS;
4861 }
4862 
4863 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
4864 {
4865 	CHECK_ERROR();
4866 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
4867 	ADJUST_LOCAL_OFFSET(dst, dstw);
4868 
4869 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4870 
4871 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4872 	compiler->mode32 = 0;
4873 #endif
4874 
4875 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
4876 
4877 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4878 	if (NOT_HALFWORD(offset)) {
4879 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
4880 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
4881 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
4882 		return compiler->error;
4883 #else
4884 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
4885 #endif
4886 	}
4887 #endif
4888 
4889 	if (offset != 0)
4890 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
4891 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
4892 }
4893 
4894 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
4895 {
4896 	sljit_u8 *inst;
4897 	struct sljit_const *const_;
4898 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4899 	sljit_s32 reg;
4900 #endif
4901 
4902 	CHECK_ERROR_PTR();
4903 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
4904 	ADJUST_LOCAL_OFFSET(dst, dstw);
4905 
4906 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4907 
4908 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
4909 	PTR_FAIL_IF(!const_);
4910 	set_const(const_, compiler);
4911 
4912 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4913 	compiler->mode32 = 0;
4914 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4915 
4916 	if (emit_load_imm64(compiler, reg, init_value))
4917 		return NULL;
4918 #else
4919 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
4920 		return NULL;
4921 #endif
4922 
4923 	inst = (sljit_u8*)ensure_buf(compiler, 1);
4924 	PTR_FAIL_IF(!inst);
4925 
4926 	inst[0] = SLJIT_INST_CONST;
4927 
4928 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4929 	if (dst & SLJIT_MEM)
4930 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
4931 			return NULL;
4932 #endif
4933 
4934 	return const_;
4935 }
4936 
4937 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_mov_addr(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
4938 {
4939 	struct sljit_jump *jump;
4940 	sljit_u8 *inst;
4941 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4942 	sljit_s32 reg;
4943 #endif /* SLJIT_CONFIG_X86_64 */
4944 
4945 	CHECK_ERROR_PTR();
4946 	CHECK_PTR(check_sljit_emit_mov_addr(compiler, dst, dstw));
4947 	ADJUST_LOCAL_OFFSET(dst, dstw);
4948 
4949 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4950 
4951 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
4952 	PTR_FAIL_IF(!jump);
4953 	set_mov_addr(jump, compiler, 0);
4954 
4955 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4956 	compiler->mode32 = 0;
4957 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4958 
4959 	PTR_FAIL_IF(emit_load_imm64(compiler, reg, 0));
4960 	jump->addr = compiler->size;
4961 
4962 	if (reg_map[reg] >= 8)
4963 		jump->flags |= MOV_ADDR_HI;
4964 #else /* !SLJIT_CONFIG_X86_64 */
4965 	PTR_FAIL_IF(emit_mov(compiler, dst, dstw, SLJIT_IMM, 0));
4966 #endif /* SLJIT_CONFIG_X86_64 */
4967 
4968 	inst = (sljit_u8*)ensure_buf(compiler, 1);
4969 	PTR_FAIL_IF(!inst);
4970 
4971 	inst[0] = SLJIT_INST_MOV_ADDR;
4972 
4973 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4974 	if (dst & SLJIT_MEM)
4975 		PTR_FAIL_IF(emit_mov(compiler, dst, dstw, TMP_REG1, 0));
4976 #endif /* SLJIT_CONFIG_X86_64 */
4977 
4978 	return jump;
4979 }
4980 
4981 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
4982 {
4983 	SLJIT_UNUSED_ARG(executable_offset);
4984 
4985 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0);
4986 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4987 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)(new_target - (addr + 4) - (sljit_uw)executable_offset));
4988 #else
4989 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)new_target);
4990 #endif
4991 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1);
4992 }
4993 
4994 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
4995 {
4996 	SLJIT_UNUSED_ARG(executable_offset);
4997 
4998 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 0);
4999 	sljit_unaligned_store_sw((void*)addr, new_constant);
5000 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 1);
5001 }
5002