1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 This module by Zoltan Herczeg
10 Original API code Copyright (c) 1997-2012 University of Cambridge
11 New API code Copyright (c) 2016-2019 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42 #if !(defined SUPPORT_VALGRIND)
43
44 #if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45 || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
46 || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
47
48 typedef enum {
49 vector_compare_match1,
50 vector_compare_match1i,
51 vector_compare_match2,
52 } vector_compare_type;
53
54 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
max_fast_forward_char_pair_offset(void)55 static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
56 {
57 #if PCRE2_CODE_UNIT_WIDTH == 8
58 /* The AVX2 code path is currently disabled. */
59 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
60 return 15;
61 #elif PCRE2_CODE_UNIT_WIDTH == 16
62 /* The AVX2 code path is currently disabled. */
63 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
64 return 7;
65 #elif PCRE2_CODE_UNIT_WIDTH == 32
66 /* The AVX2 code path is currently disabled. */
67 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
68 return 3;
69 #else
70 #error "Unsupported unit width"
71 #endif
72 }
73 #else /* !SLJIT_CONFIG_X86 */
max_fast_forward_char_pair_offset(void)74 static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
75 {
76 #if PCRE2_CODE_UNIT_WIDTH == 8
77 return 15;
78 #elif PCRE2_CODE_UNIT_WIDTH == 16
79 return 7;
80 #elif PCRE2_CODE_UNIT_WIDTH == 32
81 return 3;
82 #else
83 #error "Unsupported unit width"
84 #endif
85 }
86 #endif /* SLJIT_CONFIG_X86 */
87
88 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
jump_if_utf_char_start(struct sljit_compiler * compiler,sljit_s32 reg)89 static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
90 {
91 #if PCRE2_CODE_UNIT_WIDTH == 8
92 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
93 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
94 #elif PCRE2_CODE_UNIT_WIDTH == 16
95 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
96 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
97 #else
98 #error "Unknown code width"
99 #endif
100 }
101 #endif
102
103 #endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
104
105 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
106
character_to_int32(PCRE2_UCHAR chr)107 static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
108 {
109 sljit_u32 value = chr;
110 #if PCRE2_CODE_UNIT_WIDTH == 8
111 #define SIMD_COMPARE_TYPE_INDEX 0
112 return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
113 #elif PCRE2_CODE_UNIT_WIDTH == 16
114 #define SIMD_COMPARE_TYPE_INDEX 1
115 return (sljit_s32)((value << 16) | value);
116 #elif PCRE2_CODE_UNIT_WIDTH == 32
117 #define SIMD_COMPARE_TYPE_INDEX 2
118 return (sljit_s32)(value);
119 #else
120 #error "Unsupported unit width"
121 #endif
122 }
123
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 reg_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)124 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
125 sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
126 {
127 sljit_u8 instruction[4];
128
129 if (reg_type == SLJIT_SIMD_REG_128)
130 {
131 instruction[0] = 0x66;
132 instruction[1] = 0x0f;
133 }
134 else
135 {
136 /* Two byte VEX prefix. */
137 instruction[0] = 0xc5;
138 instruction[1] = 0xfd;
139 }
140
141 SLJIT_ASSERT(step >= 0 && step <= 3);
142
143 if (compare_type != vector_compare_match2)
144 {
145 if (step == 0)
146 {
147 if (compare_type == vector_compare_match1i)
148 {
149 /* POR xmm1, xmm2/m128 */
150 if (reg_type == SLJIT_SIMD_REG_256)
151 instruction[1] ^= (dst_ind << 3);
152
153 /* Prefix is filled. */
154 instruction[2] = 0xeb;
155 instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
156 sljit_emit_op_custom(compiler, instruction, 4);
157 }
158 return;
159 }
160
161 if (step != 2)
162 return;
163
164 /* PCMPEQB/W/D xmm1, xmm2/m128 */
165 if (reg_type == SLJIT_SIMD_REG_256)
166 instruction[1] ^= (dst_ind << 3);
167
168 /* Prefix is filled. */
169 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
170 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
171 sljit_emit_op_custom(compiler, instruction, 4);
172 return;
173 }
174
175 if (reg_type == SLJIT_SIMD_REG_256)
176 {
177 if (step == 2)
178 return;
179
180 if (step == 0)
181 {
182 step = 2;
183 instruction[1] ^= (dst_ind << 3);
184 }
185 }
186
187 switch (step)
188 {
189 case 0:
190 SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
191
192 /* MOVDQA xmm1, xmm2/m128 */
193 /* Prefix is filled. */
194 instruction[2] = 0x6f;
195 instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
196 sljit_emit_op_custom(compiler, instruction, 4);
197 return;
198
199 case 1:
200 /* PCMPEQB/W/D xmm1, xmm2/m128 */
201 if (reg_type == SLJIT_SIMD_REG_256)
202 instruction[1] ^= (dst_ind << 3);
203
204 /* Prefix is filled. */
205 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
206 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
207 sljit_emit_op_custom(compiler, instruction, 4);
208 return;
209
210 case 2:
211 /* PCMPEQB/W/D xmm1, xmm2/m128 */
212 /* Prefix is filled. */
213 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
214 instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
215 sljit_emit_op_custom(compiler, instruction, 4);
216 return;
217
218 case 3:
219 /* POR xmm1, xmm2/m128 */
220 if (reg_type == SLJIT_SIMD_REG_256)
221 instruction[1] ^= (dst_ind << 3);
222
223 /* Prefix is filled. */
224 instruction[2] = 0xeb;
225 instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
226 sljit_emit_op_custom(compiler, instruction, 4);
227 return;
228 }
229 }
230
231 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
232
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)233 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
234 {
235 DEFINE_COMPILER;
236 sljit_u8 instruction[8];
237 /* The AVX2 code path is currently disabled. */
238 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
239 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
240 sljit_s32 value;
241 struct sljit_label *start;
242 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
243 struct sljit_label *restart;
244 #endif
245 struct sljit_jump *quit;
246 struct sljit_jump *partial_quit[2];
247 vector_compare_type compare_type = vector_compare_match1;
248 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
249 sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
250 sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
251 sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
252 sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
253 sljit_u32 bit = 0;
254 int i;
255
256 SLJIT_UNUSED_ARG(offset);
257
258 if (char1 != char2)
259 {
260 bit = char1 ^ char2;
261 compare_type = vector_compare_match1i;
262
263 if (!is_powerof2(bit))
264 {
265 bit = 0;
266 compare_type = vector_compare_match2;
267 }
268 }
269
270 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
271 if (common->mode == PCRE2_JIT_COMPLETE)
272 add_jump(compiler, &common->failed_match, partial_quit[0]);
273
274 /* First part (unaligned start) */
275 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
276 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
277
278 if (char1 != char2)
279 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
280
281 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
282
283 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
284
285 if (char1 != char2)
286 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
287
288 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
289 restart = LABEL();
290 #endif
291
292 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
293 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
294 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
295
296 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
297 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
298
299 for (i = 0; i < 4; i++)
300 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
301
302 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
303 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
304 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
305
306 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
307
308 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
309
310 /* Second part (aligned) */
311 start = LABEL();
312
313 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
314 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
315
316 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
317 if (common->mode == PCRE2_JIT_COMPLETE)
318 add_jump(compiler, &common->failed_match, partial_quit[1]);
319
320 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
321 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
322 for (i = 0; i < 4; i++)
323 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
324
325 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
326 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327
328 JUMPHERE(quit);
329
330 SLJIT_ASSERT(tmp1_reg_ind < 8);
331 /* BSF r32, r/m32 */
332 instruction[0] = 0x0f;
333 instruction[1] = 0xbc;
334 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
335 sljit_emit_op_custom(compiler, instruction, 3);
336
337 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
338
339 if (common->mode != PCRE2_JIT_COMPLETE)
340 {
341 JUMPHERE(partial_quit[0]);
342 JUMPHERE(partial_quit[1]);
343 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
344 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
345 }
346 else
347 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
348
349 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
350 if (common->utf && offset > 0)
351 {
352 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
353
354 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
355
356 quit = jump_if_utf_char_start(compiler, TMP1);
357
358 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
359 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
360 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
361 JUMPTO(SLJIT_JUMP, restart);
362
363 JUMPHERE(quit);
364 }
365 #endif
366 }
367
368 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
369
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)370 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
371 {
372 DEFINE_COMPILER;
373 sljit_u8 instruction[8];
374 /* The AVX2 code path is currently disabled. */
375 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
376 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
377 sljit_s32 value;
378 struct sljit_label *start;
379 struct sljit_jump *quit;
380 jump_list *not_found = NULL;
381 vector_compare_type compare_type = vector_compare_match1;
382 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
383 sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
384 sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
385 sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
386 sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
387 sljit_u32 bit = 0;
388 int i;
389
390 if (char1 != char2)
391 {
392 bit = char1 ^ char2;
393 compare_type = vector_compare_match1i;
394
395 if (!is_powerof2(bit))
396 {
397 bit = 0;
398 compare_type = vector_compare_match2;
399 }
400 }
401
402 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
403 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
404 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
405
406 /* First part (unaligned start) */
407
408 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
409 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
410
411 if (char1 != char2)
412 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
413
414 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
415
416 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
417
418 if (char1 != char2)
419 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
420
421 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
422 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
423 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
424
425 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
426 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
427
428 for (i = 0; i < 4; i++)
429 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
430
431 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
432 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
433 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
434
435 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
436
437 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
438
439 /* Second part (aligned) */
440 start = LABEL();
441
442 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
443 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
444
445 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
446
447 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
448 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
449
450 for (i = 0; i < 4; i++)
451 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
452
453 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
454 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
455
456 JUMPHERE(quit);
457
458 SLJIT_ASSERT(tmp1_reg_ind < 8);
459 /* BSF r32, r/m32 */
460 instruction[0] = 0x0f;
461 instruction[1] = 0xbc;
462 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
463 sljit_emit_op_custom(compiler, instruction, 3);
464
465 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
466 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
467
468 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
469 return not_found;
470 }
471
472 #ifndef _WIN64
473
474 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
475
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)476 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
477 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
478 {
479 DEFINE_COMPILER;
480 sljit_u8 instruction[8];
481 /* The AVX2 code path is currently disabled. */
482 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
483 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
484 sljit_s32 value;
485 vector_compare_type compare1_type = vector_compare_match1;
486 vector_compare_type compare2_type = vector_compare_match1;
487 sljit_u32 bit1 = 0;
488 sljit_u32 bit2 = 0;
489 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
490 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
491 sljit_s32 data1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
492 sljit_s32 data2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
493 sljit_s32 cmp1a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
494 sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
495 sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4);
496 sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5);
497 sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6);
498 sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0);
499 struct sljit_label *start;
500 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
501 struct sljit_label *restart;
502 #endif
503 struct sljit_jump *jump[2];
504 int i;
505
506 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
507 SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
508
509 /* Initialize. */
510 if (common->match_end_ptr != 0)
511 {
512 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
513 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
514 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
515
516 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
517 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
518 }
519
520 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
521 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
522
523 if (char1a == char1b)
524 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
525 else
526 {
527 bit1 = char1a ^ char1b;
528 if (is_powerof2(bit1))
529 {
530 compare1_type = vector_compare_match1i;
531 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
532 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
533 }
534 else
535 {
536 compare1_type = vector_compare_match2;
537 bit1 = 0;
538 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
539 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
540 }
541 }
542
543 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
544 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, TMP1, 0);
545
546 if (char1a != char1b)
547 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR4, 0, TMP2, 0);
548
549 if (char2a == char2b)
550 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
551 else
552 {
553 bit2 = char2a ^ char2b;
554 if (is_powerof2(bit2))
555 {
556 compare2_type = vector_compare_match1i;
557 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
558 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
559 }
560 else
561 {
562 compare2_type = vector_compare_match2;
563 bit2 = 0;
564 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
565 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
566 }
567 }
568
569 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR3, 0, TMP1, 0);
570
571 if (char2a != char2b)
572 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR5, 0, TMP2, 0);
573
574 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
575 if (char1a != char1b)
576 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR4, SLJIT_FR4, 0);
577
578 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR3, SLJIT_FR3, 0);
579 if (char2a != char2b)
580 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR5, SLJIT_FR5, 0);
581
582 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
583 restart = LABEL();
584 #endif
585
586 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
587 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
588 value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
589 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
590
591 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
592 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
593
594 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
595
596 sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
597 jump[1] = JUMP(SLJIT_JUMP);
598
599 JUMPHERE(jump[0]);
600
601 if (reg_type == SLJIT_SIMD_REG_256)
602 {
603 if (diff != 16)
604 {
605 /* PSLLDQ ymm1, ymm2, imm8 */
606 instruction[0] = 0xc5;
607 instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
608 instruction[2] = 0x73;
609 instruction[3] = 0xc0 | (7 << 3) | data1_ind;
610 instruction[4] = diff & 0xf;
611 sljit_emit_op_custom(compiler, instruction, 5);
612 }
613
614 instruction[0] = 0xc4;
615 instruction[1] = 0xe3;
616 if (diff < 16)
617 {
618 /* VINSERTI128 xmm1, xmm2, xmm3/m128 */
619 /* instruction[0] = 0xc4; */
620 /* instruction[1] = 0xe3; */
621 instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
622 instruction[3] = 0x38;
623 SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);
624 instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
625 instruction[5] = (sljit_u8)(16 - diff);
626 instruction[6] = 1;
627 sljit_emit_op_custom(compiler, instruction, 7);
628 }
629 else
630 {
631 /* VPERM2I128 xmm1, xmm2, xmm3/m128 */
632 /* instruction[0] = 0xc4; */
633 /* instruction[1] = 0xe3; */
634 value = (diff == 16) ? data1_ind : data2_ind;
635 instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
636 instruction[3] = 0x46;
637 instruction[4] = 0xc0 | (data2_ind << 3) | value;
638 instruction[5] = 0x08;
639 sljit_emit_op_custom(compiler, instruction, 6);
640 }
641 }
642 else
643 {
644 /* MOVDQA xmm1, xmm2/m128 */
645 instruction[0] = 0x66;
646 instruction[1] = 0x0f;
647 instruction[2] = 0x6f;
648 instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
649 sljit_emit_op_custom(compiler, instruction, 4);
650
651 /* PSLLDQ xmm1, imm8 */
652 /* instruction[0] = 0x66; */
653 /* instruction[1] = 0x0f; */
654 instruction[2] = 0x73;
655 instruction[3] = 0xc0 | (7 << 3) | data2_ind;
656 instruction[4] = diff;
657 sljit_emit_op_custom(compiler, instruction, 5);
658 }
659
660 JUMPHERE(jump[1]);
661
662 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
663 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
664
665 for (i = 0; i < 4; i++)
666 {
667 fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
668 fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
669 }
670
671 sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
672 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
673
674 /* Ignore matches before the first STR_PTR. */
675 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
676 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
677
678 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
679
680 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
681
682 /* Main loop. */
683 start = LABEL();
684
685 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
686 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
687 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
688
689 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
690 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
691 sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
692
693 for (i = 0; i < 4; i++)
694 {
695 fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
696 fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
697 }
698
699 sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
700 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
701
702 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
703
704 JUMPHERE(jump[0]);
705
706 SLJIT_ASSERT(tmp1_reg_ind < 8);
707 /* BSF r32, r/m32 */
708 instruction[0] = 0x0f;
709 instruction[1] = 0xbc;
710 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
711 sljit_emit_op_custom(compiler, instruction, 3);
712
713 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
714
715 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
716
717 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
718 if (common->utf)
719 {
720 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
721
722 jump[0] = jump_if_utf_char_start(compiler, TMP1);
723
724 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
725 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
726
727 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
728
729 JUMPHERE(jump[0]);
730 }
731 #endif
732
733 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
734
735 if (common->match_end_ptr != 0)
736 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
737 }
738
739 #endif /* !_WIN64 */
740
741 #undef SIMD_COMPARE_TYPE_INDEX
742
743 #endif /* SLJIT_CONFIG_X86 */
744
745 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
746
747 #include <arm_neon.h>
748
749 typedef union {
750 unsigned int x;
751 struct { unsigned char c1, c2, c3, c4; } c;
752 } int_char;
753
754 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
utf_continue(PCRE2_SPTR s)755 static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
756 {
757 #if PCRE2_CODE_UNIT_WIDTH == 8
758 return (*s & 0xc0) == 0x80;
759 #elif PCRE2_CODE_UNIT_WIDTH == 16
760 return (*s & 0xfc00) == 0xdc00;
761 #else
762 #error "Unknown code width"
763 #endif
764 }
765 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
766
767 #if PCRE2_CODE_UNIT_WIDTH == 8
768 # define VECTOR_FACTOR 16
769 # define vect_t uint8x16_t
770 # define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
771 # define VCEQQ vceqq_u8
772 # define VORRQ vorrq_u8
773 # define VST1Q vst1q_u8
774 # define VDUPQ vdupq_n_u8
775 # define VEXTQ vextq_u8
776 # define VANDQ vandq_u8
777 typedef union {
778 uint8_t mem[16];
779 uint64_t dw[2];
780 } quad_word;
781 #elif PCRE2_CODE_UNIT_WIDTH == 16
782 # define VECTOR_FACTOR 8
783 # define vect_t uint16x8_t
784 # define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
785 # define VCEQQ vceqq_u16
786 # define VORRQ vorrq_u16
787 # define VST1Q vst1q_u16
788 # define VDUPQ vdupq_n_u16
789 # define VEXTQ vextq_u16
790 # define VANDQ vandq_u16
791 typedef union {
792 uint16_t mem[8];
793 uint64_t dw[2];
794 } quad_word;
795 #else
796 # define VECTOR_FACTOR 4
797 # define vect_t uint32x4_t
798 # define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
799 # define VCEQQ vceqq_u32
800 # define VORRQ vorrq_u32
801 # define VST1Q vst1q_u32
802 # define VDUPQ vdupq_n_u32
803 # define VEXTQ vextq_u32
804 # define VANDQ vandq_u32
805 typedef union {
806 uint32_t mem[4];
807 uint64_t dw[2];
808 } quad_word;
809 #endif
810
811 #define FFCS
812 #include "pcre2_jit_neon_inc.h"
813 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
814 # define FF_UTF
815 # include "pcre2_jit_neon_inc.h"
816 # undef FF_UTF
817 #endif
818 #undef FFCS
819
820 #define FFCS_2
821 #include "pcre2_jit_neon_inc.h"
822 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
823 # define FF_UTF
824 # include "pcre2_jit_neon_inc.h"
825 # undef FF_UTF
826 #endif
827 #undef FFCS_2
828
829 #define FFCS_MASK
830 #include "pcre2_jit_neon_inc.h"
831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
832 # define FF_UTF
833 # include "pcre2_jit_neon_inc.h"
834 # undef FF_UTF
835 #endif
836 #undef FFCS_MASK
837
838 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
839
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)840 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
841 {
842 DEFINE_COMPILER;
843 int_char ic;
844 struct sljit_jump *partial_quit, *quit;
845 /* Save temporary registers. */
846 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
847 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
848
849 /* Prepare function arguments */
850 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
851 GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
852 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
853
854 if (char1 == char2)
855 {
856 ic.c.c1 = char1;
857 ic.c.c2 = char2;
858 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
859
860 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
861 if (common->utf && offset > 0)
862 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
863 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
864 else
865 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
866 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
867 #else
868 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
869 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
870 #endif
871 }
872 else
873 {
874 PCRE2_UCHAR mask = char1 ^ char2;
875 if (is_powerof2(mask))
876 {
877 ic.c.c1 = char1 | mask;
878 ic.c.c2 = mask;
879 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
880
881 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
882 if (common->utf && offset > 0)
883 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
884 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
885 else
886 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
888 #else
889 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
891 #endif
892 }
893 else
894 {
895 ic.c.c1 = char1;
896 ic.c.c2 = char2;
897 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
898
899 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
900 if (common->utf && offset > 0)
901 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
902 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
903 else
904 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
905 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
906 #else
907 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
909 #endif
910 }
911 }
912 /* Restore registers. */
913 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
914 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
915
916 /* Check return value. */
917 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
918 if (common->mode == PCRE2_JIT_COMPLETE)
919 add_jump(compiler, &common->failed_match, partial_quit);
920
921 /* Fast forward STR_PTR to the result of memchr. */
922 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
923 if (common->mode != PCRE2_JIT_COMPLETE)
924 {
925 quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
926 JUMPHERE(partial_quit);
927 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
928 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
929 JUMPHERE(quit);
930 }
931 }
932
933 typedef enum {
934 compare_match1,
935 compare_match1i,
936 compare_match2,
937 } compare_type;
938
fast_forward_char_pair_compare(compare_type ctype,vect_t dst,vect_t cmp1,vect_t cmp2)939 static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
940 {
941 if (ctype == compare_match2)
942 {
943 vect_t tmp = dst;
944 dst = VCEQQ(dst, cmp1);
945 tmp = VCEQQ(tmp, cmp2);
946 dst = VORRQ(dst, tmp);
947 return dst;
948 }
949
950 if (ctype == compare_match1i)
951 dst = VORRQ(dst, cmp2);
952 dst = VCEQQ(dst, cmp1);
953 return dst;
954 }
955
max_fast_forward_char_pair_offset(void)956 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
957 {
958 #if PCRE2_CODE_UNIT_WIDTH == 8
959 return 15;
960 #elif PCRE2_CODE_UNIT_WIDTH == 16
961 return 7;
962 #elif PCRE2_CODE_UNIT_WIDTH == 32
963 return 3;
964 #else
965 #error "Unsupported unit width"
966 #endif
967 }
968
969 /* ARM doesn't have a shift left across lanes. */
shift_left_n_lanes(vect_t a,sljit_u8 n)970 static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
971 {
972 vect_t zero = VDUPQ(0);
973 SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
974 /* VEXTQ takes an immediate as last argument. */
975 #define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
976 switch (n)
977 {
978 C(1); C(2); C(3);
979 #if PCRE2_CODE_UNIT_WIDTH != 32
980 C(4); C(5); C(6); C(7);
981 # if PCRE2_CODE_UNIT_WIDTH != 16
982 C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
983 # endif
984 #endif
985 default:
986 /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
987 happen. The return is still here for compilers to not warn. */
988 return a;
989 }
990 }
991
992 #define FFCPS
993 #define FFCPS_DIFF1
994 #define FFCPS_CHAR1A2A
995
996 #define FFCPS_0
997 #include "pcre2_jit_neon_inc.h"
998 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
999 # define FF_UTF
1000 # include "pcre2_jit_neon_inc.h"
1001 # undef FF_UTF
1002 #endif
1003 #undef FFCPS_0
1004
1005 #undef FFCPS_CHAR1A2A
1006
1007 #define FFCPS_1
1008 #include "pcre2_jit_neon_inc.h"
1009 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1010 # define FF_UTF
1011 # include "pcre2_jit_neon_inc.h"
1012 # undef FF_UTF
1013 #endif
1014 #undef FFCPS_1
1015
1016 #undef FFCPS_DIFF1
1017
1018 #define FFCPS_DEFAULT
1019 #include "pcre2_jit_neon_inc.h"
1020 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1021 # define FF_UTF
1022 # include "pcre2_jit_neon_inc.h"
1023 # undef FF_UTF
1024 #endif
1025 #undef FFCPS
1026
1027 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1028
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1029 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1030 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1031 {
1032 DEFINE_COMPILER;
1033 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1034 struct sljit_jump *partial_quit;
1035 int_char ic;
1036 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1037 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1038 SLJIT_ASSERT(compiler->scratches == 5);
1039
1040 /* Save temporary register STR_PTR. */
1041 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1042
1043 /* Prepare arguments for the function call. */
1044 if (common->match_end_ptr == 0)
1045 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1046 else
1047 {
1048 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1049 OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1050
1051 OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
1052 SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
1053 }
1054
1055 GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
1056 OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1057 OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1058 ic.c.c1 = char1a;
1059 ic.c.c2 = char1b;
1060 ic.c.c3 = char2a;
1061 ic.c.c4 = char2b;
1062 OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1063
1064 if (diff == 1) {
1065 if (char1a == char1b && char2a == char2b) {
1066 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1067 if (common->utf)
1068 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1069 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
1070 else
1071 #endif
1072 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1073 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
1074 } else {
1075 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1076 if (common->utf)
1077 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1078 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
1079 else
1080 #endif
1081 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1082 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
1083 }
1084 } else {
1085 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086 if (common->utf)
1087 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
1089 else
1090 #endif
1091 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
1093 }
1094
1095 /* Restore STR_PTR register. */
1096 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1097
1098 /* Check return value. */
1099 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1100 add_jump(compiler, &common->failed_match, partial_quit);
1101
1102 /* Fast forward STR_PTR to the result of memchr. */
1103 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1104
1105 JUMPHERE(partial_quit);
1106 }
1107
1108 #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1109
1110 #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1111
1112 #if PCRE2_CODE_UNIT_WIDTH == 8
1113 #define VECTOR_ELEMENT_SIZE 0
1114 #elif PCRE2_CODE_UNIT_WIDTH == 16
1115 #define VECTOR_ELEMENT_SIZE 1
1116 #elif PCRE2_CODE_UNIT_WIDTH == 32
1117 #define VECTOR_ELEMENT_SIZE 2
1118 #else
1119 #error "Unsupported unit width"
1120 #endif
1121
load_from_mem_vector(struct sljit_compiler * compiler,BOOL vlbb,sljit_s32 dst_vreg,sljit_s32 base_reg,sljit_s32 index_reg)1122 static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1123 sljit_s32 base_reg, sljit_s32 index_reg)
1124 {
1125 sljit_u16 instruction[3];
1126
1127 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1128 instruction[1] = (sljit_u16)(base_reg << 12);
1129 instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1130
1131 sljit_emit_op_custom(compiler, instruction, 6);
1132 }
1133
1134 #if PCRE2_CODE_UNIT_WIDTH == 32
1135
replicate_imm_vector(struct sljit_compiler * compiler,int step,sljit_s32 dst_vreg,PCRE2_UCHAR chr,sljit_s32 tmp_general_reg)1136 static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1137 PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1138 {
1139 sljit_u16 instruction[3];
1140
1141 SLJIT_ASSERT(step >= 0 && step <= 1);
1142
1143 if (chr < 0x7fff)
1144 {
1145 if (step == 1)
1146 return;
1147
1148 /* VREPI */
1149 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1150 instruction[1] = (sljit_u16)chr;
1151 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1152 sljit_emit_op_custom(compiler, instruction, 6);
1153 return;
1154 }
1155
1156 if (step == 0)
1157 {
1158 OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1159
1160 /* VLVG */
1161 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
1162 instruction[1] = 0;
1163 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1164 sljit_emit_op_custom(compiler, instruction, 6);
1165 return;
1166 }
1167
1168 /* VREP */
1169 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1170 instruction[1] = 0;
1171 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1172 sljit_emit_op_custom(compiler, instruction, 6);
1173 }
1174
1175 #endif
1176
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1177 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1178 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1179 {
1180 sljit_u16 instruction[3];
1181
1182 SLJIT_ASSERT(step >= 0 && step <= 2);
1183
1184 if (step == 1)
1185 {
1186 /* VCEQ */
1187 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1188 instruction[1] = (sljit_u16)(cmp1_ind << 12);
1189 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1190 sljit_emit_op_custom(compiler, instruction, 6);
1191 return;
1192 }
1193
1194 if (compare_type != vector_compare_match2)
1195 {
1196 if (step == 0 && compare_type == vector_compare_match1i)
1197 {
1198 /* VO */
1199 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1200 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1201 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1202 sljit_emit_op_custom(compiler, instruction, 6);
1203 }
1204 return;
1205 }
1206
1207 switch (step)
1208 {
1209 case 0:
1210 /* VCEQ */
1211 instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1212 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1213 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1214 sljit_emit_op_custom(compiler, instruction, 6);
1215 return;
1216
1217 case 2:
1218 /* VO */
1219 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1220 instruction[1] = (sljit_u16)(tmp_ind << 12);
1221 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1222 sljit_emit_op_custom(compiler, instruction, 6);
1223 return;
1224 }
1225 }
1226
1227 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1228
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1229 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1230 {
1231 DEFINE_COMPILER;
1232 sljit_u16 instruction[3];
1233 struct sljit_label *start;
1234 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1235 struct sljit_label *restart;
1236 #endif
1237 struct sljit_jump *quit;
1238 struct sljit_jump *partial_quit[2];
1239 vector_compare_type compare_type = vector_compare_match1;
1240 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1241 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1242 sljit_s32 data_ind = 0;
1243 sljit_s32 tmp_ind = 1;
1244 sljit_s32 cmp1_ind = 2;
1245 sljit_s32 cmp2_ind = 3;
1246 sljit_s32 zero_ind = 4;
1247 sljit_u32 bit = 0;
1248 int i;
1249
1250 SLJIT_UNUSED_ARG(offset);
1251
1252 if (char1 != char2)
1253 {
1254 bit = char1 ^ char2;
1255 compare_type = vector_compare_match1i;
1256
1257 if (!is_powerof2(bit))
1258 {
1259 bit = 0;
1260 compare_type = vector_compare_match2;
1261 }
1262 }
1263
1264 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1265 if (common->mode == PCRE2_JIT_COMPLETE)
1266 add_jump(compiler, &common->failed_match, partial_quit[0]);
1267
1268 /* First part (unaligned start) */
1269
1270 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1271
1272 #if PCRE2_CODE_UNIT_WIDTH != 32
1273
1274 /* VREPI */
1275 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1276 instruction[1] = (sljit_u16)(char1 | bit);
1277 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1278 sljit_emit_op_custom(compiler, instruction, 6);
1279
1280 if (char1 != char2)
1281 {
1282 /* VREPI */
1283 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1284 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1285 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1286 sljit_emit_op_custom(compiler, instruction, 6);
1287 }
1288
1289 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1290
1291 for (int i = 0; i < 2; i++)
1292 {
1293 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1294
1295 if (char1 != char2)
1296 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1297 }
1298
1299 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1300
1301 if (compare_type == vector_compare_match2)
1302 {
1303 /* VREPI */
1304 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1305 instruction[1] = 0;
1306 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1307 sljit_emit_op_custom(compiler, instruction, 6);
1308 }
1309
1310 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1311 restart = LABEL();
1312 #endif
1313
1314 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1315 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1316
1317 if (compare_type != vector_compare_match2)
1318 {
1319 if (compare_type == vector_compare_match1i)
1320 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1321
1322 /* VFEE */
1323 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1324 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1325 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1326 sljit_emit_op_custom(compiler, instruction, 6);
1327 }
1328 else
1329 {
1330 for (i = 0; i < 3; i++)
1331 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1332
1333 /* VFENE */
1334 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1335 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1336 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1337 sljit_emit_op_custom(compiler, instruction, 6);
1338 }
1339
1340 /* VLGVB */
1341 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1342 instruction[1] = 7;
1343 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1344 sljit_emit_op_custom(compiler, instruction, 6);
1345
1346 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1347 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1348
1349 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1350
1351 /* Second part (aligned) */
1352 start = LABEL();
1353
1354 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1355
1356 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1357 if (common->mode == PCRE2_JIT_COMPLETE)
1358 add_jump(compiler, &common->failed_match, partial_quit[1]);
1359
1360 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1361
1362 if (compare_type != vector_compare_match2)
1363 {
1364 if (compare_type == vector_compare_match1i)
1365 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1366
1367 /* VFEE */
1368 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1369 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1370 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1371 sljit_emit_op_custom(compiler, instruction, 6);
1372 }
1373 else
1374 {
1375 for (i = 0; i < 3; i++)
1376 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1377
1378 /* VFENE */
1379 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1380 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1381 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1382 sljit_emit_op_custom(compiler, instruction, 6);
1383 }
1384
1385 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1386 JUMPTO(SLJIT_OVERFLOW, start);
1387
1388 /* VLGVB */
1389 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1390 instruction[1] = 7;
1391 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1392 sljit_emit_op_custom(compiler, instruction, 6);
1393
1394 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1395
1396 JUMPHERE(quit);
1397
1398 if (common->mode != PCRE2_JIT_COMPLETE)
1399 {
1400 JUMPHERE(partial_quit[0]);
1401 JUMPHERE(partial_quit[1]);
1402 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1403 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
1404 }
1405 else
1406 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1407
1408 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1409 if (common->utf && offset > 0)
1410 {
1411 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1412
1413 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1414
1415 quit = jump_if_utf_char_start(compiler, TMP1);
1416
1417 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1418 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1419
1420 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1421 JUMPTO(SLJIT_JUMP, restart);
1422
1423 JUMPHERE(quit);
1424 }
1425 #endif
1426 }
1427
1428 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1429
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)1430 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1431 {
1432 DEFINE_COMPILER;
1433 sljit_u16 instruction[3];
1434 struct sljit_label *start;
1435 struct sljit_jump *quit;
1436 jump_list *not_found = NULL;
1437 vector_compare_type compare_type = vector_compare_match1;
1438 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1439 sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
1440 sljit_s32 data_ind = 0;
1441 sljit_s32 tmp_ind = 1;
1442 sljit_s32 cmp1_ind = 2;
1443 sljit_s32 cmp2_ind = 3;
1444 sljit_s32 zero_ind = 4;
1445 sljit_u32 bit = 0;
1446 int i;
1447
1448 if (char1 != char2)
1449 {
1450 bit = char1 ^ char2;
1451 compare_type = vector_compare_match1i;
1452
1453 if (!is_powerof2(bit))
1454 {
1455 bit = 0;
1456 compare_type = vector_compare_match2;
1457 }
1458 }
1459
1460 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1461
1462 /* First part (unaligned start) */
1463
1464 OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1465
1466 #if PCRE2_CODE_UNIT_WIDTH != 32
1467
1468 /* VREPI */
1469 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1470 instruction[1] = (sljit_u16)(char1 | bit);
1471 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1472 sljit_emit_op_custom(compiler, instruction, 6);
1473
1474 if (char1 != char2)
1475 {
1476 /* VREPI */
1477 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1478 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1479 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1480 sljit_emit_op_custom(compiler, instruction, 6);
1481 }
1482
1483 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1484
1485 for (int i = 0; i < 2; i++)
1486 {
1487 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1488
1489 if (char1 != char2)
1490 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1491 }
1492
1493 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1494
1495 if (compare_type == vector_compare_match2)
1496 {
1497 /* VREPI */
1498 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1499 instruction[1] = 0;
1500 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1501 sljit_emit_op_custom(compiler, instruction, 6);
1502 }
1503
1504 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1505 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1506
1507 if (compare_type != vector_compare_match2)
1508 {
1509 if (compare_type == vector_compare_match1i)
1510 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1511
1512 /* VFEE */
1513 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1514 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1515 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1516 sljit_emit_op_custom(compiler, instruction, 6);
1517 }
1518 else
1519 {
1520 for (i = 0; i < 3; i++)
1521 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1522
1523 /* VFENE */
1524 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1525 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1526 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1527 sljit_emit_op_custom(compiler, instruction, 6);
1528 }
1529
1530 /* VLGVB */
1531 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1532 instruction[1] = 7;
1533 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1534 sljit_emit_op_custom(compiler, instruction, 6);
1535
1536 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1537 quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1538
1539 OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1540
1541 /* Second part (aligned) */
1542 start = LABEL();
1543
1544 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1545
1546 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1547
1548 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1549
1550 if (compare_type != vector_compare_match2)
1551 {
1552 if (compare_type == vector_compare_match1i)
1553 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1554
1555 /* VFEE */
1556 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1557 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1558 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1559 sljit_emit_op_custom(compiler, instruction, 6);
1560 }
1561 else
1562 {
1563 for (i = 0; i < 3; i++)
1564 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1565
1566 /* VFENE */
1567 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1568 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1569 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1570 sljit_emit_op_custom(compiler, instruction, 6);
1571 }
1572
1573 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1574 JUMPTO(SLJIT_OVERFLOW, start);
1575
1576 /* VLGVB */
1577 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1578 instruction[1] = 7;
1579 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1580 sljit_emit_op_custom(compiler, instruction, 6);
1581
1582 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1583
1584 JUMPHERE(quit);
1585 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1586
1587 return not_found;
1588 }
1589
1590 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1591
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1592 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1593 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1594 {
1595 DEFINE_COMPILER;
1596 sljit_u16 instruction[3];
1597 struct sljit_label *start;
1598 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1599 struct sljit_label *restart;
1600 #endif
1601 struct sljit_jump *quit;
1602 struct sljit_jump *jump[2];
1603 vector_compare_type compare1_type = vector_compare_match1;
1604 vector_compare_type compare2_type = vector_compare_match1;
1605 sljit_u32 bit1 = 0;
1606 sljit_u32 bit2 = 0;
1607 sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1608 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1609 sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
1610 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1611 sljit_s32 data1_ind = 0;
1612 sljit_s32 data2_ind = 1;
1613 sljit_s32 tmp1_ind = 2;
1614 sljit_s32 tmp2_ind = 3;
1615 sljit_s32 cmp1a_ind = 4;
1616 sljit_s32 cmp1b_ind = 5;
1617 sljit_s32 cmp2a_ind = 6;
1618 sljit_s32 cmp2b_ind = 7;
1619 sljit_s32 zero_ind = 8;
1620 int i;
1621
1622 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1623 SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1624 SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1625
1626 if (char1a != char1b)
1627 {
1628 bit1 = char1a ^ char1b;
1629 compare1_type = vector_compare_match1i;
1630
1631 if (!is_powerof2(bit1))
1632 {
1633 bit1 = 0;
1634 compare1_type = vector_compare_match2;
1635 }
1636 }
1637
1638 if (char2a != char2b)
1639 {
1640 bit2 = char2a ^ char2b;
1641 compare2_type = vector_compare_match1i;
1642
1643 if (!is_powerof2(bit2))
1644 {
1645 bit2 = 0;
1646 compare2_type = vector_compare_match2;
1647 }
1648 }
1649
1650 /* Initialize. */
1651 if (common->match_end_ptr != 0)
1652 {
1653 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1654 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1655 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1656
1657 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1658 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1659 }
1660
1661 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1662 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1663 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1664
1665 #if PCRE2_CODE_UNIT_WIDTH != 32
1666
1667 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1668
1669 /* VREPI */
1670 instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1671 instruction[1] = (sljit_u16)(char1a | bit1);
1672 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1673 sljit_emit_op_custom(compiler, instruction, 6);
1674
1675 if (char1a != char1b)
1676 {
1677 /* VREPI */
1678 instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1679 instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1680 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1681 sljit_emit_op_custom(compiler, instruction, 6);
1682 }
1683
1684 /* VREPI */
1685 instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1686 instruction[1] = (sljit_u16)(char2a | bit2);
1687 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1688 sljit_emit_op_custom(compiler, instruction, 6);
1689
1690 if (char2a != char2b)
1691 {
1692 /* VREPI */
1693 instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1694 instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1695 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1696 sljit_emit_op_custom(compiler, instruction, 6);
1697 }
1698
1699 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1700
1701 for (int i = 0; i < 2; i++)
1702 {
1703 replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1704
1705 if (char1a != char1b)
1706 replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1707
1708 replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1709
1710 if (char2a != char2b)
1711 replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1712 }
1713
1714 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1715
1716 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1717
1718 /* VREPI */
1719 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1720 instruction[1] = 0;
1721 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1722 sljit_emit_op_custom(compiler, instruction, 6);
1723
1724 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1725 restart = LABEL();
1726 #endif
1727
1728 jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1729 load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1730 jump[1] = JUMP(SLJIT_JUMP);
1731 JUMPHERE(jump[0]);
1732 load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1733 JUMPHERE(jump[1]);
1734
1735 load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1736 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1737
1738 for (i = 0; i < 3; i++)
1739 {
1740 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1741 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1742 }
1743
1744 /* VN */
1745 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1746 instruction[1] = (sljit_u16)(data2_ind << 12);
1747 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1748 sljit_emit_op_custom(compiler, instruction, 6);
1749
1750 /* VFENE */
1751 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1752 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1753 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1754 sljit_emit_op_custom(compiler, instruction, 6);
1755
1756 /* VLGVB */
1757 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1758 instruction[1] = 7;
1759 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1760 sljit_emit_op_custom(compiler, instruction, 6);
1761
1762 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1763 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1764
1765 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1766 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1767
1768 /* Main loop. */
1769 start = LABEL();
1770
1771 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1772 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1773
1774 load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1775 load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1776
1777 for (i = 0; i < 3; i++)
1778 {
1779 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1780 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1781 }
1782
1783 /* VN */
1784 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1785 instruction[1] = (sljit_u16)(data2_ind << 12);
1786 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1787 sljit_emit_op_custom(compiler, instruction, 6);
1788
1789 /* VFENE */
1790 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1791 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1792 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1793 sljit_emit_op_custom(compiler, instruction, 6);
1794
1795 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1796 JUMPTO(SLJIT_OVERFLOW, start);
1797
1798 /* VLGVB */
1799 instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1800 instruction[1] = 7;
1801 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1802 sljit_emit_op_custom(compiler, instruction, 6);
1803
1804 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1805
1806 JUMPHERE(quit);
1807
1808 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1809
1810 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1811 if (common->utf)
1812 {
1813 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1814
1815 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1816
1817 quit = jump_if_utf_char_start(compiler, TMP1);
1818
1819 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1820 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1821
1822 /* TMP1 contains diff. */
1823 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1824 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1825 JUMPTO(SLJIT_JUMP, restart);
1826
1827 JUMPHERE(quit);
1828 }
1829 #endif
1830
1831 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1832
1833 if (common->match_end_ptr != 0)
1834 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1835 }
1836
1837 #endif /* SLJIT_CONFIG_S390X */
1838
1839 #if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
1840
1841 #ifdef __linux__
1842 /* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
1843 #include <sys/auxv.h>
1844 #define LOONGARCH_HWCAP_LSX (1 << 4)
1845 #define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
1846 #else
1847 #define HAS_LSX_SUPPORT 0
1848 #endif
1849
1850 typedef sljit_ins sljit_u32;
1851
1852 #define SI12_IMM_MASK 0x003ffc00
1853 #define UI5_IMM_MASK 0x00007c00
1854 #define UI2_IMM_MASK 0x00000c00
1855
1856 #define VD(vd) ((sljit_ins)vd << 0)
1857 #define VJ(vj) ((sljit_ins)vj << 5)
1858 #define VK(vk) ((sljit_ins)vk << 10)
1859 #define RD_V(rd) ((sljit_ins)rd << 0)
1860 #define RJ_V(rj) ((sljit_ins)rj << 5)
1861
1862 #define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
1863 #define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
1864 #define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
1865
1866 // LSX OPCODES:
1867 #define VLD 0x2c000000
1868 #define VOR_V 0x71268000
1869 #define VAND_V 0x71260000
1870 #define VBSLL_V 0x728e0000
1871 #define VMSKLTZ_B 0x729c4000
1872 #define VPICKVE2GR_WU 0x72f3e000
1873
1874 #if PCRE2_CODE_UNIT_WIDTH == 8
1875 #define VREPLGR2VR 0x729f0000
1876 #define VSEQ 0x70000000
1877 #elif PCRE2_CODE_UNIT_WIDTH == 16
1878 #define VREPLGR2VR 0x729f0400
1879 #define VSEQ 0x70008000
1880 #else
1881 #define VREPLGR2VR 0x729f0800
1882 #define VSEQ 0x70010000
1883 #endif
1884
fast_forward_char_pair_lsx_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1885 static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1886 sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1887 {
1888 if (compare_type != vector_compare_match2)
1889 {
1890 if (compare_type == vector_compare_match1i)
1891 {
1892 /* VOR.V vd, vj, vk */
1893 push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
1894 }
1895
1896 /* VSEQ.B/H/W vd, vj, vk */
1897 push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1898 return;
1899 }
1900
1901 /* VBSLL.V vd, vj, ui5 */
1902 push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
1903
1904 /* VSEQ.B/H/W vd, vj, vk */
1905 push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1906
1907 /* VSEQ.B/H/W vd, vj, vk */
1908 push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
1909
1910 /* VOR vd, vj, vk */
1911 push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
1912 return;
1913 }
1914
1915 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
1916
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1917 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1918 {
1919 DEFINE_COMPILER;
1920 struct sljit_label *start;
1921 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1922 struct sljit_label *restart;
1923 #endif
1924 struct sljit_jump *quit;
1925 struct sljit_jump *partial_quit[2];
1926 vector_compare_type compare_type = vector_compare_match1;
1927 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1928 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1929 sljit_s32 data_ind = 0;
1930 sljit_s32 tmp_ind = 1;
1931 sljit_s32 cmp1_ind = 2;
1932 sljit_s32 cmp2_ind = 3;
1933 sljit_u32 bit = 0;
1934
1935 SLJIT_UNUSED_ARG(offset);
1936
1937 if (char1 != char2)
1938 {
1939 bit = char1 ^ char2;
1940 compare_type = vector_compare_match1i;
1941
1942 if (!is_powerof2(bit))
1943 {
1944 bit = 0;
1945 compare_type = vector_compare_match2;
1946 }
1947 }
1948
1949 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1950 if (common->mode == PCRE2_JIT_COMPLETE)
1951 add_jump(compiler, &common->failed_match, partial_quit[0]);
1952
1953 /* First part (unaligned start) */
1954
1955 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
1956
1957 /* VREPLGR2VR.B/H/W vd, rj */
1958 push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
1959
1960 if (char1 != char2)
1961 {
1962 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
1963
1964 /* VREPLGR2VR.B/H/W vd, rj */
1965 push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
1966 }
1967
1968 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
1969
1970 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1971 restart = LABEL();
1972 #endif
1973
1974 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
1975 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1976
1977 /* VLD vd, rj, si12 */
1978 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
1979 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1980
1981 /* VMSKLTZ.B vd, vj */
1982 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
1983
1984 /* VPICKVE2GR.WU rd, vj, ui2 */
1985 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
1986
1987 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1988 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1989
1990 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1991
1992 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1993
1994 /* Second part (aligned) */
1995 start = LABEL();
1996
1997 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1998
1999 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2000 if (common->mode == PCRE2_JIT_COMPLETE)
2001 add_jump(compiler, &common->failed_match, partial_quit[1]);
2002
2003 /* VLD vd, rj, si12 */
2004 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2005 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2006
2007 /* VMSKLTZ.B vd, vj */
2008 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2009
2010 /* VPICKVE2GR.WU rd, vj, ui2 */
2011 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2012
2013 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2014
2015 JUMPHERE(quit);
2016
2017 /* CTZ.W rd, rj */
2018 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2019
2020 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2021
2022 if (common->mode != PCRE2_JIT_COMPLETE)
2023 {
2024 JUMPHERE(partial_quit[0]);
2025 JUMPHERE(partial_quit[1]);
2026 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
2027 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
2028 }
2029 else
2030 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2031
2032 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2033 if (common->utf && offset > 0)
2034 {
2035 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
2036
2037 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
2038
2039 quit = jump_if_utf_char_start(compiler, TMP1);
2040
2041 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2042 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2043 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2044 JUMPTO(SLJIT_JUMP, restart);
2045
2046 JUMPHERE(quit);
2047 }
2048 #endif
2049 }
2050
2051 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
2052
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)2053 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
2054 {
2055 DEFINE_COMPILER;
2056 struct sljit_label *start;
2057 struct sljit_jump *quit;
2058 jump_list *not_found = NULL;
2059 vector_compare_type compare_type = vector_compare_match1;
2060 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2061 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2062 sljit_s32 data_ind = 0;
2063 sljit_s32 tmp_ind = 1;
2064 sljit_s32 cmp1_ind = 2;
2065 sljit_s32 cmp2_ind = 3;
2066 sljit_u32 bit = 0;
2067
2068 if (char1 != char2)
2069 {
2070 bit = char1 ^ char2;
2071 compare_type = vector_compare_match1i;
2072
2073 if (!is_powerof2(bit))
2074 {
2075 bit = 0;
2076 compare_type = vector_compare_match2;
2077 }
2078 }
2079
2080 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2081 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
2082 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
2083
2084 /* First part (unaligned start) */
2085
2086 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2087
2088 /* VREPLGR2VR vd, rj */
2089 push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2090
2091 if (char1 != char2)
2092 {
2093 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2094 /* VREPLGR2VR vd, rj */
2095 push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2096 }
2097
2098 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
2099 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2100 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2101
2102 /* VLD vd, rj, si12 */
2103 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2104 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2105
2106 /* VMSKLTZ.B vd, vj */
2107 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2108
2109 /* VPICKVE2GR.WU rd, vj, ui2 */
2110 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2111
2112 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2113 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2114
2115 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2116
2117 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2118
2119 /* Second part (aligned) */
2120 start = LABEL();
2121
2122 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2123
2124 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2125
2126 /* VLD vd, rj, si12 */
2127 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2128 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2129
2130 /* VMSKLTZ.B vd, vj */
2131 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2132
2133 /* VPICKVE2GR.WU rd, vj, ui2 */
2134 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2135
2136 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2137
2138 JUMPHERE(quit);
2139
2140 /* CTZ.W rd, rj */
2141 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2142
2143 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
2144 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2145
2146 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
2147 return not_found;
2148 }
2149
2150 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
2151
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)2152 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
2153 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
2154 {
2155 DEFINE_COMPILER;
2156 vector_compare_type compare1_type = vector_compare_match1;
2157 vector_compare_type compare2_type = vector_compare_match1;
2158 sljit_u32 bit1 = 0;
2159 sljit_u32 bit2 = 0;
2160 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
2161 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2162 sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
2163 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2164 sljit_s32 data1_ind = 0;
2165 sljit_s32 data2_ind = 1;
2166 sljit_s32 tmp1_ind = 2;
2167 sljit_s32 tmp2_ind = 3;
2168 sljit_s32 cmp1a_ind = 4;
2169 sljit_s32 cmp1b_ind = 5;
2170 sljit_s32 cmp2a_ind = 6;
2171 sljit_s32 cmp2b_ind = 7;
2172 struct sljit_label *start;
2173 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2174 struct sljit_label *restart;
2175 #endif
2176 struct sljit_jump *jump[2];
2177
2178 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
2179 SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
2180
2181 /* Initialize. */
2182 if (common->match_end_ptr != 0)
2183 {
2184 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
2185 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
2186 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
2187
2188 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
2189 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
2190 }
2191
2192 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2193 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2194
2195 if (char1a == char1b)
2196 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2197 else
2198 {
2199 bit1 = char1a ^ char1b;
2200 if (is_powerof2(bit1))
2201 {
2202 compare1_type = vector_compare_match1i;
2203 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
2204 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
2205 }
2206 else
2207 {
2208 compare1_type = vector_compare_match2;
2209 bit1 = 0;
2210 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2211 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
2212 }
2213 }
2214
2215 /* VREPLGR2VR vd, rj */
2216 push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
2217
2218 if (char1a != char1b)
2219 {
2220 /* VREPLGR2VR vd, rj */
2221 push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
2222 }
2223
2224 if (char2a == char2b)
2225 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2226 else
2227 {
2228 bit2 = char2a ^ char2b;
2229 if (is_powerof2(bit2))
2230 {
2231 compare2_type = vector_compare_match1i;
2232 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
2233 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
2234 }
2235 else
2236 {
2237 compare2_type = vector_compare_match2;
2238 bit2 = 0;
2239 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2240 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
2241 }
2242 }
2243
2244 /* VREPLGR2VR vd, rj */
2245 push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
2246
2247 if (char2a != char2b)
2248 {
2249 /* VREPLGR2VR vd, rj */
2250 push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
2251 }
2252
2253 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2254 restart = LABEL();
2255 #endif
2256
2257 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
2258 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2259 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2260 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2261
2262 /* VLD vd, rj, si12 */
2263 push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2264
2265 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
2266
2267 /* VLD vd, rj, si12 */
2268 push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2269 jump[1] = JUMP(SLJIT_JUMP);
2270
2271 JUMPHERE(jump[0]);
2272
2273 /* VBSLL.V vd, vj, ui5 */
2274 push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
2275
2276 JUMPHERE(jump[1]);
2277
2278 fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
2279 fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
2280
2281 /* VAND vd, vj, vk */
2282 push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2283
2284 /* VMSKLTZ.B vd, vj */
2285 push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2286
2287 /* VPICKVE2GR.WU rd, vj, ui2 */
2288 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2289
2290 /* Ignore matches before the first STR_PTR. */
2291 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2292 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2293
2294 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2295
2296 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2297
2298 /* Main loop. */
2299 start = LABEL();
2300
2301 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2302 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2303
2304 /* VLD vd, rj, si12 */
2305 push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2306 push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2307
2308 fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
2309 fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
2310
2311 /* VAND.V vd, vj, vk */
2312 push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2313
2314 /* VMSKLTZ.B vd, vj */
2315 push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2316
2317 /* VPICKVE2GR.WU rd, vj, ui2 */
2318 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2319
2320 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2321
2322 JUMPHERE(jump[0]);
2323
2324 /* CTZ.W rd, rj */
2325 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2326
2327 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2328
2329 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2330
2331 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2332 if (common->utf)
2333 {
2334 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
2335
2336 jump[0] = jump_if_utf_char_start(compiler, TMP1);
2337
2338 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2339 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
2340
2341 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
2342
2343 JUMPHERE(jump[0]);
2344 }
2345 #endif
2346
2347 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2348
2349 if (common->match_end_ptr != 0)
2350 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
2351 }
2352
2353 #endif /* SLJIT_CONFIG_LOONGARCH_64 */
2354
2355 #endif /* !SUPPORT_VALGRIND */
2356