1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9                     This module by Zoltan Herczeg
10      Original API code Copyright (c) 1997-2012 University of Cambridge
11           New API code Copyright (c) 2016-2019 University of Cambridge
12 
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16 
17     * Redistributions of source code must retain the above copyright notice,
18       this list of conditions and the following disclaimer.
19 
20     * Redistributions in binary form must reproduce the above copyright
21       notice, this list of conditions and the following disclaimer in the
22       documentation and/or other materials provided with the distribution.
23 
24     * Neither the name of the University of Cambridge nor the names of its
25       contributors may be used to endorse or promote products derived from
26       this software without specific prior written permission.
27 
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41 
42 #if !(defined SUPPORT_VALGRIND)
43 
44 #if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45      || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
46      || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
47 
48 typedef enum {
49   vector_compare_match1,
50   vector_compare_match1i,
51   vector_compare_match2,
52 } vector_compare_type;
53 
54 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
max_fast_forward_char_pair_offset(void)55 static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
56 {
57 #if PCRE2_CODE_UNIT_WIDTH == 8
58 /* The AVX2 code path is currently disabled. */
59 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
60 return 15;
61 #elif PCRE2_CODE_UNIT_WIDTH == 16
62 /* The AVX2 code path is currently disabled. */
63 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
64 return 7;
65 #elif PCRE2_CODE_UNIT_WIDTH == 32
66 /* The AVX2 code path is currently disabled. */
67 /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
68 return 3;
69 #else
70 #error "Unsupported unit width"
71 #endif
72 }
73 #else /* !SLJIT_CONFIG_X86 */
max_fast_forward_char_pair_offset(void)74 static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
75 {
76 #if PCRE2_CODE_UNIT_WIDTH == 8
77 return 15;
78 #elif PCRE2_CODE_UNIT_WIDTH == 16
79 return 7;
80 #elif PCRE2_CODE_UNIT_WIDTH == 32
81 return 3;
82 #else
83 #error "Unsupported unit width"
84 #endif
85 }
86 #endif /* SLJIT_CONFIG_X86 */
87 
88 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
jump_if_utf_char_start(struct sljit_compiler * compiler,sljit_s32 reg)89 static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
90 {
91 #if PCRE2_CODE_UNIT_WIDTH == 8
92 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
93 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
94 #elif PCRE2_CODE_UNIT_WIDTH == 16
95 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
96 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
97 #else
98 #error "Unknown code width"
99 #endif
100 }
101 #endif
102 
103 #endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
104 
105 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
106 
character_to_int32(PCRE2_UCHAR chr)107 static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
108 {
109 sljit_u32 value = chr;
110 #if PCRE2_CODE_UNIT_WIDTH == 8
111 #define SIMD_COMPARE_TYPE_INDEX 0
112 return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
113 #elif PCRE2_CODE_UNIT_WIDTH == 16
114 #define SIMD_COMPARE_TYPE_INDEX 1
115 return (sljit_s32)((value << 16) | value);
116 #elif PCRE2_CODE_UNIT_WIDTH == 32
117 #define SIMD_COMPARE_TYPE_INDEX 2
118 return (sljit_s32)(value);
119 #else
120 #error "Unsupported unit width"
121 #endif
122 }
123 
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 reg_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)124 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
125   sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
126 {
127 sljit_u8 instruction[4];
128 
129 if (reg_type == SLJIT_SIMD_REG_128)
130   {
131   instruction[0] = 0x66;
132   instruction[1] = 0x0f;
133   }
134 else
135   {
136   /* Two byte VEX prefix. */
137   instruction[0] = 0xc5;
138   instruction[1] = 0xfd;
139   }
140 
141 SLJIT_ASSERT(step >= 0 && step <= 3);
142 
143 if (compare_type != vector_compare_match2)
144   {
145   if (step == 0)
146     {
147     if (compare_type == vector_compare_match1i)
148       {
149       /* POR xmm1, xmm2/m128 */
150       if (reg_type == SLJIT_SIMD_REG_256)
151         instruction[1] ^= (dst_ind << 3);
152 
153       /* Prefix is filled. */
154       instruction[2] = 0xeb;
155       instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
156       sljit_emit_op_custom(compiler, instruction, 4);
157       }
158     return;
159     }
160 
161   if (step != 2)
162     return;
163 
164   /* PCMPEQB/W/D xmm1, xmm2/m128 */
165   if (reg_type == SLJIT_SIMD_REG_256)
166     instruction[1] ^= (dst_ind << 3);
167 
168   /* Prefix is filled. */
169   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
170   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
171   sljit_emit_op_custom(compiler, instruction, 4);
172   return;
173   }
174 
175 if (reg_type == SLJIT_SIMD_REG_256)
176   {
177   if (step == 2)
178     return;
179 
180   if (step == 0)
181     {
182     step = 2;
183     instruction[1] ^= (dst_ind << 3);
184     }
185   }
186 
187 switch (step)
188   {
189   case 0:
190   SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
191 
192   /* MOVDQA xmm1, xmm2/m128 */
193   /* Prefix is filled. */
194   instruction[2] = 0x6f;
195   instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
196   sljit_emit_op_custom(compiler, instruction, 4);
197   return;
198 
199   case 1:
200   /* PCMPEQB/W/D xmm1, xmm2/m128 */
201   if (reg_type == SLJIT_SIMD_REG_256)
202     instruction[1] ^= (dst_ind << 3);
203 
204   /* Prefix is filled. */
205   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
206   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
207   sljit_emit_op_custom(compiler, instruction, 4);
208   return;
209 
210   case 2:
211   /* PCMPEQB/W/D xmm1, xmm2/m128 */
212   /* Prefix is filled. */
213   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
214   instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
215   sljit_emit_op_custom(compiler, instruction, 4);
216   return;
217 
218   case 3:
219   /* POR xmm1, xmm2/m128 */
220   if (reg_type == SLJIT_SIMD_REG_256)
221     instruction[1] ^= (dst_ind << 3);
222 
223   /* Prefix is filled. */
224   instruction[2] = 0xeb;
225   instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
226   sljit_emit_op_custom(compiler, instruction, 4);
227   return;
228   }
229 }
230 
231 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
232 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)233 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
234 {
235 DEFINE_COMPILER;
236 sljit_u8 instruction[8];
237 /* The AVX2 code path is currently disabled. */
238 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
239 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
240 sljit_s32 value;
241 struct sljit_label *start;
242 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
243 struct sljit_label *restart;
244 #endif
245 struct sljit_jump *quit;
246 struct sljit_jump *partial_quit[2];
247 vector_compare_type compare_type = vector_compare_match1;
248 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
249 sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
250 sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
251 sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
252 sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
253 sljit_u32 bit = 0;
254 int i;
255 
256 SLJIT_UNUSED_ARG(offset);
257 
258 if (char1 != char2)
259   {
260   bit = char1 ^ char2;
261   compare_type = vector_compare_match1i;
262 
263   if (!is_powerof2(bit))
264     {
265     bit = 0;
266     compare_type = vector_compare_match2;
267     }
268   }
269 
270 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
271 if (common->mode == PCRE2_JIT_COMPLETE)
272   add_jump(compiler, &common->failed_match, partial_quit[0]);
273 
274 /* First part (unaligned start) */
275 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
276 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
277 
278 if (char1 != char2)
279   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
280 
281 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
282 
283 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
284 
285 if (char1 != char2)
286   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
287 
288 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
289 restart = LABEL();
290 #endif
291 
292 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
293 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
294 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
295 
296 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
297 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
298 
299 for (i = 0; i < 4; i++)
300   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
301 
302 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
303 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
304 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
305 
306 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
307 
308 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
309 
310 /* Second part (aligned) */
311 start = LABEL();
312 
313 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
314 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
315 
316 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
317 if (common->mode == PCRE2_JIT_COMPLETE)
318   add_jump(compiler, &common->failed_match, partial_quit[1]);
319 
320 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
321 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
322 for (i = 0; i < 4; i++)
323   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
324 
325 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
326 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327 
328 JUMPHERE(quit);
329 
330 SLJIT_ASSERT(tmp1_reg_ind < 8);
331 /* BSF r32, r/m32 */
332 instruction[0] = 0x0f;
333 instruction[1] = 0xbc;
334 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
335 sljit_emit_op_custom(compiler, instruction, 3);
336 
337 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
338 
339 if (common->mode != PCRE2_JIT_COMPLETE)
340   {
341   JUMPHERE(partial_quit[0]);
342   JUMPHERE(partial_quit[1]);
343   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
344   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
345   }
346 else
347   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
348 
349 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
350 if (common->utf && offset > 0)
351   {
352   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
353 
354   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
355 
356   quit = jump_if_utf_char_start(compiler, TMP1);
357 
358   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
359   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
360   OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
361   JUMPTO(SLJIT_JUMP, restart);
362 
363   JUMPHERE(quit);
364   }
365 #endif
366 }
367 
368 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
369 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)370 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
371 {
372 DEFINE_COMPILER;
373 sljit_u8 instruction[8];
374 /* The AVX2 code path is currently disabled. */
375 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
376 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
377 sljit_s32 value;
378 struct sljit_label *start;
379 struct sljit_jump *quit;
380 jump_list *not_found = NULL;
381 vector_compare_type compare_type = vector_compare_match1;
382 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
383 sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
384 sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
385 sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
386 sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
387 sljit_u32 bit = 0;
388 int i;
389 
390 if (char1 != char2)
391   {
392   bit = char1 ^ char2;
393   compare_type = vector_compare_match1i;
394 
395   if (!is_powerof2(bit))
396     {
397     bit = 0;
398     compare_type = vector_compare_match2;
399     }
400   }
401 
402 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
403 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
404 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
405 
406 /* First part (unaligned start) */
407 
408 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
409 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
410 
411 if (char1 != char2)
412   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
413 
414 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
415 
416 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
417 
418 if (char1 != char2)
419   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
420 
421 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
422 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
423 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
424 
425 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
426 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
427 
428 for (i = 0; i < 4; i++)
429   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
430 
431 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
432 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
433 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
434 
435 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
436 
437 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
438 
439 /* Second part (aligned) */
440 start = LABEL();
441 
442 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
443 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
444 
445 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
446 
447 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
448 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
449 
450 for (i = 0; i < 4; i++)
451   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
452 
453 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
454 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
455 
456 JUMPHERE(quit);
457 
458 SLJIT_ASSERT(tmp1_reg_ind < 8);
459 /* BSF r32, r/m32 */
460 instruction[0] = 0x0f;
461 instruction[1] = 0xbc;
462 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
463 sljit_emit_op_custom(compiler, instruction, 3);
464 
465 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
466 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
467 
468 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
469 return not_found;
470 }
471 
472 #ifndef _WIN64
473 
474 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
475 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)476 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
477   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
478 {
479 DEFINE_COMPILER;
480 sljit_u8 instruction[8];
481 /* The AVX2 code path is currently disabled. */
482 /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
483 sljit_s32 reg_type = SLJIT_SIMD_REG_128;
484 sljit_s32 value;
485 vector_compare_type compare1_type = vector_compare_match1;
486 vector_compare_type compare2_type = vector_compare_match1;
487 sljit_u32 bit1 = 0;
488 sljit_u32 bit2 = 0;
489 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
490 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
491 sljit_s32 data1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
492 sljit_s32 data2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
493 sljit_s32 cmp1a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
494 sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
495 sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4);
496 sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5);
497 sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6);
498 sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0);
499 struct sljit_label *start;
500 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
501 struct sljit_label *restart;
502 #endif
503 struct sljit_jump *jump[2];
504 int i;
505 
506 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
507 SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
508 
509 /* Initialize. */
510 if (common->match_end_ptr != 0)
511   {
512   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
513   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
514   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
515 
516   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
517   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
518   }
519 
520 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
521 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
522 
523 if (char1a == char1b)
524   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
525 else
526   {
527   bit1 = char1a ^ char1b;
528   if (is_powerof2(bit1))
529     {
530     compare1_type = vector_compare_match1i;
531     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
532     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
533     }
534   else
535     {
536     compare1_type = vector_compare_match2;
537     bit1 = 0;
538     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
539     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
540     }
541   }
542 
543 value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
544 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, TMP1, 0);
545 
546 if (char1a != char1b)
547   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR4, 0, TMP2, 0);
548 
549 if (char2a == char2b)
550   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
551 else
552   {
553   bit2 = char2a ^ char2b;
554   if (is_powerof2(bit2))
555     {
556     compare2_type = vector_compare_match1i;
557     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
558     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
559     }
560   else
561     {
562     compare2_type = vector_compare_match2;
563     bit2 = 0;
564     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
565     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
566     }
567   }
568 
569 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR3, 0, TMP1, 0);
570 
571 if (char2a != char2b)
572   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR5, 0, TMP2, 0);
573 
574 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
575 if (char1a != char1b)
576   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR4, SLJIT_FR4, 0);
577 
578 sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR3, SLJIT_FR3, 0);
579 if (char2a != char2b)
580   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR5, SLJIT_FR5, 0);
581 
582 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
583 restart = LABEL();
584 #endif
585 
586 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
587 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
588 value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
589 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
590 
591 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
592 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
593 
594 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
595 
596 sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
597 jump[1] = JUMP(SLJIT_JUMP);
598 
599 JUMPHERE(jump[0]);
600 
601 if (reg_type == SLJIT_SIMD_REG_256)
602   {
603   if (diff != 16)
604     {
605     /* PSLLDQ ymm1, ymm2, imm8 */
606     instruction[0] = 0xc5;
607     instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
608     instruction[2] = 0x73;
609     instruction[3] = 0xc0 | (7 << 3) | data1_ind;
610     instruction[4] = diff & 0xf;
611     sljit_emit_op_custom(compiler, instruction, 5);
612     }
613 
614   instruction[0] = 0xc4;
615   instruction[1] = 0xe3;
616   if (diff < 16)
617     {
618     /* VINSERTI128 xmm1, xmm2, xmm3/m128 */
619     /* instruction[0] = 0xc4; */
620     /* instruction[1] = 0xe3; */
621     instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
622     instruction[3] = 0x38;
623     SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);
624     instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
625     instruction[5] = (sljit_u8)(16 - diff);
626     instruction[6] = 1;
627     sljit_emit_op_custom(compiler, instruction, 7);
628     }
629   else
630     {
631     /* VPERM2I128 xmm1, xmm2, xmm3/m128 */
632     /* instruction[0] = 0xc4; */
633     /* instruction[1] = 0xe3; */
634     value = (diff == 16) ? data1_ind : data2_ind;
635     instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
636     instruction[3] = 0x46;
637     instruction[4] = 0xc0 | (data2_ind << 3) | value;
638     instruction[5] = 0x08;
639     sljit_emit_op_custom(compiler, instruction, 6);
640     }
641   }
642 else
643   {
644   /* MOVDQA xmm1, xmm2/m128 */
645   instruction[0] = 0x66;
646   instruction[1] = 0x0f;
647   instruction[2] = 0x6f;
648   instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
649   sljit_emit_op_custom(compiler, instruction, 4);
650 
651   /* PSLLDQ xmm1, imm8 */
652   /* instruction[0] = 0x66; */
653   /* instruction[1] = 0x0f; */
654   instruction[2] = 0x73;
655   instruction[3] = 0xc0 | (7 << 3) | data2_ind;
656   instruction[4] = diff;
657   sljit_emit_op_custom(compiler, instruction, 5);
658   }
659 
660 JUMPHERE(jump[1]);
661 
662 value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
663 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
664 
665 for (i = 0; i < 4; i++)
666   {
667   fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
668   fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
669   }
670 
671 sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
672 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
673 
674 /* Ignore matches before the first STR_PTR. */
675 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
676 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
677 
678 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
679 
680 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
681 
682 /* Main loop. */
683 start = LABEL();
684 
685 value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
686 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
687 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
688 
689 value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
690 sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
691 sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
692 
693 for (i = 0; i < 4; i++)
694   {
695   fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
696   fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
697   }
698 
699 sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
700 sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
701 
702 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
703 
704 JUMPHERE(jump[0]);
705 
706 SLJIT_ASSERT(tmp1_reg_ind < 8);
707 /* BSF r32, r/m32 */
708 instruction[0] = 0x0f;
709 instruction[1] = 0xbc;
710 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
711 sljit_emit_op_custom(compiler, instruction, 3);
712 
713 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
714 
715 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
716 
717 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
718 if (common->utf)
719   {
720   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
721 
722   jump[0] = jump_if_utf_char_start(compiler, TMP1);
723 
724   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
725   CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
726 
727   add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
728 
729   JUMPHERE(jump[0]);
730   }
731 #endif
732 
733 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
734 
735 if (common->match_end_ptr != 0)
736   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
737 }
738 
739 #endif /* !_WIN64 */
740 
741 #undef SIMD_COMPARE_TYPE_INDEX
742 
743 #endif /* SLJIT_CONFIG_X86 */
744 
745 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
746 
747 #include <arm_neon.h>
748 
749 typedef union {
750   unsigned int x;
751   struct { unsigned char c1, c2, c3, c4; } c;
752 } int_char;
753 
754 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
utf_continue(PCRE2_SPTR s)755 static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
756 {
757 #if PCRE2_CODE_UNIT_WIDTH == 8
758 return (*s & 0xc0) == 0x80;
759 #elif PCRE2_CODE_UNIT_WIDTH == 16
760 return (*s & 0xfc00) == 0xdc00;
761 #else
762 #error "Unknown code width"
763 #endif
764 }
765 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
766 
767 #if PCRE2_CODE_UNIT_WIDTH == 8
768 # define VECTOR_FACTOR 16
769 # define vect_t uint8x16_t
770 # define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
771 # define VCEQQ vceqq_u8
772 # define VORRQ vorrq_u8
773 # define VST1Q vst1q_u8
774 # define VDUPQ vdupq_n_u8
775 # define VEXTQ vextq_u8
776 # define VANDQ vandq_u8
777 typedef union {
778        uint8_t mem[16];
779        uint64_t dw[2];
780 } quad_word;
781 #elif PCRE2_CODE_UNIT_WIDTH == 16
782 # define VECTOR_FACTOR 8
783 # define vect_t uint16x8_t
784 # define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
785 # define VCEQQ vceqq_u16
786 # define VORRQ vorrq_u16
787 # define VST1Q vst1q_u16
788 # define VDUPQ vdupq_n_u16
789 # define VEXTQ vextq_u16
790 # define VANDQ vandq_u16
791 typedef union {
792        uint16_t mem[8];
793        uint64_t dw[2];
794 } quad_word;
795 #else
796 # define VECTOR_FACTOR 4
797 # define vect_t uint32x4_t
798 # define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
799 # define VCEQQ vceqq_u32
800 # define VORRQ vorrq_u32
801 # define VST1Q vst1q_u32
802 # define VDUPQ vdupq_n_u32
803 # define VEXTQ vextq_u32
804 # define VANDQ vandq_u32
805 typedef union {
806        uint32_t mem[4];
807        uint64_t dw[2];
808 } quad_word;
809 #endif
810 
811 #define FFCS
812 #include "pcre2_jit_neon_inc.h"
813 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
814 # define FF_UTF
815 # include "pcre2_jit_neon_inc.h"
816 # undef FF_UTF
817 #endif
818 #undef FFCS
819 
820 #define FFCS_2
821 #include "pcre2_jit_neon_inc.h"
822 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
823 # define FF_UTF
824 # include "pcre2_jit_neon_inc.h"
825 # undef FF_UTF
826 #endif
827 #undef FFCS_2
828 
829 #define FFCS_MASK
830 #include "pcre2_jit_neon_inc.h"
831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
832 # define FF_UTF
833 # include "pcre2_jit_neon_inc.h"
834 # undef FF_UTF
835 #endif
836 #undef FFCS_MASK
837 
838 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
839 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)840 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
841 {
842 DEFINE_COMPILER;
843 int_char ic;
844 struct sljit_jump *partial_quit, *quit;
845 /* Save temporary registers. */
846 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
847 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
848 
849 /* Prepare function arguments */
850 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
851 GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
852 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
853 
854 if (char1 == char2)
855   {
856     ic.c.c1 = char1;
857     ic.c.c2 = char2;
858     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
859 
860 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
861   if (common->utf && offset > 0)
862     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
863                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
864   else
865     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
866                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
867 #else
868   sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
869                    SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
870 #endif
871   }
872 else
873   {
874   PCRE2_UCHAR mask = char1 ^ char2;
875   if (is_powerof2(mask))
876     {
877     ic.c.c1 = char1 | mask;
878     ic.c.c2 = mask;
879     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
880 
881 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
882     if (common->utf && offset > 0)
883       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
884                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
885     else
886       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
888 #else
889     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
891 #endif
892     }
893   else
894     {
895       ic.c.c1 = char1;
896       ic.c.c2 = char2;
897       OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
898 
899 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
900     if (common->utf && offset > 0)
901       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
902                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
903     else
904       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
905                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
906 #else
907     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
909 #endif
910     }
911   }
912 /* Restore registers. */
913 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
914 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
915 
916 /* Check return value. */
917 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
918 if (common->mode == PCRE2_JIT_COMPLETE)
919   add_jump(compiler, &common->failed_match, partial_quit);
920 
921 /* Fast forward STR_PTR to the result of memchr. */
922 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
923 if (common->mode != PCRE2_JIT_COMPLETE)
924   {
925   quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
926   JUMPHERE(partial_quit);
927   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
928   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
929   JUMPHERE(quit);
930   }
931 }
932 
933 typedef enum {
934   compare_match1,
935   compare_match1i,
936   compare_match2,
937 } compare_type;
938 
fast_forward_char_pair_compare(compare_type ctype,vect_t dst,vect_t cmp1,vect_t cmp2)939 static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
940 {
941 if (ctype == compare_match2)
942   {
943   vect_t tmp = dst;
944   dst = VCEQQ(dst, cmp1);
945   tmp = VCEQQ(tmp, cmp2);
946   dst = VORRQ(dst, tmp);
947   return dst;
948   }
949 
950 if (ctype == compare_match1i)
951   dst = VORRQ(dst, cmp2);
952 dst = VCEQQ(dst, cmp1);
953 return dst;
954 }
955 
max_fast_forward_char_pair_offset(void)956 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
957 {
958 #if PCRE2_CODE_UNIT_WIDTH == 8
959 return 15;
960 #elif PCRE2_CODE_UNIT_WIDTH == 16
961 return 7;
962 #elif PCRE2_CODE_UNIT_WIDTH == 32
963 return 3;
964 #else
965 #error "Unsupported unit width"
966 #endif
967 }
968 
969 /* ARM doesn't have a shift left across lanes. */
shift_left_n_lanes(vect_t a,sljit_u8 n)970 static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
971 {
972 vect_t zero = VDUPQ(0);
973 SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
974 /* VEXTQ takes an immediate as last argument. */
975 #define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
976 switch (n)
977   {
978   C(1); C(2); C(3);
979 #if PCRE2_CODE_UNIT_WIDTH != 32
980   C(4); C(5); C(6); C(7);
981 # if PCRE2_CODE_UNIT_WIDTH != 16
982   C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
983 # endif
984 #endif
985   default:
986     /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
987        happen. The return is still here for compilers to not warn. */
988     return a;
989   }
990 }
991 
992 #define FFCPS
993 #define FFCPS_DIFF1
994 #define FFCPS_CHAR1A2A
995 
996 #define FFCPS_0
997 #include "pcre2_jit_neon_inc.h"
998 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
999 # define FF_UTF
1000 # include "pcre2_jit_neon_inc.h"
1001 # undef FF_UTF
1002 #endif
1003 #undef FFCPS_0
1004 
1005 #undef FFCPS_CHAR1A2A
1006 
1007 #define FFCPS_1
1008 #include "pcre2_jit_neon_inc.h"
1009 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1010 # define FF_UTF
1011 # include "pcre2_jit_neon_inc.h"
1012 # undef FF_UTF
1013 #endif
1014 #undef FFCPS_1
1015 
1016 #undef FFCPS_DIFF1
1017 
1018 #define FFCPS_DEFAULT
1019 #include "pcre2_jit_neon_inc.h"
1020 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1021 # define FF_UTF
1022 # include "pcre2_jit_neon_inc.h"
1023 # undef FF_UTF
1024 #endif
1025 #undef FFCPS
1026 
1027 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1028 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1029 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1030   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1031 {
1032 DEFINE_COMPILER;
1033 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1034 struct sljit_jump *partial_quit;
1035 int_char ic;
1036 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1037 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1038 SLJIT_ASSERT(compiler->scratches == 5);
1039 
1040 /* Save temporary register STR_PTR. */
1041 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1042 
1043 /* Prepare arguments for the function call. */
1044 if (common->match_end_ptr == 0)
1045    OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1046 else
1047   {
1048   OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1049   OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1050 
1051   OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
1052   SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
1053   }
1054 
1055 GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
1056 OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1057 OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1058 ic.c.c1 = char1a;
1059 ic.c.c2 = char1b;
1060 ic.c.c3 = char2a;
1061 ic.c.c4 = char2b;
1062 OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1063 
1064 if (diff == 1) {
1065   if (char1a == char1b && char2a == char2b) {
1066 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1067     if (common->utf)
1068       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1069                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
1070     else
1071 #endif
1072       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1073                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
1074   } else {
1075 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1076     if (common->utf)
1077       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1078                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
1079     else
1080 #endif
1081       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1082                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
1083   }
1084 } else {
1085 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086   if (common->utf)
1087     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
1089   else
1090 #endif
1091     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
1093 }
1094 
1095 /* Restore STR_PTR register. */
1096 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1097 
1098 /* Check return value. */
1099 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1100 add_jump(compiler, &common->failed_match, partial_quit);
1101 
1102 /* Fast forward STR_PTR to the result of memchr. */
1103 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1104 
1105 JUMPHERE(partial_quit);
1106 }
1107 
1108 #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1109 
1110 #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1111 
1112 #if PCRE2_CODE_UNIT_WIDTH == 8
1113 #define VECTOR_ELEMENT_SIZE 0
1114 #elif PCRE2_CODE_UNIT_WIDTH == 16
1115 #define VECTOR_ELEMENT_SIZE 1
1116 #elif PCRE2_CODE_UNIT_WIDTH == 32
1117 #define VECTOR_ELEMENT_SIZE 2
1118 #else
1119 #error "Unsupported unit width"
1120 #endif
1121 
load_from_mem_vector(struct sljit_compiler * compiler,BOOL vlbb,sljit_s32 dst_vreg,sljit_s32 base_reg,sljit_s32 index_reg)1122 static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1123   sljit_s32 base_reg, sljit_s32 index_reg)
1124 {
1125 sljit_u16 instruction[3];
1126 
1127 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1128 instruction[1] = (sljit_u16)(base_reg << 12);
1129 instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1130 
1131 sljit_emit_op_custom(compiler, instruction, 6);
1132 }
1133 
1134 #if PCRE2_CODE_UNIT_WIDTH == 32
1135 
replicate_imm_vector(struct sljit_compiler * compiler,int step,sljit_s32 dst_vreg,PCRE2_UCHAR chr,sljit_s32 tmp_general_reg)1136 static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1137   PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1138 {
1139 sljit_u16 instruction[3];
1140 
1141 SLJIT_ASSERT(step >= 0 && step <= 1);
1142 
1143 if (chr < 0x7fff)
1144   {
1145   if (step == 1)
1146     return;
1147 
1148   /* VREPI */
1149   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1150   instruction[1] = (sljit_u16)chr;
1151   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1152   sljit_emit_op_custom(compiler, instruction, 6);
1153   return;
1154   }
1155 
1156 if (step == 0)
1157   {
1158   OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1159 
1160   /* VLVG */
1161   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
1162   instruction[1] = 0;
1163   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1164   sljit_emit_op_custom(compiler, instruction, 6);
1165   return;
1166   }
1167 
1168 /* VREP */
1169 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1170 instruction[1] = 0;
1171 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1172 sljit_emit_op_custom(compiler, instruction, 6);
1173 }
1174 
1175 #endif
1176 
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1177 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1178   int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1179 {
1180 sljit_u16 instruction[3];
1181 
1182 SLJIT_ASSERT(step >= 0 && step <= 2);
1183 
1184 if (step == 1)
1185   {
1186   /* VCEQ */
1187   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1188   instruction[1] = (sljit_u16)(cmp1_ind << 12);
1189   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1190   sljit_emit_op_custom(compiler, instruction, 6);
1191   return;
1192   }
1193 
1194 if (compare_type != vector_compare_match2)
1195   {
1196   if (step == 0 && compare_type == vector_compare_match1i)
1197     {
1198     /* VO */
1199     instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1200     instruction[1] = (sljit_u16)(cmp2_ind << 12);
1201     instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1202     sljit_emit_op_custom(compiler, instruction, 6);
1203     }
1204   return;
1205   }
1206 
1207 switch (step)
1208   {
1209   case 0:
1210   /* VCEQ */
1211   instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1212   instruction[1] = (sljit_u16)(cmp2_ind << 12);
1213   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1214   sljit_emit_op_custom(compiler, instruction, 6);
1215   return;
1216 
1217   case 2:
1218   /* VO */
1219   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1220   instruction[1] = (sljit_u16)(tmp_ind << 12);
1221   instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1222   sljit_emit_op_custom(compiler, instruction, 6);
1223   return;
1224   }
1225 }
1226 
1227 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1228 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1229 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1230 {
1231 DEFINE_COMPILER;
1232 sljit_u16 instruction[3];
1233 struct sljit_label *start;
1234 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1235 struct sljit_label *restart;
1236 #endif
1237 struct sljit_jump *quit;
1238 struct sljit_jump *partial_quit[2];
1239 vector_compare_type compare_type = vector_compare_match1;
1240 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1241 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1242 sljit_s32 data_ind = 0;
1243 sljit_s32 tmp_ind = 1;
1244 sljit_s32 cmp1_ind = 2;
1245 sljit_s32 cmp2_ind = 3;
1246 sljit_s32 zero_ind = 4;
1247 sljit_u32 bit = 0;
1248 int i;
1249 
1250 SLJIT_UNUSED_ARG(offset);
1251 
1252 if (char1 != char2)
1253   {
1254   bit = char1 ^ char2;
1255   compare_type = vector_compare_match1i;
1256 
1257   if (!is_powerof2(bit))
1258     {
1259     bit = 0;
1260     compare_type = vector_compare_match2;
1261     }
1262   }
1263 
1264 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1265 if (common->mode == PCRE2_JIT_COMPLETE)
1266   add_jump(compiler, &common->failed_match, partial_quit[0]);
1267 
1268 /* First part (unaligned start) */
1269 
1270 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1271 
1272 #if PCRE2_CODE_UNIT_WIDTH != 32
1273 
1274 /* VREPI */
1275 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1276 instruction[1] = (sljit_u16)(char1 | bit);
1277 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1278 sljit_emit_op_custom(compiler, instruction, 6);
1279 
1280 if (char1 != char2)
1281   {
1282   /* VREPI */
1283   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1284   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1285   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1286   sljit_emit_op_custom(compiler, instruction, 6);
1287   }
1288 
1289 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1290 
1291 for (int i = 0; i < 2; i++)
1292   {
1293   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1294 
1295   if (char1 != char2)
1296     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1297   }
1298 
1299 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1300 
1301 if (compare_type == vector_compare_match2)
1302   {
1303   /* VREPI */
1304   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1305   instruction[1] = 0;
1306   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1307   sljit_emit_op_custom(compiler, instruction, 6);
1308   }
1309 
1310 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1311 restart = LABEL();
1312 #endif
1313 
1314 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1315 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1316 
1317 if (compare_type != vector_compare_match2)
1318   {
1319   if (compare_type == vector_compare_match1i)
1320     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1321 
1322   /* VFEE */
1323   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1324   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1325   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1326   sljit_emit_op_custom(compiler, instruction, 6);
1327   }
1328 else
1329   {
1330   for (i = 0; i < 3; i++)
1331     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1332 
1333   /* VFENE */
1334   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1335   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1336   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1337   sljit_emit_op_custom(compiler, instruction, 6);
1338   }
1339 
1340 /* VLGVB */
1341 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1342 instruction[1] = 7;
1343 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1344 sljit_emit_op_custom(compiler, instruction, 6);
1345 
1346 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1347 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1348 
1349 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1350 
1351 /* Second part (aligned) */
1352 start = LABEL();
1353 
1354 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1355 
1356 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1357 if (common->mode == PCRE2_JIT_COMPLETE)
1358   add_jump(compiler, &common->failed_match, partial_quit[1]);
1359 
1360 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1361 
1362 if (compare_type != vector_compare_match2)
1363   {
1364   if (compare_type == vector_compare_match1i)
1365     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1366 
1367   /* VFEE */
1368   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1369   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1370   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1371   sljit_emit_op_custom(compiler, instruction, 6);
1372   }
1373 else
1374   {
1375   for (i = 0; i < 3; i++)
1376     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1377 
1378   /* VFENE */
1379   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1380   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1381   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1382   sljit_emit_op_custom(compiler, instruction, 6);
1383   }
1384 
1385 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1386 JUMPTO(SLJIT_OVERFLOW, start);
1387 
1388 /* VLGVB */
1389 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1390 instruction[1] = 7;
1391 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1392 sljit_emit_op_custom(compiler, instruction, 6);
1393 
1394 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1395 
1396 JUMPHERE(quit);
1397 
1398 if (common->mode != PCRE2_JIT_COMPLETE)
1399   {
1400   JUMPHERE(partial_quit[0]);
1401   JUMPHERE(partial_quit[1]);
1402   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1403   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
1404   }
1405 else
1406   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1407 
1408 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1409 if (common->utf && offset > 0)
1410   {
1411   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1412 
1413   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1414 
1415   quit = jump_if_utf_char_start(compiler, TMP1);
1416 
1417   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1418   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1419 
1420   OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1421   JUMPTO(SLJIT_JUMP, restart);
1422 
1423   JUMPHERE(quit);
1424   }
1425 #endif
1426 }
1427 
1428 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1429 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)1430 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1431 {
1432 DEFINE_COMPILER;
1433 sljit_u16 instruction[3];
1434 struct sljit_label *start;
1435 struct sljit_jump *quit;
1436 jump_list *not_found = NULL;
1437 vector_compare_type compare_type = vector_compare_match1;
1438 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1439 sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
1440 sljit_s32 data_ind = 0;
1441 sljit_s32 tmp_ind = 1;
1442 sljit_s32 cmp1_ind = 2;
1443 sljit_s32 cmp2_ind = 3;
1444 sljit_s32 zero_ind = 4;
1445 sljit_u32 bit = 0;
1446 int i;
1447 
1448 if (char1 != char2)
1449   {
1450   bit = char1 ^ char2;
1451   compare_type = vector_compare_match1i;
1452 
1453   if (!is_powerof2(bit))
1454     {
1455     bit = 0;
1456     compare_type = vector_compare_match2;
1457     }
1458   }
1459 
1460 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1461 
1462 /* First part (unaligned start) */
1463 
1464 OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1465 
1466 #if PCRE2_CODE_UNIT_WIDTH != 32
1467 
1468 /* VREPI */
1469 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1470 instruction[1] = (sljit_u16)(char1 | bit);
1471 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1472 sljit_emit_op_custom(compiler, instruction, 6);
1473 
1474 if (char1 != char2)
1475   {
1476   /* VREPI */
1477   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1478   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1479   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1480   sljit_emit_op_custom(compiler, instruction, 6);
1481   }
1482 
1483 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1484 
1485 for (int i = 0; i < 2; i++)
1486   {
1487   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1488 
1489   if (char1 != char2)
1490     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1491   }
1492 
1493 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1494 
1495 if (compare_type == vector_compare_match2)
1496   {
1497   /* VREPI */
1498   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1499   instruction[1] = 0;
1500   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1501   sljit_emit_op_custom(compiler, instruction, 6);
1502   }
1503 
1504 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1505 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1506 
1507 if (compare_type != vector_compare_match2)
1508   {
1509   if (compare_type == vector_compare_match1i)
1510     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1511 
1512   /* VFEE */
1513   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1514   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1515   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1516   sljit_emit_op_custom(compiler, instruction, 6);
1517   }
1518 else
1519   {
1520   for (i = 0; i < 3; i++)
1521     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1522 
1523   /* VFENE */
1524   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1525   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1526   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1527   sljit_emit_op_custom(compiler, instruction, 6);
1528   }
1529 
1530 /* VLGVB */
1531 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1532 instruction[1] = 7;
1533 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1534 sljit_emit_op_custom(compiler, instruction, 6);
1535 
1536 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1537 quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1538 
1539 OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1540 
1541 /* Second part (aligned) */
1542 start = LABEL();
1543 
1544 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1545 
1546 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1547 
1548 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1549 
1550 if (compare_type != vector_compare_match2)
1551   {
1552   if (compare_type == vector_compare_match1i)
1553     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1554 
1555   /* VFEE */
1556   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1557   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1558   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1559   sljit_emit_op_custom(compiler, instruction, 6);
1560   }
1561 else
1562   {
1563   for (i = 0; i < 3; i++)
1564     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1565 
1566   /* VFENE */
1567   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1568   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1569   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1570   sljit_emit_op_custom(compiler, instruction, 6);
1571   }
1572 
1573 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1574 JUMPTO(SLJIT_OVERFLOW, start);
1575 
1576 /* VLGVB */
1577 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1578 instruction[1] = 7;
1579 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1580 sljit_emit_op_custom(compiler, instruction, 6);
1581 
1582 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1583 
1584 JUMPHERE(quit);
1585 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1586 
1587 return not_found;
1588 }
1589 
1590 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1591 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1592 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1593   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1594 {
1595 DEFINE_COMPILER;
1596 sljit_u16 instruction[3];
1597 struct sljit_label *start;
1598 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1599 struct sljit_label *restart;
1600 #endif
1601 struct sljit_jump *quit;
1602 struct sljit_jump *jump[2];
1603 vector_compare_type compare1_type = vector_compare_match1;
1604 vector_compare_type compare2_type = vector_compare_match1;
1605 sljit_u32 bit1 = 0;
1606 sljit_u32 bit2 = 0;
1607 sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1608 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1609 sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
1610 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1611 sljit_s32 data1_ind = 0;
1612 sljit_s32 data2_ind = 1;
1613 sljit_s32 tmp1_ind = 2;
1614 sljit_s32 tmp2_ind = 3;
1615 sljit_s32 cmp1a_ind = 4;
1616 sljit_s32 cmp1b_ind = 5;
1617 sljit_s32 cmp2a_ind = 6;
1618 sljit_s32 cmp2b_ind = 7;
1619 sljit_s32 zero_ind = 8;
1620 int i;
1621 
1622 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1623 SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1624 SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1625 
1626 if (char1a != char1b)
1627   {
1628   bit1 = char1a ^ char1b;
1629   compare1_type = vector_compare_match1i;
1630 
1631   if (!is_powerof2(bit1))
1632     {
1633     bit1 = 0;
1634     compare1_type = vector_compare_match2;
1635     }
1636   }
1637 
1638 if (char2a != char2b)
1639   {
1640   bit2 = char2a ^ char2b;
1641   compare2_type = vector_compare_match1i;
1642 
1643   if (!is_powerof2(bit2))
1644     {
1645     bit2 = 0;
1646     compare2_type = vector_compare_match2;
1647     }
1648   }
1649 
1650 /* Initialize. */
1651 if (common->match_end_ptr != 0)
1652   {
1653   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1654   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1655   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1656 
1657   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1658   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1659   }
1660 
1661 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1662 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1663 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1664 
1665 #if PCRE2_CODE_UNIT_WIDTH != 32
1666 
1667 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1668 
1669 /* VREPI */
1670 instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1671 instruction[1] = (sljit_u16)(char1a | bit1);
1672 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1673 sljit_emit_op_custom(compiler, instruction, 6);
1674 
1675 if (char1a != char1b)
1676   {
1677   /* VREPI */
1678   instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1679   instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1680   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1681   sljit_emit_op_custom(compiler, instruction, 6);
1682   }
1683 
1684 /* VREPI */
1685 instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1686 instruction[1] = (sljit_u16)(char2a | bit2);
1687 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1688 sljit_emit_op_custom(compiler, instruction, 6);
1689 
1690 if (char2a != char2b)
1691   {
1692   /* VREPI */
1693   instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1694   instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1695   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1696   sljit_emit_op_custom(compiler, instruction, 6);
1697   }
1698 
1699 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1700 
1701 for (int i = 0; i < 2; i++)
1702   {
1703   replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1704 
1705   if (char1a != char1b)
1706     replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1707 
1708   replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1709 
1710   if (char2a != char2b)
1711     replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1712   }
1713 
1714 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1715 
1716 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1717 
1718 /* VREPI */
1719 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1720 instruction[1] = 0;
1721 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1722 sljit_emit_op_custom(compiler, instruction, 6);
1723 
1724 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1725 restart = LABEL();
1726 #endif
1727 
1728 jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1729 load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1730 jump[1] = JUMP(SLJIT_JUMP);
1731 JUMPHERE(jump[0]);
1732 load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1733 JUMPHERE(jump[1]);
1734 
1735 load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1736 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1737 
1738 for (i = 0; i < 3; i++)
1739   {
1740   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1741   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1742   }
1743 
1744 /* VN */
1745 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1746 instruction[1] = (sljit_u16)(data2_ind << 12);
1747 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1748 sljit_emit_op_custom(compiler, instruction, 6);
1749 
1750 /* VFENE */
1751 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1752 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1753 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1754 sljit_emit_op_custom(compiler, instruction, 6);
1755 
1756 /* VLGVB */
1757 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1758 instruction[1] = 7;
1759 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1760 sljit_emit_op_custom(compiler, instruction, 6);
1761 
1762 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1763 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1764 
1765 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1766 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1767 
1768 /* Main loop. */
1769 start = LABEL();
1770 
1771 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1772 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1773 
1774 load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1775 load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1776 
1777 for (i = 0; i < 3; i++)
1778   {
1779   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1780   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1781   }
1782 
1783 /* VN */
1784 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1785 instruction[1] = (sljit_u16)(data2_ind << 12);
1786 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1787 sljit_emit_op_custom(compiler, instruction, 6);
1788 
1789 /* VFENE */
1790 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1791 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1792 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1793 sljit_emit_op_custom(compiler, instruction, 6);
1794 
1795 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1796 JUMPTO(SLJIT_OVERFLOW, start);
1797 
1798 /* VLGVB */
1799 instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1800 instruction[1] = 7;
1801 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1802 sljit_emit_op_custom(compiler, instruction, 6);
1803 
1804 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1805 
1806 JUMPHERE(quit);
1807 
1808 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1809 
1810 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1811 if (common->utf)
1812   {
1813   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1814 
1815   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1816 
1817   quit = jump_if_utf_char_start(compiler, TMP1);
1818 
1819   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1820   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1821 
1822   /* TMP1 contains diff. */
1823   OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1824   OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1825   JUMPTO(SLJIT_JUMP, restart);
1826 
1827   JUMPHERE(quit);
1828   }
1829 #endif
1830 
1831 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1832 
1833 if (common->match_end_ptr != 0)
1834   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1835 }
1836 
1837 #endif /* SLJIT_CONFIG_S390X */
1838 
1839 #if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
1840 
1841 #ifdef __linux__
1842 /* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
1843 #include <sys/auxv.h>
1844 #define LOONGARCH_HWCAP_LSX  (1 << 4)
1845 #define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
1846 #else
1847 #define HAS_LSX_SUPPORT 0
1848 #endif
1849 
1850 typedef sljit_ins sljit_u32;
1851 
1852 #define SI12_IMM_MASK   0x003ffc00
1853 #define UI5_IMM_MASK    0x00007c00
1854 #define UI2_IMM_MASK    0x00000c00
1855 
1856 #define VD(vd)      ((sljit_ins)vd << 0)
1857 #define VJ(vj)      ((sljit_ins)vj << 5)
1858 #define VK(vk)      ((sljit_ins)vk << 10)
1859 #define RD_V(rd)    ((sljit_ins)rd << 0)
1860 #define RJ_V(rj)    ((sljit_ins)rj << 5)
1861 
1862 #define IMM_SI12(imm)   (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
1863 #define IMM_UI5(imm)    (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
1864 #define IMM_UI2(imm)    (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
1865 
1866 // LSX OPCODES:
1867 #define VLD           0x2c000000
1868 #define VOR_V         0x71268000
1869 #define VAND_V        0x71260000
1870 #define VBSLL_V       0x728e0000
1871 #define VMSKLTZ_B     0x729c4000
1872 #define VPICKVE2GR_WU 0x72f3e000
1873 
1874 #if PCRE2_CODE_UNIT_WIDTH == 8
1875 #define VREPLGR2VR  0x729f0000
1876 #define VSEQ        0x70000000
1877 #elif PCRE2_CODE_UNIT_WIDTH == 16
1878 #define VREPLGR2VR  0x729f0400
1879 #define VSEQ        0x70008000
1880 #else
1881 #define VREPLGR2VR  0x729f0800
1882 #define VSEQ        0x70010000
1883 #endif
1884 
fast_forward_char_pair_lsx_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1885 static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1886   sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1887 {
1888 if (compare_type != vector_compare_match2)
1889   {
1890   if (compare_type == vector_compare_match1i)
1891     {
1892     /* VOR.V vd, vj, vk */
1893     push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
1894     }
1895 
1896   /* VSEQ.B/H/W vd, vj, vk */
1897   push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1898   return;
1899   }
1900 
1901 /* VBSLL.V vd, vj, ui5 */
1902 push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
1903 
1904 /* VSEQ.B/H/W vd, vj, vk */
1905 push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1906 
1907 /* VSEQ.B/H/W vd, vj, vk */
1908 push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
1909 
1910 /* VOR vd, vj, vk */
1911 push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
1912 return;
1913 }
1914 
1915 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
1916 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1917 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1918 {
1919 DEFINE_COMPILER;
1920 struct sljit_label *start;
1921 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1922 struct sljit_label *restart;
1923 #endif
1924 struct sljit_jump *quit;
1925 struct sljit_jump *partial_quit[2];
1926 vector_compare_type compare_type = vector_compare_match1;
1927 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1928 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1929 sljit_s32 data_ind = 0;
1930 sljit_s32 tmp_ind = 1;
1931 sljit_s32 cmp1_ind = 2;
1932 sljit_s32 cmp2_ind = 3;
1933 sljit_u32 bit = 0;
1934 
1935 SLJIT_UNUSED_ARG(offset);
1936 
1937 if (char1 != char2)
1938   {
1939   bit = char1 ^ char2;
1940   compare_type = vector_compare_match1i;
1941 
1942   if (!is_powerof2(bit))
1943     {
1944     bit = 0;
1945     compare_type = vector_compare_match2;
1946     }
1947   }
1948 
1949 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1950 if (common->mode == PCRE2_JIT_COMPLETE)
1951   add_jump(compiler, &common->failed_match, partial_quit[0]);
1952 
1953 /* First part (unaligned start) */
1954 
1955 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
1956 
1957 /* VREPLGR2VR.B/H/W vd, rj */
1958 push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
1959 
1960 if (char1 != char2)
1961   {
1962   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
1963 
1964   /* VREPLGR2VR.B/H/W vd, rj */
1965   push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
1966   }
1967 
1968 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
1969 
1970 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1971 restart = LABEL();
1972 #endif
1973 
1974 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
1975 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1976 
1977 /* VLD vd, rj, si12 */
1978 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
1979 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1980 
1981 /* VMSKLTZ.B vd, vj */
1982 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
1983 
1984 /* VPICKVE2GR.WU rd, vj, ui2 */
1985 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
1986 
1987 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1988 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1989 
1990 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1991 
1992 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1993 
1994 /* Second part (aligned) */
1995 start = LABEL();
1996 
1997 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1998 
1999 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2000 if (common->mode == PCRE2_JIT_COMPLETE)
2001   add_jump(compiler, &common->failed_match, partial_quit[1]);
2002 
2003 /* VLD vd, rj, si12 */
2004 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2005 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2006 
2007 /* VMSKLTZ.B vd, vj */
2008 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2009 
2010 /* VPICKVE2GR.WU rd, vj, ui2 */
2011 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2012 
2013 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2014 
2015 JUMPHERE(quit);
2016 
2017 /* CTZ.W rd, rj */
2018 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2019 
2020 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2021 
2022 if (common->mode != PCRE2_JIT_COMPLETE)
2023   {
2024   JUMPHERE(partial_quit[0]);
2025   JUMPHERE(partial_quit[1]);
2026   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
2027   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
2028   }
2029 else
2030   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2031 
2032 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2033 if (common->utf && offset > 0)
2034   {
2035   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
2036 
2037   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
2038 
2039   quit = jump_if_utf_char_start(compiler, TMP1);
2040 
2041   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2042   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2043   OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2044   JUMPTO(SLJIT_JUMP, restart);
2045 
2046   JUMPHERE(quit);
2047   }
2048 #endif
2049 }
2050 
2051 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
2052 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)2053 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
2054 {
2055 DEFINE_COMPILER;
2056 struct sljit_label *start;
2057 struct sljit_jump *quit;
2058 jump_list *not_found = NULL;
2059 vector_compare_type compare_type = vector_compare_match1;
2060 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2061 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2062 sljit_s32 data_ind = 0;
2063 sljit_s32 tmp_ind = 1;
2064 sljit_s32 cmp1_ind = 2;
2065 sljit_s32 cmp2_ind = 3;
2066 sljit_u32 bit = 0;
2067 
2068 if (char1 != char2)
2069   {
2070   bit = char1 ^ char2;
2071   compare_type = vector_compare_match1i;
2072 
2073   if (!is_powerof2(bit))
2074     {
2075     bit = 0;
2076     compare_type = vector_compare_match2;
2077     }
2078   }
2079 
2080 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2081 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
2082 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
2083 
2084 /* First part (unaligned start) */
2085 
2086 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2087 
2088 /* VREPLGR2VR vd, rj */
2089 push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2090 
2091 if (char1 != char2)
2092   {
2093   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2094   /* VREPLGR2VR vd, rj */
2095   push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2096   }
2097 
2098 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
2099 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2100 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2101 
2102 /* VLD vd, rj, si12 */
2103 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2104 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2105 
2106 /* VMSKLTZ.B vd, vj */
2107 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2108 
2109 /* VPICKVE2GR.WU rd, vj, ui2 */
2110 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2111 
2112 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2113 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2114 
2115 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2116 
2117 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2118 
2119 /* Second part (aligned) */
2120 start = LABEL();
2121 
2122 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2123 
2124 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2125 
2126 /* VLD vd, rj, si12 */
2127 push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2128 fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2129 
2130 /* VMSKLTZ.B vd, vj */
2131 push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2132 
2133 /* VPICKVE2GR.WU rd, vj, ui2 */
2134 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2135 
2136 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2137 
2138 JUMPHERE(quit);
2139 
2140 /* CTZ.W rd, rj */
2141 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2142 
2143 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
2144 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2145 
2146 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
2147 return not_found;
2148 }
2149 
2150 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
2151 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)2152 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
2153   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
2154 {
2155 DEFINE_COMPILER;
2156 vector_compare_type compare1_type = vector_compare_match1;
2157 vector_compare_type compare2_type = vector_compare_match1;
2158 sljit_u32 bit1 = 0;
2159 sljit_u32 bit2 = 0;
2160 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
2161 sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2162 sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
2163 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2164 sljit_s32 data1_ind = 0;
2165 sljit_s32 data2_ind = 1;
2166 sljit_s32 tmp1_ind = 2;
2167 sljit_s32 tmp2_ind = 3;
2168 sljit_s32 cmp1a_ind = 4;
2169 sljit_s32 cmp1b_ind = 5;
2170 sljit_s32 cmp2a_ind = 6;
2171 sljit_s32 cmp2b_ind = 7;
2172 struct sljit_label *start;
2173 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2174 struct sljit_label *restart;
2175 #endif
2176 struct sljit_jump *jump[2];
2177 
2178 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
2179 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
2180 
2181 /* Initialize. */
2182 if (common->match_end_ptr != 0)
2183   {
2184   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
2185   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
2186   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
2187 
2188   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
2189   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
2190   }
2191 
2192 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2193 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2194 
2195 if (char1a == char1b)
2196   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2197 else
2198   {
2199   bit1 = char1a ^ char1b;
2200   if (is_powerof2(bit1))
2201     {
2202     compare1_type = vector_compare_match1i;
2203     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
2204     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
2205     }
2206   else
2207     {
2208     compare1_type = vector_compare_match2;
2209     bit1 = 0;
2210     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2211     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
2212     }
2213   }
2214 
2215 /* VREPLGR2VR vd, rj */
2216 push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
2217 
2218 if (char1a != char1b)
2219   {
2220   /* VREPLGR2VR vd, rj */
2221   push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
2222   }
2223 
2224 if (char2a == char2b)
2225   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2226 else
2227   {
2228   bit2 = char2a ^ char2b;
2229   if (is_powerof2(bit2))
2230     {
2231     compare2_type = vector_compare_match1i;
2232     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
2233     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
2234     }
2235   else
2236     {
2237     compare2_type = vector_compare_match2;
2238     bit2 = 0;
2239     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2240     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
2241     }
2242   }
2243 
2244 /* VREPLGR2VR vd, rj */
2245 push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
2246 
2247 if (char2a != char2b)
2248   {
2249   /* VREPLGR2VR vd, rj */
2250   push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
2251   }
2252 
2253 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2254 restart = LABEL();
2255 #endif
2256 
2257 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
2258 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2259 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2260 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2261 
2262 /* VLD vd, rj, si12 */
2263 push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2264 
2265 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
2266 
2267 /* VLD vd, rj, si12 */
2268 push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2269 jump[1] = JUMP(SLJIT_JUMP);
2270 
2271 JUMPHERE(jump[0]);
2272 
2273 /* VBSLL.V vd, vj, ui5 */
2274 push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
2275 
2276 JUMPHERE(jump[1]);
2277 
2278 fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
2279 fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
2280 
2281 /* VAND vd, vj, vk */
2282 push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2283 
2284 /* VMSKLTZ.B vd, vj */
2285 push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2286 
2287 /* VPICKVE2GR.WU rd, vj, ui2 */
2288 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2289 
2290 /* Ignore matches before the first STR_PTR. */
2291 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2292 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2293 
2294 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2295 
2296 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2297 
2298 /* Main loop. */
2299 start = LABEL();
2300 
2301 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2302 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2303 
2304 /* VLD vd, rj, si12 */
2305 push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2306 push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2307 
2308 fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
2309 fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
2310 
2311 /* VAND.V vd, vj, vk */
2312 push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2313 
2314 /* VMSKLTZ.B vd, vj */
2315 push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2316 
2317 /* VPICKVE2GR.WU rd, vj, ui2 */
2318 push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2319 
2320 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2321 
2322 JUMPHERE(jump[0]);
2323 
2324 /* CTZ.W rd, rj */
2325 push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2326 
2327 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2328 
2329 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2330 
2331 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2332 if (common->utf)
2333   {
2334   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
2335 
2336   jump[0] = jump_if_utf_char_start(compiler, TMP1);
2337 
2338   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2339   CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
2340 
2341   add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
2342 
2343   JUMPHERE(jump[0]);
2344   }
2345 #endif
2346 
2347 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2348 
2349 if (common->match_end_ptr != 0)
2350   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
2351 }
2352 
2353 #endif /* SLJIT_CONFIG_LOONGARCH_64 */
2354 
2355 #endif /* !SUPPORT_VALGRIND */
2356