1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 This module by Zoltan Herczeg
10 Original API code Copyright (c) 1997-2012 University of Cambridge
11 New API code Copyright (c) 2016-2019 University of Cambridge
12
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41
42 #if !(defined SUPPORT_VALGRIND)
43
44 #if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45 || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X))
46
47 typedef enum {
48 vector_compare_match1,
49 vector_compare_match1i,
50 vector_compare_match2,
51 } vector_compare_type;
52
max_fast_forward_char_pair_offset(void)53 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
54 {
55 #if PCRE2_CODE_UNIT_WIDTH == 8
56 return 15;
57 #elif PCRE2_CODE_UNIT_WIDTH == 16
58 return 7;
59 #elif PCRE2_CODE_UNIT_WIDTH == 32
60 return 3;
61 #else
62 #error "Unsupported unit width"
63 #endif
64 }
65
66 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
jump_if_utf_char_start(struct sljit_compiler * compiler,sljit_s32 reg)67 static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
68 {
69 #if PCRE2_CODE_UNIT_WIDTH == 8
70 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
71 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
72 #elif PCRE2_CODE_UNIT_WIDTH == 16
73 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
74 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
75 #else
76 #error "Unknown code width"
77 #endif
78 }
79 #endif
80
81 #endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
82
83 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
84
character_to_int32(PCRE2_UCHAR chr)85 static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
86 {
87 sljit_u32 value = chr;
88 #if PCRE2_CODE_UNIT_WIDTH == 8
89 #define SSE2_COMPARE_TYPE_INDEX 0
90 return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
91 #elif PCRE2_CODE_UNIT_WIDTH == 16
92 #define SSE2_COMPARE_TYPE_INDEX 1
93 return (sljit_s32)((value << 16) | value);
94 #elif PCRE2_CODE_UNIT_WIDTH == 32
95 #define SSE2_COMPARE_TYPE_INDEX 2
96 return (sljit_s32)(value);
97 #else
98 #error "Unsupported unit width"
99 #endif
100 }
101
load_from_mem_sse2(struct sljit_compiler * compiler,sljit_s32 dst_xmm_reg,sljit_s32 src_general_reg,sljit_s8 offset)102 static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
103 {
104 sljit_u8 instruction[5];
105
106 SLJIT_ASSERT(dst_xmm_reg < 8);
107 SLJIT_ASSERT(src_general_reg < 8);
108
109 /* MOVDQA xmm1, xmm2/m128 */
110 instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
111 instruction[1] = 0x0f;
112 instruction[2] = 0x6f;
113
114 if (offset == 0)
115 {
116 instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
117 sljit_emit_op_custom(compiler, instruction, 4);
118 return;
119 }
120
121 instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
122 instruction[4] = (sljit_u8)offset;
123 sljit_emit_op_custom(compiler, instruction, 5);
124 }
125
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)126 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
127 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
128 {
129 sljit_u8 instruction[4];
130 instruction[0] = 0x66;
131 instruction[1] = 0x0f;
132
133 SLJIT_ASSERT(step >= 0 && step <= 3);
134
135 if (compare_type != vector_compare_match2)
136 {
137 if (step == 0)
138 {
139 if (compare_type == vector_compare_match1i)
140 {
141 /* POR xmm1, xmm2/m128 */
142 /* instruction[0] = 0x66; */
143 /* instruction[1] = 0x0f; */
144 instruction[2] = 0xeb;
145 instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
146 sljit_emit_op_custom(compiler, instruction, 4);
147 }
148 return;
149 }
150
151 if (step != 2)
152 return;
153
154 /* PCMPEQB/W/D xmm1, xmm2/m128 */
155 /* instruction[0] = 0x66; */
156 /* instruction[1] = 0x0f; */
157 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
158 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
159 sljit_emit_op_custom(compiler, instruction, 4);
160 return;
161 }
162
163 switch (step)
164 {
165 case 0:
166 /* MOVDQA xmm1, xmm2/m128 */
167 /* instruction[0] = 0x66; */
168 /* instruction[1] = 0x0f; */
169 instruction[2] = 0x6f;
170 instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
171 sljit_emit_op_custom(compiler, instruction, 4);
172 return;
173
174 case 1:
175 /* PCMPEQB/W/D xmm1, xmm2/m128 */
176 /* instruction[0] = 0x66; */
177 /* instruction[1] = 0x0f; */
178 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
179 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
180 sljit_emit_op_custom(compiler, instruction, 4);
181 return;
182
183 case 2:
184 /* PCMPEQB/W/D xmm1, xmm2/m128 */
185 /* instruction[0] = 0x66; */
186 /* instruction[1] = 0x0f; */
187 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
188 instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
189 sljit_emit_op_custom(compiler, instruction, 4);
190 return;
191
192 case 3:
193 /* POR xmm1, xmm2/m128 */
194 /* instruction[0] = 0x66; */
195 /* instruction[1] = 0x0f; */
196 instruction[2] = 0xeb;
197 instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
198 sljit_emit_op_custom(compiler, instruction, 4);
199 return;
200 }
201 }
202
203 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
204
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)205 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
206 {
207 DEFINE_COMPILER;
208 sljit_u8 instruction[8];
209 struct sljit_label *start;
210 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
211 struct sljit_label *restart;
212 #endif
213 struct sljit_jump *quit;
214 struct sljit_jump *partial_quit[2];
215 vector_compare_type compare_type = vector_compare_match1;
216 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
217 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
218 sljit_s32 data_ind = 0;
219 sljit_s32 tmp_ind = 1;
220 sljit_s32 cmp1_ind = 2;
221 sljit_s32 cmp2_ind = 3;
222 sljit_u32 bit = 0;
223 int i;
224
225 SLJIT_UNUSED_ARG(offset);
226
227 if (char1 != char2)
228 {
229 bit = char1 ^ char2;
230 compare_type = vector_compare_match1i;
231
232 if (!is_powerof2(bit))
233 {
234 bit = 0;
235 compare_type = vector_compare_match2;
236 }
237 }
238
239 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
240 if (common->mode == PCRE2_JIT_COMPLETE)
241 add_jump(compiler, &common->failed_match, partial_quit[0]);
242
243 /* First part (unaligned start) */
244
245 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
246
247 SLJIT_ASSERT(tmp1_reg_ind < 8);
248
249 /* MOVD xmm, r/m32 */
250 instruction[0] = 0x66;
251 instruction[1] = 0x0f;
252 instruction[2] = 0x6e;
253 instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
254 sljit_emit_op_custom(compiler, instruction, 4);
255
256 if (char1 != char2)
257 {
258 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
259
260 /* MOVD xmm, r/m32 */
261 instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
262 sljit_emit_op_custom(compiler, instruction, 4);
263 }
264
265 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
266
267 /* PSHUFD xmm1, xmm2/m128, imm8 */
268 /* instruction[0] = 0x66; */
269 /* instruction[1] = 0x0f; */
270 instruction[2] = 0x70;
271 instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
272 instruction[4] = 0;
273 sljit_emit_op_custom(compiler, instruction, 5);
274
275 if (char1 != char2)
276 {
277 /* PSHUFD xmm1, xmm2/m128, imm8 */
278 instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
279 sljit_emit_op_custom(compiler, instruction, 5);
280 }
281
282 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
283 restart = LABEL();
284 #endif
285 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
286 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
287
288 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
289 for (i = 0; i < 4; i++)
290 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
291
292 /* PMOVMSKB reg, xmm */
293 /* instruction[0] = 0x66; */
294 /* instruction[1] = 0x0f; */
295 instruction[2] = 0xd7;
296 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
297 sljit_emit_op_custom(compiler, instruction, 4);
298
299 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
300 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
301
302 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
303
304 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
305
306 /* Second part (aligned) */
307 start = LABEL();
308
309 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
310
311 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
312 if (common->mode == PCRE2_JIT_COMPLETE)
313 add_jump(compiler, &common->failed_match, partial_quit[1]);
314
315 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
316 for (i = 0; i < 4; i++)
317 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
318
319 /* PMOVMSKB reg, xmm */
320 /* instruction[0] = 0x66; */
321 /* instruction[1] = 0x0f; */
322 instruction[2] = 0xd7;
323 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
324 sljit_emit_op_custom(compiler, instruction, 4);
325
326 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327
328 JUMPHERE(quit);
329
330 /* BSF r32, r/m32 */
331 instruction[0] = 0x0f;
332 instruction[1] = 0xbc;
333 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
334 sljit_emit_op_custom(compiler, instruction, 3);
335
336 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
337
338 if (common->mode != PCRE2_JIT_COMPLETE)
339 {
340 JUMPHERE(partial_quit[0]);
341 JUMPHERE(partial_quit[1]);
342 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
343 CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
344 }
345 else
346 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
347
348 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
349 if (common->utf && offset > 0)
350 {
351 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
352
353 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
354
355 quit = jump_if_utf_char_start(compiler, TMP1);
356
357 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
358 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
359 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
360 JUMPTO(SLJIT_JUMP, restart);
361
362 JUMPHERE(quit);
363 }
364 #endif
365 }
366
367 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
368
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)369 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
370 {
371 DEFINE_COMPILER;
372 sljit_u8 instruction[8];
373 struct sljit_label *start;
374 struct sljit_jump *quit;
375 jump_list *not_found = NULL;
376 vector_compare_type compare_type = vector_compare_match1;
377 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
378 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
379 sljit_s32 data_ind = 0;
380 sljit_s32 tmp_ind = 1;
381 sljit_s32 cmp1_ind = 2;
382 sljit_s32 cmp2_ind = 3;
383 sljit_u32 bit = 0;
384 int i;
385
386 if (char1 != char2)
387 {
388 bit = char1 ^ char2;
389 compare_type = vector_compare_match1i;
390
391 if (!is_powerof2(bit))
392 {
393 bit = 0;
394 compare_type = vector_compare_match2;
395 }
396 }
397
398 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
399 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
400 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
401
402 /* First part (unaligned start) */
403
404 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
405
406 SLJIT_ASSERT(tmp1_reg_ind < 8);
407
408 /* MOVD xmm, r/m32 */
409 instruction[0] = 0x66;
410 instruction[1] = 0x0f;
411 instruction[2] = 0x6e;
412 instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
413 sljit_emit_op_custom(compiler, instruction, 4);
414
415 if (char1 != char2)
416 {
417 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
418
419 /* MOVD xmm, r/m32 */
420 instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
421 sljit_emit_op_custom(compiler, instruction, 4);
422 }
423
424 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
425
426 /* PSHUFD xmm1, xmm2/m128, imm8 */
427 /* instruction[0] = 0x66; */
428 /* instruction[1] = 0x0f; */
429 instruction[2] = 0x70;
430 instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
431 instruction[4] = 0;
432 sljit_emit_op_custom(compiler, instruction, 5);
433
434 if (char1 != char2)
435 {
436 /* PSHUFD xmm1, xmm2/m128, imm8 */
437 instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
438 sljit_emit_op_custom(compiler, instruction, 5);
439 }
440
441 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
442 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
443
444 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
445 for (i = 0; i < 4; i++)
446 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
447
448 /* PMOVMSKB reg, xmm */
449 /* instruction[0] = 0x66; */
450 /* instruction[1] = 0x0f; */
451 instruction[2] = 0xd7;
452 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
453 sljit_emit_op_custom(compiler, instruction, 4);
454
455 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
456 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
457
458 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
459
460 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
461
462 /* Second part (aligned) */
463 start = LABEL();
464
465 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
466
467 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
468
469 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
470 for (i = 0; i < 4; i++)
471 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
472
473 /* PMOVMSKB reg, xmm */
474 /* instruction[0] = 0x66; */
475 /* instruction[1] = 0x0f; */
476 instruction[2] = 0xd7;
477 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
478 sljit_emit_op_custom(compiler, instruction, 4);
479
480 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
481
482 JUMPHERE(quit);
483
484 /* BSF r32, r/m32 */
485 instruction[0] = 0x0f;
486 instruction[1] = 0xbc;
487 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
488 sljit_emit_op_custom(compiler, instruction, 3);
489
490 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
491 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
492
493 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
494 return not_found;
495 }
496
497 #ifndef _WIN64
498
499 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
500
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)501 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
502 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
503 {
504 DEFINE_COMPILER;
505 sljit_u8 instruction[8];
506 vector_compare_type compare1_type = vector_compare_match1;
507 vector_compare_type compare2_type = vector_compare_match1;
508 sljit_u32 bit1 = 0;
509 sljit_u32 bit2 = 0;
510 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
511 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
512 sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
513 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
514 sljit_s32 data1_ind = 0;
515 sljit_s32 data2_ind = 1;
516 sljit_s32 tmp1_ind = 2;
517 sljit_s32 tmp2_ind = 3;
518 sljit_s32 cmp1a_ind = 4;
519 sljit_s32 cmp1b_ind = 5;
520 sljit_s32 cmp2a_ind = 6;
521 sljit_s32 cmp2b_ind = 7;
522 struct sljit_label *start;
523 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
524 struct sljit_label *restart;
525 #endif
526 struct sljit_jump *jump[2];
527 int i;
528
529 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
530 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
531 SLJIT_ASSERT(tmp1_reg_ind < 8 && tmp2_reg_ind == 1);
532
533 /* Initialize. */
534 if (common->match_end_ptr != 0)
535 {
536 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
537 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
538 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
539
540 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
541 CMOV(SLJIT_LESS, STR_END, TMP1, 0);
542 }
543
544 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
545 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
546
547 /* MOVD xmm, r/m32 */
548 instruction[0] = 0x66;
549 instruction[1] = 0x0f;
550 instruction[2] = 0x6e;
551
552 if (char1a == char1b)
553 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
554 else
555 {
556 bit1 = char1a ^ char1b;
557 if (is_powerof2(bit1))
558 {
559 compare1_type = vector_compare_match1i;
560 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
561 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
562 }
563 else
564 {
565 compare1_type = vector_compare_match2;
566 bit1 = 0;
567 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
568 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
569 }
570 }
571
572 instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_reg_ind;
573 sljit_emit_op_custom(compiler, instruction, 4);
574
575 if (char1a != char1b)
576 {
577 instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_reg_ind;
578 sljit_emit_op_custom(compiler, instruction, 4);
579 }
580
581 if (char2a == char2b)
582 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
583 else
584 {
585 bit2 = char2a ^ char2b;
586 if (is_powerof2(bit2))
587 {
588 compare2_type = vector_compare_match1i;
589 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
590 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
591 }
592 else
593 {
594 compare2_type = vector_compare_match2;
595 bit2 = 0;
596 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
597 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
598 }
599 }
600
601 instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_reg_ind;
602 sljit_emit_op_custom(compiler, instruction, 4);
603
604 if (char2a != char2b)
605 {
606 instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_reg_ind;
607 sljit_emit_op_custom(compiler, instruction, 4);
608 }
609
610 /* PSHUFD xmm1, xmm2/m128, imm8 */
611 /* instruction[0] = 0x66; */
612 /* instruction[1] = 0x0f; */
613 instruction[2] = 0x70;
614 instruction[4] = 0;
615
616 instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind;
617 sljit_emit_op_custom(compiler, instruction, 5);
618
619 if (char1a != char1b)
620 {
621 instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind;
622 sljit_emit_op_custom(compiler, instruction, 5);
623 }
624
625 instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind;
626 sljit_emit_op_custom(compiler, instruction, 5);
627
628 if (char2a != char2b)
629 {
630 instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind;
631 sljit_emit_op_custom(compiler, instruction, 5);
632 }
633
634 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
635 restart = LABEL();
636 #endif
637
638 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
639 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
640 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
641
642 load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
643
644 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
645
646 load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
647 jump[1] = JUMP(SLJIT_JUMP);
648
649 JUMPHERE(jump[0]);
650
651 /* MOVDQA xmm1, xmm2/m128 */
652 /* instruction[0] = 0x66; */
653 /* instruction[1] = 0x0f; */
654 instruction[2] = 0x6f;
655 instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
656 sljit_emit_op_custom(compiler, instruction, 4);
657
658 /* PSLLDQ xmm1, imm8 */
659 /* instruction[0] = 0x66; */
660 /* instruction[1] = 0x0f; */
661 instruction[2] = 0x73;
662 instruction[3] = 0xc0 | (7 << 3) | data2_ind;
663 instruction[4] = diff;
664 sljit_emit_op_custom(compiler, instruction, 5);
665
666 JUMPHERE(jump[1]);
667
668 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
669
670 for (i = 0; i < 4; i++)
671 {
672 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
673 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
674 }
675
676 /* PAND xmm1, xmm2/m128 */
677 /* instruction[0] = 0x66; */
678 /* instruction[1] = 0x0f; */
679 instruction[2] = 0xdb;
680 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
681 sljit_emit_op_custom(compiler, instruction, 4);
682
683 /* PMOVMSKB reg, xmm */
684 /* instruction[0] = 0x66; */
685 /* instruction[1] = 0x0f; */
686 instruction[2] = 0xd7;
687 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
688 sljit_emit_op_custom(compiler, instruction, 4);
689
690 /* Ignore matches before the first STR_PTR. */
691 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
692 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
693
694 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
695
696 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
697
698 /* Main loop. */
699 start = LABEL();
700
701 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
702 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
703
704 load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
705 load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
706
707 for (i = 0; i < 4; i++)
708 {
709 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
710 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
711 }
712
713 /* PAND xmm1, xmm2/m128 */
714 /* instruction[0] = 0x66; */
715 /* instruction[1] = 0x0f; */
716 instruction[2] = 0xdb;
717 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
718 sljit_emit_op_custom(compiler, instruction, 4);
719
720 /* PMOVMSKB reg, xmm */
721 /* instruction[0] = 0x66; */
722 /* instruction[1] = 0x0f; */
723 instruction[2] = 0xd7;
724 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
725 sljit_emit_op_custom(compiler, instruction, 4);
726
727 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
728
729 JUMPHERE(jump[0]);
730
731 /* BSF r32, r/m32 */
732 instruction[0] = 0x0f;
733 instruction[1] = 0xbc;
734 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
735 sljit_emit_op_custom(compiler, instruction, 3);
736
737 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
738
739 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
740
741 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
742 if (common->utf)
743 {
744 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
745
746 jump[0] = jump_if_utf_char_start(compiler, TMP1);
747
748 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
749 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
750
751 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
752
753 JUMPHERE(jump[0]);
754 }
755 #endif
756
757 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
758
759 if (common->match_end_ptr != 0)
760 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
761 }
762
763 #endif /* !_WIN64 */
764
765 #undef SSE2_COMPARE_TYPE_INDEX
766
767 #endif /* SLJIT_CONFIG_X86 */
768
769 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
770
771 #include <arm_neon.h>
772
773 typedef union {
774 unsigned int x;
775 struct { unsigned char c1, c2, c3, c4; } c;
776 } int_char;
777
778 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
utf_continue(PCRE2_SPTR s)779 static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
780 {
781 #if PCRE2_CODE_UNIT_WIDTH == 8
782 return (*s & 0xc0) == 0x80;
783 #elif PCRE2_CODE_UNIT_WIDTH == 16
784 return (*s & 0xfc00) == 0xdc00;
785 #else
786 #error "Unknown code width"
787 #endif
788 }
789 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
790
791 #if PCRE2_CODE_UNIT_WIDTH == 8
792 # define VECTOR_FACTOR 16
793 # define vect_t uint8x16_t
794 # define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
795 # define VCEQQ vceqq_u8
796 # define VORRQ vorrq_u8
797 # define VST1Q vst1q_u8
798 # define VDUPQ vdupq_n_u8
799 # define VEXTQ vextq_u8
800 # define VANDQ vandq_u8
801 typedef union {
802 uint8_t mem[16];
803 uint64_t dw[2];
804 } quad_word;
805 #elif PCRE2_CODE_UNIT_WIDTH == 16
806 # define VECTOR_FACTOR 8
807 # define vect_t uint16x8_t
808 # define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
809 # define VCEQQ vceqq_u16
810 # define VORRQ vorrq_u16
811 # define VST1Q vst1q_u16
812 # define VDUPQ vdupq_n_u16
813 # define VEXTQ vextq_u16
814 # define VANDQ vandq_u16
815 typedef union {
816 uint16_t mem[8];
817 uint64_t dw[2];
818 } quad_word;
819 #else
820 # define VECTOR_FACTOR 4
821 # define vect_t uint32x4_t
822 # define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
823 # define VCEQQ vceqq_u32
824 # define VORRQ vorrq_u32
825 # define VST1Q vst1q_u32
826 # define VDUPQ vdupq_n_u32
827 # define VEXTQ vextq_u32
828 # define VANDQ vandq_u32
829 typedef union {
830 uint32_t mem[4];
831 uint64_t dw[2];
832 } quad_word;
833 #endif
834
835 #define FFCS
836 #include "pcre2_jit_neon_inc.h"
837 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
838 # define FF_UTF
839 # include "pcre2_jit_neon_inc.h"
840 # undef FF_UTF
841 #endif
842 #undef FFCS
843
844 #define FFCS_2
845 #include "pcre2_jit_neon_inc.h"
846 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
847 # define FF_UTF
848 # include "pcre2_jit_neon_inc.h"
849 # undef FF_UTF
850 #endif
851 #undef FFCS_2
852
853 #define FFCS_MASK
854 #include "pcre2_jit_neon_inc.h"
855 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
856 # define FF_UTF
857 # include "pcre2_jit_neon_inc.h"
858 # undef FF_UTF
859 #endif
860 #undef FFCS_MASK
861
862 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
863
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)864 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
865 {
866 DEFINE_COMPILER;
867 int_char ic;
868 struct sljit_jump *partial_quit;
869 /* Save temporary registers. */
870 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
871 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
872
873 /* Prepare function arguments */
874 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
875 OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
876 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
877
878 if (char1 == char2)
879 {
880 ic.c.c1 = char1;
881 ic.c.c2 = char2;
882 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
883
884 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
885 if (common->utf && offset > 0)
886 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
888 else
889 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
891 #else
892 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
893 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
894 #endif
895 }
896 else
897 {
898 PCRE2_UCHAR mask = char1 ^ char2;
899 if (is_powerof2(mask))
900 {
901 ic.c.c1 = char1 | mask;
902 ic.c.c2 = mask;
903 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
904
905 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
906 if (common->utf && offset > 0)
907 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
909 else
910 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
911 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
912 #else
913 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
914 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
915 #endif
916 }
917 else
918 {
919 ic.c.c1 = char1;
920 ic.c.c2 = char2;
921 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
922
923 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
924 if (common->utf && offset > 0)
925 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
926 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
927 else
928 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
929 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
930 #else
931 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
932 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
933 #endif
934 }
935 }
936 /* Restore registers. */
937 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
938 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
939
940 /* Check return value. */
941 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
942 if (common->mode == PCRE2_JIT_COMPLETE)
943 add_jump(compiler, &common->failed_match, partial_quit);
944
945 /* Fast forward STR_PTR to the result of memchr. */
946 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
947
948 if (common->mode != PCRE2_JIT_COMPLETE)
949 JUMPHERE(partial_quit);
950 }
951
952 typedef enum {
953 compare_match1,
954 compare_match1i,
955 compare_match2,
956 } compare_type;
957
fast_forward_char_pair_compare(compare_type ctype,vect_t dst,vect_t cmp1,vect_t cmp2)958 static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
959 {
960 if (ctype == compare_match2)
961 {
962 vect_t tmp = dst;
963 dst = VCEQQ(dst, cmp1);
964 tmp = VCEQQ(tmp, cmp2);
965 dst = VORRQ(dst, tmp);
966 return dst;
967 }
968
969 if (ctype == compare_match1i)
970 dst = VORRQ(dst, cmp2);
971 dst = VCEQQ(dst, cmp1);
972 return dst;
973 }
974
max_fast_forward_char_pair_offset(void)975 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
976 {
977 #if PCRE2_CODE_UNIT_WIDTH == 8
978 return 15;
979 #elif PCRE2_CODE_UNIT_WIDTH == 16
980 return 7;
981 #elif PCRE2_CODE_UNIT_WIDTH == 32
982 return 3;
983 #else
984 #error "Unsupported unit width"
985 #endif
986 }
987
988 /* ARM doesn't have a shift left across lanes. */
shift_left_n_lanes(vect_t a,sljit_u8 n)989 static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
990 {
991 vect_t zero = VDUPQ(0);
992 SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
993 /* VEXTQ takes an immediate as last argument. */
994 #define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
995 switch (n)
996 {
997 C(1); C(2); C(3);
998 #if PCRE2_CODE_UNIT_WIDTH != 32
999 C(4); C(5); C(6); C(7);
1000 # if PCRE2_CODE_UNIT_WIDTH != 16
1001 C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
1002 # endif
1003 #endif
1004 default:
1005 /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
1006 happen. The return is still here for compilers to not warn. */
1007 return a;
1008 }
1009 }
1010
1011 #define FFCPS
1012 #define FFCPS_DIFF1
1013 #define FFCPS_CHAR1A2A
1014
1015 #define FFCPS_0
1016 #include "pcre2_jit_neon_inc.h"
1017 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1018 # define FF_UTF
1019 # include "pcre2_jit_neon_inc.h"
1020 # undef FF_UTF
1021 #endif
1022 #undef FFCPS_0
1023
1024 #undef FFCPS_CHAR1A2A
1025
1026 #define FFCPS_1
1027 #include "pcre2_jit_neon_inc.h"
1028 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1029 # define FF_UTF
1030 # include "pcre2_jit_neon_inc.h"
1031 # undef FF_UTF
1032 #endif
1033 #undef FFCPS_1
1034
1035 #undef FFCPS_DIFF1
1036
1037 #define FFCPS_DEFAULT
1038 #include "pcre2_jit_neon_inc.h"
1039 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1040 # define FF_UTF
1041 # include "pcre2_jit_neon_inc.h"
1042 # undef FF_UTF
1043 #endif
1044 #undef FFCPS
1045
1046 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1047
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1048 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1049 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1050 {
1051 DEFINE_COMPILER;
1052 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1053 struct sljit_jump *partial_quit;
1054 int_char ic;
1055 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1056 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1057 SLJIT_ASSERT(compiler->scratches == 5);
1058
1059 /* Save temporary register STR_PTR. */
1060 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1061
1062 /* Prepare arguments for the function call. */
1063 if (common->match_end_ptr == 0)
1064 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1065 else
1066 {
1067 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1068 OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1069
1070 OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
1071 CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
1072 }
1073
1074 OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
1075 OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1076 OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1077 ic.c.c1 = char1a;
1078 ic.c.c2 = char1b;
1079 ic.c.c3 = char2a;
1080 ic.c.c4 = char2b;
1081 OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1082
1083 if (diff == 1) {
1084 if (char1a == char1b && char2a == char2b) {
1085 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086 if (common->utf)
1087 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
1089 else
1090 #endif
1091 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
1093 } else {
1094 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1095 if (common->utf)
1096 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1097 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
1098 else
1099 #endif
1100 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1101 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
1102 }
1103 } else {
1104 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1105 if (common->utf)
1106 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1107 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
1108 else
1109 #endif
1110 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1111 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
1112 }
1113
1114 /* Restore STR_PTR register. */
1115 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1116
1117 /* Check return value. */
1118 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1119 add_jump(compiler, &common->failed_match, partial_quit);
1120
1121 /* Fast forward STR_PTR to the result of memchr. */
1122 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1123
1124 JUMPHERE(partial_quit);
1125 }
1126
1127 #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1128
1129 #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1130
1131 #if PCRE2_CODE_UNIT_WIDTH == 8
1132 #define VECTOR_ELEMENT_SIZE 0
1133 #elif PCRE2_CODE_UNIT_WIDTH == 16
1134 #define VECTOR_ELEMENT_SIZE 1
1135 #elif PCRE2_CODE_UNIT_WIDTH == 32
1136 #define VECTOR_ELEMENT_SIZE 2
1137 #else
1138 #error "Unsupported unit width"
1139 #endif
1140
load_from_mem_vector(struct sljit_compiler * compiler,BOOL vlbb,sljit_s32 dst_vreg,sljit_s32 base_reg,sljit_s32 index_reg)1141 static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1142 sljit_s32 base_reg, sljit_s32 index_reg)
1143 {
1144 sljit_u16 instruction[3];
1145
1146 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1147 instruction[1] = (sljit_u16)(base_reg << 12);
1148 instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1149
1150 sljit_emit_op_custom(compiler, instruction, 6);
1151 }
1152
1153 #if PCRE2_CODE_UNIT_WIDTH == 32
1154
replicate_imm_vector(struct sljit_compiler * compiler,int step,sljit_s32 dst_vreg,PCRE2_UCHAR chr,sljit_s32 tmp_general_reg)1155 static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1156 PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1157 {
1158 sljit_u16 instruction[3];
1159
1160 SLJIT_ASSERT(step >= 0 && step <= 1);
1161
1162 if (chr < 0x7fff)
1163 {
1164 if (step == 1)
1165 return;
1166
1167 /* VREPI */
1168 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1169 instruction[1] = (sljit_u16)chr;
1170 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1171 sljit_emit_op_custom(compiler, instruction, 6);
1172 return;
1173 }
1174
1175 if (step == 0)
1176 {
1177 OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1178
1179 /* VLVG */
1180 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(tmp_general_reg));
1181 instruction[1] = 0;
1182 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1183 sljit_emit_op_custom(compiler, instruction, 6);
1184 return;
1185 }
1186
1187 /* VREP */
1188 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1189 instruction[1] = 0;
1190 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1191 sljit_emit_op_custom(compiler, instruction, 6);
1192 }
1193
1194 #endif
1195
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1196 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1197 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1198 {
1199 sljit_u16 instruction[3];
1200
1201 SLJIT_ASSERT(step >= 0 && step <= 2);
1202
1203 if (step == 1)
1204 {
1205 /* VCEQ */
1206 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1207 instruction[1] = (sljit_u16)(cmp1_ind << 12);
1208 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1209 sljit_emit_op_custom(compiler, instruction, 6);
1210 return;
1211 }
1212
1213 if (compare_type != vector_compare_match2)
1214 {
1215 if (step == 0 && compare_type == vector_compare_match1i)
1216 {
1217 /* VO */
1218 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1219 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1220 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1221 sljit_emit_op_custom(compiler, instruction, 6);
1222 }
1223 return;
1224 }
1225
1226 switch (step)
1227 {
1228 case 0:
1229 /* VCEQ */
1230 instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1231 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1232 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1233 sljit_emit_op_custom(compiler, instruction, 6);
1234 return;
1235
1236 case 2:
1237 /* VO */
1238 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1239 instruction[1] = (sljit_u16)(tmp_ind << 12);
1240 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1241 sljit_emit_op_custom(compiler, instruction, 6);
1242 return;
1243 }
1244 }
1245
1246 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1247
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1248 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1249 {
1250 DEFINE_COMPILER;
1251 sljit_u16 instruction[3];
1252 struct sljit_label *start;
1253 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1254 struct sljit_label *restart;
1255 #endif
1256 struct sljit_jump *quit;
1257 struct sljit_jump *partial_quit[2];
1258 vector_compare_type compare_type = vector_compare_match1;
1259 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1260 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1261 sljit_s32 data_ind = 0;
1262 sljit_s32 tmp_ind = 1;
1263 sljit_s32 cmp1_ind = 2;
1264 sljit_s32 cmp2_ind = 3;
1265 sljit_s32 zero_ind = 4;
1266 sljit_u32 bit = 0;
1267 int i;
1268
1269 SLJIT_UNUSED_ARG(offset);
1270
1271 if (char1 != char2)
1272 {
1273 bit = char1 ^ char2;
1274 compare_type = vector_compare_match1i;
1275
1276 if (!is_powerof2(bit))
1277 {
1278 bit = 0;
1279 compare_type = vector_compare_match2;
1280 }
1281 }
1282
1283 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1284 if (common->mode == PCRE2_JIT_COMPLETE)
1285 add_jump(compiler, &common->failed_match, partial_quit[0]);
1286
1287 /* First part (unaligned start) */
1288
1289 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1290
1291 #if PCRE2_CODE_UNIT_WIDTH != 32
1292
1293 /* VREPI */
1294 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1295 instruction[1] = (sljit_u16)(char1 | bit);
1296 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1297 sljit_emit_op_custom(compiler, instruction, 6);
1298
1299 if (char1 != char2)
1300 {
1301 /* VREPI */
1302 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1303 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1304 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1305 sljit_emit_op_custom(compiler, instruction, 6);
1306 }
1307
1308 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1309
1310 for (int i = 0; i < 2; i++)
1311 {
1312 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1313
1314 if (char1 != char2)
1315 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1316 }
1317
1318 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1319
1320 if (compare_type == vector_compare_match2)
1321 {
1322 /* VREPI */
1323 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1324 instruction[1] = 0;
1325 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1326 sljit_emit_op_custom(compiler, instruction, 6);
1327 }
1328
1329 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1330 restart = LABEL();
1331 #endif
1332
1333 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1334 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1335
1336 if (compare_type != vector_compare_match2)
1337 {
1338 if (compare_type == vector_compare_match1i)
1339 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1340
1341 /* VFEE */
1342 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1343 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1344 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1345 sljit_emit_op_custom(compiler, instruction, 6);
1346 }
1347 else
1348 {
1349 for (i = 0; i < 3; i++)
1350 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1351
1352 /* VFENE */
1353 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1354 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1355 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1356 sljit_emit_op_custom(compiler, instruction, 6);
1357 }
1358
1359 /* VLGVB */
1360 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1361 instruction[1] = 7;
1362 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1363 sljit_emit_op_custom(compiler, instruction, 6);
1364
1365 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1366 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1367
1368 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1369
1370 /* Second part (aligned) */
1371 start = LABEL();
1372
1373 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1374
1375 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1376 if (common->mode == PCRE2_JIT_COMPLETE)
1377 add_jump(compiler, &common->failed_match, partial_quit[1]);
1378
1379 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1380
1381 if (compare_type != vector_compare_match2)
1382 {
1383 if (compare_type == vector_compare_match1i)
1384 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1385
1386 /* VFEE */
1387 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1388 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1389 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1390 sljit_emit_op_custom(compiler, instruction, 6);
1391 }
1392 else
1393 {
1394 for (i = 0; i < 3; i++)
1395 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1396
1397 /* VFENE */
1398 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1399 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1400 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1401 sljit_emit_op_custom(compiler, instruction, 6);
1402 }
1403
1404 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1405 JUMPTO(SLJIT_OVERFLOW, start);
1406
1407 /* VLGVB */
1408 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1409 instruction[1] = 7;
1410 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1411 sljit_emit_op_custom(compiler, instruction, 6);
1412
1413 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1414
1415 JUMPHERE(quit);
1416
1417 if (common->mode != PCRE2_JIT_COMPLETE)
1418 {
1419 JUMPHERE(partial_quit[0]);
1420 JUMPHERE(partial_quit[1]);
1421 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1422 CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
1423 }
1424 else
1425 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1426
1427 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1428 if (common->utf && offset > 0)
1429 {
1430 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1431
1432 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1433
1434 quit = jump_if_utf_char_start(compiler, TMP1);
1435
1436 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1437 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1438
1439 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1440 JUMPTO(SLJIT_JUMP, restart);
1441
1442 JUMPHERE(quit);
1443 }
1444 #endif
1445 }
1446
1447 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1448
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)1449 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1450 {
1451 DEFINE_COMPILER;
1452 sljit_u16 instruction[3];
1453 struct sljit_label *start;
1454 struct sljit_jump *quit;
1455 jump_list *not_found = NULL;
1456 vector_compare_type compare_type = vector_compare_match1;
1457 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1458 sljit_s32 tmp3_reg_ind = sljit_get_register_index(TMP3);
1459 sljit_s32 data_ind = 0;
1460 sljit_s32 tmp_ind = 1;
1461 sljit_s32 cmp1_ind = 2;
1462 sljit_s32 cmp2_ind = 3;
1463 sljit_s32 zero_ind = 4;
1464 sljit_u32 bit = 0;
1465 int i;
1466
1467 if (char1 != char2)
1468 {
1469 bit = char1 ^ char2;
1470 compare_type = vector_compare_match1i;
1471
1472 if (!is_powerof2(bit))
1473 {
1474 bit = 0;
1475 compare_type = vector_compare_match2;
1476 }
1477 }
1478
1479 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1480
1481 /* First part (unaligned start) */
1482
1483 OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1484
1485 #if PCRE2_CODE_UNIT_WIDTH != 32
1486
1487 /* VREPI */
1488 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1489 instruction[1] = (sljit_u16)(char1 | bit);
1490 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1491 sljit_emit_op_custom(compiler, instruction, 6);
1492
1493 if (char1 != char2)
1494 {
1495 /* VREPI */
1496 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1497 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1498 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1499 sljit_emit_op_custom(compiler, instruction, 6);
1500 }
1501
1502 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1503
1504 for (int i = 0; i < 2; i++)
1505 {
1506 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1507
1508 if (char1 != char2)
1509 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1510 }
1511
1512 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1513
1514 if (compare_type == vector_compare_match2)
1515 {
1516 /* VREPI */
1517 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1518 instruction[1] = 0;
1519 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1520 sljit_emit_op_custom(compiler, instruction, 6);
1521 }
1522
1523 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1524 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1525
1526 if (compare_type != vector_compare_match2)
1527 {
1528 if (compare_type == vector_compare_match1i)
1529 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1530
1531 /* VFEE */
1532 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1533 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1534 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1535 sljit_emit_op_custom(compiler, instruction, 6);
1536 }
1537 else
1538 {
1539 for (i = 0; i < 3; i++)
1540 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1541
1542 /* VFENE */
1543 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1544 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1545 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1546 sljit_emit_op_custom(compiler, instruction, 6);
1547 }
1548
1549 /* VLGVB */
1550 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1551 instruction[1] = 7;
1552 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1553 sljit_emit_op_custom(compiler, instruction, 6);
1554
1555 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1556 quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1557
1558 OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1559
1560 /* Second part (aligned) */
1561 start = LABEL();
1562
1563 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1564
1565 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1566
1567 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1568
1569 if (compare_type != vector_compare_match2)
1570 {
1571 if (compare_type == vector_compare_match1i)
1572 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1573
1574 /* VFEE */
1575 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1576 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1577 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1578 sljit_emit_op_custom(compiler, instruction, 6);
1579 }
1580 else
1581 {
1582 for (i = 0; i < 3; i++)
1583 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1584
1585 /* VFENE */
1586 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1587 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1588 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1589 sljit_emit_op_custom(compiler, instruction, 6);
1590 }
1591
1592 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1593 JUMPTO(SLJIT_OVERFLOW, start);
1594
1595 /* VLGVB */
1596 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1597 instruction[1] = 7;
1598 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1599 sljit_emit_op_custom(compiler, instruction, 6);
1600
1601 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1602
1603 JUMPHERE(quit);
1604 add_jump(compiler, ¬_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1605
1606 return not_found;
1607 }
1608
1609 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1610
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1611 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1612 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1613 {
1614 DEFINE_COMPILER;
1615 sljit_u16 instruction[3];
1616 struct sljit_label *start;
1617 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1618 struct sljit_label *restart;
1619 #endif
1620 struct sljit_jump *quit;
1621 struct sljit_jump *jump[2];
1622 vector_compare_type compare1_type = vector_compare_match1;
1623 vector_compare_type compare2_type = vector_compare_match1;
1624 sljit_u32 bit1 = 0;
1625 sljit_u32 bit2 = 0;
1626 sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1627 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1628 sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
1629 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1630 sljit_s32 data1_ind = 0;
1631 sljit_s32 data2_ind = 1;
1632 sljit_s32 tmp1_ind = 2;
1633 sljit_s32 tmp2_ind = 3;
1634 sljit_s32 cmp1a_ind = 4;
1635 sljit_s32 cmp1b_ind = 5;
1636 sljit_s32 cmp2a_ind = 6;
1637 sljit_s32 cmp2b_ind = 7;
1638 sljit_s32 zero_ind = 8;
1639 int i;
1640
1641 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1642 SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1643 SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1644
1645 if (char1a != char1b)
1646 {
1647 bit1 = char1a ^ char1b;
1648 compare1_type = vector_compare_match1i;
1649
1650 if (!is_powerof2(bit1))
1651 {
1652 bit1 = 0;
1653 compare1_type = vector_compare_match2;
1654 }
1655 }
1656
1657 if (char2a != char2b)
1658 {
1659 bit2 = char2a ^ char2b;
1660 compare2_type = vector_compare_match1i;
1661
1662 if (!is_powerof2(bit2))
1663 {
1664 bit2 = 0;
1665 compare2_type = vector_compare_match2;
1666 }
1667 }
1668
1669 /* Initialize. */
1670 if (common->match_end_ptr != 0)
1671 {
1672 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1673 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1674 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1675
1676 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1677 CMOV(SLJIT_LESS, STR_END, TMP1, 0);
1678 }
1679
1680 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1681 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1682 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1683
1684 #if PCRE2_CODE_UNIT_WIDTH != 32
1685
1686 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1687
1688 /* VREPI */
1689 instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1690 instruction[1] = (sljit_u16)(char1a | bit1);
1691 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1692 sljit_emit_op_custom(compiler, instruction, 6);
1693
1694 if (char1a != char1b)
1695 {
1696 /* VREPI */
1697 instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1698 instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1699 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1700 sljit_emit_op_custom(compiler, instruction, 6);
1701 }
1702
1703 /* VREPI */
1704 instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1705 instruction[1] = (sljit_u16)(char2a | bit2);
1706 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1707 sljit_emit_op_custom(compiler, instruction, 6);
1708
1709 if (char2a != char2b)
1710 {
1711 /* VREPI */
1712 instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1713 instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1714 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1715 sljit_emit_op_custom(compiler, instruction, 6);
1716 }
1717
1718 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1719
1720 for (int i = 0; i < 2; i++)
1721 {
1722 replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1723
1724 if (char1a != char1b)
1725 replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1726
1727 replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1728
1729 if (char2a != char2b)
1730 replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1731 }
1732
1733 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1734
1735 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1736
1737 /* VREPI */
1738 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1739 instruction[1] = 0;
1740 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1741 sljit_emit_op_custom(compiler, instruction, 6);
1742
1743 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1744 restart = LABEL();
1745 #endif
1746
1747 jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1748 load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1749 jump[1] = JUMP(SLJIT_JUMP);
1750 JUMPHERE(jump[0]);
1751 load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1752 JUMPHERE(jump[1]);
1753
1754 load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1755 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1756
1757 for (i = 0; i < 3; i++)
1758 {
1759 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1760 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1761 }
1762
1763 /* VN */
1764 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1765 instruction[1] = (sljit_u16)(data2_ind << 12);
1766 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1767 sljit_emit_op_custom(compiler, instruction, 6);
1768
1769 /* VFENE */
1770 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1771 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1772 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1773 sljit_emit_op_custom(compiler, instruction, 6);
1774
1775 /* VLGVB */
1776 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1777 instruction[1] = 7;
1778 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1779 sljit_emit_op_custom(compiler, instruction, 6);
1780
1781 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1782 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1783
1784 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1785 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1786
1787 /* Main loop. */
1788 start = LABEL();
1789
1790 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1791 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1792
1793 load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1794 load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1795
1796 for (i = 0; i < 3; i++)
1797 {
1798 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1799 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1800 }
1801
1802 /* VN */
1803 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1804 instruction[1] = (sljit_u16)(data2_ind << 12);
1805 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1806 sljit_emit_op_custom(compiler, instruction, 6);
1807
1808 /* VFENE */
1809 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1810 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1811 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1812 sljit_emit_op_custom(compiler, instruction, 6);
1813
1814 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1815 JUMPTO(SLJIT_OVERFLOW, start);
1816
1817 /* VLGVB */
1818 instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1819 instruction[1] = 7;
1820 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1821 sljit_emit_op_custom(compiler, instruction, 6);
1822
1823 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1824
1825 JUMPHERE(quit);
1826
1827 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1828
1829 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1830 if (common->utf)
1831 {
1832 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1833
1834 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1835
1836 quit = jump_if_utf_char_start(compiler, TMP1);
1837
1838 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1839 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1840
1841 /* TMP1 contains diff. */
1842 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1843 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1844 JUMPTO(SLJIT_JUMP, restart);
1845
1846 JUMPHERE(quit);
1847 }
1848 #endif
1849
1850 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1851
1852 if (common->match_end_ptr != 0)
1853 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1854 }
1855
1856 #endif /* SLJIT_CONFIG_S390X */
1857
1858 #endif /* !SUPPORT_VALGRIND */
1859