1# regular expression test set 2# Lines are at least three fields, separated by one or more tabs. "" stands 3# for an empty field. First field is an RE. Second field is flags. If 4# C flag given, regcomp() is expected to fail, and the third field is the 5# error name (minus the leading REG_). 6# 7# Otherwise it is expected to succeed, and the third field is the string to 8# try matching it against. If there is no fourth field, the match is 9# expected to fail. If there is a fourth field, it is the substring that 10# the RE is expected to match. If there is a fifth field, it is a comma- 11# separated list of what the subexpressions should match, with - indicating 12# no match for that one. In both the fourth and fifth fields, a (sub)field 13# starting with @ indicates that the (sub)expression is expected to match 14# a null string followed by the stuff after the @; this provides a way to 15# test where null strings match. The character `N' in REs and strings 16# is newline, `S' is space, `T' is tab, `Z' is NUL. 17# 18# The full list of flags: 19# - placeholder, does nothing 20# b RE is a BRE, not an ERE 21# & try it as both an ERE and a BRE 22# C regcomp() error expected, third field is error name 23# i REG_ICASE 24# m ("mundane") REG_NOSPEC 25# s REG_NOSUB (not really testable) 26# n REG_NEWLINE 27# ^ REG_NOTBOL 28# $ REG_NOTEOL 29# # REG_STARTEND (see below) 30# p REG_PEND 31# 32# For REG_STARTEND, the start/end offsets are those of the substring 33# enclosed in (). 34 35# basics 36a & a a 37abc & abc abc 38abc|de - abc abc 39a|b|c - abc a 40 41# parentheses and perversions thereof 42a(b)c - abc abc 43a\(b\)c b abc abc 44a( C EPAREN 45a( b a( a( 46a\( - a( a( 47a\( bC EPAREN 48a\(b bC EPAREN 49a(b C EPAREN 50a(b b a(b a(b 51# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) 52a) - a) a) 53) - ) ) 54# end gagging (in a just world, those *should* give EPAREN) 55a) b a) a) 56a\) bC EPAREN 57\) bC EPAREN 58a()b - ab ab 59a\(\)b b ab ab 60 61# anchoring and REG_NEWLINE 62^abc$ & abc abc 63a^b - a^b 64a^b b a^b a^b 65a$b - a$b 66a$b b a$b a$b 67^ & abc @abc 68$ & abc @ 69^$ & "" @ 70$^ - "" @ 71\($\)\(^\) b "" @ 72# stop retching, those are legitimate (although disgusting) 73^^ - "" @ 74$$ - "" @ 75b$ & abNc 76b$ &n abNc b 77^b$ & aNbNc 78^b$ &n aNbNc b 79^$ &n aNNb @Nb 80^$ n abc 81^$ n abcN @ 82$^ n aNNb @Nb 83\($\)\(^\) bn aNNb @Nb 84^^ n^ aNNb @Nb 85$$ n aNNb @NN 86^a ^ a 87a$ $ a 88^a ^n aNb 89^b ^n aNb b 90a$ $n bNa 91b$ $n bNa b 92a*(^b$)c* - b b 93a*\(^b$\)c* b b b 94 95# certain syntax errors and non-errors 96| C EMPTY 97| b | | 98* C BADRPT 99* b * * 100+ C BADRPT 101? C BADRPT 102"" &C EMPTY 103() - abc @abc 104\(\) b abc @abc 105a||b C EMPTY 106|ab C EMPTY 107ab| C EMPTY 108(|a)b C EMPTY 109(a|)b C EMPTY 110(*a) C BADRPT 111(+a) C BADRPT 112(?a) C BADRPT 113({1}a) C BADRPT 114\(\{1\}a\) bC BADRPT 115(a|*b) C BADRPT 116(a|+b) C BADRPT 117(a|?b) C BADRPT 118(a|{1}b) C BADRPT 119^* C BADRPT 120^* b * * 121^+ C BADRPT 122^? C BADRPT 123^{1} C BADRPT 124^\{1\} bC BADRPT 125 126# metacharacters, backslashes 127a.c & abc abc 128a[bc]d & abd abd 129a\*c & a*c a*c 130a\\b & a\b a\b 131a\\\*b & a\*b a\*b 132a\bc & abc abc 133a\ &C EESCAPE 134a\\bc & a\bc a\bc 135\{ bC BADRPT 136a\[b & a[b a[b 137a[b &C EBRACK 138# trailing $ is a peculiar special case for the BRE code 139a$ & a a 140a$ & a$ 141a\$ & a 142a\$ & a$ a$ 143a\\$ & a 144a\\$ & a$ 145a\\$ & a\$ 146a\\$ & a\ a\ 147 148# back references, ugh 149a\(b\)\2c bC ESUBREG 150a\(b\1\)c bC ESUBREG 151a\(b*\)c\1d b abbcbbd abbcbbd bb 152a\(b*\)c\1d b abbcbd 153a\(b*\)c\1d b abbcbbbd 154^\(.\)\1 b abc 155a\([bc]\)\1d b abcdabbd abbd b 156a\(\([bc]\)\2\)*d b abbccd abbccd 157a\(\([bc]\)\2\)*d b abbcbd 158# actually, this next one probably ought to fail, but the spec is unclear 159a\(\(b\)*\2\)*d b abbbd abbbd 160# here is a case that no NFA implementation does right 161\(ab*\)[ab]*\1 b ababaaa ababaaa a 162# check out normal matching in the presence of back refs 163\(a\)\1bcd b aabcd aabcd 164\(a\)\1bc*d b aabcd aabcd 165\(a\)\1bc*d b aabd aabd 166\(a\)\1bc*d b aabcccd aabcccd 167\(a\)\1bc*[ce]d b aabcccd aabcccd 168^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd 169 170# ordinary repetitions 171ab*c & abc abc 172ab+c - abc abc 173ab?c - abc abc 174a\(*\)b b a*b a*b 175a\(**\)b b ab ab 176a\(***\)b bC BADRPT 177*a b *a *a 178**a b a a 179***a bC BADRPT 180 181# the dreaded bounded repetitions 182{ & { { 183{abc & {abc {abc 184{1 C BADRPT 185{1} C BADRPT 186a{b & a{b a{b 187a{1}b - ab ab 188a\{1\}b b ab ab 189a{1,}b - ab ab 190a\{1,\}b b ab ab 191a{1,2}b - aab aab 192a\{1,2\}b b aab aab 193a{1 C EBRACE 194a\{1 bC EBRACE 195a{1a C EBRACE 196a\{1a bC EBRACE 197a{1a} C BADBR 198a\{1a\} bC BADBR 199a{,2} - a{,2} a{,2} 200a\{,2\} bC BADBR 201a{,} - a{,} a{,} 202a\{,\} bC BADBR 203a{1,x} C BADBR 204a\{1,x\} bC BADBR 205a{1,x C EBRACE 206a\{1,x bC EBRACE 207a{300} C BADBR 208a\{300\} bC BADBR 209a{1,0} C BADBR 210a\{1,0\} bC BADBR 211ab{0,0}c - abcac ac 212ab\{0,0\}c b abcac ac 213ab{0,1}c - abcac abc 214ab\{0,1\}c b abcac abc 215ab{0,3}c - abbcac abbc 216ab\{0,3\}c b abbcac abbc 217ab{1,1}c - acabc abc 218ab\{1,1\}c b acabc abc 219ab{1,3}c - acabc abc 220ab\{1,3\}c b acabc abc 221ab{2,2}c - abcabbc abbc 222ab\{2,2\}c b abcabbc abbc 223ab{2,4}c - abcabbc abbc 224ab\{2,4\}c b abcabbc abbc 225((a{1,10}){1,10}){1,10} - a a a,a 226 227# multiple repetitions 228a** &C BADRPT 229a++ C BADRPT 230a?? C BADRPT 231a*+ C BADRPT 232a*? C BADRPT 233a+* C BADRPT 234a+? C BADRPT 235a?* C BADRPT 236a?+ C BADRPT 237a{1}{1} C BADRPT 238a*{1} C BADRPT 239a+{1} C BADRPT 240a?{1} C BADRPT 241a{1}* C BADRPT 242a{1}+ C BADRPT 243a{1}? C BADRPT 244a*{b} - a{b} a{b} 245a\{1\}\{1\} bC BADRPT 246a*\{1\} bC BADRPT 247a\{1\}* bC BADRPT 248 249# brackets, and numerous perversions thereof 250a[b]c & abc abc 251a[ab]c & abc abc 252a[^ab]c & adc adc 253a[]b]c & a]c a]c 254a[[b]c & a[c a[c 255a[-b]c & a-c a-c 256a[^]b]c & adc adc 257a[^-b]c & adc adc 258a[b-]c & a-c a-c 259a[b &C EBRACK 260a[] &C EBRACK 261a[1-3]c & a2c a2c 262a[3-1]c &C ERANGE 263a[1-3-5]c &C ERANGE 264a[[.-.]--]c & a-c a-c 265a[1- &C ERANGE 266a[[. &C EBRACK 267a[[.x &C EBRACK 268a[[.x. &C EBRACK 269a[[.x.] &C EBRACK 270a[[.x.]] & ax ax 271a[[.x,.]] &C ECOLLATE 272a[[.one.]]b & a1b a1b 273a[[.notdef.]]b &C ECOLLATE 274a[[.].]]b & a]b a]b 275a[[:alpha:]]c & abc abc 276a[[:notdef:]]c &C ECTYPE 277a[[: &C EBRACK 278a[[:alpha &C EBRACK 279a[[:alpha:] &C EBRACK 280a[[:alpha,:] &C ECTYPE 281a[[:]:]]b &C ECTYPE 282a[[:-:]]b &C ECTYPE 283a[[:alph:]] &C ECTYPE 284a[[:alphabet:]] &C ECTYPE 285[[:alnum:]]+ - -%@a0X- a0X 286[[:alpha:]]+ - -%@aX0- aX 287[[:blank:]]+ - aSSTb SST 288[[:cntrl:]]+ - aNTb NT 289[[:digit:]]+ - a019b 019 290[[:graph:]]+ - Sa%bS a%b 291[[:lower:]]+ - AabC ab 292[[:print:]]+ - NaSbN aSb 293[[:punct:]]+ - S%-&T %-& 294[[:space:]]+ - aSNTb SNT 295[[:upper:]]+ - aBCd BC 296[[:xdigit:]]+ - p0f3Cq 0f3C 297a[[=b=]]c & abc abc 298a[[= &C EBRACK 299a[[=b &C EBRACK 300a[[=b= &C EBRACK 301a[[=b=] &C EBRACK 302a[[=b,=]] &C ECOLLATE 303a[[=one=]]b & a1b a1b 304 305# complexities 306a(((b)))c - abc abc 307a(b|(c))d - abd abd 308a(b*|c)d - abbd abbd 309# just gotta have one DFA-buster, of course 310a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 311# and an inline expansion in case somebody gets tricky 312a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 313# and in case somebody just slips in an NFA... 314a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights 315# fish for anomalies as the number of states passes 32 31612345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 317123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 3181234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 31912345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 320123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 321# and one really big one, beyond any plausible word width 3221234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 323# fish for problems as brackets go past 8 324[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm 325[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo 326[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq 327[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq 328 329# subtleties of matching 330abc & xabcy abc 331a\(b\)?c\1d b acd 332aBc i Abc Abc 333a[Bc]*d i abBCcd abBCcd 3340[[:upper:]]1 &i 0a1 0a1 3350[[:lower:]]1 &i 0A1 0A1 336a[^b]c &i abc 337a[^b]c &i aBc 338a[^b]c &i adc adc 339[a]b[c] - abc abc 340[a]b[a] - aba aba 341[abc]b[abc] - abc abc 342[abc]b[abd] - abd abd 343a(b?c)+d - accd accd 344(wee|week)(knights|night) - weeknights weeknights 345(we|wee|week|frob)(knights|night|day) - weeknights weeknights 346a[bc]d - xyzaaabcaababdacd abd 347a[ab]c - aaabc abc 348abc s abc abc 349a* & b @b 350 351# Let's have some fun -- try to match a C comment. 352# first the obvious, which looks okay at first glance... 353/\*.*\*/ - /*x*/ /*x*/ 354# but... 355/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ 356# okay, we must not match */ inside; try to do that... 357/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ 358/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ 359# but... 360/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ 361# and a still fancier version, which does it right (I think)... 362/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ 363/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ 364/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ 365/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ 366/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ 367/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ 368 369# subexpressions 370a(b)(c)d - abcd abcd b,c 371a(((b)))c - abc abc b,b,b 372a(b|(c))d - abd abd b,- 373a(b*|c|e)d - abbd abbd bb 374a(b*|c|e)d - acd acd c 375a(b*|c|e)d - ad ad @d 376a(b?)c - abc abc b 377a(b?)c - ac ac @c 378a(b+)c - abc abc b 379a(b+)c - abbbc abbbc bbb 380a(b*)c - ac ac @c 381(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de 382# the regression tester only asks for 9 subexpressions 383a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j 384a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k 385a([bc]?)c - abc abc b 386a([bc]?)c - ac ac @c 387a([bc]+)c - abc abc b 388a([bc]+)c - abcc abcc bc 389a([bc]+)bc - abcbc abcbc bc 390a(bb+|b)b - abb abb b 391a(bbb+|bb+|b)b - abb abb b 392a(bbb+|bb+|b)b - abbb abbb bb 393a(bbb+|bb+|b)bb - abbb abbb b 394(.*).* - abcdef abcdef abcdef 395(a*)* - bc @b @b 396 397# do we get the right subexpression when it is used more than once? 398a(b|c)*d - ad ad - 399a(b|c)*d - abcd abcd c 400a(b|c)+d - abd abd b 401a(b|c)+d - abcd abcd c 402a(b|c?)+d - ad ad @d 403a(b|c?)+d - abcd abcd @d 404a(b|c){0,0}d - ad ad - 405a(b|c){0,1}d - ad ad - 406a(b|c){0,1}d - abd abd b 407a(b|c){0,2}d - ad ad - 408a(b|c){0,2}d - abcd abcd c 409a(b|c){0,}d - ad ad - 410a(b|c){0,}d - abcd abcd c 411a(b|c){1,1}d - abd abd b 412a(b|c){1,1}d - acd acd c 413a(b|c){1,2}d - abd abd b 414a(b|c){1,2}d - abcd abcd c 415a(b|c){1,}d - abd abd b 416a(b|c){1,}d - abcd abcd c 417a(b|c){2,2}d - acbd acbd b 418a(b|c){2,2}d - abcd abcd c 419a(b|c){2,4}d - abcd abcd c 420a(b|c){2,4}d - abcbd abcbd b 421a(b|c){2,4}d - abcbcd abcbcd c 422a(b|c){2,}d - abcd abcd c 423a(b|c){2,}d - abcbd abcbd b 424a(b+|((c)*))+d - abd abd @d,@d,- 425a(b+|((c)*))+d - abcd abcd @d,@d,- 426 427# check out the STARTEND option 428[abc] &# a(b)c b 429[abc] &# a(d)c 430[abc] &# a(bc)d b 431[abc] &# a(dc)d c 432. &# a()c 433b.*c &# b(bc)c bc 434b.* &# b(bc)c bc 435.*c &# b(bc)c bc 436 437# plain strings, with the NOSPEC flag 438abc m abc abc 439abc m xabcy abc 440abc m xyz 441a*b m aba*b a*b 442a*b m ab 443"" mC EMPTY 444 445# cases involving NULs 446aZb & a a 447aZb &p a 448aZb &p# (aZb) aZb 449aZ*b &p# (ab) ab 450a.b &# (aZb) aZb 451a.* &# (aZb)c aZb 452 453# word boundaries (ick) 454[[:<:]]a & a a 455[[:<:]]a & ba 456[[:<:]]a & -a a 457a[[:>:]] & a a 458a[[:>:]] & ab 459a[[:>:]] & a- a 460[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc 461[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc 462[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc 463[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc 464[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ 465[[:<:]]a_b[[:>:]] & x_a_b 466 467# past problems, and suspected problems 468(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 469abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop 470abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv 471(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 472CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 473Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz 474a?b - ab ab 475-\{0,1\}[0-9]*$ b -5 -5 476