xref: /PHP-7.4/ext/json/json_scanner.re (revision 0cf7de1c)
1/*
2  +----------------------------------------------------------------------+
3  | PHP Version 7                                                        |
4  +----------------------------------------------------------------------+
5  | Copyright (c) The PHP Group                                          |
6  +----------------------------------------------------------------------+
7  | This source file is subject to version 3.01 of the PHP license,      |
8  | that is bundled with this package in the file LICENSE, and is        |
9  | available through the world-wide-web at the following url:           |
10  | http://www.php.net/license/3_01.txt                                  |
11  | If you did not receive a copy of the PHP license and are unable to   |
12  | obtain it through the world-wide-web, please send a note to          |
13  | license@php.net so we can mail you a copy immediately.               |
14  +----------------------------------------------------------------------+
15  | Author: Jakub Zelenka <bukka@php.net>                                |
16  +----------------------------------------------------------------------+
17*/
18
19#include "php.h"
20#include "php_json_scanner.h"
21#include "php_json_scanner_defs.h"
22#include "php_json_parser.h"
23#include "json_parser.tab.h"
24
25#define	YYCTYPE     php_json_ctype
26#define	YYCURSOR    s->cursor
27#define	YYLIMIT     s->limit
28#define	YYMARKER    s->marker
29#define	YYCTXMARKER s->ctxmarker
30
31#define YYGETCONDITION()        s->state
32#define YYSETCONDITION(yystate) s->state = yystate
33
34#define	YYFILL(n)
35
36#define PHP_JSON_CONDITION_SET(condition) YYSETCONDITION(yyc##condition)
37#define PHP_JSON_CONDITION_GOTO(condition) goto yyc_##condition
38#define PHP_JSON_CONDITION_SET_AND_GOTO(condition) \
39	PHP_JSON_CONDITION_SET(condition); \
40	PHP_JSON_CONDITION_GOTO(condition)
41#define PHP_JSON_CONDITION_GOTO_STR_P2() \
42	do { \
43		if (s->utf8_invalid) { \
44			PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \
45		} else { \
46			PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \
47		} \
48	} while(0)
49
50
51#define PHP_JSON_SCANNER_COPY_ESC() php_json_scanner_copy_string(s, 0)
52#define PHP_JSON_SCANNER_COPY_UTF() php_json_scanner_copy_string(s, 5)
53#define PHP_JSON_SCANNER_COPY_UTF_SP() php_json_scanner_copy_string(s, 11)
54
55#define PHP_JSON_INT_MAX_LENGTH (MAX_LENGTH_OF_LONG - 1)
56
57
58static void php_json_scanner_copy_string(php_json_scanner *s, int esc_size)
59{
60	size_t len = s->cursor - s->str_start - esc_size - 1;
61	if (len) {
62		memcpy(s->pstr, s->str_start, len);
63		s->pstr += len;
64	}
65}
66
67static int php_json_hex_to_int(char code)
68{
69	if (code >= '0' && code <= '9') {
70		return code - '0';
71	} else if (code >= 'A' && code <= 'F') {
72		return code - ('A' - 10);
73	} else if (code >= 'a' && code <= 'f') {
74		return code - ('a' - 10);
75	} else {
76		/* this should never happened (just to suppress compiler warning) */
77		return -1;
78	}
79}
80
81static int php_json_ucs2_to_int_ex(php_json_scanner *s, int size, int start)
82{
83	int i, code = 0;
84	php_json_ctype *pc = s->cursor - start;
85	for (i = 0; i < size; i++) {
86		code |= php_json_hex_to_int(*(pc--)) << (i * 4);
87	}
88	return code;
89}
90
91static int php_json_ucs2_to_int(php_json_scanner *s, int size)
92{
93	return php_json_ucs2_to_int_ex(s, size, 1);
94}
95
96void php_json_scanner_init(php_json_scanner *s, char *str, size_t str_len, int options)
97{
98	s->cursor = (php_json_ctype *) str;
99	s->limit = (php_json_ctype *) str + str_len;
100	s->options = options;
101	PHP_JSON_CONDITION_SET(JS);
102}
103
104int php_json_scan(php_json_scanner *s)
105{
106	ZVAL_NULL(&s->value);
107
108std:
109	s->token = s->cursor;
110
111/*!re2c
112	re2c:indent:top = 1;
113	re2c:yyfill:enable = 0;
114
115	DIGIT   = [0-9] ;
116	DIGITNZ = [1-9] ;
117	UINT    = "0" | ( DIGITNZ DIGIT* ) ;
118	INT     = "-"? UINT ;
119	HEX     = DIGIT | [a-fA-F] ;
120	HEXNZ   = DIGITNZ | [a-fA-F] ;
121	HEX7    = [0-7] ;
122	HEXC    = DIGIT | [a-cA-C] ;
123	FLOAT   = INT "." DIGIT+ ;
124	EXP     = ( INT | FLOAT ) [eE] [+-]? DIGIT+ ;
125	NL      = "\r"? "\n" ;
126	WS      = [ \t\r]+ ;
127	EOI     = "\000";
128	CTRL    = [\x00-\x1F] ;
129	UTF8T   = [\x80-\xBF] ;
130	UTF8_1  = [\x00-\x7F] ;
131	UTF8_2  = [\xC2-\xDF] UTF8T ;
132	UTF8_3A = "\xE0" [\xA0-\xBF] UTF8T ;
133	UTF8_3B = [\xE1-\xEC] UTF8T{2} ;
134	UTF8_3C = "\xED" [\x80-\x9F] UTF8T ;
135	UTF8_3D = [\xEE-\xEF] UTF8T{2} ;
136	UTF8_3  = UTF8_3A | UTF8_3B | UTF8_3C | UTF8_3D ;
137	UTF8_4A = "\xF0"[\x90-\xBF] UTF8T{2} ;
138	UTF8_4B = [\xF1-\xF3] UTF8T{3} ;
139	UTF8_4C = "\xF4" [\x80-\x8F] UTF8T{2} ;
140	UTF8_4  = UTF8_4A | UTF8_4B | UTF8_4C ;
141	UTF8    = UTF8_1 | UTF8_2 | UTF8_3 | UTF8_4 ;
142	ANY     = [^] ;
143	ESCPREF = "\\" ;
144	ESCSYM  = ( "\"" | "\\" | "/" | [bfnrt] ) ;
145	ESC     = ESCPREF ESCSYM ;
146	UTFSYM  = "u" ;
147	UTFPREF = ESCPREF UTFSYM ;
148	UCS2    = UTFPREF HEX{4} ;
149	UTF16_1 = UTFPREF "00" HEX7 HEX ;
150	UTF16_2 = UTFPREF "0" HEX7 HEX{2} ;
151	UTF16_3 = UTFPREF ( ( ( HEXC | [efEF] ) HEX ) | ( [dD] HEX7 ) ) HEX{2} ;
152	UTF16_4 = UTFPREF [dD] [89abAB] HEX{2} UTFPREF [dD] [c-fC-F] HEX{2} ;
153
154	<JS>"{"                  { return '{'; }
155	<JS>"}"                  { return '}'; }
156	<JS>"["                  { return '['; }
157	<JS>"]"                  { return ']'; }
158	<JS>":"                  { return ':'; }
159	<JS>","                  { return ','; }
160	<JS>"null"               {
161		ZVAL_NULL(&s->value);
162		return PHP_JSON_T_NUL;
163	}
164	<JS>"true"               {
165		ZVAL_TRUE(&s->value);
166		return PHP_JSON_T_TRUE;
167	}
168	<JS>"false"              {
169		ZVAL_FALSE(&s->value);
170		return PHP_JSON_T_FALSE;
171	}
172	<JS>INT                  {
173		zend_bool bigint = 0, negative = s->token[0] == '-';
174		size_t digits = (size_t) (s->cursor - s->token - negative);
175		if (digits >= PHP_JSON_INT_MAX_LENGTH) {
176			if (digits == PHP_JSON_INT_MAX_LENGTH) {
177				int cmp = strncmp((char *) (s->token + negative), LONG_MIN_DIGITS, PHP_JSON_INT_MAX_LENGTH);
178				if (!(cmp < 0 || (cmp == 0 && negative))) {
179					bigint = 1;
180				}
181			} else {
182				bigint = 1;
183			}
184		}
185		if (!bigint) {
186			ZVAL_LONG(&s->value, ZEND_STRTOL((char *) s->token, NULL, 10));
187			return PHP_JSON_T_INT;
188		} else if (s->options & PHP_JSON_BIGINT_AS_STRING) {
189			ZVAL_STRINGL(&s->value, (char *) s->token, s->cursor - s->token);
190			return PHP_JSON_T_STRING;
191		} else {
192			ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
193			return PHP_JSON_T_DOUBLE;
194		}
195	}
196	<JS>FLOAT|EXP            {
197		ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
198		return PHP_JSON_T_DOUBLE;
199	}
200	<JS>NL|WS                { goto std; }
201	<JS>EOI                  {
202		if (s->limit < s->cursor) {
203			return PHP_JSON_T_EOI;
204		} else {
205			s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
206			return PHP_JSON_T_ERROR;
207		}
208	}
209	<JS>["]                  {
210		s->str_start = s->cursor;
211		s->str_esc = 0;
212		s->utf8_invalid = 0;
213		s->utf8_invalid_count = 0;
214		PHP_JSON_CONDITION_SET_AND_GOTO(STR_P1);
215	}
216	<JS>CTRL                 {
217		s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
218		return PHP_JSON_T_ERROR;
219	}
220	<JS>UTF8                 {
221		s->errcode = PHP_JSON_ERROR_SYNTAX;
222		return PHP_JSON_T_ERROR;
223	}
224	<JS>ANY                  {
225		s->errcode = PHP_JSON_ERROR_UTF8;
226		return PHP_JSON_T_ERROR;
227	}
228
229	<STR_P1>CTRL             {
230		s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
231		return PHP_JSON_T_ERROR;
232	}
233	<STR_P1>UTF16_1          {
234		s->str_esc += 5;
235		PHP_JSON_CONDITION_GOTO(STR_P1);
236	}
237	<STR_P1>UTF16_2          {
238		s->str_esc += 4;
239		PHP_JSON_CONDITION_GOTO(STR_P1);
240	}
241	<STR_P1>UTF16_3          {
242		s->str_esc += 3;
243		PHP_JSON_CONDITION_GOTO(STR_P1);
244	}
245	<STR_P1>UTF16_4          {
246		s->str_esc += 8;
247		PHP_JSON_CONDITION_GOTO(STR_P1);
248	}
249	<STR_P1>UCS2             {
250		s->errcode = PHP_JSON_ERROR_UTF16;
251		return PHP_JSON_T_ERROR;
252	}
253	<STR_P1>ESC              {
254		s->str_esc++;
255		PHP_JSON_CONDITION_GOTO(STR_P1);
256	}
257	<STR_P1>ESCPREF           {
258		s->errcode = PHP_JSON_ERROR_SYNTAX;
259		return PHP_JSON_T_ERROR;
260	}
261	<STR_P1>["]              {
262		zend_string *str;
263		size_t len = s->cursor - s->str_start - s->str_esc - 1 + s->utf8_invalid_count;
264		if (len == 0) {
265			PHP_JSON_CONDITION_SET(JS);
266			ZVAL_EMPTY_STRING(&s->value);
267			return PHP_JSON_T_ESTRING;
268		}
269		str = zend_string_alloc(len, 0);
270		ZSTR_VAL(str)[len] = '\0';
271		ZVAL_STR(&s->value, str);
272		if (s->str_esc || s->utf8_invalid) {
273			s->pstr = (php_json_ctype *) Z_STRVAL(s->value);
274			s->cursor = s->str_start;
275			PHP_JSON_CONDITION_GOTO_STR_P2();
276		} else {
277			memcpy(Z_STRVAL(s->value), s->str_start, len);
278			PHP_JSON_CONDITION_SET(JS);
279			return PHP_JSON_T_STRING;
280		}
281	}
282	<STR_P1>UTF8             { PHP_JSON_CONDITION_GOTO(STR_P1); }
283	<STR_P1>ANY              {
284		if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) {
285			if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
286				if (s->utf8_invalid_count > INT_MAX - 2) {
287					s->errcode = PHP_JSON_ERROR_UTF8;
288					return PHP_JSON_T_ERROR;
289				}
290				s->utf8_invalid_count += 2;
291			} else {
292				s->utf8_invalid_count--;
293			}
294			s->utf8_invalid = 1;
295			PHP_JSON_CONDITION_GOTO(STR_P1);
296		}
297		s->errcode = PHP_JSON_ERROR_UTF8;
298		return PHP_JSON_T_ERROR;
299	}
300
301	<STR_P2_UTF,STR_P2_BIN>UTF16_1             {
302		int utf16 = php_json_ucs2_to_int(s, 2);
303		PHP_JSON_SCANNER_COPY_UTF();
304		*(s->pstr++) = (char) utf16;
305		s->str_start = s->cursor;
306		PHP_JSON_CONDITION_GOTO_STR_P2();
307	}
308	<STR_P2_UTF,STR_P2_BIN>UTF16_2             {
309		int utf16 = php_json_ucs2_to_int(s, 3);
310		PHP_JSON_SCANNER_COPY_UTF();
311		*(s->pstr++) = (char) (0xc0 | (utf16 >> 6));
312		*(s->pstr++) = (char) (0x80 | (utf16 & 0x3f));
313		s->str_start = s->cursor;
314		PHP_JSON_CONDITION_GOTO_STR_P2();
315	}
316	<STR_P2_UTF,STR_P2_BIN>UTF16_3             {
317		int utf16 = php_json_ucs2_to_int(s, 4);
318		PHP_JSON_SCANNER_COPY_UTF();
319		*(s->pstr++) = (char) (0xe0 | (utf16 >> 12));
320		*(s->pstr++) = (char) (0x80 | ((utf16 >> 6) & 0x3f));
321		*(s->pstr++) = (char) (0x80 | (utf16 & 0x3f));
322		s->str_start = s->cursor;
323		PHP_JSON_CONDITION_GOTO_STR_P2();
324	}
325	<STR_P2_UTF,STR_P2_BIN>UTF16_4             {
326		int utf32, utf16_hi, utf16_lo;
327		utf16_hi = php_json_ucs2_to_int(s, 4);
328		utf16_lo = php_json_ucs2_to_int_ex(s, 4, 7);
329		utf32 = ((utf16_lo & 0x3FF) << 10) + (utf16_hi & 0x3FF) + 0x10000;
330		PHP_JSON_SCANNER_COPY_UTF_SP();
331		*(s->pstr++) = (char) (0xf0 | (utf32 >> 18));
332		*(s->pstr++) = (char) (0x80 | ((utf32 >> 12) & 0x3f));
333		*(s->pstr++) = (char) (0x80 | ((utf32 >> 6) & 0x3f));
334		*(s->pstr++) = (char) (0x80 | (utf32 & 0x3f));
335		s->str_start = s->cursor;
336		PHP_JSON_CONDITION_GOTO_STR_P2();
337	}
338	<STR_P2_UTF,STR_P2_BIN>ESCPREF          {
339		char esc;
340		PHP_JSON_SCANNER_COPY_ESC();
341		switch (*s->cursor) {
342			case 'b':
343				esc = '\b';
344				break;
345			case 'f':
346				esc = '\f';				break;
347			case 'n':
348				esc = '\n';
349				break;
350			case 'r':
351				esc = '\r';
352				break;
353			case 't':
354				esc = '\t';
355				break;
356			case '\\':
357			case '/':
358			case '"':
359				esc = *s->cursor;
360				break;
361			default:
362				s->errcode = PHP_JSON_ERROR_SYNTAX;
363				return PHP_JSON_T_ERROR;
364		}
365		*(s->pstr++) = esc;
366		++YYCURSOR;
367		s->str_start = s->cursor;
368		PHP_JSON_CONDITION_GOTO_STR_P2();
369	}
370	<STR_P2_UTF,STR_P2_BIN>["] => JS        {
371		PHP_JSON_SCANNER_COPY_ESC();
372		return PHP_JSON_T_STRING;
373	}
374	<STR_P2_BIN>UTF8         { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); }
375	<STR_P2_BIN>ANY          {
376		if (s->utf8_invalid) {
377			PHP_JSON_SCANNER_COPY_ESC();
378			if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
379				*(s->pstr++) = (char) (0xe0 | (0xfffd >> 12));
380				*(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f));
381				*(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f));
382			}
383			s->str_start = s->cursor;
384		}
385		PHP_JSON_CONDITION_GOTO(STR_P2_BIN);
386	}
387	<STR_P2_UTF>ANY          { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); }
388
389	<*>ANY                   {
390		s->errcode = PHP_JSON_ERROR_SYNTAX;
391		return PHP_JSON_T_ERROR;
392	}
393*/
394
395}
396