xref: /php-src/ext/json/json_scanner.re (revision efee76b8)
1/*
2  +----------------------------------------------------------------------+
3  | Copyright (c) The PHP Group                                          |
4  +----------------------------------------------------------------------+
5  | This source file is subject to version 3.01 of the PHP license,      |
6  | that is bundled with this package in the file LICENSE, and is        |
7  | available through the world-wide-web at the following url:           |
8  | https://www.php.net/license/3_01.txt                                 |
9  | If you did not receive a copy of the PHP license and are unable to   |
10  | obtain it through the world-wide-web, please send a note to          |
11  | license@php.net so we can mail you a copy immediately.               |
12  +----------------------------------------------------------------------+
13  | Author: Jakub Zelenka <bukka@php.net>                                |
14  +----------------------------------------------------------------------+
15*/
16
17#include "php.h"
18#include "php_json_scanner.h"
19#include "php_json_scanner_defs.h"
20#include "php_json_parser.h"
21#include "json_parser.tab.h"
22
23#define	YYCTYPE     php_json_ctype
24#define	YYCURSOR    s->cursor
25#define	YYLIMIT     s->limit
26#define	YYMARKER    s->marker
27#define	YYCTXMARKER s->ctxmarker
28
29#define YYGETCONDITION()        s->state
30#define YYSETCONDITION(yystate) s->state = yystate
31
32#define	YYFILL(n)
33
34#define PHP_JSON_CONDITION_SET(condition) YYSETCONDITION(yyc##condition)
35#define PHP_JSON_CONDITION_GOTO(condition) goto yyc_##condition
36#define PHP_JSON_CONDITION_SET_AND_GOTO(condition) \
37	PHP_JSON_CONDITION_SET(condition); \
38	PHP_JSON_CONDITION_GOTO(condition)
39#define PHP_JSON_CONDITION_GOTO_STR_P2() \
40	do { \
41		if (s->utf8_invalid) { \
42			PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \
43		} else { \
44			PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \
45		} \
46	} while(0)
47
48
49#define PHP_JSON_SCANNER_COPY_ESC() php_json_scanner_copy_string(s, 0)
50#define PHP_JSON_SCANNER_COPY_UTF() php_json_scanner_copy_string(s, 5)
51#define PHP_JSON_SCANNER_COPY_UTF_SP() php_json_scanner_copy_string(s, 11)
52
53#define PHP_JSON_INT_MAX_LENGTH (MAX_LENGTH_OF_LONG - 1)
54
55
56static void php_json_scanner_copy_string(php_json_scanner *s, size_t esc_size)
57{
58	size_t len = (size_t)(s->cursor - s->str_start - esc_size - 1);
59	if (len) {
60		memcpy(s->pstr, s->str_start, len);
61		s->pstr += len;
62	}
63}
64
65static int php_json_hex_to_int(unsigned char code)
66{
67	if (code >= '0' && code <= '9') {
68		return code - '0';
69	} else if (code >= 'A' && code <= 'F') {
70		return code - ('A' - 10);
71	} else if (code >= 'a' && code <= 'f') {
72		return code - ('a' - 10);
73	} else {
74		/* this should never happened (just to suppress compiler warning) */
75		return -1;
76	}
77}
78
79static int php_json_ucs2_to_int_ex(php_json_scanner *s, int size, int start)
80{
81	int i, code = 0;
82	php_json_ctype *pc = s->cursor - start;
83	for (i = 0; i < size; i++) {
84		code |= php_json_hex_to_int(*(pc--)) << (i * 4);
85	}
86	return code;
87}
88
89static int php_json_ucs2_to_int(php_json_scanner *s, int size)
90{
91	return php_json_ucs2_to_int_ex(s, size, 1);
92}
93
94void php_json_scanner_init(php_json_scanner *s, const char *str, size_t str_len, int options)
95{
96	s->cursor = (php_json_ctype *) str;
97	s->limit = (php_json_ctype *) str + str_len;
98	s->options = options;
99	PHP_JSON_CONDITION_SET(JS);
100}
101
102int php_json_scan(php_json_scanner *s)
103{
104	ZVAL_NULL(&s->value);
105
106std:
107	s->token = s->cursor;
108
109/*!re2c
110	re2c:indent:top = 1;
111	re2c:yyfill:enable = 0;
112
113	DIGIT   = [0-9] ;
114	DIGITNZ = [1-9] ;
115	UINT    = "0" | ( DIGITNZ DIGIT* ) ;
116	INT     = "-"? UINT ;
117	HEX     = DIGIT | [a-fA-F] ;
118	HEXNZ   = DIGITNZ | [a-fA-F] ;
119	HEX7    = [0-7] ;
120	HEXC    = DIGIT | [a-cA-C] ;
121	FLOAT   = INT "." DIGIT+ ;
122	EXP     = ( INT | FLOAT ) [eE] [+-]? DIGIT+ ;
123	NL      = "\r"? "\n" ;
124	WS      = [ \t\r]+ ;
125	EOI     = "\000";
126	CTRL    = [\x00-\x1F] ;
127	UTF8T   = [\x80-\xBF] ;
128	UTF8_1  = [\x00-\x7F] ;
129	UTF8_2  = [\xC2-\xDF] UTF8T ;
130	UTF8_3A = "\xE0" [\xA0-\xBF] UTF8T ;
131	UTF8_3B = [\xE1-\xEC] UTF8T{2} ;
132	UTF8_3C = "\xED" [\x80-\x9F] UTF8T ;
133	UTF8_3D = [\xEE-\xEF] UTF8T{2} ;
134	UTF8_3  = UTF8_3A | UTF8_3B | UTF8_3C | UTF8_3D ;
135	UTF8_4A = "\xF0"[\x90-\xBF] UTF8T{2} ;
136	UTF8_4B = [\xF1-\xF3] UTF8T{3} ;
137	UTF8_4C = "\xF4" [\x80-\x8F] UTF8T{2} ;
138	UTF8_4  = UTF8_4A | UTF8_4B | UTF8_4C ;
139	UTF8    = UTF8_1 | UTF8_2 | UTF8_3 | UTF8_4 ;
140	ANY     = [^] ;
141	ESCPREF = "\\" ;
142	ESCSYM  = ( "\"" | "\\" | "/" | [bfnrt] ) ;
143	ESC     = ESCPREF ESCSYM ;
144	UTFSYM  = "u" ;
145	UTFPREF = ESCPREF UTFSYM ;
146	UCS2    = UTFPREF HEX{4} ;
147	UTF16_1 = UTFPREF "00" HEX7 HEX ;
148	UTF16_2 = UTFPREF "0" HEX7 HEX{2} ;
149	UTF16_3 = UTFPREF ( ( ( HEXC | [efEF] ) HEX ) | ( [dD] HEX7 ) ) HEX{2} ;
150	UTF16_4 = UTFPREF [dD] [89abAB] HEX{2} UTFPREF [dD] [c-fC-F] HEX{2} ;
151
152	<JS>"{"                  { return '{'; }
153	<JS>"}"                  { return '}'; }
154	<JS>"["                  { return '['; }
155	<JS>"]"                  { return ']'; }
156	<JS>":"                  { return ':'; }
157	<JS>","                  { return ','; }
158	<JS>"null"               {
159		ZVAL_NULL(&s->value);
160		return PHP_JSON_T_NUL;
161	}
162	<JS>"true"               {
163		ZVAL_TRUE(&s->value);
164		return PHP_JSON_T_TRUE;
165	}
166	<JS>"false"              {
167		ZVAL_FALSE(&s->value);
168		return PHP_JSON_T_FALSE;
169	}
170	<JS>INT                  {
171		bool bigint = 0, negative = s->token[0] == '-';
172		size_t digits = (size_t) (s->cursor - s->token - negative);
173		if (digits >= PHP_JSON_INT_MAX_LENGTH) {
174			if (digits == PHP_JSON_INT_MAX_LENGTH) {
175				int cmp = strncmp((char *) (s->token + negative), LONG_MIN_DIGITS, PHP_JSON_INT_MAX_LENGTH);
176				if (!(cmp < 0 || (cmp == 0 && negative))) {
177					bigint = 1;
178				}
179			} else {
180				bigint = 1;
181			}
182		}
183		if (!bigint) {
184			ZVAL_LONG(&s->value, ZEND_STRTOL((char *) s->token, NULL, 10));
185			return PHP_JSON_T_INT;
186		} else if (s->options & PHP_JSON_BIGINT_AS_STRING) {
187			ZVAL_STRINGL(&s->value, (char *) s->token, (size_t)(s->cursor - s->token));
188			return PHP_JSON_T_STRING;
189		} else {
190			ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
191			return PHP_JSON_T_DOUBLE;
192		}
193	}
194	<JS>FLOAT|EXP            {
195		ZVAL_DOUBLE(&s->value, zend_strtod((char *) s->token, NULL));
196		return PHP_JSON_T_DOUBLE;
197	}
198	<JS>NL|WS                { goto std; }
199	<JS>EOI                  {
200		if (s->limit < s->cursor) {
201			return PHP_JSON_T_EOI;
202		} else {
203			s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
204			return PHP_JSON_T_ERROR;
205		}
206	}
207	<JS>["]                  {
208		s->str_start = s->cursor;
209		s->str_esc = 0;
210		s->utf8_invalid = 0;
211		s->utf8_invalid_count = 0;
212		PHP_JSON_CONDITION_SET_AND_GOTO(STR_P1);
213	}
214	<JS>CTRL                 {
215		s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
216		return PHP_JSON_T_ERROR;
217	}
218	<JS>UTF8                 {
219		s->errcode = PHP_JSON_ERROR_SYNTAX;
220		return PHP_JSON_T_ERROR;
221	}
222	<JS>ANY                  {
223		s->errcode = PHP_JSON_ERROR_UTF8;
224		return PHP_JSON_T_ERROR;
225	}
226
227	<STR_P1>CTRL             {
228		s->errcode = PHP_JSON_ERROR_CTRL_CHAR;
229		return PHP_JSON_T_ERROR;
230	}
231	<STR_P1>UTF16_1          {
232		s->str_esc += 5;
233		PHP_JSON_CONDITION_GOTO(STR_P1);
234	}
235	<STR_P1>UTF16_2          {
236		s->str_esc += 4;
237		PHP_JSON_CONDITION_GOTO(STR_P1);
238	}
239	<STR_P1>UTF16_3          {
240		s->str_esc += 3;
241		PHP_JSON_CONDITION_GOTO(STR_P1);
242	}
243	<STR_P1>UTF16_4          {
244		s->str_esc += 8;
245		PHP_JSON_CONDITION_GOTO(STR_P1);
246	}
247	<STR_P1>UCS2             {
248		s->errcode = PHP_JSON_ERROR_UTF16;
249		return PHP_JSON_T_ERROR;
250	}
251	<STR_P1>ESC              {
252		s->str_esc++;
253		PHP_JSON_CONDITION_GOTO(STR_P1);
254	}
255	<STR_P1>ESCPREF           {
256		s->errcode = PHP_JSON_ERROR_SYNTAX;
257		return PHP_JSON_T_ERROR;
258	}
259	<STR_P1>["]              {
260		zend_string *str;
261		size_t len = (size_t)(s->cursor - s->str_start - s->str_esc - 1 + s->utf8_invalid_count);
262		if (len == 0) {
263			PHP_JSON_CONDITION_SET(JS);
264			ZVAL_EMPTY_STRING(&s->value);
265			return PHP_JSON_T_ESTRING;
266		}
267		str = zend_string_alloc(len, 0);
268		ZSTR_VAL(str)[len] = '\0';
269		ZVAL_STR(&s->value, str);
270		if (s->str_esc || s->utf8_invalid) {
271			s->pstr = (php_json_ctype *) Z_STRVAL(s->value);
272			s->cursor = s->str_start;
273			PHP_JSON_CONDITION_GOTO_STR_P2();
274		} else {
275			memcpy(Z_STRVAL(s->value), s->str_start, len);
276			PHP_JSON_CONDITION_SET(JS);
277			return PHP_JSON_T_STRING;
278		}
279	}
280	<STR_P1>UTF8             { PHP_JSON_CONDITION_GOTO(STR_P1); }
281	<STR_P1>ANY              {
282		if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) {
283			if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
284				if (s->utf8_invalid_count > INT_MAX - 2) {
285					s->errcode = PHP_JSON_ERROR_UTF8;
286					return PHP_JSON_T_ERROR;
287				}
288				s->utf8_invalid_count += 2;
289			} else {
290				s->utf8_invalid_count--;
291			}
292			s->utf8_invalid = 1;
293			PHP_JSON_CONDITION_GOTO(STR_P1);
294		}
295		s->errcode = PHP_JSON_ERROR_UTF8;
296		return PHP_JSON_T_ERROR;
297	}
298
299	<STR_P2_UTF,STR_P2_BIN>UTF16_1             {
300		int utf16 = php_json_ucs2_to_int(s, 2);
301		PHP_JSON_SCANNER_COPY_UTF();
302		*(s->pstr++) = (unsigned char) utf16;
303		s->str_start = s->cursor;
304		PHP_JSON_CONDITION_GOTO_STR_P2();
305	}
306	<STR_P2_UTF,STR_P2_BIN>UTF16_2             {
307		int utf16 = php_json_ucs2_to_int(s, 3);
308		PHP_JSON_SCANNER_COPY_UTF();
309		*(s->pstr++) = (unsigned char) (0xc0 | (utf16 >> 6));
310		*(s->pstr++) = (unsigned char) (0x80 | (utf16 & 0x3f));
311		s->str_start = s->cursor;
312		PHP_JSON_CONDITION_GOTO_STR_P2();
313	}
314	<STR_P2_UTF,STR_P2_BIN>UTF16_3             {
315		int utf16 = php_json_ucs2_to_int(s, 4);
316		PHP_JSON_SCANNER_COPY_UTF();
317		*(s->pstr++) = (unsigned char) (0xe0 | (utf16 >> 12));
318		*(s->pstr++) = (unsigned char) (0x80 | ((utf16 >> 6) & 0x3f));
319		*(s->pstr++) = (unsigned char) (0x80 | (utf16 & 0x3f));
320		s->str_start = s->cursor;
321		PHP_JSON_CONDITION_GOTO_STR_P2();
322	}
323	<STR_P2_UTF,STR_P2_BIN>UTF16_4             {
324		int utf32, utf16_hi, utf16_lo;
325		utf16_hi = php_json_ucs2_to_int(s, 4);
326		utf16_lo = php_json_ucs2_to_int_ex(s, 4, 7);
327		utf32 = ((utf16_lo & 0x3FF) << 10) + (utf16_hi & 0x3FF) + 0x10000;
328		PHP_JSON_SCANNER_COPY_UTF_SP();
329		*(s->pstr++) = (unsigned char) (0xf0 | (utf32 >> 18));
330		*(s->pstr++) = (unsigned char) (0x80 | ((utf32 >> 12) & 0x3f));
331		*(s->pstr++) = (unsigned char) (0x80 | ((utf32 >> 6) & 0x3f));
332		*(s->pstr++) = (unsigned char) (0x80 | (utf32 & 0x3f));
333		s->str_start = s->cursor;
334		PHP_JSON_CONDITION_GOTO_STR_P2();
335	}
336	<STR_P2_UTF,STR_P2_BIN>ESCPREF          {
337		unsigned char esc;
338		PHP_JSON_SCANNER_COPY_ESC();
339		switch (*s->cursor) {
340			case 'b':
341				esc = '\b';
342				break;
343			case 'f':
344				esc = '\f';				break;
345			case 'n':
346				esc = '\n';
347				break;
348			case 'r':
349				esc = '\r';
350				break;
351			case 't':
352				esc = '\t';
353				break;
354			case '\\':
355			case '/':
356			case '"':
357				esc = *s->cursor;
358				break;
359			default:
360				s->errcode = PHP_JSON_ERROR_SYNTAX;
361				return PHP_JSON_T_ERROR;
362		}
363		*(s->pstr++) = esc;
364		++YYCURSOR;
365		s->str_start = s->cursor;
366		PHP_JSON_CONDITION_GOTO_STR_P2();
367	}
368	<STR_P2_UTF,STR_P2_BIN>["] => JS        {
369		PHP_JSON_SCANNER_COPY_ESC();
370		return PHP_JSON_T_STRING;
371	}
372	<STR_P2_BIN>UTF8         { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); }
373	<STR_P2_BIN>ANY          {
374		if (s->utf8_invalid) {
375			PHP_JSON_SCANNER_COPY_ESC();
376			if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
377				*(s->pstr++) = (unsigned char) (0xe0 | (0xfffd >> 12));
378				*(s->pstr++) = (unsigned char) (0x80 | ((0xfffd >> 6) & 0x3f));
379				*(s->pstr++) = (unsigned char) (0x80 | (0xfffd & 0x3f));
380			}
381			s->str_start = s->cursor;
382		}
383		PHP_JSON_CONDITION_GOTO(STR_P2_BIN);
384	}
385	<STR_P2_UTF>ANY          { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); }
386
387	<*>ANY                   {
388		s->errcode = PHP_JSON_ERROR_SYNTAX;
389		return PHP_JSON_T_ERROR;
390	}
391*/
392
393}
394