xref: /PHP-8.0/ext/fileinfo/libmagic/is_json.c (revision c3eeab01)
1 /*-
2  * Copyright (c) 2018 Christos Zoulas
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24  * POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /*
28  * Parse JSON object serialization format (RFC-7159)
29  */
30 
31 #ifndef TEST
32 #include "file.h"
33 
34 #ifndef lint
35 FILE_RCSID("@(#)$File: is_json.c,v 1.15 2020/06/07 19:05:47 christos Exp $")
36 #endif
37 
38 #include <string.h>
39 #include "magic.h"
40 #endif
41 
42 #ifdef DEBUG
43 #include <stdio.h>
44 #define DPRINTF(a, b, c)	\
45     printf("%s [%.2x/%c] %.20s\n", (a), *(b), *(b), (const char *)(c))
46 #else
47 #define DPRINTF(a, b, c)	do { } while (/*CONSTCOND*/0)
48 #endif
49 
50 #define JSON_ARRAY	0
51 #define JSON_CONSTANT	1
52 #define JSON_NUMBER	2
53 #define JSON_OBJECT	3
54 #define JSON_STRING	4
55 #define JSON_ARRAYN	5
56 #define JSON_MAX	6
57 
58 /*
59  * if JSON_COUNT != 0:
60  *	count all the objects, require that we have the whole data file
61  * otherwise:
62  *	stop if we find an object or an array
63  */
64 #ifndef JSON_COUNT
65 #define JSON_COUNT 0
66 #endif
67 
68 static int json_parse(const unsigned char **, const unsigned char *, size_t *,
69 	size_t);
70 
71 static int
json_isspace(const unsigned char uc)72 json_isspace(const unsigned char uc)
73 {
74 	switch (uc) {
75 	case ' ':
76 	case '\n':
77 	case '\r':
78 	case '\t':
79 		return 1;
80 	default:
81 		return 0;
82 	}
83 }
84 
85 static int
json_isdigit(unsigned char uc)86 json_isdigit(unsigned char uc)
87 {
88 	switch (uc) {
89 	case '0': case '1': case '2': case '3': case '4':
90 	case '5': case '6': case '7': case '8': case '9':
91 		return 1;
92 	default:
93 		return 0;
94 	}
95 }
96 
97 static int
json_isxdigit(unsigned char uc)98 json_isxdigit(unsigned char uc)
99 {
100 	if (json_isdigit(uc))
101 		return 1;
102 	switch (uc) {
103 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
104 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
105 		return 1;
106 	default:
107 		return 0;
108 	}
109 }
110 
111 static const unsigned char *
json_skip_space(const unsigned char * uc,const unsigned char * ue)112 json_skip_space(const unsigned char *uc, const unsigned char *ue)
113 {
114 	while (uc < ue && json_isspace(*uc))
115 		uc++;
116 	return uc;
117 }
118 
119 static int
json_parse_string(const unsigned char ** ucp,const unsigned char * ue)120 json_parse_string(const unsigned char **ucp, const unsigned char *ue)
121 {
122 	const unsigned char *uc = *ucp;
123 	size_t i;
124 
125 	DPRINTF("Parse string: ", uc, *ucp);
126 	while (uc < ue) {
127 		switch (*uc++) {
128 		case '\0':
129 			goto out;
130 		case '\\':
131 			if (uc == ue)
132 				goto out;
133 			switch (*uc++) {
134 			case '\0':
135 				goto out;
136 			case '"':
137 			case '\\':
138 			case '/':
139 			case 'b':
140 			case 'f':
141 			case 'n':
142 			case 'r':
143 			case 't':
144 				continue;
145 			case 'u':
146 				if (ue - uc < 4) {
147 					uc = ue;
148 					goto out;
149 				}
150 				for (i = 0; i < 4; i++)
151 					if (!json_isxdigit(*uc++))
152 						goto out;
153 				continue;
154 			default:
155 				goto out;
156 			}
157 		case '"':
158 			*ucp = uc;
159 			DPRINTF("Good string: ", uc, *ucp);
160 			return 1;
161 		default:
162 			continue;
163 		}
164 	}
165 out:
166 	DPRINTF("Bad string: ", uc, *ucp);
167 	*ucp = uc;
168 	return 0;
169 }
170 
171 static int
json_parse_array(const unsigned char ** ucp,const unsigned char * ue,size_t * st,size_t lvl)172 json_parse_array(const unsigned char **ucp, const unsigned char *ue,
173 	size_t *st, size_t lvl)
174 {
175 	const unsigned char *uc = *ucp;
176 
177 	DPRINTF("Parse array: ", uc, *ucp);
178 	while (uc < ue) {
179 		if (*uc == ']')
180 			goto done;
181 		if (!json_parse(&uc, ue, st, lvl + 1))
182 			goto out;
183 		if (uc == ue)
184 			goto out;
185 		switch (*uc) {
186 		case ',':
187 			uc++;
188 			continue;
189 		case ']':
190 		done:
191 			st[JSON_ARRAYN]++;
192 			*ucp = uc + 1;
193 			DPRINTF("Good array: ", uc, *ucp);
194 			return 1;
195 		default:
196 			goto out;
197 		}
198 	}
199 out:
200 	DPRINTF("Bad array: ", uc,  *ucp);
201 	*ucp = uc;
202 	return 0;
203 }
204 
205 static int
json_parse_object(const unsigned char ** ucp,const unsigned char * ue,size_t * st,size_t lvl)206 json_parse_object(const unsigned char **ucp, const unsigned char *ue,
207 	size_t *st, size_t lvl)
208 {
209 	const unsigned char *uc = *ucp;
210 	DPRINTF("Parse object: ", uc, *ucp);
211 	while (uc < ue) {
212 		uc = json_skip_space(uc, ue);
213 		if (uc == ue)
214 			goto out;
215 		if (*uc == '}') {
216 			uc++;
217 			goto done;
218 		}
219 		if (*uc++ != '"') {
220 			DPRINTF("not string", uc, *ucp);
221 			goto out;
222 		}
223 		DPRINTF("next field", uc, *ucp);
224 		if (!json_parse_string(&uc, ue)) {
225 			DPRINTF("not string", uc, *ucp);
226 			goto out;
227 		}
228 		uc = json_skip_space(uc, ue);
229 		if (uc == ue)
230 			goto out;
231 		if (*uc++ != ':') {
232 			DPRINTF("not colon", uc, *ucp);
233 			goto out;
234 		}
235 		if (!json_parse(&uc, ue, st, lvl + 1)) {
236 			DPRINTF("not json", uc, *ucp);
237 			goto out;
238 		}
239 		if (uc == ue)
240 			goto out;
241 		switch (*uc++) {
242 		case ',':
243 			continue;
244 		case '}': /* { */
245 		done:
246 			*ucp = uc;
247 			DPRINTF("Good object: ", uc, *ucp);
248 			return 1;
249 		default:
250 			*ucp = uc - 1;
251 			DPRINTF("not more", uc, *ucp);
252 			goto out;
253 		}
254 	}
255 out:
256 	DPRINTF("Bad object: ", uc, *ucp);
257 	*ucp = uc;
258 	return 0;
259 }
260 
261 static int
json_parse_number(const unsigned char ** ucp,const unsigned char * ue)262 json_parse_number(const unsigned char **ucp, const unsigned char *ue)
263 {
264 	const unsigned char *uc = *ucp;
265 	int got = 0;
266 
267 	DPRINTF("Parse number: ", uc, *ucp);
268 	if (uc == ue)
269 		return 0;
270 	if (*uc == '-')
271 		uc++;
272 
273 	for (; uc < ue; uc++) {
274 		if (!json_isdigit(*uc))
275 			break;
276 		got = 1;
277 	}
278 	if (uc == ue)
279 		goto out;
280 	if (*uc == '.')
281 		uc++;
282 	for (; uc < ue; uc++) {
283 		if (!json_isdigit(*uc))
284 			break;
285 		got = 1;
286 	}
287 	if (uc == ue)
288 		goto out;
289 	if (got && (*uc == 'e' || *uc == 'E')) {
290 		uc++;
291 		got = 0;
292 		if (uc == ue)
293 			goto out;
294 		if (*uc == '+' || *uc == '-')
295 			uc++;
296 		for (; uc < ue; uc++) {
297 			if (!json_isdigit(*uc))
298 				break;
299 			got = 1;
300 		}
301 	}
302 out:
303 	if (!got)
304 		DPRINTF("Bad number: ", uc, *ucp);
305 	else
306 		DPRINTF("Good number: ", uc, *ucp);
307 	*ucp = uc;
308 	return got;
309 }
310 
311 static int
json_parse_const(const unsigned char ** ucp,const unsigned char * ue,const char * str,size_t len)312 json_parse_const(const unsigned char **ucp, const unsigned char *ue,
313     const char *str, size_t len)
314 {
315 	const unsigned char *uc = *ucp;
316 
317 	DPRINTF("Parse const: ", uc, *ucp);
318 	for (len--; uc < ue && --len;) {
319 		if (*uc++ == *++str)
320 			continue;
321 	}
322 	if (len)
323 		DPRINTF("Bad const: ", uc, *ucp);
324 	*ucp = uc;
325 	return len == 0;
326 }
327 
328 static int
json_parse(const unsigned char ** ucp,const unsigned char * ue,size_t * st,size_t lvl)329 json_parse(const unsigned char **ucp, const unsigned char *ue,
330     size_t *st, size_t lvl)
331 {
332 	const unsigned char *uc;
333 	int rv = 0;
334 	int t;
335 
336 	uc = json_skip_space(*ucp, ue);
337 	if (uc == ue)
338 		goto out;
339 
340 	// Avoid recursion
341 	if (lvl > 20)
342 		return 0;
343 #if JSON_COUNT
344 	/* bail quickly if not counting */
345 	if (lvl > 1 && (st[JSON_OBJECT] || st[JSON_ARRAYN]))
346 		return 1;
347 #endif
348 
349 	DPRINTF("Parse general: ", uc, *ucp);
350 	switch (*uc++) {
351 	case '"':
352 		rv = json_parse_string(&uc, ue);
353 		t = JSON_STRING;
354 		break;
355 	case '[':
356 		rv = json_parse_array(&uc, ue, st, lvl + 1);
357 		t = JSON_ARRAY;
358 		break;
359 	case '{': /* '}' */
360 		rv = json_parse_object(&uc, ue, st, lvl + 1);
361 		t = JSON_OBJECT;
362 		break;
363 	case 't':
364 		rv = json_parse_const(&uc, ue, "true", sizeof("true"));
365 		t = JSON_CONSTANT;
366 		break;
367 	case 'f':
368 		rv = json_parse_const(&uc, ue, "false", sizeof("false"));
369 		t = JSON_CONSTANT;
370 		break;
371 	case 'n':
372 		rv = json_parse_const(&uc, ue, "null", sizeof("null"));
373 		t = JSON_CONSTANT;
374 		break;
375 	default:
376 		--uc;
377 		rv = json_parse_number(&uc, ue);
378 		t = JSON_NUMBER;
379 		break;
380 	}
381 	if (rv)
382 		st[t]++;
383 	uc = json_skip_space(uc, ue);
384 out:
385 	*ucp = uc;
386 	DPRINTF("End general: ", uc, *ucp);
387 	if (lvl == 0)
388 		return rv && (st[JSON_ARRAYN] || st[JSON_OBJECT]);
389 	return rv;
390 }
391 
392 #ifndef TEST
393 int
file_is_json(struct magic_set * ms,const struct buffer * b)394 file_is_json(struct magic_set *ms, const struct buffer *b)
395 {
396 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
397 	const unsigned char *ue = uc + b->flen;
398 	size_t st[JSON_MAX];
399 	int mime = ms->flags & MAGIC_MIME;
400 
401 
402 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
403 		return 0;
404 
405 	memset(st, 0, sizeof(st));
406 
407 	if (!json_parse(&uc, ue, st, 0))
408 		return 0;
409 
410 	if (mime == MAGIC_MIME_ENCODING)
411 		return 1;
412 	if (mime) {
413 		if (file_printf(ms, "application/json") == -1)
414 			return -1;
415 		return 1;
416 	}
417 	if (file_printf(ms, "JSON data") == -1)
418 		return -1;
419 #if JSON_COUNT
420 #define P(n) st[n], st[n] > 1 ? "s" : ""
421 	if (file_printf(ms, " (%" SIZE_T_FORMAT "u object%s, %" SIZE_T_FORMAT
422 	    "u array%s, %" SIZE_T_FORMAT "u string%s, %" SIZE_T_FORMAT
423 	    "u constant%s, %" SIZE_T_FORMAT "u number%s, %" SIZE_T_FORMAT
424 	    "u >1array%s)",
425 	    P(JSON_OBJECT), P(JSON_ARRAY), P(JSON_STRING), P(JSON_CONSTANT),
426 	    P(JSON_NUMBER), P(JSON_ARRAYN))
427 	    == -1)
428 		return -1;
429 #endif
430 	return 1;
431 }
432 
433 #else
434 
435 #include <sys/types.h>
436 #include <sys/stat.h>
437 #include <stdio.h>
438 #include <fcntl.h>
439 #include <unistd.h>
440 #include <stdlib.h>
441 #include <stdint.h>
442 #include <err.h>
443 
444 int
main(int argc,char * argv[])445 main(int argc, char *argv[])
446 {
447 	int fd, rv;
448 	struct stat st;
449 	unsigned char *p;
450 	size_t stats[JSON_MAX];
451 
452 	if ((fd = open(argv[1], O_RDONLY)) == -1)
453 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
454 
455 	if (fstat(fd, &st) == -1)
456 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
457 
458 	if ((p = malloc(st.st_size)) == NULL)
459 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
460 		    (intmax_t)st.st_size);
461 	if (read(fd, p, st.st_size) != st.st_size)
462 		err(EXIT_FAILURE, "Can't read %jd bytes",
463 		    (intmax_t)st.st_size);
464 	memset(stats, 0, sizeof(stats));
465 	printf("is json %d\n", json_parse((const unsigned char **)&p,
466 	    p + st.st_size, stats, 0));
467 	return 0;
468 }
469 #endif
470