xref: /PHP-8.3/sapi/fuzzer/fuzzer-mbstring.c (revision 5f2587eb)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Stanislav Malyshev <stas@php.net>                           |
14    +----------------------------------------------------------------------+
15  */
16 
17 
18 #include "zend.h"
19 #include "fuzzer.h"
20 #include "fuzzer-sapi.h"
21 #include "ext/mbstring/mbstring.h"
22 
convert_encoding(const uint8_t * Data,size_t Size,const mbfl_encoding * FromEncoding,const mbfl_encoding * ToEncoding,size_t BufSize,unsigned int * NumErrors)23 zend_string* convert_encoding(const uint8_t *Data, size_t Size, const mbfl_encoding *FromEncoding, const mbfl_encoding *ToEncoding, size_t BufSize, unsigned int *NumErrors)
24 {
25 	uint32_t *wchar_buf = ecalloc(BufSize, sizeof(uint32_t));
26 	unsigned int state = 0;
27 
28 	mb_convert_buf buf;
29 	mb_convert_buf_init(&buf, Size, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
30 
31 	while (Size) {
32 		size_t out_len = FromEncoding->to_wchar((unsigned char**)&Data, &Size, wchar_buf, BufSize, &state);
33 		ZEND_ASSERT(out_len <= BufSize);
34 		ToEncoding->from_wchar(wchar_buf, out_len, &buf, !Size);
35 	}
36 
37 	*NumErrors = buf.errors;
38 	zend_string *result = mb_convert_buf_result(&buf, ToEncoding);
39 	efree(wchar_buf);
40 	return result;
41 }
42 
assert_zend_string_eql(zend_string * str1,zend_string * str2)43 void assert_zend_string_eql(zend_string *str1, zend_string *str2)
44 {
45 	ZEND_ASSERT(ZSTR_LEN(str1) == ZSTR_LEN(str2));
46 	for (int i = 0; i < ZSTR_LEN(str1); i++) {
47 		ZEND_ASSERT(ZSTR_VAL(str1)[i] == ZSTR_VAL(str2)[i]);
48 	}
49 }
50 
LLVMFuzzerTestOneInput(const uint8_t * Data,size_t Size)51 int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
52 	const uint8_t *Comma1 = memchr(Data, ',', Size);
53 	if (!Comma1) {
54 		return 0;
55 	}
56 
57 	size_t ToEncodingNameLen = Comma1 - Data;
58 	char *ToEncodingName = estrndup((char *) Data, ToEncodingNameLen);
59 	Data = Comma1 + 1;
60 	Size -= ToEncodingNameLen + 1;
61 
62 	const uint8_t *Comma2 = memchr(Data, ',', Size);
63 	if (!Comma2) {
64 		efree(ToEncodingName);
65 		return 0;
66 	}
67 
68 	size_t FromEncodingNameLen = Comma2 - Data;
69 	char *FromEncodingName = estrndup((char *) Data, FromEncodingNameLen);
70 	Data = Comma2 + 1;
71 	Size -= FromEncodingNameLen + 1;
72 
73 	const mbfl_encoding *ToEncoding = mbfl_name2encoding(ToEncodingName);
74 	const mbfl_encoding *FromEncoding = mbfl_name2encoding(FromEncodingName);
75 
76 	if (!ToEncoding || !FromEncoding || Size < 2 || fuzzer_request_startup() == FAILURE) {
77 		efree(ToEncodingName);
78 		efree(FromEncodingName);
79 		return 0;
80 	}
81 
82 	/* Rather than converting an entire (possibly very long) string at once, mbstring converts
83 	 * strings 'chunk by chunk'; the decoder will run until it fills up its output buffer with
84 	 * wchars, then the encoder will process those wchars, then the decoder runs again until it
85 	 * again fills up its output buffer, and so on
86 	 *
87 	 * The most error-prone part of the decoder/encoder code is where we exit a decoder/encoder
88 	 * function and save its state to allow later resumption
89 	 * To stress-test that aspect of the decoders/encoders, try performing an encoding conversion
90 	 * operation with different, random buffer sizes
91 	 * If the code is correct, the result should always be the same either way */
92 	size_t bufsize1 = *Data++;
93 	size_t bufsize2 = *Data++;
94 	bufsize1 = MAX(bufsize1, MBSTRING_MIN_WCHAR_BUFSIZE);
95 	bufsize2 = MAX(bufsize2, MBSTRING_MIN_WCHAR_BUFSIZE);
96 	Size -= 2;
97 
98 	unsigned int errors1 = 0, errors2 = 0;
99 
100 	zend_string *Result1 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize1, &errors1);
101 	zend_string *Result2 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize2, &errors2);
102 
103 	assert_zend_string_eql(Result1, Result2);
104 	ZEND_ASSERT(errors1 == errors2);
105 
106 	/* For some text encodings, we have specialized validation functions. These should always be
107 	 * stricter than the conversion functions; if the conversion function receives invalid input
108 	 * and emits an error marker (MBFL_BAD_INPUT), then the validation function should always
109 	 * return false. However, if the conversion function does not emit any error marker, it may
110 	 * still happen in some cases that the validation function returns false. */
111 	if (FromEncoding->check != NULL) {
112 		bool good = FromEncoding->check((unsigned char*)Data, Size);
113 		if (errors1 > 0) {
114 			ZEND_ASSERT(!good);
115 		}
116 	}
117 
118 	zend_string_release(Result1);
119 	zend_string_release(Result2);
120 	efree(ToEncodingName);
121 	efree(FromEncodingName);
122 
123 	fuzzer_request_shutdown();
124 	return 0;
125 }
126 
LLVMFuzzerInitialize(int * argc,char *** argv)127 int LLVMFuzzerInitialize(int *argc, char ***argv) {
128 	fuzzer_init_php(NULL);
129 	return 0;
130 }
131