xref: /php-src/ext/libxml/mime_sniff.c (revision 34ec4b35)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Authors: Niels Dossche <nielsdos@php.net>                            |
14    +----------------------------------------------------------------------+
15 */
16 
17 /* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)
18  * It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.
19  * In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.
20  */
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25 
26 #include "php.h"
27 #ifdef HAVE_LIBXML
28 
29 #include "php_libxml.h"
30 
is_not_slash(char c)31 static bool is_not_slash(char c)
32 {
33 	return c != '/';
34 }
35 
is_not_semicolon(char c)36 static bool is_not_semicolon(char c)
37 {
38 	return c != ';';
39 }
40 
is_not_semicolon_or_equals(char c)41 static bool is_not_semicolon_or_equals(char c)
42 {
43 	return c != ';' && c != '=';
44 }
45 
is_not_quote_or_backslash(char c)46 static bool is_not_quote_or_backslash(char c)
47 {
48 	return c != '"' && c != '\\';
49 }
50 
51 /* https://fetch.spec.whatwg.org/#http-tab-or-space */
is_http_tab_or_space(char c)52 static bool is_http_tab_or_space(char c)
53 {
54 	return c == 0x09 || c == 0x20;
55 }
56 
57 /* https://fetch.spec.whatwg.org/#http-whitespace */
is_http_whitespace(char c)58 static bool is_http_whitespace(char c)
59 {
60 	return c == 0x0A || c == 0x0D || is_http_tab_or_space(c);
61 }
62 
63 /* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */
is_http_quoted_string_token(unsigned char c)64 static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */
65 {
66 	return c == 0x09 || (c >= 0x20 && c != 0x7F);
67 }
68 
69 /* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points
70  * Implemented by returning the length of the sequence */
collect_a_sequence_of_code_points(const char * position,const char * end,bool (* condition)(char))71 static zend_always_inline size_t collect_a_sequence_of_code_points(const char *position, const char *end, bool (*condition)(char))
72 {
73 	const char *start = position;
74 	while (position < end && condition(*position)) {
75 		position++;
76 	}
77 	return position - start;
78 }
79 
80 /* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */
collect_an_http_quoted_string_with_extract_value(const char * position,const char * end,const char ** position_out)81 static zend_string *collect_an_http_quoted_string_with_extract_value(const char *position, const char *end, const char **position_out)
82 {
83 	/* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */
84 
85 	/* 2. Let value be the empty string */
86 	zend_string *value = zend_string_alloc(end - position /* can't be longer than this */, false);
87 	ZSTR_LEN(value) = 0;
88 
89 	/* 3. Assert */
90 	ZEND_ASSERT(*position == '"');
91 
92 	/* 4. Advance */
93 	position++;
94 
95 	/* 5. While true */
96 	while (true) {
97 		/* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */
98 		size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);
99 		memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);
100 		ZSTR_LEN(value) += length;
101 		position += length;
102 
103 		/* 5.2. Past end check */
104 		if (position >= end) {
105 			break;
106 		}
107 
108 		/* 5.3. quoteOrBackslash is the code point at position */
109 		char quote_or_backslash = *position;
110 
111 		/* 5.4. Advance */
112 		position++;
113 
114 		/* 5.5. quote_or_backslash is '\\', deal with escaping */
115 		if (quote_or_backslash == '\\') {
116 			/* 5.5.1. Past end check */
117 			if (position >= end) {
118 				ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';
119 				ZSTR_LEN(value)++;
120 				break;
121 			}
122 
123 			/* 5.5.2. Append code point at position */
124 			ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;
125 			ZSTR_LEN(value)++;
126 
127 			/* 5.5.3. Advance */
128 			position++;
129 		} else {
130 			/* 5.6. Otherwise: assert and break */
131 			ZEND_ASSERT(quote_or_backslash == '"');
132 			break;
133 		}
134 	}
135 
136 	ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';
137 
138 	*position_out = position;
139 
140 	/* 6. extract-value is always true, return value */
141 	/* Step 7 is not needed because we always return here already */
142 	return value;
143 }
144 
145 /* https://infra.spec.whatwg.org/#ascii-alphanumeric */
is_ascii_alpha_numeric(char c)146 static bool is_ascii_alpha_numeric(char c)
147 {
148 	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
149 }
150 
151 /* https://mimesniff.spec.whatwg.org/#http-token-code-point */
is_http_token(char c)152 static bool is_http_token(char c)
153 {
154 	return c == 0x21
155 		|| (c >= 0x23 && c <= 0x27)
156 		|| c == 0x2A || c == 0x2B || c == 0x2D || c == 0x2E
157 		|| c == 0x5E || c == 0x5F
158 		|| c == 0x60
159 		|| c == 0x7C || c == 0x7E
160 		|| is_ascii_alpha_numeric(c);
161 }
162 
is_empty_string_or_does_not_solely_contain_http_token_code_points(const char * start,size_t len)163 static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)
164 {
165 	if (len == 0) {
166 		return true;
167 	}
168 	while (len > 0) {
169 		if (!is_http_token(*start)) {
170 			return true;
171 		}
172 		len--;
173 		start++;
174 	}
175 	return false;
176 }
177 
solely_contains_http_quoted_string_tokens(const char * start,size_t len)178 static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)
179 {
180 	while (len > 0) {
181 		if (!is_http_quoted_string_token(*start)) {
182 			return false;
183 		}
184 		len--;
185 		start++;
186 	}
187 	return true;
188 }
189 
190 /* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
191  * Note: We only care about the charset detection */
php_libxml_sniff_charset_from_string(const char * start,const char * end)192 PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end)
193 {
194 	/* 1. Remove leading & trailing HTTP whitespace */
195 	while (start < end && is_http_whitespace(*start)) {
196 		start++;
197 	}
198 	while (start < end && is_http_whitespace(*(end - 1))) {
199 		end--;
200 	}
201 
202 	/* 2. Position variable: no-op because we move the start pointer instead */
203 
204 	/* 3. Collect sequence of code points that are not '/' (for type) */
205 	size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);
206 
207 	/* 4. Empty string or not solely http tokens */
208 	if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {
209 		return NULL;
210 	}
211 	start += type_length;
212 
213 	/* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */
214 	if (start >= end) {
215 		return NULL;
216 	}
217 
218 	/* 6. Skip '/' */
219 	start++;
220 
221 	/* 7. Collect sequence of code points that are not ';' (for subtype) */
222 	size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
223 
224 	/* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */
225 
226 	/* 9. Empty string or not solely http tokens */
227 	if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {
228 		return NULL;
229 	}
230 	start += subtype_length;
231 
232 	/* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */
233 
234 	/* 11. Loop with check: position not past end */
235 	while (start < end) {
236 		/* 11.1. Advance position */
237 		start++;
238 
239 		/* 11.2. Collect sequence that *is* HTTP whitespace */
240 		size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);
241 		start += whitespace_length;
242 
243 		/* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */
244 		size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);
245 		const char *parameter_name = start;
246 		start += parameter_name_length;
247 
248 		/* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */
249 
250 		/* 11.5. Position past input check */
251 		if (start < end) {
252 			if (*start == ';') {
253 				continue;
254 			}
255 			start++;
256 		} else {
257 			/* 11.6. */
258 			break;
259 		}
260 
261 		/* 11.7. Let parameterValue be null */
262 		zend_string *parameter_value = NULL;
263 
264 		/* 11.8. Quoted string check */
265 		if (*start == '"') {
266 			/* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */
267 			parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);
268 
269 			/* 11.8.2. Collect a sequence of code points that are not ';' */
270 			start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);
271 		} else {
272 			/* 11.9. Otherwise */
273 			/* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */
274 			size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
275 			parameter_value = zend_string_init(start, parameter_value_length, false);
276 			start += parameter_name_length;
277 
278 			/* 11.9.2. Remove trailing HTTP whitespace from parameterValue */
279 			while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {
280 				ZSTR_LEN(parameter_value)--;
281 			}
282 			ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';
283 
284 			/* 11.9.3. Continue if parameterValue is empty */
285 			if (ZSTR_LEN(parameter_value) == 0) {
286 				zend_string_release_ex(parameter_value, false);
287 				continue;
288 			}
289 		}
290 
291 		/* 11.10. We diverge from the spec here: we're only interested in charset.
292 		 *        Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */
293 		if (parameter_name_length == strlen("charset")
294 			&& strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */
295 			&& solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {
296 			return parameter_value;
297 		}
298 
299 		zend_string_release_ex(parameter_value, false);
300 	}
301 
302 	/* 12. Return mimetype, a no-op / spec divergence */
303 	return NULL;
304 }
305 
php_libxml_sniff_charset_from_stream(const php_stream * s)306 PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s)
307 {
308 	if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
309 		zval *header;
310 
311 		ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
312 			const char buf[] = "Content-Type:";
313 			if (Z_TYPE_P(header) == IS_STRING &&
314 					!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
315 				return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + sizeof(buf) - 1, Z_STRVAL_P(header) + Z_STRLEN_P(header));
316 			}
317 		} ZEND_HASH_FOREACH_END();
318 	}
319 
320 	return NULL;
321 }
322 
323 #endif  /* HAVE_LIBXML */
324