xref: /PHP-Parser/lib/PhpParser/Lexer.php (revision f7d484aa)
1<?php declare(strict_types=1);
2
3namespace PhpParser;
4
5require __DIR__ . '/compatibility_tokens.php';
6
7class Lexer {
8    /**
9     * Tokenize the provided source code.
10     *
11     * The token array is in the same format as provided by the PhpToken::tokenize() method in
12     * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill
13     * implementation in earlier PHP version.
14     *
15     * The token array is terminated by a sentinel token with token ID 0.
16     * The token array does not discard any tokens (i.e. whitespace and comments are included).
17     * The token position attributes are against this token array.
18     *
19     * @param string $code The source code to tokenize.
20     * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
21     *                                        ErrorHandler\Throwing.
22     * @return Token[] Tokens
23     */
24    public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array {
25        if (null === $errorHandler) {
26            $errorHandler = new ErrorHandler\Throwing();
27        }
28
29        $scream = ini_set('xdebug.scream', '0');
30
31        $tokens = @Token::tokenize($code);
32        $this->postprocessTokens($tokens, $errorHandler);
33
34        if (false !== $scream) {
35            ini_set('xdebug.scream', $scream);
36        }
37
38        return $tokens;
39    }
40
41    private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void {
42        $chr = $token->text;
43        if ($chr === "\0") {
44            // PHP cuts error message after null byte, so need special case
45            $errorMsg = 'Unexpected null byte';
46        } else {
47            $errorMsg = sprintf(
48                'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
49            );
50        }
51
52        $errorHandler->handleError(new Error($errorMsg, [
53            'startLine' => $token->line,
54            'endLine' => $token->line,
55            'startFilePos' => $token->pos,
56            'endFilePos' => $token->pos,
57        ]));
58    }
59
60    private function isUnterminatedComment(Token $token): bool {
61        return $token->is([\T_COMMENT, \T_DOC_COMMENT])
62            && substr($token->text, 0, 2) === '/*'
63            && substr($token->text, -2) !== '*/';
64    }
65
66    /**
67     * @param list<Token> $tokens
68     */
69    protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void {
70        // This function reports errors (bad characters and unterminated comments) in the token
71        // array, and performs certain canonicalizations:
72        //  * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
73        //    T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
74        //  * Add a sentinel token with ID 0.
75
76        $numTokens = \count($tokens);
77        if ($numTokens === 0) {
78            // Empty input edge case: Just add the sentinel token.
79            $tokens[] = new Token(0, "\0", 1, 0);
80            return;
81        }
82
83        for ($i = 0; $i < $numTokens; $i++) {
84            $token = $tokens[$i];
85            if ($token->id === \T_BAD_CHARACTER) {
86                $this->handleInvalidCharacter($token, $errorHandler);
87            }
88
89            if ($token->id === \ord('&')) {
90                $next = $i + 1;
91                while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) {
92                    $next++;
93                }
94                $followedByVarOrVarArg = isset($tokens[$next]) &&
95                    $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]);
96                $token->id = $followedByVarOrVarArg
97                    ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
98                    : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
99            }
100        }
101
102        // Check for unterminated comment
103        $lastToken = $tokens[$numTokens - 1];
104        if ($this->isUnterminatedComment($lastToken)) {
105            $errorHandler->handleError(new Error('Unterminated comment', [
106                'startLine' => $lastToken->line,
107                'endLine' => $lastToken->getEndLine(),
108                'startFilePos' => $lastToken->pos,
109                'endFilePos' => $lastToken->getEndPos(),
110            ]));
111        }
112
113        // Add sentinel token.
114        $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos());
115    }
116}
117