1<?php declare(strict_types=1); 2 3namespace PhpParser; 4 5require __DIR__ . '/compatibility_tokens.php'; 6 7class Lexer { 8 /** 9 * Tokenize the provided source code. 10 * 11 * The token array is in the same format as provided by the PhpToken::tokenize() method in 12 * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill 13 * implementation in earlier PHP version. 14 * 15 * The token array is terminated by a sentinel token with token ID 0. 16 * The token array does not discard any tokens (i.e. whitespace and comments are included). 17 * The token position attributes are against this token array. 18 * 19 * @param string $code The source code to tokenize. 20 * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to 21 * ErrorHandler\Throwing. 22 * @return Token[] Tokens 23 */ 24 public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array { 25 if (null === $errorHandler) { 26 $errorHandler = new ErrorHandler\Throwing(); 27 } 28 29 $scream = ini_set('xdebug.scream', '0'); 30 31 $tokens = @Token::tokenize($code); 32 $this->postprocessTokens($tokens, $errorHandler); 33 34 if (false !== $scream) { 35 ini_set('xdebug.scream', $scream); 36 } 37 38 return $tokens; 39 } 40 41 private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void { 42 $chr = $token->text; 43 if ($chr === "\0") { 44 // PHP cuts error message after null byte, so need special case 45 $errorMsg = 'Unexpected null byte'; 46 } else { 47 $errorMsg = sprintf( 48 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr) 49 ); 50 } 51 52 $errorHandler->handleError(new Error($errorMsg, [ 53 'startLine' => $token->line, 54 'endLine' => $token->line, 55 'startFilePos' => $token->pos, 56 'endFilePos' => $token->pos, 57 ])); 58 } 59 60 private function isUnterminatedComment(Token $token): bool { 61 return $token->is([\T_COMMENT, \T_DOC_COMMENT]) 62 && substr($token->text, 0, 2) === '/*' 63 && substr($token->text, -2) !== '*/'; 64 } 65 66 /** 67 * @param list<Token> $tokens 68 */ 69 protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void { 70 // This function reports errors (bad characters and unterminated comments) in the token 71 // array, and performs certain canonicalizations: 72 // * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and 73 // T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types. 74 // * Add a sentinel token with ID 0. 75 76 $numTokens = \count($tokens); 77 if ($numTokens === 0) { 78 // Empty input edge case: Just add the sentinel token. 79 $tokens[] = new Token(0, "\0", 1, 0); 80 return; 81 } 82 83 for ($i = 0; $i < $numTokens; $i++) { 84 $token = $tokens[$i]; 85 if ($token->id === \T_BAD_CHARACTER) { 86 $this->handleInvalidCharacter($token, $errorHandler); 87 } 88 89 if ($token->id === \ord('&')) { 90 $next = $i + 1; 91 while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) { 92 $next++; 93 } 94 $followedByVarOrVarArg = isset($tokens[$next]) && 95 $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]); 96 $token->id = $followedByVarOrVarArg 97 ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG 98 : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG; 99 } 100 } 101 102 // Check for unterminated comment 103 $lastToken = $tokens[$numTokens - 1]; 104 if ($this->isUnterminatedComment($lastToken)) { 105 $errorHandler->handleError(new Error('Unterminated comment', [ 106 'startLine' => $lastToken->line, 107 'endLine' => $lastToken->getEndLine(), 108 'startFilePos' => $lastToken->pos, 109 'endFilePos' => $lastToken->getEndPos(), 110 ])); 111 } 112 113 // Add sentinel token. 114 $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos()); 115 } 116} 117