diff --git a/src/Tokenizer.php b/src/Tokenizer.php index f8f35a6..b974b69 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -722,6 +722,12 @@ final class Tokenizer 'YEARWEEK', ]; + /** @var list */ + private array $dataTypeModifiers = [ + 'WITH TIME ZONE', + 'WITHOUT TIME ZONE', + ]; + /** Regular expression for tokenizing. */ private readonly string $tokenizeRegex; @@ -834,11 +840,13 @@ private function makeRegexFromList(array $values, bool $sorted = false): string private function makeTokenizeRegexes(): array { // Set up regular expressions - $regexBoundaries = $this->makeRegexFromList($this->boundaries); - $regexReserved = $this->makeRegexFromList($this->reserved); - $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); - $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); - $regexFunction = $this->makeRegexFromList($this->functions); + + $regexBoundaries = $this->makeRegexFromList($this->boundaries); + $regexReserved = $this->makeRegexFromList($this->reserved); + $regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel)); + $regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline)); + $regexFunction = $this->makeRegexFromList($this->functions); + $regexDataTypeModifiers = str_replace(' ', '\s+', $this->makeRegexFromList($this->dataTypeModifiers)); return [ Token::TOKEN_TYPE_WHITESPACE => '\s+', @@ -866,6 +874,10 @@ private function makeTokenizeRegexes(): array Token::TOKEN_TYPE_NUMBER => '(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')', // punctuation and symbols Token::TOKEN_TYPE_BOUNDARY => $regexBoundaries, + // data type modifiers, this make 'WITH TIMEZONE' to be different from the 'WITH" from CTE + Token::TOKEN_TYPE_RESERVED => '(? '(? [ + [ + new Token(Token::TOKEN_TYPE_RESERVED, 'TIMESTAMP'), + new Token(Token::TOKEN_TYPE_BOUNDARY, '('), + new Token(Token::TOKEN_TYPE_NUMBER, '0'), + new Token(Token::TOKEN_TYPE_BOUNDARY, ')'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'WITH TIME ZONE'), + ], + 'TIMESTAMP(0) WITH TIME ZONE', + ]; + + yield 'WITHOUT TIME ZONE as single token' => [ + [ + new Token(Token::TOKEN_TYPE_RESERVED, 'TIME'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'WITHOUT TIME ZONE'), + ], + 'TIME WITHOUT TIME ZONE', + ]; + + yield 'CTE WITH still works' => [ + [ + new Token(Token::TOKEN_TYPE_RESERVED_TOPLEVEL, 'WITH'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_WORD, 'cte'), + new Token(Token::TOKEN_TYPE_WHITESPACE, ' '), + new Token(Token::TOKEN_TYPE_RESERVED, 'AS'), + ], + 'WITH cte AS', + ]; } public function testTokenizeLongConcat(): void