Skip to content

Commit 9426969

Browse files
committed
Implement "next_token()" & "get_next_token" API
1 parent 7155b7b commit 9426969

File tree

2 files changed

+163
-53
lines changed

2 files changed

+163
-53
lines changed

tests/mysql/WP_MySQL_Lexer_Tests.php

Lines changed: 87 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,75 @@
33
use PHPUnit\Framework\TestCase;
44

55
class WP_MySQL_Lexer_Tests extends TestCase {
6+
public function test_tokenize_valid_input(): void {
7+
$lexer = new WP_MySQL_Lexer( 'SELECT id FROM users' );
8+
9+
// SELECT
10+
$this->assertTrue( $lexer->next_token() );
11+
$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );
12+
13+
// id
14+
$this->assertTrue( $lexer->next_token() );
15+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );
16+
17+
// FROM
18+
$this->assertTrue( $lexer->next_token() );
19+
$this->assertSame( WP_MySQL_Lexer::FROM_SYMBOL, $lexer->get_token()->get_type() );
20+
21+
// users
22+
$this->assertTrue( $lexer->next_token() );
23+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );
24+
25+
// EOF
26+
$this->assertTrue( $lexer->next_token() );
27+
$this->assertSame( WP_MySQL_Lexer::EOF, $lexer->get_token()->get_type() );
28+
29+
// No more tokens.
30+
$this->assertFalse( $lexer->next_token() );
31+
$this->assertNull( $lexer->get_token() );
32+
33+
// Again, no more tokens.
34+
$this->assertFalse( $lexer->next_token() );
35+
$this->assertNull( $lexer->get_token() );
36+
}
37+
38+
public function test_tokenize_invalid_input(): void {
39+
$lexer = new WP_MySQL_Lexer( "SELECT x'ab01xyz'" );
40+
41+
// SELECT
42+
$this->assertTrue( $lexer->next_token() );
43+
$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );
44+
45+
// Invalid input.
46+
$this->assertFalse( $lexer->next_token() );
47+
$this->assertNull( $lexer->get_token() );
48+
49+
// No more tokens.
50+
$this->assertFalse( $lexer->next_token() );
51+
$this->assertNull( $lexer->get_token() );
52+
53+
// Again, no more tokens.
54+
$this->assertFalse( $lexer->next_token() );
55+
$this->assertNull( $lexer->get_token() );
56+
}
57+
658
/**
759
* Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
860
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
961
*/
1062
public function test_identifier_utf8_range(): void {
1163
for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
12-
$value = mb_chr( $i, 'UTF-8' );
13-
$lexer = new WP_MySQL_Lexer( $value );
14-
$type = $lexer->next_token()->get_type();
64+
$value = mb_chr( $i, 'UTF-8' );
65+
66+
$lexer = new WP_MySQL_Lexer( $value );
67+
$this->assertTrue( $lexer->next_token() );
68+
69+
$type = $lexer->get_token()->get_type();
1570
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
1671
if ( $is_valid ) {
1772
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
18-
} elseif ( strlen( $value ) === 0 ) {
19-
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
2073
} else {
21-
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
74+
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
2275
}
2376
}
2477
}
@@ -33,14 +86,19 @@ public function test_identifier_utf8_range(): void {
3386
public function test_identifier_utf8_two_byte_sequences(): void {
3487
for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
3588
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
36-
$value = chr( $byte_1 ) . chr( $byte_2 );
89+
$value = chr( $byte_1 ) . chr( $byte_2 );
90+
91+
$lexer = new WP_MySQL_Lexer( $value );
92+
$result = $lexer->next_token();
93+
$token = $lexer->get_token();
94+
3795
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
38-
$lexer = new WP_MySQL_Lexer( $value );
39-
$type = $lexer->next_token()->get_type();
4096
if ( $is_valid ) {
41-
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
97+
$this->assertTrue( $result );
98+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
4299
} else {
43-
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
100+
$this->assertFalse( $result );
101+
$this->assertNull( $token );
44102
}
45103
}
46104
}
@@ -58,14 +116,19 @@ public function test_identifier_utf8_three_byte_sequences(): void {
58116
for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
59117
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
60118
for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
61-
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
119+
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
120+
121+
$lexer = new WP_MySQL_Lexer( $value );
122+
$result = $lexer->next_token();
123+
$token = $lexer->get_token();
124+
62125
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
63-
$lexer = new WP_MySQL_Lexer( $value );
64-
$type = $lexer->next_token()->get_type();
65126
if ( $is_valid ) {
66-
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
127+
$this->assertTrue( $result );
128+
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
67129
} else {
68-
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
130+
$this->assertFalse( $result );
131+
$this->assertNull( $token );
69132
}
70133
}
71134
}
@@ -77,8 +140,8 @@ public function test_identifier_utf8_three_byte_sequences(): void {
77140
*/
78141
public function test_integer_types( $input, $expected ): void {
79142
$lexer = new WP_MySQL_Lexer( $input );
80-
$type = $lexer->next_token()->get_type();
81-
$this->assertSame( $expected, $type );
143+
$this->assertTrue( $lexer->next_token() );
144+
$this->assertSame( $expected, $lexer->get_token()->get_type() );
82145
}
83146

84147
public function data_integer_types(): array {
@@ -145,20 +208,20 @@ public function data_identifier_or_number(): array {
145208
array( '0b01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
146209
array( '0b', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
147210
array( "b'01'", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
148-
array( "b'01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
211+
array( "b'01xyz'", array() ), // invalid input
149212
array( "b''", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
150-
array( "b'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
151-
array( "b'01", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
213+
array( "b'", array() ), // invalid input
214+
array( "b'01", array() ), // invalid input
152215

153216
// hex
154217
array( '0xab01', array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
155218
array( '0xab01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
156219
array( '0x', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
157220
array( "x'ab01'", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
158-
array( "x'ab01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
221+
array( "x'ab01xyz'", array() ), // invalid input
159222
array( "x''", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
160-
array( "x'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
161-
array( "x'ab", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
223+
array( "x'", array() ), // invalid input
224+
array( "x'ab", array() ), // invalid input
162225

163226
// decimal
164227
array( '123.456', array( WP_MySQL_Lexer::DECIMAL_NUMBER, WP_MySQL_Lexer::EOF ) ),

wp-includes/mysql/class-wp-mysql-lexer.php

Lines changed: 76 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -936,9 +936,8 @@ class WP_MySQL_Lexer {
936936
const MYSQL_COMMENT_END = 902;
937937

938938
// Special tokens
939-
const WHITESPACE = 0;
940-
const EOF = -1;
941-
const INVALID_INPUT = -2;
939+
const WHITESPACE = 0;
940+
const EOF = -1;
942941

943942
/**
944943
* A map of SQL keyword string values to their corresponding token types.
@@ -2151,6 +2150,17 @@ class WP_MySQL_Lexer {
21512150
*/
21522151
private $token_starts_at = 0;
21532152

2153+
/**
2154+
* The type of the current token.
2155+
*
2156+
* When a token is successfully recognized and read, this value is set to the
2157+
* constant representing the token type. When no token was read yet, or the
2158+
* end of the SQL payload or an invalid token is reached, this value is null.
2159+
*
2160+
* @var int|null
2161+
*/
2162+
private $token_type;
2163+
21542164
/**
21552165
* Whether the tokenizer is inside an active MySQL-specific comment.
21562166
*
@@ -2184,22 +2194,56 @@ public function __construct(
21842194
*
21852195
* This method reads bytes from the SQL payload until a token is recognized.
21862196
* It starts from "$this->sql[ $this->bytes_already_read ]", advances the
2187-
* number of bytes read, and returns a WP_MySQL_Token object. When the end of
2188-
* the SQL payload is reached, the method always returns an EOF token.
2197+
* number of bytes read, and returns a boolean indicating whether a token
2198+
* was successfully recognized and read. When the end of the SQL payload
2199+
* or an invalid token is reached, the method returns false.
21892200
*
2190-
* @return WP_MySQL_Token A token object representing the next recognized token.
2201+
* @return bool Whether a token was successfully recognized and read.
21912202
*/
2192-
public function next_token(): WP_MySQL_Token {
2203+
public function next_token(): bool {
2204+
// We already reached the end of the SQL payload or an invalid token.
2205+
// Don't attempt to read any more bytes, and bail out immediately.
2206+
if (
2207+
self::EOF === $this->token_type
2208+
|| ( null === $this->token_type && $this->bytes_already_read > 0 )
2209+
) {
2210+
$this->token_type = null;
2211+
return false;
2212+
}
2213+
21932214
do {
21942215
$this->token_starts_at = $this->bytes_already_read;
2195-
$type = $this->read_next_token();
2216+
$this->token_type = $this->read_next_token();
21962217
} while (
2197-
self::WHITESPACE === $type
2198-
|| self::COMMENT === $type
2199-
|| self::MYSQL_COMMENT_START === $type
2200-
|| self::MYSQL_COMMENT_END === $type
2218+
self::WHITESPACE === $this->token_type
2219+
|| self::COMMENT === $this->token_type
2220+
|| self::MYSQL_COMMENT_START === $this->token_type
2221+
|| self::MYSQL_COMMENT_END === $this->token_type
22012222
);
2202-
return new WP_MySQL_Token( $type, $this->get_current_token_bytes() );
2223+
2224+
// Invalid input.
2225+
if ( null === $this->token_type ) {
2226+
return false;
2227+
}
2228+
return true;
2229+
}
2230+
2231+
/**
2232+
* Return the current token represented as a WP_MySQL_Token object.
2233+
*
2234+
* When no token was read yet, or the end of the SQL payload or an invalid
2235+
* token is reached, the method returns null.
2236+
*
2237+
* @TODO: Consider referential stability ($lexer->get_token() === $lexer->get_token()),
2238+
* or separate getters for the token type and token bytes (no token objects).
2239+
*
2240+
* @return WP_MySQL_Token|null An object representing the next recognized token or null.
2241+
*/
2242+
public function get_token(): ?WP_MySQL_Token {
2243+
if ( null === $this->token_type ) {
2244+
return null;
2245+
}
2246+
return new WP_MySQL_Token( $this->token_type, $this->get_current_token_bytes() );
22032247
}
22042248

22052249
/**
@@ -2209,17 +2253,20 @@ public function next_token(): WP_MySQL_Token {
22092253
* by "$this->sql[ $this->bytes_already_read ]", and reads all tokens until
22102254
* the end of the SQL payload is reached, returning an array of token objects.
22112255
*
2212-
* It can be used to tokenize the whole SQL payload at once, at the expense of
2213-
* storing all token objects in memory at the same time.
2256+
* When an invalid token is reached, the method stops and returns the partial
2257+
* sequence of valid tokens. In this case, the EOF token will not be included.
2258+
*
2259+
* This method can be used to tokenize the whole SQL payload at once, at the
2260+
* expense of storing all token objects in memory at the same time.
22142261
*
22152262
* @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
22162263
*/
22172264
public function remaining_tokens(): array {
22182265
$tokens = array();
2219-
do {
2220-
$token = $this->next_token();
2266+
while ( true === $this->next_token() ) {
2267+
$token = $this->get_token();
22212268
$tokens[] = $token;
2222-
} while ( WP_MySQL_Lexer::EOF !== $token->type );
2269+
}
22232270
return $tokens;
22242271
}
22252272

@@ -2281,7 +2328,7 @@ public static function get_token_name( int $token_id ): ?string {
22812328
return $token_name ? $token_name : null;
22822329
}
22832330

2284-
private function read_next_token(): int {
2331+
private function read_next_token(): ?int {
22852332
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
22862333
$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;
22872334

@@ -2362,13 +2409,13 @@ private function read_next_token(): int {
23622409
if ( $this->mysql_version >= 50713 ) {
23632410
$type = self::JSON_UNQUOTED_SEPARATOR_SYMBOL;
23642411
} else {
2365-
$type = self::INVALID_INPUT;
2412+
return null; // Invalid input.
23662413
}
23672414
} else {
23682415
if ( $this->mysql_version >= 50708 ) {
23692416
$type = self::JSON_SEPARATOR_SYMBOL;
23702417
} else {
2371-
$type = self::INVALID_INPUT;
2418+
return null; // Invalid input.
23722419
}
23732420
}
23742421
} else {
@@ -2474,7 +2521,7 @@ private function read_next_token(): int {
24742521
$this->bytes_already_read += 1; // Consume the 'N'.
24752522
$type = self::NULL2_SYMBOL;
24762523
} else {
2477-
$type = self::INVALID_INPUT;
2524+
return null; // Invalid input.
24782525
}
24792526
} elseif ( '#' === $byte ) {
24802527
$type = $this->read_line_comment();
@@ -2531,7 +2578,7 @@ private function get_current_token_bytes(): string {
25312578
* See:
25322579
* https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
25332580
*/
2534-
private function read_identifier(): int {
2581+
private function read_identifier(): ?int {
25352582
$started_at = $this->bytes_already_read;
25362583
while ( true ) {
25372584
// First, let's try to parse an ASCII sequence.
@@ -2590,10 +2637,10 @@ private function read_identifier(): int {
25902637

25912638
return $this->bytes_already_read - $started_at > 0
25922639
? self::IDENTIFIER
2593-
: self::INVALID_INPUT;
2640+
: null; // Invalid input.
25942641
}
25952642

2596-
private function read_number(): int {
2643+
private function read_number(): ?int {
25972644
// @TODO: Support numeric-only identifier parts after "." (e.g., 1ea10.1).
25982645

25992646
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
@@ -2619,7 +2666,7 @@ private function read_number(): int {
26192666
$this->bytes_already_read >= strlen( $this->sql )
26202667
|| "'" !== $this->sql[ $this->bytes_already_read ]
26212668
) {
2622-
return self::INVALID_INPUT;
2669+
return null; // Invalid input.
26232670
}
26242671
$this->bytes_already_read += 1; // Consume the "'".
26252672
}
@@ -2642,7 +2689,7 @@ private function read_number(): int {
26422689
$this->bytes_already_read >= strlen( $this->sql )
26432690
|| "'" !== $this->sql[ $this->bytes_already_read ]
26442691
) {
2645-
return self::INVALID_INPUT;
2692+
return null; // Invalid input.
26462693
}
26472694
$this->bytes_already_read += 1; // Consume the "'".
26482695
}
@@ -2759,7 +2806,7 @@ private function read_number(): int {
27592806
*
27602807
* @param string $quote The quote character - ', ", or `.
27612808
*/
2762-
private function read_quoted_text(): int {
2809+
private function read_quoted_text(): ?int {
27632810
$quote = $this->sql[ $this->bytes_already_read ];
27642811
$this->bytes_already_read += 1; // Consume the quote.
27652812

@@ -2792,7 +2839,7 @@ private function read_quoted_text(): int {
27922839

27932840
// Unclosed string - unexpected EOF.
27942841
if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
2795-
return self::INVALID_INPUT;
2842+
return null; // Invalid input.
27962843
}
27972844

27982845
// Check if the quote is doubled.

0 commit comments

Comments
 (0)