Skip to content

Commit 8bc63f0

Browse files
authored
Correctly tokenize nested comments (apache#1629)
1 parent 94ea206 commit 8bc63f0

File tree

4 files changed

+108
-17
lines changed

4 files changed

+108
-17
lines changed

src/dialect/generic.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
131131
fn supports_empty_projections(&self) -> bool {
132132
true
133133
}
134+
135+
fn supports_nested_comments(&self) -> bool {
136+
true
137+
}
134138
}

src/dialect/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
682682
false
683683
}
684684

685+
/// Returns true if the dialect supports nested comments
686+
/// e.g. `/* /* nested */ */`
687+
fn supports_nested_comments(&self) -> bool {
688+
false
689+
}
690+
685691
/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
686692
/// as an alias assignment operator, rather than a boolean expression.
687693
/// For example: the following statements are equivalent for such a dialect:

src/dialect/postgresql.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
241241
fn supports_empty_projections(&self) -> bool {
242242
true
243243
}
244+
245+
fn supports_nested_comments(&self) -> bool {
246+
true
247+
}
244248
}
245249

246250
pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {

src/tokenizer.rs

Lines changed: 94 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1855,28 +1855,33 @@ impl<'a> Tokenizer<'a> {
18551855
) -> Result<Option<Token>, TokenizerError> {
18561856
let mut s = String::new();
18571857
let mut nested = 1;
1858-
let mut last_ch = ' ';
1858+
let supports_nested_comments = self.dialect.supports_nested_comments();
18591859

18601860
loop {
18611861
match chars.next() {
1862-
Some(ch) => {
1863-
if last_ch == '/' && ch == '*' {
1864-
nested += 1;
1865-
} else if last_ch == '*' && ch == '/' {
1866-
nested -= 1;
1867-
if nested == 0 {
1868-
s.pop();
1869-
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1870-
}
1862+
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
1863+
chars.next(); // consume the '*'
1864+
s.push('/');
1865+
s.push('*');
1866+
nested += 1;
1867+
}
1868+
Some('*') if matches!(chars.peek(), Some('/')) => {
1869+
chars.next(); // consume the '/'
1870+
nested -= 1;
1871+
if nested == 0 {
1872+
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
18711873
}
1874+
s.push('*');
1875+
s.push('/');
1876+
}
1877+
Some(ch) => {
18721878
s.push(ch);
1873-
last_ch = ch;
18741879
}
18751880
None => {
18761881
break self.tokenizer_error(
18771882
chars.location(),
18781883
"Unexpected EOF while in a multi-line comment",
1879-
)
1884+
);
18801885
}
18811886
}
18821887
}
@@ -2718,18 +2723,90 @@ mod tests {
27182723

27192724
#[test]
27202725
fn tokenize_nested_multiline_comment() {
2721-
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2726+
let dialect = GenericDialect {};
2727+
let test_cases = vec![
2728+
(
2729+
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
2730+
vec![
2731+
Token::Number("0".to_string(), false),
2732+
Token::Whitespace(Whitespace::MultiLineComment(
2733+
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
2734+
)),
2735+
Token::Whitespace(Whitespace::Space),
2736+
Token::Div,
2737+
Token::Word(Word {
2738+
value: "comment".to_string(),
2739+
quote_style: None,
2740+
keyword: Keyword::COMMENT,
2741+
}),
2742+
Token::Mul,
2743+
Token::Div,
2744+
Token::Number("1".to_string(), false),
2745+
],
2746+
),
2747+
(
2748+
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
2749+
vec![
2750+
Token::Number("0".to_string(), false),
2751+
Token::Whitespace(Whitespace::MultiLineComment(
2752+
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
2753+
)),
2754+
Token::Number("1".to_string(), false),
2755+
],
2756+
),
2757+
(
2758+
"SELECT 1/* a /* b */ c */0",
2759+
vec![
2760+
Token::make_keyword("SELECT"),
2761+
Token::Whitespace(Whitespace::Space),
2762+
Token::Number("1".to_string(), false),
2763+
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
2764+
Token::Number("0".to_string(), false),
2765+
],
2766+
),
2767+
];
2768+
2769+
for (sql, expected) in test_cases {
2770+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2771+
compare(expected, tokens);
2772+
}
2773+
}
2774+
2775+
#[test]
2776+
fn tokenize_nested_multiline_comment_empty() {
2777+
let sql = "select 1/*/**/*/0";
27222778

27232779
let dialect = GenericDialect {};
2724-
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2780+
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
27252781
let expected = vec![
2782+
Token::make_keyword("select"),
2783+
Token::Whitespace(Whitespace::Space),
2784+
Token::Number("1".to_string(), false),
2785+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
27262786
Token::Number("0".to_string(), false),
2787+
];
2788+
2789+
compare(expected, tokens);
2790+
}
2791+
2792+
#[test]
2793+
fn tokenize_nested_comments_if_not_supported() {
2794+
let dialect = SQLiteDialect {};
2795+
let sql = "SELECT 1/*/* nested comment */*/0";
2796+
let tokens = Tokenizer::new(&dialect, sql).tokenize();
2797+
let expected = vec![
2798+
Token::make_keyword("SELECT"),
2799+
Token::Whitespace(Whitespace::Space),
2800+
Token::Number("1".to_string(), false),
27272801
Token::Whitespace(Whitespace::MultiLineComment(
2728-
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2802+
"/* nested comment ".to_string(),
27292803
)),
2730-
Token::Number("1".to_string(), false),
2804+
Token::Mul,
2805+
Token::Div,
2806+
Token::Number("0".to_string(), false),
27312807
];
2732-
compare(expected, tokens);
2808+
2809+
compare(expected, tokens.unwrap());
27332810
}
27342811

27352812
#[test]

0 commit comments

Comments
 (0)