Skip to content

Commit 07ed839

Browse files
committed
Introduce CST infrastructure based on rowan
This adds the common code to allow building a lossless syntax tree based on the `rowan` crate, in parallel with our current AST. The motivation is discussed in the README. The CST is a flat list of tokens by default, structure can be added incrementally in future commits. Two changes are split into their own commits, that should be squashed with this, once the GreenNodeBuilder change is upstreamed: - Fix `TBD: rowan's builder does not allow reverting to a checkpoint` - Hide rowan behind a non-default `cst` feature Other tasks to be done later, marked with `TBD` in the code: - Fix `Token`/`SyntaxKind` duplication, changing the former to store a slice of the original source code, e.g. `(SyntaxKind, SmolStr)` This should also fix the currently disabled test-cases where `Token`'s `to_string()` does not return the original string: * `parse_escaped_single_quote_string_predicate` * `parse_literal_string` (the case of `HexLiteralString`) - Fix the hack in `parse_keyword()` to remap the token type (RA has `bump_remap() for this) - Fix the `Parser::pending` hack (needs rethinking parser's API) Probably related is the issue of handling whitespace and comments: the way this prototype handles it looks wrong.
1 parent e088c8a commit 07ed839

File tree

8 files changed

+290
-10
lines changed

8 files changed

+290
-10
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ path = "src/lib.rs"
2121
[dependencies]
2222
bigdecimal = { version = "0.1.0", optional = true }
2323
log = "0.4.5"
24+
rowan = "0.10.0"
2425

2526
[dev-dependencies]
2627
simple_logger = "1.0.1"

examples/cli.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
use std::fs;
1818

1919
use sqlparser::dialect::*;
20-
use sqlparser::parser::Parser;
20+
use sqlparser::{parser::Parser, tokenizer::Tokenizer};
2121

2222
fn main() {
2323
simple_logger::init().unwrap();
@@ -45,7 +45,17 @@ fn main() {
4545
chars.next();
4646
chars.as_str()
4747
};
48-
let parse_result = Parser::parse_sql(&*dialect, without_bom);
48+
49+
let tokens = Tokenizer::new(&*dialect, without_bom)
50+
.tokenize()
51+
.unwrap_or_else(|e| {
52+
println!("Error tokenizing: {:?}", e);
53+
std::process::exit(1);
54+
});
55+
56+
let mut parser = Parser::new(tokens);
57+
let parse_result = parser.parse_statements();
58+
4959
match parse_result {
5060
Ok(statements) => {
5161
println!(
@@ -57,7 +67,8 @@ fn main() {
5767
.join("\n")
5868
);
5969
println!("Parse results:\n{:#?}", statements);
60-
std::process::exit(0);
70+
71+
println!("Parse tree:\n{:#?}", parser.syntax());
6172
}
6273
Err(e) => {
6374
println!("Error during parsing: {:?}", e);

src/cst.rs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Licensed under the Apache License, Version 2.0 (the "License");
2+
// you may not use this file except in compliance with the License.
3+
// You may obtain a copy of the License at
4+
//
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
//
7+
// Unless required by applicable law or agreed to in writing, software
8+
// distributed under the License is distributed on an "AS IS" BASIS,
9+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
// See the License for the specific language governing permissions and
11+
// limitations under the License.
12+
13+
//! A lossless syntax tree that preserves the full fidelity of the source text,
14+
//! including whitespace.
15+
//!
16+
//! We refer to it as a CST, short for Concrete Syntax Tree, in order to
17+
//! contrast it with an AST, although it's not a grammar-based parse tree.
18+
//!
19+
//! The design is based on rust-analyzer's
20+
//! https://github.com/rust-analyzer/rust-analyzer/blob/2020-04-27/docs/dev/syntax.md
21+
//! The RA folks generously made their syntax tree implementation available as the
22+
//! `rowan` crate, which we re-use.
23+
use crate::tokenizer::Token;
24+
25+
/// Each node of the CST has a "kind", a variant from this enum representing
26+
/// its type.
27+
///
28+
/// There are separate kinds for leaf nodes (tokens, such as "whitespace"
29+
/// or "number") and non-leafs, such as "expresssion", but they are not
30+
/// distinguished at the type level.
31+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
32+
#[allow(non_camel_case_types)]
33+
#[repr(u16)]
34+
#[rustfmt::skip]
35+
pub enum SyntaxKind {
36+
// First, the variants from the `Token` enum.
37+
38+
// Note: we rely on the token "kinds" having the same internal `u16`
39+
// representation as the `Token` enum, meaning the number and the
40+
// order of these must match the `Token` enum exactly.
41+
42+
// TBD: change `Token` to be a `(SyntaxKind, SmolStr)`.
43+
44+
/// A keyword (like SELECT) or an optionally quoted SQL identifier
45+
Word = 0,
46+
/// An unsigned numeric literal
47+
Number,
48+
/// A character that could not be tokenized
49+
Char,
50+
/// Single quoted string: i.e: 'string'
51+
SingleQuotedString,
52+
/// "National" string literal: i.e: N'string'
53+
NationalStringLiteral,
54+
/// Hexadecimal string literal: i.e.: X'deadbeef'
55+
HexStringLiteral,
56+
/// Comma
57+
Comma,
58+
/// Whitespace (space, tab, etc)
59+
Whitespace,
60+
/// Equality operator `=`
61+
Eq,
62+
/// Not Equals operator `<>` (or `!=` in some dialects)
63+
Neq,
64+
/// Less Than operator `<`
65+
Lt,
66+
/// Greater han operator `>`
67+
Gt,
68+
/// Less Than Or Equals operator `<=`
69+
LtEq,
70+
/// Greater Than Or Equals operator `>=`
71+
GtEq,
72+
/// Plus operator `+`
73+
Plus,
74+
/// Minus operator `-`
75+
Minus,
76+
/// Multiplication operator `*`
77+
Mult,
78+
/// Division operator `/`
79+
Div,
80+
/// Modulo Operator `%`
81+
Mod,
82+
/// Left parenthesis `(`
83+
LParen,
84+
/// Right parenthesis `)`
85+
RParen,
86+
/// Period (used for compound identifiers or projections into nested types)
87+
Period,
88+
/// Colon `:`
89+
Colon,
90+
/// DoubleColon `::` (used for casting in postgresql)
91+
DoubleColon,
92+
/// SemiColon `;` used as separator for COPY and payload
93+
SemiColon,
94+
/// Backslash `\` used in terminating the COPY payload with `\.`
95+
Backslash,
96+
/// Left bracket `[`
97+
LBracket,
98+
/// Right bracket `]`
99+
RBracket,
100+
/// Ampersand &
101+
Ampersand,
102+
/// Left brace `{`
103+
LBrace,
104+
/// Right brace `}`
105+
RBrace,
106+
107+
// Other kinds representing non-leaf nodes in the Syntax tree.
108+
ROOT,
109+
ERR,
110+
KW,
111+
112+
// Sentinel value
113+
LAST
114+
}
115+
116+
impl Token {
117+
pub fn kind(&self) -> SyntaxKind {
118+
// From https://github.com/rust-lang/rfcs/blob/master/text/2363-arbitrary-enum-discriminant.md
119+
unsafe { *(self as *const Self as *const SyntaxKind) }
120+
}
121+
}
122+
123+
impl From<SyntaxKind> for rowan::SyntaxKind {
124+
fn from(kind: SyntaxKind) -> Self {
125+
Self(kind as u16)
126+
}
127+
}
128+
129+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
130+
pub enum Lang {}
131+
impl rowan::Language for Lang {
132+
type Kind = SyntaxKind;
133+
fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
134+
assert!(raw.0 <= SyntaxKind::LAST as u16);
135+
unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
136+
}
137+
fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
138+
kind.into()
139+
}
140+
}
141+
142+
pub type SyntaxNode = rowan::SyntaxNode<Lang>;

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#![warn(clippy::all)]
3636

3737
pub mod ast;
38+
pub mod cst;
3839
pub mod dialect;
3940
pub mod parser;
4041
pub mod tokenizer;

src/parser.rs

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ use super::tokenizer::*;
2121
use std::error::Error;
2222
use std::fmt;
2323

24+
use crate::{cst, cst::SyntaxKind as SK};
25+
2426
#[derive(Debug, Clone, PartialEq)]
2527
pub enum ParserError {
2628
TokenizerError(String),
@@ -38,6 +40,7 @@ macro_rules! parser_err {
3840
pub struct Marker {
3941
/// position in the token stream (`parser.index`)
4042
index: usize,
43+
builder_checkpoint: rowan::Checkpoint,
4144
}
4245

4346
#[derive(PartialEq)]
@@ -79,12 +82,63 @@ pub struct Parser {
7982
tokens: Vec<Token>,
8083
/// The index of the first unprocessed token in `self.tokens`
8184
index: usize,
85+
builder: rowan::GreenNodeBuilder<'static>,
86+
87+
// TBD: the parser currently provides an API to move around the token
88+
// stream without restrictions (`next_token`/`prev_token`), while the
89+
// `builder` does not. To work around this, we keep a list of "pending"
90+
// tokens which have already been processed via `next_token`, but may
91+
// be put back via `prev_token`.
92+
pending: Vec<(cst::SyntaxKind, rowan::SmolStr)>,
93+
}
94+
95+
/// `ret!(expr => via self.complete(m, SK::FOO, ..)` runs the following steps:
96+
/// 1) Evaluates `expr`, possibly advancing the parser's position in the token stream;
97+
/// 2) Closes the current branch of the CST, identified by `m`, and sets its kind to
98+
/// `SyntaxKind::FOO`;
99+
/// 3) Returns the value of `expr`, which should be the typed AST node, corresponding
100+
/// to the recently closed branch.
101+
/// The weird syntax prevents rustfmt from making each call to this macro take up 5 lines.
102+
macro_rules! ret {
103+
{ $e: expr => via $self: ident .complete($m: ident, $syntax_kind: expr, ..) } => {
104+
{
105+
let rv = $e;
106+
$self.complete($m, $syntax_kind, rv)
107+
}
108+
};
82109
}
83110

84111
impl Parser {
85112
/// Parse the specified tokens
86113
pub fn new(tokens: Vec<Token>) -> Self {
87-
Parser { tokens, index: 0 }
114+
let mut parser = Parser {
115+
tokens,
116+
index: 0,
117+
builder: rowan::GreenNodeBuilder::new(),
118+
pending: vec![],
119+
};
120+
parser.builder.start_node(SK::ROOT.into());
121+
parser
122+
}
123+
124+
pub fn syntax(mut self) -> cst::SyntaxNode {
125+
if self.peek_token().is_some() {
126+
// Not at end-of-file: either some extraneous tokens left after
127+
// successfully parsing something, or we've bailed with an error.
128+
//
129+
// TBD: ideally we wouldn't abandon the "current" branch of the
130+
// CST on error, instead `.complete()`-ing it as usual, but that's
131+
// in conflict with having to return the typed AST, which can't
132+
// lack certain bits.
133+
self.builder.start_node(SK::ERR.into());
134+
while self.next_token().is_some() {}
135+
self.builder.finish_node();
136+
} else {
137+
// TBD: this is required until all parser methods end with `ret!`.
138+
self.flush_pending_buffer();
139+
}
140+
self.builder.finish_node();
141+
cst::SyntaxNode::new_root(self.builder.finish())
88142
}
89143

90144
/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
@@ -749,11 +803,44 @@ impl Parser {
749803
}
750804

751805
pub fn start(&mut self) -> Marker {
752-
Marker { index: self.index }
806+
self.flush_pending_buffer();
807+
Marker {
808+
index: self.index,
809+
builder_checkpoint: self.builder.checkpoint(),
810+
}
811+
}
812+
813+
fn start_if<F>(&mut self, f: F) -> Option<Marker>
814+
where
815+
F: FnOnce(&mut Parser) -> bool,
816+
{
817+
self.flush_pending_buffer();
818+
let m = self.start();
819+
if f(self) {
820+
Some(m)
821+
} else {
822+
None
823+
}
753824
}
754825

755826
pub fn reset(&mut self, m: Marker) {
756827
self.index = m.index;
828+
self.pending.truncate(0);
829+
// TBD: rowan's builder does not allow reverting to a checkpoint
830+
}
831+
832+
pub fn complete<T>(&mut self, m: Marker, kind: cst::SyntaxKind, rv: T) -> T {
833+
self.flush_pending_buffer();
834+
self.builder
835+
.start_node_at(m.builder_checkpoint, kind.into());
836+
self.builder.finish_node();
837+
rv
838+
}
839+
840+
pub fn flush_pending_buffer(&mut self) {
841+
for (kind, s) in self.pending.drain(..) {
842+
self.builder.token(kind.into(), s);
843+
}
757844
}
758845

759846
/// Return the first non-whitespace token that has not yet been processed
@@ -783,9 +870,10 @@ impl Parser {
783870
/// (or None if reached end-of-file) and mark it as processed. OK to call
784871
/// repeatedly after reaching EOF.
785872
pub fn next_token(&mut self) -> Option<Token> {
873+
self.flush_pending_buffer();
874+
786875
loop {
787-
self.index += 1;
788-
match self.tokens.get(self.index - 1) {
876+
match self.next_token_no_skip() {
789877
Some(Token::Whitespace(_)) => continue,
790878
token => return token.cloned(),
791879
}
@@ -795,7 +883,12 @@ impl Parser {
795883
/// Return the first unprocessed token, possibly whitespace.
796884
pub fn next_token_no_skip(&mut self) -> Option<&Token> {
797885
self.index += 1;
798-
self.tokens.get(self.index - 1)
886+
#[allow(clippy::let_and_return)]
887+
let token = self.tokens.get(self.index - 1);
888+
if let Some(t) = token {
889+
self.pending.push((t.kind(), t.to_string().into()));
890+
}
891+
token
799892
}
800893

801894
/// Push back the last one non-whitespace token. Must be called after
@@ -805,9 +898,23 @@ impl Parser {
805898
loop {
806899
assert!(self.index > 0);
807900
self.index -= 1;
901+
902+
if !self.pending.is_empty() {
903+
self.pending.pop();
904+
} else {
905+
assert!(self.index >= self.tokens.len()); // past EOF
906+
}
907+
808908
if let Some(Token::Whitespace(_)) = self.tokens.get(self.index) {
809909
continue;
810910
}
911+
912+
// There may be only one non-whitespace token `pending` as by
913+
// convention, backtracking (i.e. going more than one token back)
914+
// is done via `start`/`reset` instead.
915+
for tok in &self.pending {
916+
assert!(tok.0 == SK::Whitespace);
917+
}
811918
return;
812919
}
813920
}
@@ -832,6 +939,10 @@ impl Parser {
832939
match self.peek_token() {
833940
Some(Token::Word(ref k)) if expected.eq_ignore_ascii_case(&k.keyword) => {
834941
self.next_token();
942+
// TBD: a hack to change the "kind" of the token just processed
943+
let mut p = self.pending.pop().unwrap();
944+
p.0 = SK::KW.into();
945+
self.pending.push(p);
835946
true
836947
}
837948
_ => false,

0 commit comments

Comments
 (0)