nickolay
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/cli.rs
Lines changed: 14 additions & 3 deletions b/‎examples/cli.rs
Lines changed: 14 additions & 3 deletions
diff --git a/‎src/cst.rs
Lines changed: 142 additions & 0 deletions b/‎src/cst.rs
Lines changed: 142 additions & 0 deletions
diff --git a/‎src/lib.rs
Lines changed: 1 addition & 0 deletions b/‎src/lib.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/parser.rs
Lines changed: 116 additions & 5 deletions b/‎src/parser.rs
Lines changed: 116 additions & 5 deletions
@@ -21,6 +21,7 @@ path = "src/lib.rs"
 [dependencies]
 bigdecimal = { version = "0.1.0", optional = true }
 log = "0.4.5"
+rowan = "0.10.0"
 
 [dev-dependencies]
 simple_logger = "1.0.1"
 
@@ -17,7 +17,7 @@
 use std::fs;
 
 use sqlparser::dialect::*;
-use sqlparser::parser::Parser;
+use sqlparser::{parser::Parser, tokenizer::Tokenizer};
 
 fn main() {
     simple_logger::init().unwrap();
@@ -45,7 +45,17 @@ fn main() {
         chars.next();
         chars.as_str()
     };
-    let parse_result = Parser::parse_sql(&*dialect, without_bom);
+
+    let tokens = Tokenizer::new(&*dialect, without_bom)
+        .tokenize()
+        .unwrap_or_else(|e| {
+            println!("Error tokenizing: {:?}", e);
+            std::process::exit(1);
+        });
+
+    let mut parser = Parser::new(tokens);
+    let parse_result = parser.parse_statements();
+
     match parse_result {
         Ok(statements) => {
             println!(
@@ -57,7 +67,8 @@ fn main() {
                     .join("\n")
             );
             println!("Parse results:\n{:#?}", statements);
-            std::process::exit(0);
+
+            println!("Parse tree:\n{:#?}", parser.syntax());
         }
         Err(e) => {
             println!("Error during parsing: {:?}", e);
 
@@ -0,0 +1,142 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! A lossless syntax tree that preserves the full fidelity of the source text,
+//! including whitespace.
+//!
+//! We refer to it as a CST, short for Concrete Syntax Tree, in order to
+//! contrast it with an AST, although it's not a grammar-based parse tree.
+//!
+//! The design is based on rust-analyzer's
+//! https://github.com/rust-analyzer/rust-analyzer/blob/2020-04-27/docs/dev/syntax.md
+//! The RA folks generously made their syntax tree implementation available as the
+//! `rowan` crate, which we re-use.
+use crate::tokenizer::Token;
+
+/// Each node of the CST has a "kind", a variant from this enum representing
+/// its type.
+///
+/// There are separate kinds for leaf nodes (tokens, such as "whitespace"
+/// or "number") and non-leafs, such as "expresssion", but they are not
+/// distinguished at the type level.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[allow(non_camel_case_types)]
+#[repr(u16)]
+#[rustfmt::skip]
+pub enum SyntaxKind {
+    // First, the variants from the `Token` enum.
+
+    // Note: we rely on the token "kinds" having the same internal `u16`
+    // representation as the `Token` enum, meaning the number and the
+    // order of these must match the `Token` enum exactly.
+
+    // TBD: change `Token` to be a `(SyntaxKind, SmolStr)`.
+
+    /// A keyword (like SELECT) or an optionally quoted SQL identifier
+    Word = 0,
+    /// An unsigned numeric literal
+    Number,
+    /// A character that could not be tokenized
+    Char,
+    /// Single quoted string: i.e: 'string'
+    SingleQuotedString,
+    /// "National" string literal: i.e: N'string'
+    NationalStringLiteral,
+    /// Hexadecimal string literal: i.e.: X'deadbeef'
+    HexStringLiteral,
+    /// Comma
+    Comma,
+    /// Whitespace (space, tab, etc)
+    Whitespace,
+    /// Equality operator `=`
+    Eq,
+    /// Not Equals operator `<>` (or `!=` in some dialects)
+    Neq,
+    /// Less Than operator `<`
+    Lt,
+    /// Greater han operator `>`
+    Gt,
+    /// Less Than Or Equals operator `<=`
+    LtEq,
+    /// Greater Than Or Equals operator `>=`
+    GtEq,
+    /// Plus operator `+`
+    Plus,
+    /// Minus operator `-`
+    Minus,
+    /// Multiplication operator `*`
+    Mult,
+    /// Division operator `/`
+    Div,
+    /// Modulo Operator `%`
+    Mod,
+    /// Left parenthesis `(`
+    LParen,
+    /// Right parenthesis `)`
+    RParen,
+    /// Period (used for compound identifiers or projections into nested types)
+    Period,
+    /// Colon `:`
+    Colon,
+    /// DoubleColon `::` (used for casting in postgresql)
+    DoubleColon,
+    /// SemiColon `;` used as separator for COPY and payload
+    SemiColon,
+    /// Backslash `\` used in terminating the COPY payload with `\.`
+    Backslash,
+    /// Left bracket `[`
+    LBracket,
+    /// Right bracket `]`
+    RBracket,
+    /// Ampersand &
+    Ampersand,
+    /// Left brace `{`
+    LBrace,
+    /// Right brace `}`
+    RBrace,
+
+    // Other kinds representing non-leaf nodes in the Syntax tree.
+    ROOT,
+    ERR,
+    KW,
+
+    // Sentinel value
+    LAST
+}
+
+impl Token {
+    pub fn kind(&self) -> SyntaxKind {
+        // From https://github.com/rust-lang/rfcs/blob/master/text/2363-arbitrary-enum-discriminant.md
+        unsafe { *(self as *const Self as *const SyntaxKind) }
+    }
+}
+
+impl From<SyntaxKind> for rowan::SyntaxKind {
+    fn from(kind: SyntaxKind) -> Self {
+        Self(kind as u16)
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Lang {}
+impl rowan::Language for Lang {
+    type Kind = SyntaxKind;
+    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
+        assert!(raw.0 <= SyntaxKind::LAST as u16);
+        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
+    }
+    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
+        kind.into()
+    }
+}
+
+pub type SyntaxNode = rowan::SyntaxNode<Lang>;
@@ -35,6 +35,7 @@
 #![warn(clippy::all)]
 
 pub mod ast;
+pub mod cst;
 pub mod dialect;
 pub mod parser;
 pub mod tokenizer;
 
@@ -21,6 +21,8 @@ use super::tokenizer::*;
 use std::error::Error;
 use std::fmt;
 
+use crate::{cst, cst::SyntaxKind as SK};
+
 #[derive(Debug, Clone, PartialEq)]
 pub enum ParserError {
     TokenizerError(String),
@@ -38,6 +40,7 @@ macro_rules! parser_err {
 pub struct Marker {
     /// position in the token stream (`parser.index`)
     index: usize,
+    builder_checkpoint: rowan::Checkpoint,
 }
 
 #[derive(PartialEq)]
@@ -79,12 +82,63 @@ pub struct Parser {
     tokens: Vec<Token>,
     /// The index of the first unprocessed token in `self.tokens`
     index: usize,
+    builder: rowan::GreenNodeBuilder<'static>,
+
+    // TBD: the parser currently provides an API to move around the token
+    // stream without restrictions (`next_token`/`prev_token`), while the
+    // `builder` does not. To work around this, we keep a list of "pending"
+    // tokens which have already been processed via `next_token`, but may
+    // be put back via `prev_token`.
+    pending: Vec<(cst::SyntaxKind, rowan::SmolStr)>,
+}
+
+/// `ret!(expr => via self.complete(m, SK::FOO, ..)` runs the following steps:
+///   1) Evaluates `expr`, possibly advancing the parser's position in the token stream;
+///   2) Closes the current branch of the CST, identified by `m`, and sets its kind to
+///     `SyntaxKind::FOO`;
+///   3) Returns the value of `expr`, which should be the typed AST node, corresponding
+///     to the recently closed branch.
+/// The weird syntax prevents rustfmt from making each call to this macro take up 5 lines.
+macro_rules! ret {
+    { $e: expr => via $self: ident .complete($m: ident, $syntax_kind: expr, ..) } => {
+        {
+            let rv = $e;
+            $self.complete($m, $syntax_kind, rv)
+        }
+    };
 }
 
 impl Parser {
     /// Parse the specified tokens
     pub fn new(tokens: Vec<Token>) -> Self {
-        Parser { tokens, index: 0 }
+        let mut parser = Parser {
+            tokens,
+            index: 0,
+            builder: rowan::GreenNodeBuilder::new(),
+            pending: vec![],
+        };
+        parser.builder.start_node(SK::ROOT.into());
+        parser
+    }
+
+    pub fn syntax(mut self) -> cst::SyntaxNode {
+        if self.peek_token().is_some() {
+            // Not at end-of-file: either some extraneous tokens left after
+            // successfully parsing something, or we've bailed with an error.
+            //
+            // TBD: ideally we wouldn't abandon the "current" branch of the
+            // CST on error, instead `.complete()`-ing it as usual, but that's
+            // in conflict with having to return the typed AST, which can't
+            // lack certain bits.
+            self.builder.start_node(SK::ERR.into());
+            while self.next_token().is_some() {}
+            self.builder.finish_node();
+        } else {
+            // TBD: this is required until all parser methods end with `ret!`.
+            self.flush_pending_buffer();
+        }
+        self.builder.finish_node();
+        cst::SyntaxNode::new_root(self.builder.finish())
     }
 
     /// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
@@ -749,11 +803,44 @@ impl Parser {
     }
 
     pub fn start(&mut self) -> Marker {
-        Marker { index: self.index }
+        self.flush_pending_buffer();
+        Marker {
+            index: self.index,
+            builder_checkpoint: self.builder.checkpoint(),
+        }
+    }
+
+    fn start_if<F>(&mut self, f: F) -> Option<Marker>
+    where
+        F: FnOnce(&mut Parser) -> bool,
+    {
+        self.flush_pending_buffer();
+        let m = self.start();
+        if f(self) {
+            Some(m)
+        } else {
+            None
+        }
     }
 
     pub fn reset(&mut self, m: Marker) {
         self.index = m.index;
+        self.pending.truncate(0);
+        // TBD: rowan's builder does not allow reverting to a checkpoint
+    }
+
+    pub fn complete<T>(&mut self, m: Marker, kind: cst::SyntaxKind, rv: T) -> T {
+        self.flush_pending_buffer();
+        self.builder
+            .start_node_at(m.builder_checkpoint, kind.into());
+        self.builder.finish_node();
+        rv
+    }
+
+    pub fn flush_pending_buffer(&mut self) {
+        for (kind, s) in self.pending.drain(..) {
+            self.builder.token(kind.into(), s);
+        }
     }
 
     /// Return the first non-whitespace token that has not yet been processed
@@ -783,9 +870,10 @@ impl Parser {
     /// (or None if reached end-of-file) and mark it as processed. OK to call
     /// repeatedly after reaching EOF.
     pub fn next_token(&mut self) -> Option<Token> {
+        self.flush_pending_buffer();
+
         loop {
-            self.index += 1;
-            match self.tokens.get(self.index - 1) {
+            match self.next_token_no_skip() {
                 Some(Token::Whitespace(_)) => continue,
                 token => return token.cloned(),
             }
@@ -795,7 +883,12 @@ impl Parser {
     /// Return the first unprocessed token, possibly whitespace.
     pub fn next_token_no_skip(&mut self) -> Option<&Token> {
         self.index += 1;
-        self.tokens.get(self.index - 1)
+        #[allow(clippy::let_and_return)]
+        let token = self.tokens.get(self.index - 1);
+        if let Some(t) = token {
+            self.pending.push((t.kind(), t.to_string().into()));
+        }
+        token
     }
 
     /// Push back the last one non-whitespace token. Must be called after
@@ -805,9 +898,23 @@ impl Parser {
         loop {
             assert!(self.index > 0);
             self.index -= 1;
+
+            if !self.pending.is_empty() {
+                self.pending.pop();
+            } else {
+                assert!(self.index >= self.tokens.len()); // past EOF
+            }
+
             if let Some(Token::Whitespace(_)) = self.tokens.get(self.index) {
                 continue;
             }
+
+            // There may be only one non-whitespace token `pending` as by
+            // convention, backtracking (i.e. going more than one token back)
+            // is done via `start`/`reset` instead.
+            for tok in &self.pending {
+                assert!(tok.0 == SK::Whitespace);
+            }
             return;
         }
     }
@@ -832,6 +939,10 @@ impl Parser {
         match self.peek_token() {
             Some(Token::Word(ref k)) if expected.eq_ignore_ascii_case(&k.keyword) => {
                 self.next_token();
+                // TBD: a hack to change the "kind" of the token just processed
+                let mut p = self.pending.pop().unwrap();
+                p.0 = SK::KW.into();
+                self.pending.push(p);
                 true
             }
             _ => false,