Skip to content

Commit e9f30c0

Browse files
committed
Initial unicode character support for identifiers and whitespace
Summary: Test Plan: Added a test Reviewers: Subscribers: Tasks: Tags:
1 parent 19f8399 commit e9f30c0

File tree

7 files changed

+25375
-5
lines changed

7 files changed

+25375
-5
lines changed

parser/cpp/prepare-javacc-grammar.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ pwd
33
GRAMMAR_DIR='../grammar'
44
GEN_DIR='target/generated-sources/javacc'
55
mkdir -p $GEN_DIR
6-
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt
6+
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/unicode-identifier-start.txt $GRAMMAR_DIR/unicode-identifier-extend.txt $GRAMMAR_DIR/ws.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt

parser/grammar/lexical-elements.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,14 @@ regular_identifier()
8080

8181
| <#identifier_part: <identifier_start> | <identifier_extend> >
8282

83-
| <#identifier_start: ["a"-"z"] // temp
83+
| <#identifier_start: (<UnicodeIdentifierStart>)
8484
/*!! See the Syntax Rules.*/
85+
/* Unicode char classes: <Ll> | <Lm> | <Lo> | <Lt> | <Lu> | <Nl> */
8586
>
8687

87-
| <#identifier_extend: ["\u00B7", "0"-"9", "_"] // temp
88+
| <#identifier_extend: ["\u00B7"] | <UnicodeIdentifierExtend>
8889
//!! See the Syntax Rules.
90+
/* Unicode char classes: <Mn>, <Mc>, <Nd>, <Pc>, <Cf> */
8991
>
9092

9193
| <large_object_length_token: ( <digit> )+ <multiplier> >
@@ -150,7 +152,14 @@ delimiter_token:
150152

151153
SPECIAL_TOKEN:
152154
{
153-
<white_space: <newline> | [ " ", "\t" ] // temp
155+
<white_space: (<UnicodeWhiteSpace> |
156+
[ "\u0009" // Horizontal Tabulation
157+
, "\n" // "\\u000A" //, Line Feed
158+
, "\u000B" // Vertical Tabulation
159+
, "\f" // "\\u000C" //, Form Feed
160+
, "\r" // "\\u000D" //, Carriage Return
161+
, "\u0085" // Next Line
162+
])
154163
//!! See the Syntax Rules.
155164
>
156165

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Concatenate all the fragments into a .jj file.
22
gendir='../target/generated-sources/javacc'
33
mkdir -p $gendir
4-
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt lexical-elements.txt > $gendir/parser_tmp.jjt
4+
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt unicode-identifier-start.txt unicode-identifier-extend.txt ws.txt lexical-elements.txt > $gendir/parser_tmp.jjt

parser/grammar/unicode-identifier-extend.txt

Lines changed: 3316 additions & 0 deletions
Large diffs are not rendered by default.

parser/grammar/unicode-identifier-start.txt

Lines changed: 22012 additions & 0 deletions
Large diffs are not rendered by default.

parser/grammar/ws.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
TOKEN:
2+
{
3+
<#Zl: [
4+
"\u2028" //LINE SEPARATOR;Zl;0;WS;;;;;N;;;;;
5+
]>
6+
7+
| <#Zp: [
8+
"\u2029" //PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;;
9+
]>
10+
11+
| <#Zs: [
12+
"\u0020" //SPACE;Zs;0;WS;;;;;N;;;;;
13+
, "\u00A0" //NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
14+
, "\u1680" //OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
15+
, "\u2000" //EN QUAD;Zs;0;WS;2002;;;;N;;;;;
16+
, "\u2001" //EM QUAD;Zs;0;WS;2003;;;;N;;;;;
17+
, "\u2002" //EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
18+
, "\u2003" //EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
19+
, "\u2004" //THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
20+
, "\u2005" //FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
21+
, "\u2006" //SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
22+
, "\u2007" //FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
23+
, "\u2008" //PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
24+
, "\u2009" //THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
25+
, "\u200A" //HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
26+
, "\u202F" //NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
27+
, "\u205F" //MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
28+
, "\u3000" //IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
29+
]>
30+
31+
| <#UnicodeWhiteSpace: (<Zl> | <Zp> | <Zs>)>
32+
}

parser/src/test/java/com/facebook/coresql/parser/TestSqlParser.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public class TestSqlParser
5252
"SELECT f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f())))))))))))))))))))))))))))));",
5353
"SELECT abs, 2 as abs;",
5454
"SELECT sqrt(x), power(y, 5), myFunction('a') FROM T;",
55+
"SELECT 1 ఒకటి;",
5556
};
5657

5758
private AstNode parse(String sql)

0 commit comments

Comments
 (0)