Skip to content

Commit 3de76c2

Browse files
committed
improve lexer
1 parent b4a4e7c commit 3de76c2

File tree

1 file changed

+33
-22
lines changed

1 file changed

+33
-22
lines changed

src/parser/lexer.ts

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ const word = choiceOnlyOne(latinWord, singleUcsurWord);
8686
const properWords = allAtLeastOnce(
8787
match(/[A-Z][a-zA-Z]*/, "proper word").skip(spaces),
8888
)
89-
.map((array) => array.join(" "));
89+
.map((array) => array.join(" "))
90+
.map<Token>((words) => ({ type: "proper word", words, kind: "latin" }));
9091
/** Parses a specific word, either UCSUR or latin. */
9192
function specificWord(thatWord: string): Parser<string> {
9293
return word.filter((thisWord) => {
@@ -98,13 +99,16 @@ function specificWord(thatWord: string): Parser<string> {
9899
});
99100
}
100101
/** Parses multiple a. */
101-
const multipleA = sequence(specificWord("a"), allAtLeastOnce(specificWord("a")))
102-
.map(([a, as]) => [a, ...as].length);
102+
const multipleA = sequence(
103+
specificWord("a"),
104+
count(allAtLeastOnce(specificWord("a"))),
105+
)
106+
.map<Token>(([_, count]) => ({ type: "multiple a", count: count + 1 }));
103107
/** Parses lengthened words. */
104108
const longWord = choiceOnlyOne(matchString("a"), matchString("n"))
105109
.then((word) =>
106110
count(allAtLeastOnce(matchString(word)))
107-
.map<Token & { type: "long word" }>((count) => ({
111+
.map<Token>((count) => ({
108112
type: "long word",
109113
word,
110114
length: count + 1,
@@ -124,7 +128,8 @@ const xAlaX = lazy(() => {
124128
sequence(specificWord("ala"), specificWord(word)).map(() => word)
125129
);
126130
}
127-
});
131+
})
132+
.map<Token>((word) => ({ type: "x ala x", word }));
128133

129134
Parser.endCache();
130135

@@ -139,7 +144,8 @@ const punctuation = choiceOnlyOne(
139144
)
140145
.skip(spaces),
141146
newline.map(() => "."),
142-
);
147+
)
148+
.map<Token>((punctuation) => ({ type: "punctuation", punctuation }));
143149
/**
144150
* Parses cartouche element and returns the phonemes or letters it represents.
145151
*/
@@ -181,7 +187,13 @@ const cartouche = sequence(
181187
return `${word[0].toUpperCase()}${word.slice(1)}`;
182188
});
183189
/** Parses multiple cartouches. */
184-
const cartouches = allAtLeastOnce(cartouche).map((words) => words.join(" "));
190+
const cartouches = allAtLeastOnce(cartouche)
191+
.map((words) => words.join(" "))
192+
.map<Token>((words) => ({
193+
type: "proper word",
194+
words,
195+
kind: "cartouche",
196+
}));
185197
/**
186198
* Parses long glyph container.
187199
*
@@ -243,32 +255,31 @@ const insideLongGlyph = specificSpecialUcsur(END_OF_REVERSE_LONG_GLYPH)
243255
.skip(specificSpecialUcsur(START_OF_LONG_GLYPH))
244256
.skip(spaces)
245257
.map<Token>((words) => ({ type: "inside long glyph", words }));
258+
const combinedGlyphsToken = combinedGlyphs
259+
.skip(spaces)
260+
.map<Token>((words) => ({ type: "combined glyphs", words }));
261+
const wordToken = word.map<Token>((word) => ({ type: "word", word }));
246262

247263
Parser.startCache(cache);
248264

249265
/** Parses a token. */
250-
export const token = choiceOnlyOne<Token>(
266+
export const token = choiceOnlyOne(
267+
longWord,
268+
xAlaX,
269+
multipleA,
270+
wordToken,
271+
properWords,
272+
// UCSUR only
251273
spaceLongGlyph,
252274
headedLongGlyphStart,
253-
combinedGlyphs
254-
.skip(spaces)
255-
.map((words) => ({ type: "combined glyphs", words })),
256-
properWords.map((words) => ({ type: "proper word", words, kind: "latin" })),
257-
longWord,
258-
xAlaX.map((word) => ({ type: "x ala x", word })),
259-
multipleA.map((count) => ({ type: "multiple a", count })),
260-
word.map((word) => ({ type: "word", word })),
275+
combinedGlyphsToken,
261276
// starting with non-words:
262-
punctuation.map((punctuation) => ({ type: "punctuation", punctuation })),
277+
punctuation,
263278
headlessLongGlyphEnd,
264279
headedLongGlyphEnd,
265280
headlessLongGlyphStart,
266281
insideLongGlyph,
267-
cartouches.map((words) => ({
268-
type: "proper word",
269-
words,
270-
kind: "cartouche",
271-
})),
282+
cartouches,
272283
);
273284

274285
Parser.endCache();

0 commit comments

Comments
 (0)