Skip to content

WIP: Non-recursive shunting yard algorithm for expression parsing #524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions src/parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,186 @@ function parse_eq_star(ps::ParseState)
end
end

function binding_power(tok) # TODO: prefix ops
k = kind(tok) # lhs rhs
return k in KSet"NewlineWs EndMarker" ? (0, 0) :
k == K"(" ? (1000, 1) :
k == K")" ? (1, 1001) :
k > K"BEGIN_OPS" ? (
k < K"BEGIN_ASSIGNMENTS" ? nothing : # FIXME?
k <= K"END_ASSIGNMENTS" ? (11, 10) :
k <= K"END_PAIRARROW" ? (31, 30) :
k <= K"END_CONDITIONAL" ? (50, 50) : # FIXME Ternary a ? b : c
k <= K"END_ARROW" ? (71, 70) :
k <= K"END_LAZYOR" ? (91, 90) :
k <= K"END_LAZYAND" ? (111, 110) :
k <= K"END_COMPARISON" ? (130, 130) : # FIXME n-ary K"comparison"
k <= K"<|" ? (151, 150) :
k <= K"|>" ? (170, 171) :
k <= K"END_COLON" ? (190, 191) : # FIXME Optional infix ternary a : b : c
k <= K"END_PLUS" ? (210, 211) :
k <= K"END_TIMES" ? (230, 231) :
k <= K"END_RATIONAL" ? (240, 241) :
k <= K"END_BITSHIFTS" ? (251, 250) :
k <= K"END_POWER" ? (261, 260) :
k <= K"END_DECL" ? (270, 271) :
k == K"where" ? (280, 281) :
k == K"." ? (290, 291) :
k <= K"!" ? (310, 311) :
k <= K"'" ? (320, 1) :
k <= K"->" ? (330, 331) :
k <= K"END_UNICODE_OPS" ? (340, 341) :
nothing
) :
nothing
end

# Expression parser based on the Shunting-Yard algorithm
#
# This is a form of operator precedence parsing basically equivalent to
# "precedence climbing" / "Pratt parsing". In contrast to Pratt parsing, we
# keep all the state in an explicit stack.
#
# Some error reporting is included in this implementation; this seems fairly
# easy to do, despite what some people claim about the shunting yard. Having
# the explicit stack gives us easy access to a lot more state so it's likely
# that error reporting is easier than in a recursive formulation.
#
# Some refs:
# https://matklad.github.io/2020/04/15/from-pratt-to-dijkstra.html
# https://en.wikipedia.org/wiki/Shunting_yard_algorithm
function shunting_yard(ps::ParseState, min_binding_power)
# Operator stack. Top kept separately for efficiency
mark = position(ps) # Mark to the left of the last non-operator
prev_right_bp = min_binding_power # Right binding power of the last operator bumped
prev_op = true # True if previous was an operator
stack = []
while true
skip_nl = ps.whitespace_newline || prev_op # TODO: Or in parens.
tok = peek_token(ps, skip_newlines=skip_nl)
k = kind(tok)
bps = binding_power(k)
juxtapose_error = false
if isnothing(bps)
if !prev_op
# Treat erroneous juxtaposition as a low binding power "error operator"
juxtapose_error = true
k = K"error"
bps = (5,5)
else
mark = position(ps)
@assert is_literal(k) || is_identifier(k)
bump(ps, skip_newlines=skip_nl)
prev_op = false
continue
end
end

if prev_op && !(is_unary_op(tok) || k == K"(")
@error "bump term err"
bump_trivia(ps, error="Expected term", skip_newlines=skip_nl)
end
left_bp, right_bp = bps
# max_terms = is_suffixed(tok) ? 2 :
# (k == K"+" || k == K"++" || k == K"*") ? typemax(Int) :
# k == K":" ? 3 :
curr_chain_op = k in KSet"+ ++ *" && !is_decorated(tok)
while !isempty(stack) && stack[end].right_bp > left_bp &&
!(curr_chain_op && (t2 = stack[end].tok; kind(t2) == k && !is_decorated(t2)))
top = pop!(stack)
tk = kind(top.tok)
if tk == K"error" || is_syntactic_unary_op(k)
emit(ps, top.mark, tk)
elseif tk == K")"
if !isempty(stack) && kind(stack[end].tok) == K"("
top = pop!(stack)
emit(ps, top.mark, K"parens")
else
reset_node!(ps, top.op_mark, kind=K"error")
pos2 = ParseStreamPosition(top.op_mark.token_index+1, top.op_mark.range_index)
emit_diagnostic(ps, top.op_mark, pos2, error="found `)` without matching `(`")
end
elseif tk == K"("
reset_node!(ps, top.op_mark, kind=K"error")
pos2 = ParseStreamPosition(top.op_mark.token_index+1, top.op_mark.range_index)
emit_diagnostic(ps, top.op_mark, pos2, error="found `(` without matching `)`")
else
is_chain_op = tk in KSet"+ ++ *" && !is_decorated(top.tok) &&
!isempty(stack) && (t2 = stack[end].tok; kind(t2) == tk && !is_decorated(t2))
if is_chain_op
# Chains are consumed from right to left and we only want
# the leftmost operator token as nontrivia.
reset_node!(ps, top.op_mark, flags=TRIVIA_FLAG)
else
# FIXME PREFIX_OP_FLAG
emit(ps, top.mark, is_dotted(top.tok) ? K"dotcall" : K"call",
tk == K"'" ? POSTFIX_OP_FLAG : INFIX_FLAG)
end
end
mark = top.mark
end

if right_bp <= min_binding_power
@assert isempty(stack) # TODO?
break
end

op_mark = if juxtapose_error
bump_trivia(ps, error="Cannot juxtapose expressions", skip_newlines=skip_nl)
elseif k == K")" || k == K"("
# TODO: There's a lot of hacks here to support `(` and `)` and it's
# probably still buggy. For example, I think the termination
# condition at nonzero min_binding_power is buggy in the presence
# of parens. Overall, it's probably best just to have a separate
# special case for parens.
#
# Especially because they will need to dispatch to some quite
# complex code when we detect "anything interesting" - there's soo
# many cases where parens aren't just grouping:
# - function call syntax without commas `*(x)`
# - tuples `(x,)`
# - named tuples `(; x)`
#
# There's even cases which are almost the same as the above, but
# where parens are just grouping after all, such as
# - unary operator syntax where `()` is grouping like `+(x)`
#
# See parse_paren and parse_unary etc :(
#
bump_trivia(ps, skip_newlines=skip_nl)
m = position(ps)
bump(ps, TRIVIA_FLAG, skip_newlines=skip_nl)
if k == K"("
mark = m
end
m
else
bump_trivia(ps, skip_newlines=skip_nl) # or support skip_newlines in bump_dotsplit
bump_dotsplit(ps, remap_kind=K"Identifier")
end

prev_op = !(k == K"'" || k == K")") # postfix ops

# TODO: Deal with postfix unary ops here
push!(stack, (; mark, right_bp, tok, op_mark))
end
end

# Hacky convenience driver for shunting_yard
function parse_sy(text; kws...)
parse_sy(SyntaxNode, text; kws...)
end

function parse_sy(::Type{TreeType}, text; ignore_errors=false) where {TreeType}
stream = ParseStream(text)
shunting_yard(ParseState(stream), 0)
validate_tokens(stream)
if !ignore_errors && !isempty(stream.diagnostics)
throw(ParseError(stream))
end
build_tree(TreeType, stream, keep_parens=true)
end

# a = b ==> (= a b)
#
# flisp: parse-assignment
Expand Down
Loading