-
Notifications
You must be signed in to change notification settings - Fork 291
Fix parsing reference for nested fields #965
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -572,51 +572,54 @@ def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: | |
|
||
|
||
class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]): | ||
def _flat_name_to_list(self, name: str) -> List[str]: | ||
return name.split(".") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a risky route. I'd rather migrate to a |
||
|
||
def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: | ||
pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) | ||
return pc.field(term.ref().field.name).isin(pyarrow_literals) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)).isin(pyarrow_literals) | ||
|
||
def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: | ||
pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) | ||
return ~pc.field(term.ref().field.name).isin(pyarrow_literals) | ||
return ~pc.field(*self._flat_name_to_list(term.ref().name)).isin(pyarrow_literals) | ||
|
||
def visit_is_nan(self, term: BoundTerm[Any]) -> pc.Expression: | ||
ref = pc.field(term.ref().field.name) | ||
ref = pc.field(*self._flat_name_to_list(term.ref().name)) | ||
return pc.is_nan(ref) | ||
|
||
def visit_not_nan(self, term: BoundTerm[Any]) -> pc.Expression: | ||
ref = pc.field(term.ref().field.name) | ||
ref = pc.field(*self._flat_name_to_list(term.ref().name)) | ||
return ~pc.is_nan(ref) | ||
|
||
def visit_is_null(self, term: BoundTerm[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name).is_null(nan_is_null=False) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)).is_null(nan_is_null=False) | ||
|
||
def visit_not_null(self, term: BoundTerm[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name).is_valid() | ||
return pc.field(*self._flat_name_to_list(term.ref().name)).is_valid() | ||
|
||
def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) == _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) == _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) != _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) != _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) >= _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) >= _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) > _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) > _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) < _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) < _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.field(term.ref().field.name) <= _convert_scalar(literal.value, term.ref().field.field_type) | ||
return pc.field(*self._flat_name_to_list(term.ref().name)) <= _convert_scalar(literal.value, term.ref().field.field_type) | ||
|
||
def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return pc.starts_with(pc.field(term.ref().field.name), literal.value) | ||
return pc.starts_with(pc.field(*self._flat_name_to_list(term.ref().name)), literal.value) | ||
|
||
def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: | ||
return ~pc.starts_with(pc.field(term.ref().field.name), literal.value) | ||
return ~pc.starts_with(pc.field(*self._flat_name_to_list(term.ref().name)), literal.value) | ||
|
||
def visit_true(self) -> pc.Expression: | ||
return pc.scalar(True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,7 @@ def test_false() -> None: | |
def test_is_null() -> None: | ||
assert IsNull("foo") == parser.parse("foo is null") | ||
assert IsNull("foo") == parser.parse("foo IS NULL") | ||
assert IsNull("foo") == parser.parse("table.foo IS NULL") | ||
assert IsNull("table.foo") == parser.parse("table.foo IS NULL") | ||
|
||
|
||
def test_not_null() -> None: | ||
|
@@ -199,3 +199,7 @@ def test_with_function() -> None: | |
parser.parse("foo = 1 and lower(bar) = '2'") | ||
|
||
assert "Expected end of text, found 'and'" in str(exc_info) | ||
|
||
|
||
def test_nested_field_equality() -> None: | ||
assert EqualTo("foo.first", "a") == parser.parse("foo.first == 'a'") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think the key to success is to have some kind of syntax for quoting literals. For example: https://spark.apache.org/docs/latest/sql-ref-literals.html Then we can parse something like: 'a.b' -> Reference(('a.b',))
'a.b'.c -> Reference(('a.b', 'c'))
a.b.c -> Reference(('a', 'b', 'c')) Or folks have to use: row_filter=EqualTo(('a.b',), 123)
row_filter=EqualTo(('a.b', 'c'), 123)
row_filter=EqualTo(('a', 'b', 'c'), 123) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is also an interesting proposal on the spec side of things: apache/iceberg#10883 Related: apache/iceberg#598 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: extract out
"."
as a variable