Skip to content

Commit 9ee11c2

Browse files
committed
reimplement _check_schema_compatible
1 parent 6a9b612 commit 9ee11c2

File tree

2 files changed

+45
-7
lines changed

2 files changed

+45
-7
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2036,24 +2036,29 @@ def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema, down
20362036
"""
20372037
Check if the `table_schema` is compatible with `other_schema`.
20382038
2039-
Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type.
2039+
The schemas are compatible if:
2040+
- All fields in `other_schema` are present in `table_schema`. (other_schema <= table_schema)
2041+
- All required fields in `table_schema` are present in `other_schema`.
20402042
20412043
Raises:
20422044
ValueError: If the schemas are not compatible.
20432045
"""
2046+
from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids, pyarrow_to_schema
2047+
20442048
name_mapping = table_schema.name_mapping
20452049
try:
2046-
task_schema = pyarrow_to_schema(
2047-
other_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
2048-
)
2050+
other_schema = pyarrow_to_schema(other_schema, name_mapping=name_mapping)
20492051
except ValueError as e:
2050-
other_schema = _pyarrow_to_schema_without_ids(other_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
2052+
other_schema = _pyarrow_to_schema_without_ids(other_schema)
20512053
additional_names = set(other_schema.column_names) - set(table_schema.column_names)
20522054
raise ValueError(
20532055
f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)."
20542056
) from e
20552057

2056-
if table_schema.as_struct() != task_schema.as_struct():
2058+
missing_table_schema_fields = {field for field in other_schema.fields if field not in table_schema.fields}
2059+
required_table_schema_fields = {field for field in table_schema.fields if field.required}
2060+
missing_required_fields = {field for field in required_table_schema_fields if field not in other_schema.fields}
2061+
if missing_table_schema_fields or missing_required_fields:
20572062
from rich.console import Console
20582063
from rich.table import Table as RichTable
20592064

@@ -2066,7 +2071,7 @@ def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema, down
20662071

20672072
for lhs in table_schema.fields:
20682073
try:
2069-
rhs = task_schema.find_field(lhs.field_id)
2074+
rhs = other_schema.find_field(lhs.field_id)
20702075
rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs))
20712076
except ValueError:
20722077
rich_table.add_row("❌", str(lhs), "Missing")

tests/io/test_pyarrow.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1799,6 +1799,39 @@ def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
17991799
_check_schema_compatible(table_schema_simple, other_schema)
18001800

18011801

1802+
def test_schema_compatible(table_schema_simple: Schema) -> None:
1803+
try:
1804+
_check_schema_compatible(table_schema_simple, table_schema_simple.as_arrow())
1805+
except Exception:
1806+
pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
1807+
1808+
1809+
def test_schema_projection(table_schema_simple: Schema) -> None:
1810+
# remove optional `baz` field from `table_schema_simple`
1811+
other_schema = pa.schema((
1812+
pa.field("foo", pa.string(), nullable=True),
1813+
pa.field("bar", pa.int32(), nullable=False),
1814+
))
1815+
try:
1816+
_check_schema_compatible(table_schema_simple, other_schema)
1817+
except Exception:
1818+
pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
1819+
1820+
1821+
def test_schema_downcast(table_schema_simple: Schema) -> None:
1822+
# large_string type is compatible with string type
1823+
other_schema = pa.schema((
1824+
pa.field("foo", pa.large_string(), nullable=True),
1825+
pa.field("bar", pa.int32(), nullable=False),
1826+
pa.field("baz", pa.bool_(), nullable=True),
1827+
))
1828+
1829+
try:
1830+
_check_schema_compatible(table_schema_simple, other_schema)
1831+
except Exception:
1832+
pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
1833+
1834+
18021835
def test_schema_downcast(table_schema_simple: Schema) -> None:
18031836
# large_string type is compatible with string type
18041837
other_schema = pa.schema((

0 commit comments

Comments
 (0)