Skip to content

Commit 3c52167

Browse files
Fokkokevinjqliu
andauthored
Fix the snapshot summary of a partial overwrite (#1879)
# Rationale for this change @kevinjqliu PTAL. I took the liberty of providing a fix for this since I was curious where this was coming from, hope you don't mind! I've cherry-picked your commit with the test. ![image](https://github.com/user-attachments/assets/14227da9-1f4a-4411-88f0-309907d3d332) Java produces: ```json { "added-data-files": "1", "added-files-size": "707", "added-records": "2", "app-id": "local-1743678304626", "changed-partition-count": "1", "deleted-data-files": "1", "deleted-records": "3", "engine-name": "spark", "engine-version": "3.5.5", "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)", "removed-files-size": "693", "spark.app.id": "local-1743678304626", "total-data-files": "3", "total-delete-files": "0", "total-equality-deletes": "0", "total-files-size": "1993", "total-position-deletes": "0", "total-records": "4" } ``` # Are these changes tested? # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Kevin Liu <[email protected]> Co-authored-by: Kevin Liu <[email protected]>
1 parent 881b2d5 commit 3c52167

File tree

5 files changed

+115
-27
lines changed

5 files changed

+115
-27
lines changed

pyiceberg/table/snapshots.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
2929
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
3030
from pyiceberg.schema import Schema
31+
from pyiceberg.utils.deprecated import deprecation_message
3132

3233
if TYPE_CHECKING:
3334
from pyiceberg.table.metadata import TableMetadata
@@ -356,6 +357,11 @@ def update_snapshot_summaries(
356357
raise ValueError(f"Operation not implemented: {summary.operation}")
357358

358359
if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:
360+
deprecation_message(
361+
deprecated_in="0.10.0",
362+
removed_in="0.11.0",
363+
help_message="The truncate-full-table shouldn't be used.",
364+
)
359365
summary = _truncate_table_summary(summary, previous_summary)
360366

361367
if not previous_summary:

pyiceberg/table/update/snapshot.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
236236
return update_snapshot_summaries(
237237
summary=Summary(operation=self._operation, **ssc.build(), **snapshot_properties),
238238
previous_summary=previous_snapshot.summary if previous_snapshot is not None else None,
239-
truncate_full_table=self._operation == Operation.OVERWRITE,
240239
)
241240

242241
def _commit(self) -> UpdatesAndRequirements:

tests/integration/test_deletes.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
467467
assert snapshots[2].summary == Summary(
468468
Operation.OVERWRITE,
469469
**{
470-
"added-files-size": snapshots[2].summary["total-files-size"],
471470
"added-data-files": "1",
471+
"added-files-size": snapshots[2].summary["added-files-size"],
472472
"added-records": "2",
473473
"changed-partition-count": "1",
474-
"total-files-size": snapshots[2].summary["total-files-size"],
475-
"total-delete-files": "0",
476-
"total-data-files": "1",
477-
"total-position-deletes": "0",
478-
"total-records": "2",
479-
"total-equality-deletes": "0",
480-
"deleted-data-files": "2",
481-
"removed-delete-files": "1",
482-
"deleted-records": "5",
474+
"deleted-data-files": "1",
475+
"deleted-records": "3",
483476
"removed-files-size": snapshots[2].summary["removed-files-size"],
484-
"removed-position-deletes": "1",
477+
"total-data-files": "2",
478+
"total-delete-files": "1",
479+
"total-equality-deletes": "0",
480+
"total-files-size": snapshots[2].summary["total-files-size"],
481+
"total-position-deletes": "1",
482+
"total-records": "4",
485483
},
486484
)
487485

tests/integration/test_writes/test_writes.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
250250
"total-records": "0",
251251
}
252252

253-
# Overwrite
253+
# Append
254254
assert summaries[3] == {
255255
"added-data-files": "1",
256256
"added-files-size": str(file_size),
@@ -264,6 +264,99 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
264264
}
265265

266266

267+
@pytest.mark.integration
268+
def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catalog) -> None:
269+
identifier = "default.test_summaries_partial_overwrite"
270+
TEST_DATA = {
271+
"id": [1, 2, 3, 1, 1],
272+
"name": ["AB", "CD", "EF", "CD", "EF"],
273+
}
274+
pa_schema = pa.schema(
275+
[
276+
pa.field("id", pa.int32()),
277+
pa.field("name", pa.string()),
278+
]
279+
)
280+
arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
281+
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, schema=pa_schema)
282+
with tbl.update_spec() as txn:
283+
txn.add_identity("id")
284+
tbl.append(arrow_table)
285+
286+
assert len(tbl.inspect.data_files()) == 3
287+
288+
tbl.delete(delete_filter="id == 1 and name = 'AB'") # partial overwrite data from 1 data file
289+
290+
rows = spark.sql(
291+
f"""
292+
SELECT operation, summary
293+
FROM {identifier}.snapshots
294+
ORDER BY committed_at ASC
295+
"""
296+
).collect()
297+
298+
operations = [row.operation for row in rows]
299+
assert operations == ["append", "overwrite"]
300+
301+
summaries = [row.summary for row in rows]
302+
303+
file_size = int(summaries[0]["added-files-size"])
304+
assert file_size > 0
305+
306+
# APPEND
307+
assert summaries[0] == {
308+
"added-data-files": "3",
309+
"added-files-size": "2570",
310+
"added-records": "5",
311+
"changed-partition-count": "3",
312+
"total-data-files": "3",
313+
"total-delete-files": "0",
314+
"total-equality-deletes": "0",
315+
"total-files-size": "2570",
316+
"total-position-deletes": "0",
317+
"total-records": "5",
318+
}
319+
# Java produces:
320+
# {
321+
# "added-data-files": "1",
322+
# "added-files-size": "707",
323+
# "added-records": "2",
324+
# "app-id": "local-1743678304626",
325+
# "changed-partition-count": "1",
326+
# "deleted-data-files": "1",
327+
# "deleted-records": "3",
328+
# "engine-name": "spark",
329+
# "engine-version": "3.5.5",
330+
# "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)",
331+
# "removed-files-size": "693",
332+
# "spark.app.id": "local-1743678304626",
333+
# "total-data-files": "3",
334+
# "total-delete-files": "0",
335+
# "total-equality-deletes": "0",
336+
# "total-files-size": "1993",
337+
# "total-position-deletes": "0",
338+
# "total-records": "4"
339+
# }
340+
files = tbl.inspect.data_files()
341+
assert len(files) == 3
342+
assert summaries[1] == {
343+
"added-data-files": "1",
344+
"added-files-size": "859",
345+
"added-records": "2",
346+
"changed-partition-count": "1",
347+
"deleted-data-files": "1",
348+
"deleted-records": "3",
349+
"removed-files-size": "866",
350+
"total-data-files": "3",
351+
"total-delete-files": "0",
352+
"total-equality-deletes": "0",
353+
"total-files-size": "2563",
354+
"total-position-deletes": "0",
355+
"total-records": "4",
356+
}
357+
assert len(tbl.scan().to_pandas()) == 4
358+
359+
267360
@pytest.mark.integration
268361
def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
269362
identifier = "default.arrow_data_files"

tests/table/test_snapshots.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,6 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:
289289
"total-position-deletes": "1",
290290
"total-records": "1",
291291
},
292-
truncate_full_table=True,
293292
)
294293

295294
expected = {
@@ -299,18 +298,12 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:
299298
"added-files-size": "4",
300299
"added-position-deletes": "5",
301300
"added-records": "6",
302-
"total-data-files": "1",
303-
"total-records": "6",
304-
"total-delete-files": "2",
305-
"total-equality-deletes": "3",
306-
"total-files-size": "4",
307-
"total-position-deletes": "5",
308-
"deleted-data-files": "1",
309-
"removed-delete-files": "1",
310-
"deleted-records": "1",
311-
"removed-files-size": "1",
312-
"removed-position-deletes": "1",
313-
"removed-equality-deletes": "1",
301+
"total-data-files": "2",
302+
"total-delete-files": "3",
303+
"total-records": "7",
304+
"total-files-size": "5",
305+
"total-position-deletes": "6",
306+
"total-equality-deletes": "4",
314307
}
315308

316309
assert actual.additional_properties == expected
@@ -337,7 +330,6 @@ def test_invalid_type() -> None:
337330
},
338331
),
339332
previous_summary={"total-data-files": "abc"}, # should be a number
340-
truncate_full_table=True,
341333
)
342334

343335
assert "Could not parse summary property total-data-files to an int: abc" in str(e.value)

0 commit comments

Comments
 (0)