Skip to content

Commit 5bbcb11

Browse files
committed
produce overwrite operation
1 parent 322ebdd commit 5bbcb11

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

tests/integration/test_writes/test_writes.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,76 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
262262
}
263263

264264

265+
@pytest.mark.integration
266+
def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catalog) -> None:
267+
identifier = "default.test_summaries_partial_overwrite"
268+
TEST_DATA = {
269+
"id": [1, 2, 3, 1, 1],
270+
"name": ["AB", "CD", "EF", "CD", "EF"],
271+
}
272+
pa_schema = pa.schema(
273+
[
274+
pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)),
275+
pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)),
276+
]
277+
)
278+
arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
279+
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, schema=pa_schema)
280+
with tbl.update_spec() as txn:
281+
txn.add_identity("id") # partition by `id` to create 3 data files
282+
tbl.append(arrow_table) # append
283+
tbl.delete(delete_filter="id == 1 and name = 'AB'") # partial overwrite data from 1 data file
284+
285+
rows = spark.sql(
286+
f"""
287+
SELECT operation, summary
288+
FROM {identifier}.snapshots
289+
ORDER BY committed_at ASC
290+
"""
291+
).collect()
292+
293+
operations = [row.operation for row in rows]
294+
assert operations == ["append", "overwrite"]
295+
296+
summaries = [row.summary for row in rows]
297+
298+
file_size = int(summaries[0]["added-files-size"])
299+
assert file_size > 0
300+
301+
# APPEND
302+
assert summaries[0] == {
303+
"added-data-files": "3",
304+
"added-files-size": "2848",
305+
"added-records": "5",
306+
"changed-partition-count": "3",
307+
"total-data-files": "3",
308+
"total-delete-files": "0",
309+
"total-equality-deletes": "0",
310+
"total-files-size": "2848",
311+
"total-position-deletes": "0",
312+
"total-records": "5",
313+
}
314+
# BUG `deleted-data-files` property is being replaced by the previous summary's `total-data-files` value
315+
# OVERWRITE from tbl.delete
316+
assert summaries[1] == {
317+
"added-data-files": "1",
318+
"added-files-size": "859",
319+
"added-records": "2", # wrong should be 0
320+
"changed-partition-count": "1",
321+
"deleted-data-files": "3", # wrong should be 1
322+
"deleted-records": "5", # wrong should be 1
323+
"removed-files-size": "2848",
324+
"total-data-files": "1", # wrong should be 3
325+
"total-delete-files": "0",
326+
"total-equality-deletes": "0",
327+
"total-files-size": "859",
328+
"total-position-deletes": "0",
329+
"total-records": "2", # wrong should be 4
330+
}
331+
assert len(tbl.inspect.data_files()) == 3
332+
assert len(tbl.scan().to_pandas()) == 4
333+
334+
265335
@pytest.mark.integration
266336
def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
267337
identifier = "default.arrow_data_files"

0 commit comments

Comments
 (0)