@@ -262,6 +262,76 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
262
262
}
263
263
264
264
265
+ @pytest .mark .integration
266
+ def test_summaries_partial_overwrite (spark : SparkSession , session_catalog : Catalog ) -> None :
267
+ identifier = "default.test_summaries_partial_overwrite"
268
+ TEST_DATA = {
269
+ "id" : [1 , 2 , 3 , 1 , 1 ],
270
+ "name" : ["AB" , "CD" , "EF" , "CD" , "EF" ],
271
+ }
272
+ pa_schema = pa .schema (
273
+ [
274
+ pa .field ("id" , pa .dictionary (pa .int32 (), pa .int32 (), False )),
275
+ pa .field ("name" , pa .dictionary (pa .int32 (), pa .string (), False )),
276
+ ]
277
+ )
278
+ arrow_table = pa .Table .from_pydict (TEST_DATA , schema = pa_schema )
279
+ tbl = _create_table (session_catalog , identifier , {"format-version" : "2" }, schema = pa_schema )
280
+ with tbl .update_spec () as txn :
281
+ txn .add_identity ("id" ) # partition by `id` to create 3 data files
282
+ tbl .append (arrow_table ) # append
283
+ tbl .delete (delete_filter = "id == 1 and name = 'AB'" ) # partial overwrite data from 1 data file
284
+
285
+ rows = spark .sql (
286
+ f"""
287
+ SELECT operation, summary
288
+ FROM { identifier } .snapshots
289
+ ORDER BY committed_at ASC
290
+ """
291
+ ).collect ()
292
+
293
+ operations = [row .operation for row in rows ]
294
+ assert operations == ["append" , "overwrite" ]
295
+
296
+ summaries = [row .summary for row in rows ]
297
+
298
+ file_size = int (summaries [0 ]["added-files-size" ])
299
+ assert file_size > 0
300
+
301
+ # APPEND
302
+ assert summaries [0 ] == {
303
+ "added-data-files" : "3" ,
304
+ "added-files-size" : "2848" ,
305
+ "added-records" : "5" ,
306
+ "changed-partition-count" : "3" ,
307
+ "total-data-files" : "3" ,
308
+ "total-delete-files" : "0" ,
309
+ "total-equality-deletes" : "0" ,
310
+ "total-files-size" : "2848" ,
311
+ "total-position-deletes" : "0" ,
312
+ "total-records" : "5" ,
313
+ }
314
+ # BUG `deleted-data-files` property is being replaced by the previous summary's `total-data-files` value
315
+ # OVERWRITE from tbl.delete
316
+ assert summaries [1 ] == {
317
+ "added-data-files" : "1" ,
318
+ "added-files-size" : "859" ,
319
+ "added-records" : "2" , # wrong should be 0
320
+ "changed-partition-count" : "1" ,
321
+ "deleted-data-files" : "3" , # wrong should be 1
322
+ "deleted-records" : "5" , # wrong should be 1
323
+ "removed-files-size" : "2848" ,
324
+ "total-data-files" : "1" , # wrong should be 3
325
+ "total-delete-files" : "0" ,
326
+ "total-equality-deletes" : "0" ,
327
+ "total-files-size" : "859" ,
328
+ "total-position-deletes" : "0" ,
329
+ "total-records" : "2" , # wrong should be 4
330
+ }
331
+ assert len (tbl .inspect .data_files ()) == 3
332
+ assert len (tbl .scan ().to_pandas ()) == 4
333
+
334
+
265
335
@pytest .mark .integration
266
336
def test_data_files (spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
267
337
identifier = "default.arrow_data_files"
0 commit comments