Skip to content

Commit 3fb92aa

Browse files
FokkoSreesh Maheshwar
andauthored
Partition statistics metadata reading (#2146)
# Rationale for this change Took @smaheshwar-pltr's draft PR and added a test: #2033 Cherry-picked his original work to ensure it gets attributed to the original author. # Are these changes tested? # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Sreesh Maheshwar <[email protected]>
1 parent bbb1c25 commit 3fb92aa

File tree

4 files changed

+75
-4
lines changed

4 files changed

+75
-4
lines changed

pyiceberg/table/metadata.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
SortOrder,
3737
assign_fresh_sort_order_ids,
3838
)
39-
from pyiceberg.table.statistics import StatisticsFile
39+
from pyiceberg.table.statistics import PartitionStatisticsFile, StatisticsFile
4040
from pyiceberg.typedef import (
4141
EMPTY_DICT,
4242
IcebergBaseModel,
@@ -222,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel):
222222
table correctly. A table can contain many statistics files
223223
associated with different table snapshots."""
224224

225+
partition_statistics: List[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list)
226+
"""A optional list of partition statistics files.
227+
Partition statistics are not required for reading or planning
228+
and readers may ignore them. Each table snapshot may be associated
229+
with at most one partition statistics file. A writer can optionally
230+
write the partition statistics file during each write operation,
231+
or it can also be computed on demand."""
232+
225233
# validators
226234
@field_validator("properties", mode="before")
227235
def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]:

pyiceberg/table/statistics.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,24 @@ class BlobMetadata(IcebergBaseModel):
2929
properties: Optional[Dict[str, str]] = None
3030

3131

32-
class StatisticsFile(IcebergBaseModel):
32+
class StatisticsCommonFields(IcebergBaseModel):
33+
"""Common fields between table and partition statistics structs found on metadata."""
34+
3335
snapshot_id: int = Field(alias="snapshot-id")
3436
statistics_path: str = Field(alias="statistics-path")
3537
file_size_in_bytes: int = Field(alias="file-size-in-bytes")
38+
39+
40+
class StatisticsFile(StatisticsCommonFields):
3641
file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes")
3742
key_metadata: Optional[str] = Field(alias="key-metadata", default=None)
3843
blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata")
3944

4045

46+
class PartitionStatisticsFile(StatisticsCommonFields):
47+
pass
48+
49+
4150
def filter_statistics_by_snapshot_id(
4251
statistics: List[StatisticsFile],
4352
reject_snapshot_id: int,

tests/table/test_metadata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,13 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None:
173173
def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None:
174174
table_metadata = TableMetadataV1(**example_table_metadata_v1)
175175
table_metadata_json = table_metadata.model_dump_json()
176-
expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}"""
176+
expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}"""
177177
assert table_metadata_json == expected
178178

179179

180180
def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None:
181181
table_metadata = TableMetadataV2(**example_table_metadata_v2).model_dump_json()
182-
expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"format-version":2,"last-sequence-number":34}"""
182+
expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"partition-statistics":[],"format-version":2,"last-sequence-number":34}"""
183183
assert table_metadata == expected
184184

185185

tests/table/test_statistics.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
from pyiceberg.table.statistics import BlobMetadata, PartitionStatisticsFile, StatisticsFile
18+
19+
20+
def test_partition_statistics_file() -> None:
21+
partition_statistics_file_json = (
22+
"""{"snapshot-id":123,"statistics-path":"s3://bucket/statistics.parquet","file-size-in-bytes":345}"""
23+
)
24+
partition_statistics_file = PartitionStatisticsFile.model_validate_json(partition_statistics_file_json)
25+
26+
assert partition_statistics_file == PartitionStatisticsFile(
27+
snapshot_id=123, statistics_path="s3://bucket/statistics.parquet", file_size_in_bytes=345
28+
)
29+
30+
assert partition_statistics_file.model_dump_json() == partition_statistics_file_json
31+
32+
33+
def test_statistics_file() -> None:
34+
statistics_file_json = """{"snapshot-id":123,"statistics-path":"s3://bucket/statistics.parquet","file-size-in-bytes":345,"file-footer-size-in-bytes":456,"blob-metadata":[{"type":"apache-datasketches-theta-v1","snapshot-id":567,"sequence-number":22,"fields":[1,2,3],"properties":{"foo":"bar"}}]}"""
35+
statistics_file = StatisticsFile.model_validate_json(statistics_file_json)
36+
37+
assert statistics_file == StatisticsFile(
38+
snapshot_id=123,
39+
statistics_path="s3://bucket/statistics.parquet",
40+
file_size_in_bytes=345,
41+
file_footer_size_in_bytes=456,
42+
key_metadata=None,
43+
blob_metadata=[
44+
BlobMetadata(
45+
type="apache-datasketches-theta-v1",
46+
snapshot_id=567,
47+
sequence_number=22,
48+
fields=[1, 2, 3],
49+
properties={"foo": "bar"},
50+
)
51+
],
52+
)
53+
54+
assert statistics_file.model_dump_json() == statistics_file_json

0 commit comments

Comments
 (0)