Skip to content

Commit 382a548

Browse files
committed
Avro: Fix tests and add missing content header
While working on #2004 I've noticed some small discrepancies that I think would be good to address in a separate PR.
1 parent 904c0b7 commit 382a548

File tree

3 files changed

+36
-14
lines changed

3 files changed

+36
-14
lines changed

pyiceberg/manifest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,7 @@ def __init__(
12791279
"parent-snapshot-id": str(parent_snapshot_id) if parent_snapshot_id is not None else "null",
12801280
"sequence-number": str(sequence_number),
12811281
"format-version": "2",
1282+
"content": "data",
12821283
AVRO_CODEC_KEY: compression,
12831284
},
12841285
)

tests/conftest.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import boto3
4848
import pytest
4949
from moto import mock_aws
50+
from pydantic_core import to_json
5051

5152
from pyiceberg.catalog import Catalog, load_catalog
5253
from pyiceberg.catalog.noop import NoopCatalog
@@ -67,10 +68,12 @@
6768
)
6869
from pyiceberg.io.fsspec import FsspecFileIO
6970
from pyiceberg.manifest import DataFile, FileFormat
71+
from pyiceberg.partitioning import PartitionField, PartitionSpec
7072
from pyiceberg.schema import Accessor, Schema
7173
from pyiceberg.serializers import ToOutputFile
7274
from pyiceberg.table import FileScanTask, Table
7375
from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2
76+
from pyiceberg.transforms import DayTransform, IdentityTransform
7477
from pyiceberg.types import (
7578
BinaryType,
7679
BooleanType,
@@ -1858,15 +1861,40 @@ def simple_map() -> MapType:
18581861

18591862

18601863
@pytest.fixture(scope="session")
1861-
def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]:
1864+
def test_schema() -> Schema:
1865+
return Schema(
1866+
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", TimestampType(), False)
1867+
)
1868+
1869+
1870+
@pytest.fixture(scope="session")
1871+
def test_partition_spec() -> Schema:
1872+
return PartitionSpec(
1873+
PartitionField(1, 1000, IdentityTransform(), "VendorID"),
1874+
PartitionField(2, 1001, DayTransform(), "tpep_pickup_datetime"),
1875+
)
1876+
1877+
1878+
@pytest.fixture(scope="session")
1879+
def generated_manifest_entry_file(
1880+
avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec
1881+
) -> Generator[str, None, None]:
18621882
from fastavro import parse_schema, writer
18631883

18641884
parsed_schema = parse_schema(avro_schema_manifest_entry)
18651885

18661886
with TemporaryDirectory() as tmpdir:
18671887
tmp_avro_file = tmpdir + "/manifest.avro"
18681888
with open(tmp_avro_file, "wb") as out:
1869-
writer(out, parsed_schema, manifest_entry_records)
1889+
writer(
1890+
out,
1891+
parsed_schema,
1892+
manifest_entry_records,
1893+
metadata={
1894+
"schema": test_schema.model_dump_json(),
1895+
"partition-spec": to_json(test_partition_spec.fields).decode("utf-8"),
1896+
},
1897+
)
18701898
yield tmp_avro_file
18711899

18721900

tests/utils/test_manifest.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@
3838
write_manifest,
3939
write_manifest_list,
4040
)
41-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
41+
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
4242
from pyiceberg.schema import Schema
4343
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
44-
from pyiceberg.transforms import IdentityTransform
4544
from pyiceberg.typedef import Record, TableVersion
4645
from pyiceberg.types import IntegerType, NestedField
4746

@@ -363,6 +362,8 @@ def test_write_manifest(
363362
generated_manifest_file_file_v1: str,
364363
generated_manifest_file_file_v2: str,
365364
format_version: TableVersion,
365+
test_schema: Schema,
366+
test_partition_spec: PartitionSpec,
366367
compression: AvroCompressionCodec,
367368
) -> None:
368369
io = load_file_io()
@@ -376,20 +377,12 @@ def test_write_manifest(
376377
)
377378
demo_manifest_file = snapshot.manifests(io)[0]
378379
manifest_entries = demo_manifest_file.fetch_manifest_entry(io)
379-
test_schema = Schema(
380-
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False)
381-
)
382-
test_spec = PartitionSpec(
383-
PartitionField(source_id=1, field_id=1, transform=IdentityTransform(), name="VendorID"),
384-
PartitionField(source_id=2, field_id=2, transform=IdentityTransform(), name="tpep_pickup_datetime"),
385-
spec_id=demo_manifest_file.partition_spec_id,
386-
)
387380
with TemporaryDirectory() as tmpdir:
388381
tmp_avro_file = tmpdir + "/test_write_manifest.avro"
389382
output = io.new_output(tmp_avro_file)
390383
with write_manifest(
391384
format_version=format_version,
392-
spec=test_spec,
385+
spec=test_partition_spec,
393386
schema=test_schema,
394387
output_file=output,
395388
snapshot_id=8744736658442914487,
@@ -404,7 +397,7 @@ def test_write_manifest(
404397

405398
expected_metadata = {
406399
"schema": test_schema.model_dump_json(),
407-
"partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""",
400+
"partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_datetime"}]""",
408401
"partition-spec-id": str(demo_manifest_file.partition_spec_id),
409402
"format-version": str(format_version),
410403
}

0 commit comments

Comments
 (0)