Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 38 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import boto3
import pytest
from moto import mock_aws
from pydantic_core import to_json

from pyiceberg.catalog import Catalog, load_catalog
from pyiceberg.catalog.noop import NoopCatalog
Expand All @@ -67,10 +68,12 @@
)
from pyiceberg.io.fsspec import FsspecFileIO
from pyiceberg.manifest import DataFile, FileFormat
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Accessor, Schema
from pyiceberg.serializers import ToOutputFile
from pyiceberg.table import FileScanTask, Table
from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2
from pyiceberg.transforms import DayTransform, IdentityTransform
from pyiceberg.types import (
BinaryType,
BooleanType,
Expand Down Expand Up @@ -1255,8 +1258,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
{"key": 15, "value": 0},
],
"lower_bounds": [
{"key": 2, "value": b"2020-04-01 00:00"},
{"key": 3, "value": b"2020-04-01 00:12"},
{"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
{"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
{"key": 7, "value": b"\x03\x00\x00\x00"},
{"key": 8, "value": b"\x01\x00\x00\x00"},
{"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"},
Expand All @@ -1270,8 +1273,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
{"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"},
],
"upper_bounds": [
{"key": 2, "value": b"2020-04-30 23:5:"},
{"key": 3, "value": b"2020-05-01 00:41"},
{"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
{"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
{"key": 7, "value": b"\t\x01\x00\x00"},
{"key": 8, "value": b"\t\x01\x00\x00"},
{"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"},
Expand Down Expand Up @@ -1376,8 +1379,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
],
"lower_bounds": [
{"key": 1, "value": b"\x01\x00\x00\x00"},
{"key": 2, "value": b"2020-04-01 00:00"},
{"key": 3, "value": b"2020-04-01 00:03"},
{"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
{"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
{"key": 4, "value": b"\x00\x00\x00\x00"},
{"key": 5, "value": b"\x01\x00\x00\x00"},
{"key": 6, "value": b"N"},
Expand All @@ -1396,8 +1399,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
],
"upper_bounds": [
{"key": 1, "value": b"\x01\x00\x00\x00"},
{"key": 2, "value": b"2020-04-30 23:5:"},
{"key": 3, "value": b"2020-05-01 00:1:"},
{"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
{"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
{"key": 4, "value": b"\x06\x00\x00\x00"},
{"key": 5, "value": b"c\x00\x00\x00"},
{"key": 6, "value": b"Y"},
Expand Down Expand Up @@ -1858,15 +1861,40 @@ def simple_map() -> MapType:


@pytest.fixture(scope="session")
def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]:
def test_schema() -> Schema:
return Schema(
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", TimestampType(), False)
)


@pytest.fixture(scope="session")
def test_partition_spec() -> Schema:
return PartitionSpec(
PartitionField(1, 1000, IdentityTransform(), "VendorID"),
PartitionField(2, 1001, DayTransform(), "tpep_pickup_day"),
)


@pytest.fixture(scope="session")
def generated_manifest_entry_file(
avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec
) -> Generator[str, None, None]:
from fastavro import parse_schema, writer

parsed_schema = parse_schema(avro_schema_manifest_entry)

with TemporaryDirectory() as tmpdir:
tmp_avro_file = tmpdir + "/manifest.avro"
with open(tmp_avro_file, "wb") as out:
writer(out, parsed_schema, manifest_entry_records)
writer(
out,
parsed_schema,
manifest_entry_records,
metadata={
"schema": test_schema.model_dump_json(),
"partition-spec": to_json(test_partition_spec.fields).decode("utf-8"),
},
)
yield tmp_avro_file


Expand Down
33 changes: 13 additions & 20 deletions tests/utils/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@
write_manifest,
write_manifest_list,
)
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
from pyiceberg.transforms import IdentityTransform
from pyiceberg.typedef import Record, TableVersion
from pyiceberg.types import IntegerType, NestedField

Expand Down Expand Up @@ -154,8 +153,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
}
assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}
assert data_file.lower_bounds == {
2: b"2020-04-01 00:00",
3: b"2020-04-01 00:12",
2: b"\x01\x00\x00\x00\x00\x00\x00\x00",
3: b"\x01\x00\x00\x00\x00\x00\x00\x00",
7: b"\x03\x00\x00\x00",
8: b"\x01\x00\x00\x00",
10: b"\xf6(\\\x8f\xc2\x05S\xc0",
Expand All @@ -169,8 +168,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
}
assert data_file.upper_bounds == {
2: b"2020-04-30 23:5:",
3: b"2020-05-01 00:41",
2: b"\x06\x00\x00\x00\x00\x00\x00\x00",
3: b"\x06\x00\x00\x00\x00\x00\x00\x00",
7: b"\t\x01\x00\x00",
8: b"\t\x01\x00\x00",
10: b"\xcd\xcc\xcc\xcc\xcc,_@",
Expand Down Expand Up @@ -363,6 +362,8 @@ def test_write_manifest(
generated_manifest_file_file_v1: str,
generated_manifest_file_file_v2: str,
format_version: TableVersion,
test_schema: Schema,
test_partition_spec: PartitionSpec,
compression: AvroCompressionCodec,
) -> None:
io = load_file_io()
Expand All @@ -376,20 +377,12 @@ def test_write_manifest(
)
demo_manifest_file = snapshot.manifests(io)[0]
manifest_entries = demo_manifest_file.fetch_manifest_entry(io)
test_schema = Schema(
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False)
)
test_spec = PartitionSpec(
PartitionField(source_id=1, field_id=1, transform=IdentityTransform(), name="VendorID"),
PartitionField(source_id=2, field_id=2, transform=IdentityTransform(), name="tpep_pickup_datetime"),
spec_id=demo_manifest_file.partition_spec_id,
)
with TemporaryDirectory() as tmpdir:
tmp_avro_file = tmpdir + "/test_write_manifest.avro"
output = io.new_output(tmp_avro_file)
with write_manifest(
format_version=format_version,
spec=test_spec,
spec=test_partition_spec,
schema=test_schema,
output_file=output,
snapshot_id=8744736658442914487,
Expand All @@ -404,7 +397,7 @@ def test_write_manifest(

expected_metadata = {
"schema": test_schema.model_dump_json(),
"partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""",
"partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""",
"partition-spec-id": str(demo_manifest_file.partition_spec_id),
"format-version": str(format_version),
}
Expand Down Expand Up @@ -497,8 +490,8 @@ def test_write_manifest(
}
assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}
assert data_file.lower_bounds == {
2: b"2020-04-01 00:00",
3: b"2020-04-01 00:12",
2: b"\x01\x00\x00\x00\x00\x00\x00\x00",
3: b"\x01\x00\x00\x00\x00\x00\x00\x00",
7: b"\x03\x00\x00\x00",
8: b"\x01\x00\x00\x00",
10: b"\xf6(\\\x8f\xc2\x05S\xc0",
Expand All @@ -512,8 +505,8 @@ def test_write_manifest(
19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
}
assert data_file.upper_bounds == {
2: b"2020-04-30 23:5:",
3: b"2020-05-01 00:41",
2: b"\x06\x00\x00\x00\x00\x00\x00\x00",
3: b"\x06\x00\x00\x00\x00\x00\x00\x00",
7: b"\t\x01\x00\x00",
8: b"\t\x01\x00\x00",
10: b"\xcd\xcc\xcc\xcc\xcc,_@",
Expand Down