apache · dingo4dev · May 14, 2025 · May 17, 2025 · May 17, 2025
diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py
@@ -32,6 +32,7 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 from uuid import UUID
 
@@ -121,8 +122,22 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None:
 
 @dataclass(frozen=True)
 class UUIDWriter(Writer):
-    def write(self, encoder: BinaryEncoder, val: UUID) -> None:
-        encoder.write(val.bytes)
+    def write(self, encoder: BinaryEncoder, val: Union[UUID, str, bytes]) -> None:
+        if isinstance(val, UUID):
+            encoder.write(val.bytes)
+        elif isinstance(val, bytes):
+            encoder.write(val)
+        elif isinstance(val, str):
+            if val.startswith("b'") and val.endswith("'"):
+                # Handle string representation of bytes
+                # Convert the escaped string to actual bytes
+                byte_string = val[2:-1].encode("utf-8").decode("unicode_escape").encode("latin1")
+                encoder.write(UUID(bytes=byte_string).bytes)
+            else:
+                # Regular UUID string
+                encoder.write(UUID(val).bytes)
+        else:
+            raise TypeError(f"Expected UUID, bytes, or string, got {type(val)}")
 
 
 @dataclass(frozen=True)

diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py
@@ -267,10 +267,13 @@ def _(_: StringType, value: str) -> bytes:
 
 
 @to_bytes.register(UUIDType)
-def _(_: UUIDType, value: Union[uuid.UUID, bytes]) -> bytes:
+def _(_: UUIDType, value: Union[uuid.UUID, bytes, str]) -> bytes:
     if isinstance(value, bytes):
-        return value
-    return value.bytes
+        return str(uuid.UUID(bytes=value)).encode(UTF8)
+    elif isinstance(value, uuid.UUID):
+        return str(value).encode(UTF8)
+    else:
+        return str(uuid.UUID(value)).encode(UTF8)
 
 
 @to_bytes.register(BinaryType)
@@ -355,11 +358,15 @@ def _(_: StringType, b: bytes) -> str:
 
 @from_bytes.register(BinaryType)
 @from_bytes.register(FixedType)
-@from_bytes.register(UUIDType)
 def _(_: PrimitiveType, b: bytes) -> bytes:
     return b
 
 
+@from_bytes.register(UUIDType)
+def _(_: UUIDType, b: bytes) -> uuid.UUID:
+    return uuid.UUID(b.decode(UTF8))
+
+
 @from_bytes.register(DecimalType)
 def _(primitive_type: DecimalType, buf: bytes) -> Decimal:
     unscaled = int.from_bytes(buf, "big", signed=True)

diff --git a/pyiceberg/expressions/literals.py b/pyiceberg/expressions/literals.py
@@ -144,7 +144,7 @@ def literal(value: L) -> Literal[L]:
     elif isinstance(value, str):
         return StringLiteral(value)
     elif isinstance(value, UUID):
-        return UUIDLiteral(value.bytes)  # type: ignore
+        return UUIDLiteral(UUID(bytes=value.bytes))
     elif isinstance(value, bytes):
         return BinaryLiteral(value)
     elif isinstance(value, Decimal):
@@ -586,8 +586,8 @@ def _(self, _: TimestamptzType) -> Literal[int]:
         return TimestampLiteral(timestamptz_to_micros(self.value))
 
     @to.register(UUIDType)
-    def _(self, _: UUIDType) -> Literal[bytes]:
-        return UUIDLiteral(UUID(self.value).bytes)
+    def _(self, _: UUIDType) -> Literal[UUID]:
+        return UUIDLiteral(UUID(self.value))
 
     @to.register(DecimalType)
     def _(self, type_var: DecimalType) -> Literal[Decimal]:
@@ -631,30 +631,30 @@ def __repr__(self) -> str:
         return f"literal({repr(self.value)})"
 
 
-class UUIDLiteral(Literal[bytes]):
-    def __init__(self, value: bytes) -> None:
-        super().__init__(value, bytes)
+class UUIDLiteral(Literal[UUID]):
+    def __init__(self, value: UUID) -> None:
+        super().__init__(value, UUID)
 
     @singledispatchmethod
     def to(self, type_var: IcebergType) -> Literal:  # type: ignore
         raise TypeError(f"Cannot convert UUIDLiteral into {type_var}")
 
     @to.register(UUIDType)
-    def _(self, _: UUIDType) -> Literal[bytes]:
+    def _(self, _: UUIDType) -> Literal[UUID]:
         return self
 
     @to.register(FixedType)
     def _(self, type_var: FixedType) -> Literal[bytes]:
         if len(type_var) == UUID_BYTES_LENGTH:
-            return FixedLiteral(self.value)
+            return FixedLiteral(self.value.bytes)
         else:
             raise TypeError(
                 f"Cannot convert UUIDLiteral into {type_var}, different length: {len(type_var)} <> {UUID_BYTES_LENGTH}"
             )
 
     @to.register(BinaryType)
     def _(self, _: BinaryType) -> Literal[bytes]:
-        return BinaryLiteral(self.value)
+        return BinaryLiteral(self.value.bytes)
 
 
 class FixedLiteral(Literal[bytes]):
@@ -679,9 +679,9 @@ def _(self, _: BinaryType) -> Literal[bytes]:
         return BinaryLiteral(self.value)
 
     @to.register(UUIDType)
-    def _(self, type_var: UUIDType) -> Literal[bytes]:
+    def _(self, type_var: UUIDType) -> Literal[UUID]:
         if len(self.value) == UUID_BYTES_LENGTH:
-            return UUIDLiteral(self.value)
+            return UUIDLiteral(UUID(bytes=self.value))
         else:
             raise TypeError(
                 f"Could not convert {self.value!r} into a {type_var}, lengths differ {len(self.value)} <> {UUID_BYTES_LENGTH}"
@@ -710,9 +710,9 @@ def _(self, type_var: FixedType) -> Literal[bytes]:
             )
 
     @to.register(UUIDType)
-    def _(self, type_var: UUIDType) -> Literal[bytes]:
+    def _(self, type_var: UUIDType) -> Literal[UUID]:
         if len(self.value) == UUID_BYTES_LENGTH:
-            return UUIDLiteral(self.value)
+            return UUIDLiteral(UUID(bytes=self.value))
         else:
             raise TypeError(
                 f"Cannot convert BinaryLiteral into {type_var}, different length: {UUID_BYTES_LENGTH} <> {len(self.value)}"

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -467,8 +467,17 @@ def _(type: IcebergType, value: Optional[time]) -> Optional[int]:
 
 
 @_to_partition_representation.register(UUIDType)
-def _(type: IcebergType, value: Optional[uuid.UUID]) -> Optional[str]:
-    return str(value) if value is not None else None
+def _(type: IcebergType, value: Optional[Union[uuid.UUID, int, bytes]]) -> Optional[Union[str, int]]:
+    if value is None:
+        return None
+    elif isinstance(value, bytes):
+        return str(uuid.UUID(bytes=value))  # IdentityTransform
+    elif isinstance(value, uuid.UUID):
+        return str(value)  # IdentityTransform
+    elif isinstance(value, int):
+        return value  # BucketTransform
+    else:
+        raise ValueError(f"Type not recognized: {value}")
 
 
 @_to_partition_representation.register(PrimitiveType)

diff --git a/tests/expressions/test_literals.py b/tests/expressions/test_literals.py
@@ -373,7 +373,7 @@ def test_string_to_uuid_literal() -> None:
     uuid_str = literal(str(expected))
     uuid_lit = uuid_str.to(UUIDType())
 
-    assert expected.bytes == uuid_lit.value
+    assert uuid_lit.value == expected
 
 
 def test_string_to_decimal_literal() -> None:
@@ -530,8 +530,7 @@ def test_binary_to_uuid() -> None:
     lit = literal(test_uuid.bytes)
     uuid_lit = lit.to(UUIDType())
     assert uuid_lit is not None
-    assert lit.value == uuid_lit.value
-    assert uuid_lit.value == test_uuid.bytes
+    assert uuid_lit.value == test_uuid
 
 
 def test_incompatible_binary_to_uuid() -> None:
@@ -560,8 +559,7 @@ def test_fixed_to_uuid() -> None:
     lit = literal(test_uuid.bytes).to(FixedType(16))
     uuid_lit = lit.to(UUIDType())
     assert uuid_lit is not None
-    assert lit.value == uuid_lit.value
-    assert uuid_lit.value == test_uuid.bytes
+    assert uuid_lit.value == test_uuid
 
 
 def test_incompatible_fixed_to_uuid() -> None:
@@ -900,7 +898,7 @@ def test_uuid_literal_initialization() -> None:
     test_uuid = uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")
     uuid_literal = literal(test_uuid)
     assert isinstance(uuid_literal, Literal)
-    assert test_uuid.bytes == uuid_literal.value
+    assert test_uuid == uuid_literal.value
 
 
 def test_uuid_to_fixed() -> None:

diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -19,6 +19,7 @@
 import os
 import random
 import time
+import uuid
 from datetime import date, datetime, timedelta
 from decimal import Decimal
 from pathlib import Path
@@ -48,7 +49,7 @@
 from pyiceberg.schema import Schema
 from pyiceberg.table import TableProperties
 from pyiceberg.table.sorting import SortDirection, SortField, SortOrder
-from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
+from pyiceberg.transforms import BucketTransform, DayTransform, HourTransform, IdentityTransform
 from pyiceberg.types import (
     DateType,
     DecimalType,
@@ -58,6 +59,7 @@
     LongType,
     NestedField,
     StringType,
+    UUIDType,
 )
 from utils import _create_table
 
@@ -1841,3 +1843,56 @@ def test_read_write_decimals(session_catalog: Catalog) -> None:
     tbl.append(arrow_table)
 
     assert tbl.scan().to_arrow() == arrow_table
+
+
+@pytest.mark.integration
+def test_read_write_uuids_partitioned(session_catalog: Catalog) -> None:
+    """Test simple reading and writing partitioned UUID data types in supported transform.
+    - BucketTransform
+    - IdentityTransform
+    """
+
+    identifier = "default.test_read_write_uuids"
+    uuids = [
+        uuid.UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes,
+        uuid.UUID("5f473c64-dbeb-449b-bdfa-b6b4185b1bde").bytes,
+        None,
+    ]
+
+    arrow_table = pa.Table.from_pydict(
+        {
+            "uuid_1": pa.array(uuids, type=pa.binary(16)),
+            "uuid_2": pa.array(uuids, type=pa.binary(16)),
+        }
+    )
+
+    tbl = _create_table(
+        session_catalog,
+        identifier,
+        properties={"format-version": 2},
+        schema=Schema(
+            NestedField(field_id=1, name="uuid_1", field_type=UUIDType(), required=False),
+            NestedField(field_id=2, name="uuid_2", field_type=UUIDType(), required=False),
+        ),
+        partition_spec=PartitionSpec(
+            PartitionField(source_id=1, field_id=1001, transform=BucketTransform(2), name="uuid_bucket"),
+            PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="uuid_indentity"),
+        ),
+    )
+
+    tbl.append(arrow_table)
+    assert tbl.scan().to_arrow() == arrow_table
+    # Check BucketTransform partitioning filtering
+    assert tbl.scan(row_filter=f"uuid_1 == '{uuid.UUID(bytes=uuids[0])}'").to_arrow() == pa.Table.from_pydict(
+        {
+            "uuid_1": pa.array([uuids[0]], type=pa.binary(16)),
+            "uuid_2": pa.array([uuids[0]], type=pa.binary(16)),
+        }
+    )
+    # Check IdentityTransform partitioning filtering
+    assert tbl.scan(row_filter=f"uuid_2 == '{uuid.UUID(bytes=uuids[1])}'").to_arrow() == pa.Table.from_pydict(
+        {
+            "uuid_1": pa.array([uuids[1]], type=pa.binary(16)),
+            "uuid_2": pa.array([uuids[1]], type=pa.binary(16)),
+        }
+    )
diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
@@ -66,6 +66,7 @@
     TableMetadataV1,
     TableMetadataV2,
 )
+from pyiceberg.typedef import UTF8
 from pyiceberg.types import (
     BooleanType,
     FloatType,
@@ -537,7 +538,7 @@ def test_metrics_primitive_types() -> None:
     assert datafile.lower_bounds[8] == STRUCT_INT64.pack(datetime_to_micros(datetime(2022, 1, 2, 17, 30, 34, 399)))
     assert datafile.lower_bounds[9] == STRUCT_INT64.pack(datetime_to_micros(datetime(2022, 1, 2, 17, 30, 34, 399, tz)))
     assert datafile.lower_bounds[10] == b"he"
-    assert datafile.lower_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes
+    assert datafile.lower_bounds[11] == str(uuid.uuid3(uuid.NAMESPACE_DNS, "foo")).encode(UTF8)
     assert datafile.lower_bounds[12] == b"he"
     assert datafile.lower_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(12345)
     assert datafile.lower_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(12345679123456)
@@ -554,7 +555,7 @@ def test_metrics_primitive_types() -> None:
     assert datafile.upper_bounds[8] == STRUCT_INT64.pack(datetime_to_micros(datetime(2023, 2, 4, 13, 21, 4, 354)))
     assert datafile.upper_bounds[9] == STRUCT_INT64.pack(datetime_to_micros(datetime(2023, 2, 4, 13, 21, 4, 354, tz)))
     assert datafile.upper_bounds[10] == b"wp"
-    assert datafile.upper_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes
+    assert datafile.upper_bounds[11] == str(uuid.uuid3(uuid.NAMESPACE_DNS, "bar")).encode(UTF8)
     assert datafile.upper_bounds[12] == b"wp"
     assert datafile.upper_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(67891)
     assert datafile.upper_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(67891234678912)

diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
@@ -21,14 +21,15 @@
 
 import pytest
 
-from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec
 from pyiceberg.schema import Schema
 from pyiceberg.transforms import (
     BucketTransform,
     DayTransform,
     HourTransform,
     IdentityTransform,
     MonthTransform,
+    Transform,
     TruncateTransform,
     YearTransform,
 )
@@ -217,6 +218,41 @@ def test_transform_consistency_with_pyarrow_transform(source_type: PrimitiveType
                 raise
 
 
+@pytest.mark.parametrize(
+    "source_type, _transform, input_value,expected_value",
+    [
+        (UUIDType(), BucketTransform(2), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, 0),
+        (
+            UUIDType(),
+            IdentityTransform(),
+            UUID("ec9b663b-062f-4200-a130-8de19c21b800"),
+            UUID("ec9b663b-062f-4200-a130-8de19c21b800"),
+        ),
+        (UUIDType(), TruncateTransform(1), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, None),
+    ],
+)
+def test_transform_uuid_partition_key(
+    source_type: PrimitiveType, _transform: Transform[Any, Any], input_value: UUID, expected_value: Any
+) -> None:
+    """
+    Tests that UUID values can be correctly transformed and used as partition keys with various transformation functions.
+    """
+    schema = Schema(NestedField(field_id=1, name="uuid", field_type=source_type, required=True))
+    partition_field = PartitionField(source_id=1, field_id=1001, transform=_transform, name="uuid_partition")
+    spec = PartitionSpec(partition_field)
+
+    if _transform.can_transform(source_type):
+        transformer = _transform.transform(source=source_type)
+
+        value = transformer(input_value)
+        assert value == expected_value
+
+        partition_field_value = PartitionFieldValue(field=partition_field, value=value)
+        partition_key = PartitionKey(field_values=[partition_field_value], partition_spec=spec, schema=schema)
+        assert partition_key.field_values[0].field == partition_field
+        assert partition_key.field_values[0].value == expected_value
+
+
 def test_deserialize_partition_field_v2() -> None:
     json_partition_spec = """{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}"""