Merge pull request #253 from lincc-frameworks/read-empty-parquet

hombit · web-flow · commit 7791e5db58cf · 2025-05-07T14:48:21.000-04:00
Fix read_parquet for parquet files with no data
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -9,6 +9,7 @@
 from upath import UPath
 
 from ..series.dtype import NestedDtype
+from ..series.utils import table_to_struct_array
 from .core import NestedFrame
 
 
@@ -140,7 +141,7 @@ def read_parquet(
         indices_to_remove = []
         for col, indices in nested_structures.items():
             # Build a struct column from the columns
-            structs[col] = table.select(indices).to_struct_array()
+            structs[col] = table_to_struct_array(table.select(indices))
             indices_to_remove.extend(indices)
 
         # Remove the original columns in reverse order to avoid index shifting
diff --git a/src/nested_pandas/series/utils.py b/src/nested_pandas/series/utils.py
@@ -190,3 +190,15 @@ def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray:
         fields.append(list_array)
 
     return pa.StructArray.from_arrays(fields, names=array.type.value_type.names)
+
+
+def table_to_struct_array(table: pa.Table) -> pa.StructArray:
+    """pa.Table.to_struct_array
+
+    pyarrow has a bug for empty tables:
+    https://github.com/apache/arrow/issues/46355
+    """
+    if len(table) == 0:
+        array = pa.array([], type=pa.struct(table.schema))
+        return cast(pa.StructArray, array)
+    return table.to_struct_array()
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -254,3 +254,28 @@ def test_pandas_read_parquet():
 
         # Check the columns
         assert df.columns.tolist() == ["a", "b", "nested"]
+
+
+def test_read_empty_parquet():
+    """Test that we can read empty parquet files"""
+    orig_nf = generate_data(1, 2).iloc[:0]
+
+    with tempfile.NamedTemporaryFile("wb", suffix="parquet") as tmpfile:
+        orig_nf.to_parquet(tmpfile.name)
+        # All columns
+        # Do not check dtype because of:
+        # https://github.com/lincc-frameworks/nested-pandas/issues/252
+        assert_frame_equal(read_parquet(tmpfile.name), orig_nf, check_dtype=False)
+        # Few columns
+        assert_frame_equal(
+            read_parquet(
+                tmpfile.name,
+                columns=[
+                    "a",
+                    "nested.flux",
+                    "nested.band",
+                ],
+            ),
+            orig_nf.drop(["b", "nested.t"], axis=1),
+            check_dtype=False,
+        )