Skip to content

Commit 7791e5d

Browse files
authored
Merge pull request #253 from lincc-frameworks/read-empty-parquet
Fix read_parquet for parquet files with no data
2 parents efaa27d + debacc3 commit 7791e5d

File tree

3 files changed

+39
-1
lines changed

3 files changed

+39
-1
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from upath import UPath
1010

1111
from ..series.dtype import NestedDtype
12+
from ..series.utils import table_to_struct_array
1213
from .core import NestedFrame
1314

1415

@@ -140,7 +141,7 @@ def read_parquet(
140141
indices_to_remove = []
141142
for col, indices in nested_structures.items():
142143
# Build a struct column from the columns
143-
structs[col] = table.select(indices).to_struct_array()
144+
structs[col] = table_to_struct_array(table.select(indices))
144145
indices_to_remove.extend(indices)
145146

146147
# Remove the original columns in reverse order to avoid index shifting

src/nested_pandas/series/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,15 @@ def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray:
190190
fields.append(list_array)
191191

192192
return pa.StructArray.from_arrays(fields, names=array.type.value_type.names)
193+
194+
195+
def table_to_struct_array(table: pa.Table) -> pa.StructArray:
196+
"""pa.Table.to_struct_array
197+
198+
pyarrow has a bug for empty tables:
199+
https://github.com/apache/arrow/issues/46355
200+
"""
201+
if len(table) == 0:
202+
array = pa.array([], type=pa.struct(table.schema))
203+
return cast(pa.StructArray, array)
204+
return table.to_struct_array()

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,28 @@ def test_pandas_read_parquet():
254254

255255
# Check the columns
256256
assert df.columns.tolist() == ["a", "b", "nested"]
257+
258+
259+
def test_read_empty_parquet():
260+
"""Test that we can read empty parquet files"""
261+
orig_nf = generate_data(1, 2).iloc[:0]
262+
263+
with tempfile.NamedTemporaryFile("wb", suffix="parquet") as tmpfile:
264+
orig_nf.to_parquet(tmpfile.name)
265+
# All columns
266+
# Do not check dtype because of:
267+
# https://github.com/lincc-frameworks/nested-pandas/issues/252
268+
assert_frame_equal(read_parquet(tmpfile.name), orig_nf, check_dtype=False)
269+
# Few columns
270+
assert_frame_equal(
271+
read_parquet(
272+
tmpfile.name,
273+
columns=[
274+
"a",
275+
"nested.flux",
276+
"nested.band",
277+
],
278+
),
279+
orig_nf.drop(["b", "nested.t"], axis=1),
280+
check_dtype=False,
281+
)

0 commit comments

Comments
 (0)