File tree Expand file tree Collapse file tree 3 files changed +39
-1
lines changed
tests/nested_pandas/nestedframe Expand file tree Collapse file tree 3 files changed +39
-1
lines changed Original file line number Diff line number Diff line change 9
9
from upath import UPath
10
10
11
11
from ..series .dtype import NestedDtype
12
+ from ..series .utils import table_to_struct_array
12
13
from .core import NestedFrame
13
14
14
15
@@ -140,7 +141,7 @@ def read_parquet(
140
141
indices_to_remove = []
141
142
for col , indices in nested_structures .items ():
142
143
# Build a struct column from the columns
143
- structs [col ] = table .select (indices ). to_struct_array ( )
144
+ structs [col ] = table_to_struct_array ( table .select (indices ))
144
145
indices_to_remove .extend (indices )
145
146
146
147
# Remove the original columns in reverse order to avoid index shifting
Original file line number Diff line number Diff line change @@ -190,3 +190,15 @@ def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray:
190
190
fields .append (list_array )
191
191
192
192
return pa .StructArray .from_arrays (fields , names = array .type .value_type .names )
193
+
194
+
195
+ def table_to_struct_array (table : pa .Table ) -> pa .StructArray :
196
+ """pa.Table.to_struct_array
197
+
198
+ pyarrow has a bug for empty tables:
199
+ https://github.com/apache/arrow/issues/46355
200
+ """
201
+ if len (table ) == 0 :
202
+ array = pa .array ([], type = pa .struct (table .schema ))
203
+ return cast (pa .StructArray , array )
204
+ return table .to_struct_array ()
Original file line number Diff line number Diff line change @@ -254,3 +254,28 @@ def test_pandas_read_parquet():
254
254
255
255
# Check the columns
256
256
assert df .columns .tolist () == ["a" , "b" , "nested" ]
257
+
258
+
259
+ def test_read_empty_parquet ():
260
+ """Test that we can read empty parquet files"""
261
+ orig_nf = generate_data (1 , 2 ).iloc [:0 ]
262
+
263
+ with tempfile .NamedTemporaryFile ("wb" , suffix = "parquet" ) as tmpfile :
264
+ orig_nf .to_parquet (tmpfile .name )
265
+ # All columns
266
+ # Do not check dtype because of:
267
+ # https://github.com/lincc-frameworks/nested-pandas/issues/252
268
+ assert_frame_equal (read_parquet (tmpfile .name ), orig_nf , check_dtype = False )
269
+ # Few columns
270
+ assert_frame_equal (
271
+ read_parquet (
272
+ tmpfile .name ,
273
+ columns = [
274
+ "a" ,
275
+ "nested.flux" ,
276
+ "nested.band" ,
277
+ ],
278
+ ),
279
+ orig_nf .drop (["b" , "nested.t" ], axis = 1 ),
280
+ check_dtype = False ,
281
+ )
You can’t perform that action at this time.
0 commit comments