Skip to content

Commit 27dd430

Browse files
authored
Merge pull request #181 from lincc-frameworks/fix-from-lists
Fix pack_lists for multiple-chunk series
2 parents 3cb19bd + 5d39c52 commit 27dd430

File tree

2 files changed

+64
-4
lines changed

2 files changed

+64
-4
lines changed

src/nested_pandas/series/packer.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,34 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr
196196
nested_pandas.series.dtype.NestedDtype : The dtype of the output series.
197197
nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes.
198198
"""
199-
struct_array = pa.StructArray.from_arrays(
200-
[df[column] for column in df.columns],
201-
names=df.columns,
202-
)
199+
# When series is converted to pa.array it may be both Array and ChunkedArray
200+
# We convert it to chunked for the sake of consistency
201+
pa_arrays_maybe_chunked = {column: pa.array(df[column]) for column in df.columns}
202+
pa_chunked_arrays = {
203+
column: arr if isinstance(arr, pa.ChunkedArray) else pa.chunked_array([arr])
204+
for column, arr in pa_arrays_maybe_chunked.items()
205+
}
206+
207+
# If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no
208+
# data copying.
209+
chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()])
210+
if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths):
211+
chunks = []
212+
numpy_chunks = next(iter(pa_chunked_arrays.values())).num_chunks
213+
for i in range(numpy_chunks):
214+
chunks.append(
215+
pa.StructArray.from_arrays(
216+
[arr.chunk(i) for arr in pa_chunked_arrays.values()],
217+
names=pa_chunked_arrays.keys(),
218+
)
219+
)
220+
struct_array = pa.chunked_array(chunks)
221+
else: # "flatten" the chunked arrays
222+
struct_array = pa.StructArray.from_arrays(
223+
[arr.combine_chunks() for arr in pa_chunked_arrays.values()],
224+
names=pa_chunked_arrays.keys(),
225+
)
226+
203227
ext_array = NestedExtensionArray(struct_array, validate=validate)
204228
return pd.Series(
205229
ext_array,

tests/nested_pandas/series/test_packer.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,42 @@ def test_pack_lists():
286286
assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name])
287287

288288

289+
def test_pack_lists_with_chunked_arrays():
290+
"""Issue https://github.com/lincc-frameworks/nested-pandas/issues/180"""
291+
chunked_a = pd.Series(
292+
pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3),
293+
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
294+
name="a",
295+
)
296+
chunked_b = pd.Series(
297+
pa.chunked_array([pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]])] * 3),
298+
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
299+
name="b",
300+
)
301+
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
302+
series = packer.pack_lists(list_df)
303+
assert_series_equal(series.nest.get_list_series("a"), chunked_a)
304+
assert_series_equal(series.nest.get_list_series("b"), chunked_b)
305+
306+
307+
def test_pack_lists_with_uneven_chunked_arrays():
308+
"""Issue https://github.com/lincc-frameworks/nested-pandas/issues/180"""
309+
chunked_a = pd.Series(
310+
pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3),
311+
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
312+
name="a",
313+
)
314+
chunked_b = pd.Series(
315+
pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]] * 3),
316+
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
317+
name="b",
318+
)
319+
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
320+
series = packer.pack_lists(list_df)
321+
assert_series_equal(series.nest.get_list_series("a"), chunked_a)
322+
assert_series_equal(series.nest.get_list_series("b"), chunked_b)
323+
324+
289325
def test_pack_seq_with_dfs_and_index():
290326
"""Test pack_seq()."""
291327
dfs = [

0 commit comments

Comments
 (0)