Skip to content

Commit 2402e19

Browse files
committed
support kwargs for read_parquet
1 parent 71828ad commit 2402e19

File tree

3 files changed

+43
-4
lines changed

3 files changed

+43
-4
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def read_parquet(
1414
data: str | UPath | bytes,
1515
columns: list[str] | None = None,
1616
reject_nesting: list[str] | str | None = None,
17+
**kwargs,
1718
) -> NestedFrame:
1819
"""
1920
Load a parquet object from a file path into a NestedFrame.
@@ -82,12 +83,12 @@ def read_parquet(
8283
# Check if `data` is a file-like object
8384
if hasattr(data, "read"):
8485
# If `data` is a file-like object, pass it directly to pyarrow
85-
table = pq.read_table(data, columns=columns)
86+
table = pq.read_table(data, columns=columns, **kwargs)
8687
else:
8788
# Otherwise, treat `data` as a file path and use UPath
8889
path = UPath(data)
89-
with path.open("rb") as f:
90-
table = pq.read_table(f, columns=columns)
90+
filesystem = kwargs.pop("filesystem", path.fs)
91+
table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs)
9192

9293
# Resolve partial loading of nested structures
9394
# Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -148,7 +149,7 @@ def read_parquet(
148149
# not zero-copy, but reduce memory pressure via the self_destruct kwarg
149150
# https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
150151
df = NestedFrame(
151-
table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), split_blocks=True, self_destruct=True)
152+
table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True)
152153
)
153154
del table
154155
# Attempt to cast struct columns to NestedDTypes

tests/nested_pandas/e2e_tests/test_issue89.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Based on https://github.com/lincc-frameworks/nested-pandas/issues/89"""
2+
from pyarrow.dataset import partitioning
23

34
import nested_pandas as npd
45
import numpy as np
@@ -16,11 +17,13 @@ def test_issue89():
1617
object_ndf = npd.read_parquet(
1718
f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet",
1819
columns=["ra", "dec", "ps1_objid"],
20+
partitioning=None,
1921
).set_index("ps1_objid")
2022

2123
source_ndf = npd.read_parquet(
2224
f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet",
2325
columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"],
26+
partitioning=None,
2427
).set_index("ps1_objid")
2528

2629
object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import pyarrow as pa
66
import pyarrow.parquet as pq
77
import pytest
8+
from upath import UPath
9+
810
from nested_pandas import read_parquet
911
from nested_pandas.datasets import generate_data
1012
from pandas.testing import assert_frame_equal
@@ -26,6 +28,39 @@ def test_read_parquet():
2628
assert nf.lincc.nest.fields == ["band", "frameworks"]
2729

2830

31+
def test_read_parquet_directory():
32+
"""Test reading a parquet file with no columns specified"""
33+
# Load in the example file
34+
nf = read_parquet("tests/test_data")
35+
36+
# Check the columns
37+
assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
38+
39+
# Make sure nested columns were recognized
40+
assert nf.nested_columns == ["nested", "lincc"]
41+
42+
# Check the nested columns
43+
assert nf.nested.nest.fields == ["t", "flux", "band"]
44+
assert nf.lincc.nest.fields == ["band", "frameworks"]
45+
46+
47+
def test_read_parquet_directory_with_filesystem():
48+
"""Test reading a parquet file with no columns specified"""
49+
# Load in the example file
50+
path = UPath("tests/test_data")
51+
nf = read_parquet(path.path, filesystem=path.fs)
52+
53+
# Check the columns
54+
assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
55+
56+
# Make sure nested columns were recognized
57+
assert nf.nested_columns == ["nested", "lincc"]
58+
59+
# Check the nested columns
60+
assert nf.nested.nest.fields == ["t", "flux", "band"]
61+
assert nf.lincc.nest.fields == ["band", "frameworks"]
62+
63+
2964
def test_file_object_read_parquet():
3065
"""Test reading parquet from a file-object"""
3166
with open("tests/test_data/nested.parquet", "rb") as f:

0 commit comments

Comments
 (0)