support kwargs for read_parquet

smcguire-cmu · smcguire-cmu · commit 2402e197806c · 2025-04-25T20:18:44.000-04:00
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -14,6 +14,7 @@ def read_parquet(
     data: str | UPath | bytes,
     columns: list[str] | None = None,
     reject_nesting: list[str] | str | None = None,
+    **kwargs,
 ) -> NestedFrame:
     """
     Load a parquet object from a file path into a NestedFrame.
@@ -82,12 +83,12 @@ def read_parquet(
     # Check if `data` is a file-like object
     if hasattr(data, "read"):
         # If `data` is a file-like object, pass it directly to pyarrow
-        table = pq.read_table(data, columns=columns)
+        table = pq.read_table(data, columns=columns, **kwargs)
     else:
         # Otherwise, treat `data` as a file path and use UPath
         path = UPath(data)
-        with path.open("rb") as f:
-            table = pq.read_table(f, columns=columns)
+        filesystem = kwargs.pop("filesystem", path.fs)
+        table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs)
 
     # Resolve partial loading of nested structures
     # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -148,7 +149,7 @@ def read_parquet(
     # not zero-copy, but reduce memory pressure via the self_destruct kwarg
     # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
     df = NestedFrame(
-        table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), split_blocks=True, self_destruct=True)
+        table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True)
     )
     del table
     # Attempt to cast struct columns to NestedDTypes
diff --git a/tests/nested_pandas/e2e_tests/test_issue89.py b/tests/nested_pandas/e2e_tests/test_issue89.py
@@ -1,4 +1,5 @@
 """Based on https://github.com/lincc-frameworks/nested-pandas/issues/89"""
+from pyarrow.dataset import partitioning
 
 import nested_pandas as npd
 import numpy as np
@@ -16,11 +17,13 @@ def test_issue89():
     object_ndf = npd.read_parquet(
         f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet",
         columns=["ra", "dec", "ps1_objid"],
+        partitioning=None,
     ).set_index("ps1_objid")
 
     source_ndf = npd.read_parquet(
         f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet",
         columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"],
+        partitioning=None,
     ).set_index("ps1_objid")
 
     object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -5,6 +5,8 @@
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pytest
+from upath import UPath
+
 from nested_pandas import read_parquet
 from nested_pandas.datasets import generate_data
 from pandas.testing import assert_frame_equal
@@ -26,6 +28,39 @@ def test_read_parquet():
     assert nf.lincc.nest.fields == ["band", "frameworks"]
 
 
+def test_read_parquet_directory():
+    """Test reading a parquet file with no columns specified"""
+    # Load in the example file
+    nf = read_parquet("tests/test_data")
+
+    # Check the columns
+    assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
+
+    # Make sure nested columns were recognized
+    assert nf.nested_columns == ["nested", "lincc"]
+
+    # Check the nested columns
+    assert nf.nested.nest.fields == ["t", "flux", "band"]
+    assert nf.lincc.nest.fields == ["band", "frameworks"]
+
+
+def test_read_parquet_directory_with_filesystem():
+    """Test reading a parquet file with no columns specified"""
+    # Load in the example file
+    path = UPath("tests/test_data")
+    nf = read_parquet(path.path, filesystem=path.fs)
+
+    # Check the columns
+    assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
+
+    # Make sure nested columns were recognized
+    assert nf.nested_columns == ["nested", "lincc"]
+
+    # Check the nested columns
+    assert nf.nested.nest.fields == ["t", "flux", "band"]
+    assert nf.lincc.nest.fields == ["band", "frameworks"]
+
+
 def test_file_object_read_parquet():
     """Test reading parquet from a file-object"""
     with open("tests/test_data/nested.parquet", "rb") as f: