Merge pull request #221 from lincc-frameworks/result_nesting

dougbrn · web-flow · commit cf7f4cef50cc · 2025-03-12T16:35:23.000-07:00
Enable Inference of Nested Structures to Reduce outputs
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -845,7 +845,7 @@ def sort_values(
                 return None
             return new_df
 
-    def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override]
+    def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame:  # type: ignore[override]
         """
         Takes a function and applies it to each top-level row of the NestedFrame.
 
@@ -862,6 +862,12 @@ def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override
         args : positional arguments
             Positional arguments to pass to the function, the first *args should be the names of the
             columns to apply the function to.
+        infer_nesting : bool, default True
+            If True, the function will pack output columns into nested
+            structures based on column names adhering to a nested naming
+            scheme. E.g. "nested.b" and "nested.c" will be packed into a column
+            called "nested" with columns "b" and "c". If False, all outputs
+            will be returned as base columns.
         kwargs : keyword arguments, optional
             Keyword arguments to pass to the function.
 
@@ -915,7 +921,30 @@ def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override
                 iterators.append(self[layer].array.iter_field_lists(col))
 
         results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators)]
-        return NestedFrame(results, index=self.index)
+        results_nf = NestedFrame(results, index=self.index)
+
+        if infer_nesting:
+            # find potential nested structures from columns
+            nested_cols = list(
+                np.unique(
+                    [
+                        column.split(".", 1)[0]
+                        for column in results_nf.columns
+                        if isinstance(column, str) and "." in column
+                    ]
+                )
+            )
+
+            # pack results into nested structures
+            for layer in nested_cols:
+                layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
+                rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
+                nested_col = pack_lists(rename_df, name=layer)
+                results_nf = results_nf[
+                    [col for col in results_nf.columns if not col.startswith(f"{layer}.")]
+                ].join(nested_col)
+
+        return results_nf
 
     def to_parquet(self, path, by_layer=False, **kwargs) -> None:
         """Creates parquet file(s) with the data of a NestedFrame, either
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -1022,6 +1022,67 @@ def cols_allclose(col1, col2):
     )
 
 
+def test_reduce_infer_nesting():
+    """Test that nesting inference works in reduce"""
+
+    ndf = generate_data(3, 20, seed=1)
+
+    # Test simple case
+    def complex_output(flux):
+        return {
+            "max_flux": np.max(flux),
+            "lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
+        }
+
+    result = ndf.reduce(complex_output, "nested.flux")
+    assert list(result.columns) == ["max_flux", "lc"]
+    assert list(result.lc.nest.fields) == ["flux_quantiles"]
+
+    # Test multi-column nested output
+    def complex_output(flux):
+        return {
+            "max_flux": np.max(flux),
+            "lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
+            "lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5],
+        }
+
+    result = ndf.reduce(complex_output, "nested.flux")
+    assert list(result.columns) == ["max_flux", "lc"]
+    assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
+
+    # Test integer names
+    def complex_output(flux):
+        return np.max(flux), np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]), [0.1, 0.2, 0.3, 0.4, 0.5]
+
+    result = ndf.reduce(complex_output, "nested.flux")
+    assert list(result.columns) == [0, 1, 2]
+
+    # Test multiple nested structures output
+    def complex_output(flux):
+        return {
+            "max_flux": np.max(flux),
+            "lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
+            "lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5],
+            "meta.colors": ["green", "red", "blue"],
+        }
+
+    result = ndf.reduce(complex_output, "nested.flux")
+    assert list(result.columns) == ["max_flux", "lc", "meta"]
+    assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
+    assert list(result.meta.nest.fields) == ["colors"]
+
+    # Test only nested structure output
+    def complex_output(flux):
+        return {
+            "lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
+            "lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5],
+        }
+
+    result = ndf.reduce(complex_output, "nested.flux")
+    assert list(result.columns) == ["lc"]
+    assert list(result.lc.nest.fields) == ["flux_quantiles", "labels"]
+
+
 def test_scientific_notation():
     """
     Test that NestedFrame.query handles constants that are written in scientific notation.