Merge pull request #243 from lincc-frameworks/ppt-2.0.6

hombit · web-flow · commit f10acc0e4234 · 2025-04-22T14:48:26.000-04:00
PPT 2.0.6, doctest fixes, NestedFrame.__repr__
diff --git a/.copier-answers.yml b/.copier-answers.yml
@@ -1,5 +1,5 @@
 # Changes here will be overwritten by Copier
-_commit: v2.0.5
+_commit: v2.0.6
 _src_path: gh:lincc-frameworks/python-project-template
 author_email: brantd@uw.edu
 author_name: LINCC Frameworks
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -97,7 +97,7 @@ repos:
             "-d", # Flag for cached environment and doctrees
             "./docs/_build/doctrees", # Directory
             "-D", # Flag to override settings in conf.py
-            "exclude_patterns=notebooks/*", # Exclude our notebooks from pre-commit
+            "exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit
           ]
     # Run unit tests, verify that they pass. Note that coverage is run against
     # the ./src directory here because that is what will be committed. In the
diff --git a/.setup_dev.sh b/.setup_dev.sh
@@ -1,10 +1,19 @@
 #!/usr/bin/env bash
 
+# Bash Unofficial strict mode (http://redsymbol.net/articles/unofficial-bash-strict-mode/) 
+# and (https://disconnected.systems/blog/another-bash-strict-mode/)
+set -o nounset # Any uninitialized variable is an error
+set -o errexit # Exit the script on the failure of any command to execute without error
+set -o pipefail # Fail command pipelines on the failure of any individual step
+IFS=$'\n\t' #set internal field separator to avoid iteration errors
+# Trap all exits and output something helpful
+trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR
+
 # This script should be run by new developers to install this package in
 # editable mode and configure their local environment
 
 echo "Checking virtual environment"
-if [ -z "${VIRTUAL_ENV}" ] && [ -z "${CONDA_PREFIX}" ]; then
+if [ "${VIRTUAL_ENV:-missing}" = "missing" ] && [ "${CONDA_PREFIX:-missing}" = "missing" ]; then
     echo 'No virtual environment detected: none of $VIRTUAL_ENV or $CONDA_PREFIX is set.'
     echo
     echo "=== This script is going to install the project in the system python environment ==="
@@ -20,7 +29,7 @@ fi
 
 echo "Checking pip version"
 MINIMUM_PIP_VERSION=22
-pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./ /g') )
+pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./\n\t/g') )
 if let "${pipversion[0]}<${MINIMUM_PIP_VERSION}"; then
     echo "Insufficient version of pip found. Requires at least version ${MINIMUM_PIP_VERSION}."
     echo "See https://lincc-ppt.readthedocs.io/ for details."
@@ -32,7 +41,7 @@ python -m pip install -e . > /dev/null
 
 echo "Installing developer dependencies in local environment"
 python -m pip install -e .'[dev]' > /dev/null
-if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
+if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
 
 echo "Installing pre-commit"
 pre-commit install > /dev/null
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,7 +56,10 @@ write_to = "src/nested_pandas/_version.py"
 [tool.pytest.ini_options]
 testpaths = [
     "tests",
+    "src",
+    "docs",
 ]
+addopts = "--doctest-modules --doctest-glob=*.rst"
 
 [tool.black]
 line-length = 110
diff --git a/src/nested_pandas/datasets/generation.py b/src/nested_pandas/datasets/generation.py
@@ -24,8 +24,9 @@ def generate_data(n_base, n_layer, seed=None) -> NestedFrame:
 
     Examples
     --------
-    >>> nested_pandas.datasets.generate_data(10,100)
-    >>> nested_pandas.datasets.generate_data(10, {"nested_a": 100, "nested_b": 200})
+    >>> from nested_pandas.datasets import generate_data
+    >>> nf1 = generate_data(10,100)
+    >>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200})
     """
     # use provided seed, "None" acts as if no seed is provided
     randomstate = np.random.RandomState(seed=seed)
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -278,7 +278,7 @@ def add_nested(
         Examples
         --------
 
-
+        >>> import nested_pandas as npd
         >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
         ...            index=[0,1,2])
         >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
@@ -320,11 +320,12 @@ def nest_lists(self, name: str, columns: list[str]) -> NestedFrame:
         Examples
         --------
 
+        >>> import nested_pandas as npd
         >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
         ...                   "e":[[1,2,3], [4,5,6], [7,8,9]]},
         ...                   index=[0,1,2])
 
-        >>> nf.nest_lists(columns=["c","d"], name="nested")
+        >>> nf.nest_lists(columns=["e"], name="nested")
            c  d                nested
         0  1  2  [{e: 1}; …] (3 rows)
         1  2  4  [{e: 4}; …] (3 rows)
@@ -367,6 +368,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None,
         Examples
         --------
 
+        >>> import nested_pandas as npd
         >>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4],
         ...                   "c":[1,2,3,4,5], "d":[2,4,6,8,10]},
         ...                   index=[0,0,0,1,1])
@@ -424,6 +426,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
         Examples
         --------
 
+        >>> import nested_pandas as npd
         >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
         ...                   "e":[[1,2,3], [4,5,6], [7,8,9]]},
         ...                   index=[0,1,2])
@@ -605,7 +608,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
 
         >>> nf = nf.query("nested.t > 10")
         >>> nf
-           a         b                                             nested
+                  a         b                                             nested
         0  0.417022  0.184677  [{t: 13.40935, flux: 98.886109, band: 'g'}; …]...
         1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
         2  0.000114  0.691121  [{t: 11.173797, flux: 28.044399, band: 'r'}; …...
@@ -619,9 +622,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
         with rows of that particular nested structure filtered. For example,
         querying the NestedFrame "df" with nested structure "my_nested" as
         below will return all rows of df, but with mynested filtered by the
-        condition:
-
-        >>> df.query("mynested.a > 2")
+        condition: `nf.query("mynested.a > 2")`
         """
         if not isinstance(expr, str):
             msg = f"expr must be a string to be evaluated, {type(expr)} given"
@@ -786,7 +787,7 @@ def dropna(
         >>> # this query empties several of the nested dataframes
         >>> nf = nf.query("nested.t > 19")
         >>> nf
-            a         b                                        nested
+                  a         b                                        nested
         0  0.417022  0.184677                                          None
         1  0.720324  0.372520   [{t: 19.365232, flux: 90.85955, band: 'r'}]
         2  0.000114  0.691121  [{t: 19.157791, flux: 14.672857, band: 'r'}]
@@ -796,7 +797,7 @@ def dropna(
 
         >>> # dropna removes rows with those emptied dataframes
         >>> nf.dropna(subset="nested")
-            a         b                                        nested
+                  a         b                                        nested
         1  0.720324  0.372520   [{t: 19.365232, flux: 90.85955, band: 'r'}]
         2  0.000114  0.691121  [{t: 19.157791, flux: 14.672857, band: 'r'}]
 
@@ -806,9 +807,20 @@ def dropna(
         >>> nf = generate_data(5,5, seed=1)
         >>> # Either on the whole dataframe
         >>> nf.dropna(on_nested="nested")
+                  a         b                                             nested
+        0  0.417022  0.184677  [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ...
+        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
+        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
+        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, band: 'g'}; …]...
+        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...
         >>> # or on a specific nested column
         >>> nf.dropna(subset="nested.t")
-
+                  a         b                                             nested
+        0  0.417022  0.184677  [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ...
+        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
+        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
+        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, band: 'g'}; …]...
+        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...
 
         Notes
         -----
@@ -909,7 +921,7 @@ def sort_values(
 
         >>> # Sort nested values
         >>> nf.sort_values(by="nested.band")
-           a         b                                             nested
+                  a         b                                             nested
         0  0.417022  0.184677  [{t: 13.40935, flux: 98.886109, band: 'g'}; …]...
         1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
         2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
@@ -1017,13 +1029,15 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame:  # t
         >>> from nested_pandas.datasets.generation import generate_data
         >>> import numpy as np
         >>> nf = generate_data(5,5, seed=1)
-
+        >>>
         >>> # define a custom user function
+        >>> # reduce will return a NestedFrame with two columns
         >>> def example_func(base_col, nested_col):
-        >>>    '''reduce will return a NestedFrame with two columns'''
-        >>>    return {"mean": np.mean(nested_col),
-        ...            "mean_minus_base": np.mean(nested_col) - base_col}
-
+        ...     return {
+        ...         "mean": np.mean(nested_col),
+        ...         "mean_minus_base": np.mean(nested_col) - base_col,
+        ...     }
+        >>>
         >>> # apply the function
         >>> nf.reduce(example_func, "a", "nested.t")
                 mean  mean_minus_base
@@ -1038,8 +1052,8 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame:  # t
 
         >>> # define a custom user function that returns nested structure
         >>> def example_func(base_col1, base_col2, nested_col):
-        >>>    '''reduce will return a NestedFrame with nested structure'''
-        >>>    return {"offsets.t_a": nested_col - base_col1,
+        ...    '''reduce will return a NestedFrame with nested structure'''
+        ...    return {"offsets.t_a": nested_col - base_col1,
         ...            "offsets.t_b": nested_col - base_col2}
 
         By giving both output columns the prefix "offsets.", we signal
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -64,12 +64,12 @@ def read_parquet(
     Simple loading example:
 
     >>> import nested_pandas as npd
-    >>> nf = npd.read_parquet("path/to/file.parquet")
+    >>> nf = npd.read_parquet("path/to/file.parquet")  # doctest: +SKIP
 
     Partial loading:
 
     >>> #Load only the "flux" sub-column of the "nested" column
-    >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"])
+    >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"])  # doctest: +SKIP
     """
 
     # Type convergence for reject_nesting
diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
@@ -258,7 +258,7 @@ def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
         ...                                                 [["g","g"],
         ...                                                  ["r","r"]])
         >>> # Look at one row of the series
-        >>> nested_with_avg[0]
+        >>> nf_new_band[0]
                   t       flux band new_band
         0  2.935118  39.676747    g        g
         1  3.725204  41.919451    r        g
diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py
@@ -56,6 +56,9 @@ def name(self) -> str:
         fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
         return f"nested<{fields}>"
 
+    def __repr__(self) -> str:
+        return self.name
+
     @classmethod
     def construct_array_type(cls) -> Type[ExtensionArray]:
         """Corresponded array type, always NestedExtensionArray"""
diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py
@@ -27,8 +27,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame:
     Examples
     --------
 
+    >>> import pandas as pd
+    >>> # Show all columns
+    >>> pd.set_option("display.width", 200)
+    >>> pd.set_option("display.max_columns", None)
     >>> from nested_pandas.datasets.generation import generate_data
-    >>> nf = generate_data(5,10,seed=1)
+    >>> nf = generate_data(5, 10, seed=1)
 
     >>> from nested_pandas.utils import count_nested
     >>> count_nested(nf, "nested")

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ repos:`
`97`	`97`	`"-d", # Flag for cached environment and doctrees`
`98`	`98`	`"./docs/_build/doctrees", # Directory`
`99`	`99`	`"-D", # Flag to override settings in conf.py`
`100`		`- "exclude_patterns=notebooks/*", # Exclude our notebooks from pre-commit`
	`100`	`+ "exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit`
`101`	`101`	`]`
`102`	`102`	`# Run unit tests, verify that they pass. Note that coverage is run against`
`103`	`103`	`# the ./src directory here because that is what will be committed. In the`