From b288d0ee46ba3a3845a93ddb0433898fe2212435 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:19:34 -0500 Subject: [PATCH 1/7] accessor changes: * checks both column names and index names for matching with regular expressions * for axis and coordinates as well as for custon names/vocabs * Some improvements in the handling of guessing when matches are not found --- cf_pandas/accessor.py | 60 ++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/cf_pandas/accessor.py b/cf_pandas/accessor.py index 1dabf01..1407c99 100644 --- a/cf_pandas/accessor.py +++ b/cf_pandas/accessor.py @@ -58,7 +58,8 @@ class CFAccessor: """Dataframe accessor analogous to cf-xarray accessor.""" def __init__(self, pandas_obj): - self._validate(pandas_obj) + # don't automatically validate but can when needed + # self._validate(pandas_obj) self._obj = pandas_obj @staticmethod @@ -110,9 +111,12 @@ def __getitem__(self, key: str) -> Union[pd.Series, pd.DataFrame]: else: col_names = _get_custom_criteria(self._obj, key) - # return series - if len(col_names) == 1: + # return series for column + if len(col_names) == 1 and col_names[0] in self._obj.columns: return self._obj[col_names[0]] + # return index + elif len(col_names) == 1 and col_names[0] in self._obj.index.names: + return self._obj.index.get_level_values(col_names[0]) # return DataFrame elif len(col_names) > 1: return self._obj[col_names] @@ -247,6 +251,32 @@ def custom_keys(self): } return vardict + + @property + def axes_cols(self) -> List[str]: + """ + Property that returns a list of column names from the axes mapping. + + Returns + ------- + list + Variable names that are the column names which represent axes. + """ + + return list(itertools.chain(*[*self.axes.values()])) + + @property + def coordinates_cols(self) -> List[str]: + """ + Property that returns a list of column names from the coordinates mapping. + + Returns + ------- + list + Variable names that are the column names which represent coordinates. + """ + + return list(itertools.chain(*[*self.coordinates.values()])) @property def standard_names(self): @@ -313,26 +343,14 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list: f"cf_xarray did not understand key {key!r}. Expected one of {valid_keys!r}" ) - # search_in = set() - # attrs_or_encoding = ChainMap(obj.attrs, obj.encoding) - # coordinates = attrs_or_encoding.get("coordinates", None) - - # # Handles case where the coordinates attribute is None - # # This is used to tell xarray to not write a coordinates attribute - # if coordinates: - # search_in.update(coordinates.split(" ")) - # if not search_in: - # search_in = set(obj.coords) - - # # maybe only do this for key in _AXIS_NAMES? - # search_in.update(obj.indexes) - - # search_in = search_in & set(obj.coords) + # loop over column names and index names results: set = set() - for col in obj.columns: - # var = obj.coords[coord] + cols_and_indices = list(obj.columns) + cols_and_indices += obj.index.names + # remove None if in names from index + cols_and_indices = [name for name in cols_and_indices if name is not None] + for col in cols_and_indices: if key in coordinate_criteria: - # import pdb; pdb.set_trace() for criterion, expected in coordinate_criteria[key].items(): # allow for the column header having a space in it that separate # the name from the units, for example From ec8abb7fa9b6df2ff6ac2beb7eaf4b3a8a90865d Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:22:16 -0500 Subject: [PATCH 2/7] updates to guess_regex: * for Z, "dbars" no longer has to be at the start of the string to match * for time, "time" no longer has to be at the start of the string to match * for latitude and longitude, "lat" and "lon", respectively, no longer have to be at the start of the string to match. This should probably be a change for all such matches, but I want to discuss with cf-xarray people. --- cf_pandas/criteria.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cf_pandas/criteria.py b/cf_pandas/criteria.py index 5b30f35..be00bb7 100644 --- a/cf_pandas/criteria.py +++ b/cf_pandas/criteria.py @@ -103,15 +103,24 @@ coordinate_criteria["X"]["long_name"] += ("cell index along first dimension",) coordinate_criteria["Y"]["long_name"] += ("cell index along second dimension",) +# changes allow for the pattern string to not be at the start of the comparison string +# like (?=.*lon) guess_regex = { - "time": re.compile("\\bt\\b|(time|min|hour|day|week|month|year)[0-9]*"), + "time": re.compile("\\bt\\b|(?=.*time|min|hour|day|week|month|year)[0-9]*"), + # "time": re.compile("\\bt\\b|(time|min|hour|day|week|month|year)[0-9]*"), "Z": re.compile( - "(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|" + "(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|(?=.*dbars)|h(ei)?ght|altitude|depth|" "isobaric|pres|isotherm)[a-z_]*[0-9]*" ), + # "Z": re.compile( + # "(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|" + # "isobaric|pres|isotherm)[a-z_]*[0-9]*" + # ), "Y": re.compile("y|j|nlat|nj"), - "latitude": re.compile("y?(nav_lat|lat|gphi)[a-z0-9]*"), + "latitude": re.compile("y?(nav_lat|(?=.*lat)|gphi)[a-z0-9]*"), + # "latitude": re.compile("(?i)y?(?=.*lat)[a-z0-9]*"), "X": re.compile("x|i|nlon|ni"), - "longitude": re.compile("x?(nav_lon|lon|glam)[a-z0-9]*"), + # "longitude": re.compile("(?i)x?(?=.*lon)[a-z0-9]*"), + "longitude": re.compile("x?(nav_lon|(?=.*lon)|glam)[a-z0-9]*"), } guess_regex["T"] = guess_regex["time"] From efd2dcb8805f9a23a3e7c5ca285975fd8d3f0e71 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:22:41 -0500 Subject: [PATCH 3/7] removed some comments --- cf_pandas/criteria.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cf_pandas/criteria.py b/cf_pandas/criteria.py index be00bb7..bf8226a 100644 --- a/cf_pandas/criteria.py +++ b/cf_pandas/criteria.py @@ -118,9 +118,7 @@ # ), "Y": re.compile("y|j|nlat|nj"), "latitude": re.compile("y?(nav_lat|(?=.*lat)|gphi)[a-z0-9]*"), - # "latitude": re.compile("(?i)y?(?=.*lat)[a-z0-9]*"), "X": re.compile("x|i|nlon|ni"), - # "longitude": re.compile("(?i)x?(?=.*lon)[a-z0-9]*"), "longitude": re.compile("x?(nav_lon|(?=.*lon)|glam)[a-z0-9]*"), } guess_regex["T"] = guess_regex["time"] From 859f08440ff86dc8d5e7b21971d077d316c76399 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:23:53 -0500 Subject: [PATCH 4/7] linting --- cf_pandas/accessor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cf_pandas/accessor.py b/cf_pandas/accessor.py index 1407c99..da0190c 100644 --- a/cf_pandas/accessor.py +++ b/cf_pandas/accessor.py @@ -114,7 +114,7 @@ def __getitem__(self, key: str) -> Union[pd.Series, pd.DataFrame]: # return series for column if len(col_names) == 1 and col_names[0] in self._obj.columns: return self._obj[col_names[0]] - # return index + # return index elif len(col_names) == 1 and col_names[0] in self._obj.index.names: return self._obj.index.get_level_values(col_names[0]) # return DataFrame @@ -251,7 +251,7 @@ def custom_keys(self): } return vardict - + @property def axes_cols(self) -> List[str]: """ @@ -262,9 +262,9 @@ def axes_cols(self) -> List[str]: list Variable names that are the column names which represent axes. """ - + return list(itertools.chain(*[*self.axes.values()])) - + @property def coordinates_cols(self) -> List[str]: """ @@ -275,7 +275,7 @@ def coordinates_cols(self) -> List[str]: list Variable names that are the column names which represent coordinates. """ - + return list(itertools.chain(*[*self.coordinates.values()])) @property From 0b2d38344239a2d83904d1bb41b1e4765c5b75a0 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:30:52 -0500 Subject: [PATCH 5/7] updated test for validation --- cf_pandas/accessor.py | 6 +++--- tests/test_accessor.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cf_pandas/accessor.py b/cf_pandas/accessor.py index da0190c..0577f39 100644 --- a/cf_pandas/accessor.py +++ b/cf_pandas/accessor.py @@ -62,14 +62,14 @@ def __init__(self, pandas_obj): # self._validate(pandas_obj) self._obj = pandas_obj - @staticmethod - def _validate(obj): + # @staticmethod + def _validate(self): """what is necessary for basic use.""" # verify that necessary keys are present. Z would also be nice but might be missing. # but don't use the accessor to check keys = ["T", "longitude", "latitude"] - missing_keys = [key for key in keys if len(_get_axis_coord(obj, key)) == 0] + missing_keys = [key for key in keys if len(_get_axis_coord(self._obj, key)) == 0] if len(missing_keys) > 0: raise AttributeError( f'{"longitude", "latitude", "time"} must be identifiable in DataFrame but {missing_keys} are missing.' diff --git a/tests/test_accessor.py b/tests/test_accessor.py index 6ce4f67..360b564 100644 --- a/tests/test_accessor.py +++ b/tests/test_accessor.py @@ -33,7 +33,7 @@ def test_validate(): ] ) with pytest.raises(AttributeError): - df.cf.keys() + df.cf._validate() def test_match_criteria_key_accessor(): From c5dec821a6a4ed477becb196d85e165e21cd6246 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:51:15 -0500 Subject: [PATCH 6/7] test and fix for index being guessed --- cf_pandas/accessor.py | 19 +++++++++++++------ tests/test_accessor.py | 11 +++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/cf_pandas/accessor.py b/cf_pandas/accessor.py index 0577f39..d2cfdd8 100644 --- a/cf_pandas/accessor.py +++ b/cf_pandas/accessor.py @@ -69,7 +69,9 @@ def _validate(self): # verify that necessary keys are present. Z would also be nice but might be missing. # but don't use the accessor to check keys = ["T", "longitude", "latitude"] - missing_keys = [key for key in keys if len(_get_axis_coord(self._obj, key)) == 0] + missing_keys = [ + key for key in keys if len(_get_axis_coord(self._obj, key)) == 0 + ] if len(missing_keys) > 0: raise AttributeError( f'{"longitude", "latitude", "time"} must be identifiable in DataFrame but {missing_keys} are missing.' @@ -368,14 +370,19 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list: # units = getattr(col.data, "units", None) # if units in expected: # results.update((col,)) - # also use the guess_regex approach by default, but only if no results so far # this takes the logic from cf-xarray guess_coord_axis if len(results) == 0: - if key in ("T", "time") and _is_datetime_like(obj[col]): - results.update((col,)) - continue # prevent second detection - + if col in obj.columns: + if key in ("T", "time") and _is_datetime_like(obj[col]): + results.update((col,)) + continue # prevent second detection + elif col in obj.index.names: + if key in ("T", "time") and _is_datetime_like( + obj.index.get_level_values(col) + ): + results.update((col,)) + continue # prevent second detection pattern = guess_regex[key] if pattern.match(col.lower()): results.update((col,)) diff --git a/tests/test_accessor.py b/tests/test_accessor.py index 360b564..4351b30 100644 --- a/tests/test_accessor.py +++ b/tests/test_accessor.py @@ -128,3 +128,14 @@ def test_get_by_guess_regex(): assert df.cf["longitude"].name == "lon" assert df.cf["latitude"].name == "lat" assert df.cf["time"].name == "min" + + df = pd.DataFrame(columns=["blah_lon", "table_lat"]) + assert df.cf["longitude"].name == "blah_lon" + assert df.cf["latitude"].name == "table_lat" + + +def test_index(): + """Test when time is in index.""" + df = pd.DataFrame(index=["m_time"]) + df.index.rename("m_time", inplace=True) + assert df.cf["T"].name == "m_time" From 0b991b44c5da62388d4b99e73385fd512e23ce06 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 28 Apr 2023 15:56:56 -0500 Subject: [PATCH 7/7] added another test --- tests/test_accessor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_accessor.py b/tests/test_accessor.py index 4351b30..990db14 100644 --- a/tests/test_accessor.py +++ b/tests/test_accessor.py @@ -139,3 +139,9 @@ def test_index(): df = pd.DataFrame(index=["m_time"]) df.index.rename("m_time", inplace=True) assert df.cf["T"].name == "m_time" + + +def test_cols(): + df = pd.DataFrame(columns=["m_time", "lon", "lat", "temp"]) + assert df.cf.axes_cols == ["m_time"] + assert sorted(df.cf.coordinates_cols) == ["lat", "lon", "m_time"]