Skip to content

Commit 614f6de

Browse files
authored
Merge pull request #26 from kthyng/improvements
Accessor can return indices, some guess_regex terms can be in middle of string now.
2 parents 83208a6 + 0b991b4 commit 614f6de

File tree

3 files changed

+83
-34
lines changed

3 files changed

+83
-34
lines changed

cf_pandas/accessor.py

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,20 @@ class CFAccessor:
5858
"""Dataframe accessor analogous to cf-xarray accessor."""
5959

6060
def __init__(self, pandas_obj):
61-
self._validate(pandas_obj)
61+
# don't automatically validate but can when needed
62+
# self._validate(pandas_obj)
6263
self._obj = pandas_obj
6364

64-
@staticmethod
65-
def _validate(obj):
65+
# @staticmethod
66+
def _validate(self):
6667
"""what is necessary for basic use."""
6768

6869
# verify that necessary keys are present. Z would also be nice but might be missing.
6970
# but don't use the accessor to check
7071
keys = ["T", "longitude", "latitude"]
71-
missing_keys = [key for key in keys if len(_get_axis_coord(obj, key)) == 0]
72+
missing_keys = [
73+
key for key in keys if len(_get_axis_coord(self._obj, key)) == 0
74+
]
7275
if len(missing_keys) > 0:
7376
raise AttributeError(
7477
f'{"longitude", "latitude", "time"} must be identifiable in DataFrame but {missing_keys} are missing.'
@@ -110,9 +113,12 @@ def __getitem__(self, key: str) -> Union[pd.Series, pd.DataFrame]:
110113
else:
111114
col_names = _get_custom_criteria(self._obj, key)
112115

113-
# return series
114-
if len(col_names) == 1:
116+
# return series for column
117+
if len(col_names) == 1 and col_names[0] in self._obj.columns:
115118
return self._obj[col_names[0]]
119+
# return index
120+
elif len(col_names) == 1 and col_names[0] in self._obj.index.names:
121+
return self._obj.index.get_level_values(col_names[0])
116122
# return DataFrame
117123
elif len(col_names) > 1:
118124
return self._obj[col_names]
@@ -248,6 +254,32 @@ def custom_keys(self):
248254

249255
return vardict
250256

257+
@property
258+
def axes_cols(self) -> List[str]:
259+
"""
260+
Property that returns a list of column names from the axes mapping.
261+
262+
Returns
263+
-------
264+
list
265+
Variable names that are the column names which represent axes.
266+
"""
267+
268+
return list(itertools.chain(*[*self.axes.values()]))
269+
270+
@property
271+
def coordinates_cols(self) -> List[str]:
272+
"""
273+
Property that returns a list of column names from the coordinates mapping.
274+
275+
Returns
276+
-------
277+
list
278+
Variable names that are the column names which represent coordinates.
279+
"""
280+
281+
return list(itertools.chain(*[*self.coordinates.values()]))
282+
251283
@property
252284
def standard_names(self):
253285
"""
@@ -313,26 +345,14 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list:
313345
f"cf_xarray did not understand key {key!r}. Expected one of {valid_keys!r}"
314346
)
315347

316-
# search_in = set()
317-
# attrs_or_encoding = ChainMap(obj.attrs, obj.encoding)
318-
# coordinates = attrs_or_encoding.get("coordinates", None)
319-
320-
# # Handles case where the coordinates attribute is None
321-
# # This is used to tell xarray to not write a coordinates attribute
322-
# if coordinates:
323-
# search_in.update(coordinates.split(" "))
324-
# if not search_in:
325-
# search_in = set(obj.coords)
326-
327-
# # maybe only do this for key in _AXIS_NAMES?
328-
# search_in.update(obj.indexes)
329-
330-
# search_in = search_in & set(obj.coords)
348+
# loop over column names and index names
331349
results: set = set()
332-
for col in obj.columns:
333-
# var = obj.coords[coord]
350+
cols_and_indices = list(obj.columns)
351+
cols_and_indices += obj.index.names
352+
# remove None if in names from index
353+
cols_and_indices = [name for name in cols_and_indices if name is not None]
354+
for col in cols_and_indices:
334355
if key in coordinate_criteria:
335-
# import pdb; pdb.set_trace()
336356
for criterion, expected in coordinate_criteria[key].items():
337357
# allow for the column header having a space in it that separate
338358
# the name from the units, for example
@@ -350,14 +370,19 @@ def _get_axis_coord(obj: Union[DataFrame, Series], key: str) -> list:
350370
# units = getattr(col.data, "units", None)
351371
# if units in expected:
352372
# results.update((col,))
353-
354373
# also use the guess_regex approach by default, but only if no results so far
355374
# this takes the logic from cf-xarray guess_coord_axis
356375
if len(results) == 0:
357-
if key in ("T", "time") and _is_datetime_like(obj[col]):
358-
results.update((col,))
359-
continue # prevent second detection
360-
376+
if col in obj.columns:
377+
if key in ("T", "time") and _is_datetime_like(obj[col]):
378+
results.update((col,))
379+
continue # prevent second detection
380+
elif col in obj.index.names:
381+
if key in ("T", "time") and _is_datetime_like(
382+
obj.index.get_level_values(col)
383+
):
384+
results.update((col,))
385+
continue # prevent second detection
361386
pattern = guess_regex[key]
362387
if pattern.match(col.lower()):
363388
results.update((col,))

cf_pandas/criteria.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,22 @@
103103
coordinate_criteria["X"]["long_name"] += ("cell index along first dimension",)
104104
coordinate_criteria["Y"]["long_name"] += ("cell index along second dimension",)
105105

106+
# changes allow for the pattern string to not be at the start of the comparison string
107+
# like (?=.*lon)
106108
guess_regex = {
107-
"time": re.compile("\\bt\\b|(time|min|hour|day|week|month|year)[0-9]*"),
109+
"time": re.compile("\\bt\\b|(?=.*time|min|hour|day|week|month|year)[0-9]*"),
110+
# "time": re.compile("\\bt\\b|(time|min|hour|day|week|month|year)[0-9]*"),
108111
"Z": re.compile(
109-
"(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|"
112+
"(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|(?=.*dbars)|h(ei)?ght|altitude|depth|"
110113
"isobaric|pres|isotherm)[a-z_]*[0-9]*"
111114
),
115+
# "Z": re.compile(
116+
# "(z|nav_lev|gdep|lv_|[o]*lev|bottom_top|sigma|h(ei)?ght|altitude|depth|"
117+
# "isobaric|pres|isotherm)[a-z_]*[0-9]*"
118+
# ),
112119
"Y": re.compile("y|j|nlat|nj"),
113-
"latitude": re.compile("y?(nav_lat|lat|gphi)[a-z0-9]*"),
120+
"latitude": re.compile("y?(nav_lat|(?=.*lat)|gphi)[a-z0-9]*"),
114121
"X": re.compile("x|i|nlon|ni"),
115-
"longitude": re.compile("x?(nav_lon|lon|glam)[a-z0-9]*"),
122+
"longitude": re.compile("x?(nav_lon|(?=.*lon)|glam)[a-z0-9]*"),
116123
}
117124
guess_regex["T"] = guess_regex["time"]

tests/test_accessor.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_validate():
3333
]
3434
)
3535
with pytest.raises(AttributeError):
36-
df.cf.keys()
36+
df.cf._validate()
3737

3838

3939
def test_match_criteria_key_accessor():
@@ -128,3 +128,20 @@ def test_get_by_guess_regex():
128128
assert df.cf["longitude"].name == "lon"
129129
assert df.cf["latitude"].name == "lat"
130130
assert df.cf["time"].name == "min"
131+
132+
df = pd.DataFrame(columns=["blah_lon", "table_lat"])
133+
assert df.cf["longitude"].name == "blah_lon"
134+
assert df.cf["latitude"].name == "table_lat"
135+
136+
137+
def test_index():
138+
"""Test when time is in index."""
139+
df = pd.DataFrame(index=["m_time"])
140+
df.index.rename("m_time", inplace=True)
141+
assert df.cf["T"].name == "m_time"
142+
143+
144+
def test_cols():
145+
df = pd.DataFrame(columns=["m_time", "lon", "lat", "temp"])
146+
assert df.cf.axes_cols == ["m_time"]
147+
assert sorted(df.cf.coordinates_cols) == ["lat", "lon", "m_time"]

0 commit comments

Comments
 (0)