Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement default time resolution for CF time encoding/decoding #9580

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 4 additions & 14 deletions xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
from xarray.core.pdcompat import (
NoDefault,
count_not_none,
nanosecond_precision_timestamp,
default_precision_timestamp,
no_default,
)
from xarray.core.utils import emit_user_level_warning
Expand All @@ -83,21 +83,13 @@
T_FreqStr = TypeVar("T_FreqStr", str, None)


def _nanosecond_precision_timestamp(*args, **kwargs):
# As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to
# infer the appropriate datetime precision. Until xarray supports
# non-nanosecond precision times, we will use this constructor wrapper to
# explicitly create nanosecond-precision Timestamp objects.
return pd.Timestamp(*args, **kwargs).as_unit("ns")


def get_date_type(calendar, use_cftime=True):
"""Return the cftime date type for a given calendar name."""
if cftime is None:
raise ImportError("cftime is required for dates with non-standard calendars")
else:
if _is_standard_calendar(calendar) and not use_cftime:
return _nanosecond_precision_timestamp
return default_precision_timestamp

calendars = {
"noleap": cftime.DatetimeNoLeap,
Expand Down Expand Up @@ -1475,10 +1467,8 @@ def date_range_like(source, calendar, use_cftime=None):
if is_np_datetime_like(source.dtype):
# We want to use datetime fields (datetime64 object don't have them)
source_calendar = "standard"
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
source_start = nanosecond_precision_timestamp(source_start)
source_end = nanosecond_precision_timestamp(source_end)
source_start = default_precision_timestamp(source_start)
source_end = default_precision_timestamp(source_end)
else:
if isinstance(source, CFTimeIndex):
source_calendar = source.calendar
Expand Down
132 changes: 96 additions & 36 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
from xarray.core.duck_array_ops import asarray, ravel, reshape
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.options import _get_datetime_resolution
from xarray.core.pdcompat import default_precision_timestamp
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import Variable
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
Expand Down Expand Up @@ -193,9 +194,7 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
ref_date = default_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
Expand Down Expand Up @@ -245,6 +244,25 @@ def _decode_datetime_with_cftime(
return np.array([], dtype=object)


def _check_date_for_units_since_refdate(
date, unit: str, ref_date: pd.Timestamp
) -> None:
delta = date * np.timedelta64(1, unit)
if not np.isnan(delta):
# this will raise on dtype overflow for integer dtypes
if date.dtype.kind == "iu" and not np.int64(delta) == date:
raise OutOfBoundsTimedelta(
"DType overflow in Datetime/Timedelta calculation."
)
# this will raise on overflow
ref_date_unit = np.datetime_data(ref_date.asm8)[0]
(
(ref_date + delta).as_unit(ref_date_unit)
if hasattr(ref_date, "as_unit")
else (ref_date + delta)._as_unit(ref_date_unit)
)


def _decode_datetime_with_pandas(
flat_num_dates: np.ndarray, units: str, calendar: str
) -> np.ndarray:
Expand All @@ -266,20 +284,56 @@ def _decode_datetime_with_pandas(
time_units, ref_date_str = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(ref_date_str)
# relaxed to non-nanosecond resolution
ref_date = pd.Timestamp(ref_date_str)
# strip tz information
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
# get default unit and delta
default_unit = _get_datetime_resolution()
default_delta = np.timedelta64(1, default_unit).astype("timedelta64[ns]")
# get ref_date and time delta
ref_date_unit = (
ref_date.unit
if hasattr(ref_date, "unit")
else np.datetime_data(ref_date.asm8)[0]
)
ref_date_delta = np.timedelta64(1, ref_date_unit).astype("timedelta64[ns]")
time_delta = np.timedelta64(1, time_units).astype("timedelta64[ns]")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you considered the case where flat_num_dates has a floating point dtype? In that situation one could end up needing finer precision than the time_units specified in the encoding.

But maybe another question at this stage is whether we would want the encoding information (reference date or time units) to affect the datetime precision we decode to—currently it appears one could end up with non-default precision if the time units in the file specified something finer than the user-specified precision via set_options, which could be surprising.

In the long term, however, I agree that it would be nice to have some way of inferring this automatically.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my perspective the decoding process should just do what it is supposed to do , which is decode into the designated datetimes as given by time_unit since reference_date. As our final dtype (np.datetime64) is based on unix epoch (1970-01-01, with different possible resolutions) we need to find the fitting resolution.

days since 1970-01-01 00:00:00 -> "s" (reference_date takes precedence over time_unit since it is in second resolution)
seconds since 1970-01-01 -> "s" (time_unit take precedence over reference_date)
milliseconds since 1970-01-01 00:00:00.000000001 -> "ns" (reference_date takes precedence over time_unit)

For now we have a third possible element to consider, the default resolution, which currently is "ns" (or might be changed to something else now).

The current proposed workflow with the default resolution should finally not be needed any more, when all issues have been ironed out. Or, we use something like None to specify automatic inferring.

But maybe another question at this stage is whether we would want the encoding information (reference date or time units) to affect the datetime precision we decode to—currently it appears one could end up with non-default precision if the time units in the file specified something finer than the user-specified precision via set_options, which could be surprising.

Maybe the idea of a default resolution should only come into consideration in the encoding process, when the user does not specify any units?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or no default resolution at all and try to infer everything (if that's possible at all)?

A good example showing the ambiguities between on-disk and in-memory is something like

  • on-disk: [0, 1, 2, 3] with days since 1970-01-01 12:00:00
    will translate to:
  • in-memory: [12, 36, 60, 84] with datetime64[h]
  • in_memory default "s": [ 43200, 129600, 216000, 302400] with datetime64[s]

A default resolution of seconds s would be not problem for the above, but for this example:

  • on-disk: [0, 1, 2, 3] with seconds since 1970-01-01 00:00:00.001
    will translate to:
  • in-memory: [1, 1001, 2001, 3001] with datetime64[ms]
  • in_memory default "s": [0, 1, 2, 3] with datetime64[s]

we would run into issues. The more I think about this, the more think inferring as much as possible is the way to go, to not round/cutoff or otherwise get time mismatches between on-disk and in-memory.

Copy link
Member

@spencerkclark spencerkclark Oct 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed I agree long-term some kind of inference would be best—ideally we do not lose precision based on what was written on disk. It is just somewhat complicated (in the case of floating point times, one would need to take into account the values themselves in addition to the metadata), but we can try to hash out now what that could look like (or punt and just say floats are always decoded to a user-configurable precision, defaulting to nanosecond).

For context, I interpreted this PR as temporarily aiming to maintain a consistent NumPy datetime precision throughout xarray, while at least allowing for some flexibility (hence the modifications to the casting logic in variable.py to also respect the time_resolution set in set_options), so it was surprising to see one place where datetime precisions other than that specified in set_options could appear.

In your eyes, at what point do we remove the casting logic and fully allow mixed precision datetime values? Is this merely just an issue with the tests at this stage?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@spencerkclark Sorry, went down that rabbit hole the last days. I think I'm on a good way to have things sorted out. There are different locations in the code and tests were the gears mesh together.

The overall decoding and encoding logic seem to work in my local feature branch. I'll update this PR when I've fixed all the bits and pieces. Need to dig further now, back into the hole 🐰

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries—if at any point you feel it would be helpful to discuss anything in a more interactive way we could try to find a time to meet. Happy to keep things asynchronous / open though too. Thanks again for your work on this!

# choose the highest resolution
new_time_units = {
ref_date_delta: ref_date_unit,
time_delta: time_units,
default_delta: default_unit,
}[min(default_delta, ref_date_delta, time_delta)]
# transform to the highest needed resolution
# this will raise accordingly
ref_date = (
ref_date.as_unit(new_time_units)
if hasattr(ref_date, "as_unit")
else ref_date._as_unit(new_time_units)
)
except ValueError as err:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
raise OutOfBoundsDatetime from err

dunit = (
ref_date.unit
if hasattr(ref_date, "unit")
else np.datetime_data(ref_date.asm8)[0]
)

with warnings.catch_warnings():
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
_check_date_for_units_since_refdate(
flat_num_dates.min(), time_units, ref_date
)
_check_date_for_units_since_refdate(
flat_num_dates.max(), time_units, ref_date
)

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -292,20 +346,25 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind in "f":
flat_num_dates = flat_num_dates.astype(np.float64)

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
# properly handle NaN/NaT to prevent casting NaN to int
# keep NaT/nan mask
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
# ensures that we do not encounter integer overflow at any point in the
# process without raising OutOfBoundsDatetime.
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
# in case we need to change the unit, we fix the numbers here
# this should be safe, as errors would have been raised above
ns_time_unit = _NS_PER_TIME_DELTA[time_units]
ns_dunit = _NS_PER_TIME_DELTA[dunit]
if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_dunit):
flat_num_dates *= np.int64(ns_time_unit / ns_dunit)
time_units = dunit

# Cast input ordinals to integers and properly handle NaN/NaT
# to prevent casting NaN to int
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_int[nan] = np.iinfo(np.int64).min
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# cast to timedelta64[time_units] and add to ref_date
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_units}]")


def decode_cf_datetime(
Expand Down Expand Up @@ -370,7 +429,7 @@ def to_timedelta_unboxed(value, **kwargs):

def to_datetime_unboxed(value, **kwargs):
result = pd.to_datetime(value, **kwargs).to_numpy()
assert result.dtype == "datetime64[ns]"
assert result.dtype == f"datetime64[{_get_datetime_resolution()}]"
return result


Expand All @@ -390,7 +449,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta:

def _unit_timedelta_numpy(units: str) -> np.timedelta64:
numpy_units = _netcdf_to_numpy_timeunit(units)
return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns")
default_unit = _get_datetime_resolution()
return np.timedelta64(
int(_NS_PER_TIME_DELTA[numpy_units] / _NS_PER_TIME_DELTA[default_unit]),
default_unit,
)


def _infer_time_units_from_diff(unique_timedeltas) -> str:
Expand All @@ -411,7 +474,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:


def _time_units_to_timedelta64(units: str) -> np.timedelta64:
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]")
default_unit = _get_datetime_resolution()
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype(
f"timedelta64[{default_unit}]"
)


def infer_calendar_name(dates) -> CFCalendar:
Expand Down Expand Up @@ -440,13 +506,11 @@ def infer_datetime_units(dates) -> str:
unique time deltas in `dates`)
"""
dates = ravel(np.asarray(dates))
if np.asarray(dates).dtype == "datetime64[ns]":
if np.issubdtype(np.asarray(dates).dtype, "datetime64"):
dates = to_datetime_unboxed(dates)
dates = dates[pd.notnull(dates)]
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
reference_date = nanosecond_precision_timestamp(reference_date)
reference_date = default_precision_timestamp(reference_date)
else:
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
reference_date = format_cftime_datetime(reference_date)
Expand Down Expand Up @@ -479,17 +543,15 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray:
If raise_on_invalid is True (default), invalid dates trigger a ValueError.
Otherwise, the invalid element is replaced by np.NaT."""
times = np.asarray(times)
# TODO: the strict enforcement of nanosecond precision datetime values can
# be relaxed when addressing GitHub issue #7493.
new = np.empty(times.shape, dtype="M8[ns]")
new = np.empty(times.shape, dtype=f"M8[{_get_datetime_resolution()}]")
dt: pd.Timestamp | Literal["NaT"]
for i, t in np.ndenumerate(times):
try:
# Use pandas.Timestamp in place of datetime.datetime, because
# NumPy casts it safely it np.datetime64[ns] for dates outside
# 1678 to 2262 (this is not currently the case for
# datetime.datetime).
dt = nanosecond_precision_timestamp(
dt = default_precision_timestamp(
t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond
)
except ValueError as e:
Expand Down Expand Up @@ -546,10 +608,8 @@ def convert_time_or_go_back(date, date_type):

This is meant to convert end-of-month dates into a new calendar.
"""
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
if date_type == pd.Timestamp:
date_type = nanosecond_precision_timestamp
date_type = default_precision_timestamp
try:
return date_type(
date.year,
Expand Down Expand Up @@ -757,7 +817,7 @@ def _eagerly_encode_cf_datetime(
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
# parse with cftime instead
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"
assert np.issubdtype(dates.dtype, "datetime64")

time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)
Expand Down
11 changes: 11 additions & 0 deletions xarray/core/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"use_numbagg",
"use_opt_einsum",
"use_flox",
"time_resolution",
]

class T_Options(TypedDict):
Expand Down Expand Up @@ -59,6 +60,7 @@ class T_Options(TypedDict):
use_flox: bool
use_numbagg: bool
use_opt_einsum: bool
time_resolution: Literal["s", "ms", "us", "ns"]


OPTIONS: T_Options = {
Expand Down Expand Up @@ -86,10 +88,12 @@ class T_Options(TypedDict):
"use_flox": True,
"use_numbagg": True,
"use_opt_einsum": True,
"time_resolution": "ns",
}

_JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"])
_DISPLAY_OPTIONS = frozenset(["text", "html"])
_TIME_RESOLUTION_OPTIONS = frozenset(["s", "ms", "us", "ns"])


def _positive_integer(value: Any) -> bool:
Expand Down Expand Up @@ -117,6 +121,7 @@ def _positive_integer(value: Any) -> bool:
"use_opt_einsum": lambda value: isinstance(value, bool),
"use_flox": lambda value: isinstance(value, bool),
"warn_for_unclosed_files": lambda value: isinstance(value, bool),
"time_resolution": _TIME_RESOLUTION_OPTIONS.__contains__,
}


Expand Down Expand Up @@ -158,6 +163,10 @@ def _get_keep_attrs(default: bool) -> bool:
return _get_boolean_with_default("keep_attrs", default)


def _get_datetime_resolution() -> Literal["s", "ms", "us", "ns"]:
return OPTIONS["time_resolution"]


class set_options:
"""
Set options for xarray in a controlled context.
Expand Down Expand Up @@ -258,6 +267,8 @@ class set_options:
warn_for_unclosed_files : bool, default: False
Whether or not to issue a warning when unclosed files are
deallocated. This is mostly useful for debugging.
time_resolution : {"s", "ms", "us", "ns"}, default: "ns"
Time resolution used for CF encoding/decoding.

Examples
--------
Expand Down
24 changes: 15 additions & 9 deletions xarray/core/pdcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@
from enum import Enum
from typing import Literal

import numpy as np
import pandas as pd
from packaging.version import Version

from xarray.core.options import _get_datetime_resolution


def count_not_none(*args) -> int:
Expand Down Expand Up @@ -73,13 +75,17 @@ def __repr__(self) -> str:
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas


def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
"""Return a nanosecond-precision Timestamp object.
def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
"""Return a Timestamp object with the default precision.

Note this function should no longer be needed after addressing GitHub issue
#7493.
Xarray default is "ns". This can be overridden by setting
set_options(time_resolution="us") or any other resolution
of {"s", "ms", "us", "ns"}.
"""
if Version(pd.__version__) >= Version("2.0.0"):
return pd.Timestamp(*args, **kwargs).as_unit("ns")
else:
return pd.Timestamp(*args, **kwargs)
dt = pd.Timestamp(*args, **kwargs)
units = ["s", "ms", "us", "ns"]
default = _get_datetime_resolution()
unit = dt.unit if hasattr(dt, "unit") else np.datetime_data(dt.asm8)[0]
if units.index(default) > units.index(unit):
dt = dt.as_unit(default)
return dt
Loading
Loading