Skip to content

API: timestamp resolution inference: default to microseconds when possible #62031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ cpdef array_to_datetime(
iresult[i] = parse_pydatetime(val, &dts, creso=creso)

elif PyDate_Check(val):
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,

cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz,
int32_t nanos=*,
NPY_DATETIMEUNIT reso=*)
NPY_DATETIMEUNIT reso=*,
NPY_DATETIMEUNIT best_reso=*)

cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
bint dayfirst=*,
Expand Down
23 changes: 16 additions & 7 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ from pandas._libs.missing cimport checknull_with_nat_and_na
from pandas._libs.tslibs.dtypes cimport (
abbrev_to_npy_unit,
get_supported_reso,
get_supported_reso_for_dts,
npy_unit_to_attrname,
periods_per_second,
)
Expand Down Expand Up @@ -422,10 +423,9 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
return convert_datetime_to_tsobject(ts, tz, nanos, reso=reso)
elif PyDate_Check(ts):
# Keep the converter same as PyDateTime's
# For date object we give the lowest supported resolution, i.e. "s"
ts = datetime.combine(ts, time())
return convert_datetime_to_tsobject(
ts, tz, nanos=0, reso=NPY_DATETIMEUNIT.NPY_FR_s
ts, tz, nanos=0, reso=NPY_DATETIMEUNIT.NPY_FR_us
)
else:
from .period import Period
Expand Down Expand Up @@ -453,7 +453,8 @@ cdef _TSObject convert_datetime_to_tsobject(
datetime ts,
tzinfo tz,
int32_t nanos=0,
NPY_DATETIMEUNIT reso=NPY_FR_ns,
NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
NPY_DATETIMEUNIT best_reso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
):
"""
Convert a datetime (or Timestamp) input `ts`, along with optional timezone
Expand All @@ -480,7 +481,6 @@ cdef _TSObject convert_datetime_to_tsobject(
_TSObject obj = _TSObject()
int64_t pps

obj.creso = reso
obj.fold = ts.fold
if tz is not None:

Expand All @@ -507,6 +507,10 @@ cdef _TSObject convert_datetime_to_tsobject(
if nanos:
obj.dts.ps = nanos * 1000

if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
reso = get_supported_reso_for_dts(best_reso, &obj.dts)
obj.creso = reso

try:
obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts)
except OverflowError as err:
Expand Down Expand Up @@ -622,7 +626,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
&out_tzoffset, False
)
if not string_to_dts_failed:
reso = get_supported_reso(out_bestunit)
reso = get_supported_reso_for_dts(out_bestunit, &dts)
check_dts_bounds(&dts, reso)
obj = _TSObject()
obj.dts = dts
Expand Down Expand Up @@ -660,8 +664,13 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
out_bestunit=&out_bestunit,
nanos=&nanos,
)
reso = get_supported_reso(out_bestunit)
return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso)
return convert_datetime_to_tsobject(
dt,
tz,
nanos=nanos,
reso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
best_reso=out_bestunit
)


cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns):
Expand Down
8 changes: 7 additions & 1 deletion pandas/_libs/tslibs/dtypes.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from numpy cimport int64_t

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
npy_datetimestruct,
)


cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
Expand All @@ -9,6 +12,9 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1
cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso)
cdef NPY_DATETIMEUNIT get_supported_reso_for_dts(
NPY_DATETIMEUNIT reso, npy_datetimestruct* dts
)
cdef bint is_supported_unit(NPY_DATETIMEUNIT reso)

cdef dict c_OFFSET_TO_PERIOD_FREQSTR
Expand Down
49 changes: 49 additions & 0 deletions pandas/_libs/tslibs/dtypes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@
# originals
from enum import Enum

import numpy as np

from cpython.object cimport (
Py_GE,
Py_LE,
)

from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
cmp_dtstructs,
get_conversion_factor,
import_pandas_datetime,
npy_datetimestruct,
pandas_datetime_to_datetimestruct,
)

import_pandas_datetime()
Expand Down Expand Up @@ -504,6 +514,45 @@ cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso):
return reso


cdef npy_datetimestruct dts_us_min, dts_us_max
pandas_datetime_to_datetimestruct(
np.iinfo(np.int64).min + 1, NPY_DATETIMEUNIT.NPY_FR_us, &dts_us_min
)
pandas_datetime_to_datetimestruct(
np.iinfo(np.int64).max, NPY_DATETIMEUNIT.NPY_FR_us, &dts_us_max
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

util.INT64_MAX

)


cdef NPY_DATETIMEUNIT get_supported_reso_for_dts(
NPY_DATETIMEUNIT reso, npy_datetimestruct* dts
):
# Similar as above, but taking the actual datetime value in account,
# defaulting to 'us' if possible.
if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
return NPY_DATETIMEUNIT.NPY_FR_ns
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be FR_us?

# if dts.ps != 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are the .ps checks not necessary?

# return NPY_DATETIMEUNIT.NPY_FR_ns
# elif (
# cmp_dtstructs(dts, &dts_us_min, Py_GE)
# and cmp_dtstructs(dts, &dts_us_max, Py_LE)
# ):
# return NPY_DATETIMEUNIT.NPY_FR_us
# else:
# return NPY_DATETIMEUNIT.NPY_FR_s
if reso < NPY_DATETIMEUNIT.NPY_FR_us:
if (
cmp_dtstructs(dts, &dts_us_min, Py_GE)
and cmp_dtstructs(dts, &dts_us_max, Py_LE)
):
return NPY_DATETIMEUNIT.NPY_FR_us
else:
# TODO still distinguish between ms or s?
return NPY_DATETIMEUNIT.NPY_FR_s
elif reso > NPY_DATETIMEUNIT.NPY_FR_ns:
return NPY_DATETIMEUNIT.NPY_FR_ns
return reso


cdef bint is_supported_unit(NPY_DATETIMEUNIT reso):
return (
reso == NPY_DATETIMEUNIT.NPY_FR_ns
Expand Down
13 changes: 5 additions & 8 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ from pandas._libs.tslibs.conversion cimport (
)
from pandas._libs.tslibs.dtypes cimport (
get_supported_reso,
get_supported_reso_for_dts,
npy_unit_to_abbrev,
npy_unit_to_attrname,
)
Expand Down Expand Up @@ -421,7 +422,7 @@ def array_strptime(
continue
elif PyDate_Check(val):
state.found_other = True
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
Expand Down Expand Up @@ -460,7 +461,7 @@ def array_strptime(
if string_to_dts_succeeded:
# No error reported by string_to_dts, pick back up
# where we left off
item_reso = get_supported_reso(out_bestunit)
item_reso = get_supported_reso_for_dts(out_bestunit, &dts)
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
Expand Down Expand Up @@ -622,7 +623,7 @@ cdef tzinfo _parse_with_format(
f"time data \"{val}\" doesn't match format \"{fmt}\""
)

item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_s
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us

iso_year = -1
year = 1900
Expand Down Expand Up @@ -710,11 +711,7 @@ cdef tzinfo _parse_with_format(
elif parse_code == 10:
# e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f'
s = found_dict["f"]
if len(s) <= 3:
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ms
elif len(s) <= 6:
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us
else:
if len(s) > 6:
item_reso[0] = NPY_FR_ns
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
(Period("2012-02-01", freq="D"), "period[D]"),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(unit="s", tz="US/Eastern"),
DatetimeTZDtype(unit="us", tz="US/Eastern"),
),
(Timedelta(seconds=500), "timedelta64[ns]"),
]
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def unique(values):
array([2, 1])

>>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
array(['2016-01-01T00:00:00'], dtype='datetime64[s]')
array(['2016-01-01T00:00:00.000000'], dtype='datetime64[us]')

>>> pd.unique(
... pd.Series(
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1906,11 +1906,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:

>>> rng_tz.floor("2h", ambiguous=False)
DatetimeIndex(['2021-10-31 02:00:00+01:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)

>>> rng_tz.floor("2h", ambiguous=True)
DatetimeIndex(['2021-10-31 02:00:00+02:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)
"""

_floor_example = """>>> rng.floor('h')
Expand All @@ -1933,11 +1933,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:

>>> rng_tz.floor("2h", ambiguous=False)
DatetimeIndex(['2021-10-31 02:00:00+01:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)

>>> rng_tz.floor("2h", ambiguous=True)
DatetimeIndex(['2021-10-31 02:00:00+02:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)
"""

_ceil_example = """>>> rng.ceil('h')
Expand All @@ -1960,11 +1960,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:

>>> rng_tz.ceil("h", ambiguous=False)
DatetimeIndex(['2021-10-31 02:00:00+01:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)

>>> rng_tz.ceil("h", ambiguous=True)
DatetimeIndex(['2021-10-31 02:00:00+02:00'],
dtype='datetime64[s, Europe/Amsterdam]', freq=None)
dtype='datetime64[us, Europe/Amsterdam]', freq=None)
"""


Expand Down
20 changes: 10 additions & 10 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc]
... )
<DatetimeArray>
['2023-01-01 00:00:00', '2023-01-02 00:00:00']
Length: 2, dtype: datetime64[s]
Length: 2, dtype: datetime64[us]
"""

_typ = "datetimearray"
Expand Down Expand Up @@ -614,7 +614,7 @@ def tz(self) -> tzinfo | None:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.tz
datetime.timezone.utc

Expand Down Expand Up @@ -1044,7 +1044,7 @@ def tz_localize(
4 2018-10-28 02:30:00+01:00
5 2018-10-28 03:00:00+01:00
6 2018-10-28 03:30:00+01:00
dtype: datetime64[s, CET]
dtype: datetime64[us, CET]

In some cases, inferring the DST is impossible. In such cases, you can
pass an ndarray to the ambiguous parameter to set the DST explicitly
Expand All @@ -1056,7 +1056,7 @@ def tz_localize(
0 2018-10-28 01:20:00+02:00
1 2018-10-28 02:36:00+02:00
2 2018-10-28 03:46:00+01:00
dtype: datetime64[s, CET]
dtype: datetime64[us, CET]

If the DST transition causes nonexistent times, you can shift these
dates forward or backwards with a timedelta object or `'shift_forward'`
Expand Down Expand Up @@ -1439,7 +1439,7 @@ def time(self) -> npt.NDArray[np.object_]:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.time
0 10:00:00
1 11:00:00
Expand Down Expand Up @@ -1482,7 +1482,7 @@ def timetz(self) -> npt.NDArray[np.object_]:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.timetz
0 10:00:00+00:00
1 11:00:00+00:00
Expand Down Expand Up @@ -1524,7 +1524,7 @@ def date(self) -> npt.NDArray[np.object_]:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.date
0 2020-01-01
1 2020-02-01
Expand Down Expand Up @@ -1873,7 +1873,7 @@ def isocalendar(self) -> DataFrame:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.dayofyear
0 1
1 32
Expand Down Expand Up @@ -1909,7 +1909,7 @@ def isocalendar(self) -> DataFrame:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-04-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.quarter
0 1
1 2
Expand Down Expand Up @@ -1945,7 +1945,7 @@ def isocalendar(self) -> DataFrame:
>>> s
0 2020-01-01 10:00:00+00:00
1 2020-02-01 11:00:00+00:00
dtype: datetime64[s, UTC]
dtype: datetime64[us, UTC]
>>> s.dt.daysinmonth
0 31
1 29
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1376,7 +1376,7 @@ def factorize(
0 2000-03-11
1 2000-03-12
2 2000-03-13
dtype: datetime64[s]
dtype: datetime64[us]

>>> ser.searchsorted('3/14/2000')
np.int64(3)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
>>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"])
>>> index
DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
dtype='datetime64[s]', freq=None)
dtype='datetime64[us]', freq=None)
>>> pd.isna(index)
array([False, False, True, False])

Expand Down
Loading
Loading