Skip to content

Commit e74f323

Browse files
samukwekusamuel.oranyeliericmjlpre-commit-ci[bot]
authored
convert_excel_date, convert_matlab_date for Polars (#1365)
Implemented `convert_excel_date` and `convert_matlab_date` for Polars. ------ Co-authored-by: samuel.oranyeli <[email protected]> Co-authored-by: Eric Ma <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 45ad9c0 commit e74f323

7 files changed

+194
-51
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## [Unreleased]
44

5+
- [ENH] Added `convert_excel_date` and `convert_matlab_date` methods for polars - Issue #1352
56
- [ENH] Added a `complete` method for polars. - Issue #1352 @samukweku
67
- [ENH] `read_commandline` function now supports polars - Issue #1352
78
- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341

janitor/functions/convert_date.py

+47-39
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,22 @@
1-
import datetime as dt
2-
from typing import Hashable
1+
from typing import Hashable, Union
32

43
import pandas as pd
54
import pandas_flavor as pf
6-
from pandas.api.types import is_numeric_dtype
75
from pandas.errors import OutOfBoundsDatetime
86

9-
from janitor.utils import deprecated_alias
7+
from janitor.utils import deprecated_alias, refactored_function
108

119

1210
@pf.register_dataframe_method
13-
@deprecated_alias(column="column_name")
11+
@deprecated_alias(column="column_names")
1412
def convert_excel_date(
15-
df: pd.DataFrame, column_name: Hashable
13+
df: pd.DataFrame, column_names: Union[Hashable, list]
1614
) -> pd.DataFrame:
1715
"""Convert Excel's serial date format into Python datetime format.
1816
19-
This method mutates the original DataFrame.
17+
This method does not mutate the original DataFrame.
2018
21-
Implementation is also from
19+
Implementation is based on
2220
[Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas).
2321
2422
Examples:
@@ -38,40 +36,36 @@ def convert_excel_date(
3836
3937
Args:
4038
df: A pandas DataFrame.
41-
column_name: A column name.
42-
43-
Raises:
44-
ValueError: If there are non numeric values in the column.
39+
column_names: A column name, or a list of column names.
4540
4641
Returns:
4742
A pandas DataFrame with corrected dates.
4843
""" # noqa: E501
4944

50-
if not is_numeric_dtype(df[column_name]):
51-
raise ValueError(
52-
"There are non-numeric values in the column. "
53-
"All values must be numeric."
45+
if not isinstance(column_names, list):
46+
column_names = [column_names]
47+
# https://stackoverflow.com/a/65460255/7175713
48+
dictionary = {
49+
column_name: pd.to_datetime(
50+
df[column_name], unit="D", origin="1899-12-30"
5451
)
52+
for column_name in column_names
53+
}
5554

56-
df[column_name] = pd.TimedeltaIndex(
57-
df[column_name], unit="d"
58-
) + dt.datetime(
59-
1899, 12, 30
60-
) # noqa: W503
61-
return df
55+
return df.assign(**dictionary)
6256

6357

6458
@pf.register_dataframe_method
65-
@deprecated_alias(column="column_name")
59+
@deprecated_alias(column="column_names")
6660
def convert_matlab_date(
67-
df: pd.DataFrame, column_name: Hashable
61+
df: pd.DataFrame, column_names: Union[Hashable, list]
6862
) -> pd.DataFrame:
6963
"""Convert Matlab's serial date number into Python datetime format.
7064
71-
Implementation is also from
65+
Implementation is based on
7266
[Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python).
7367
74-
This method mutates the original DataFrame.
68+
This method does not mutate the original DataFrame.
7569
7670
Examples:
7771
>>> import pandas as pd
@@ -84,29 +78,38 @@ def convert_matlab_date(
8478
2 737124.498500
8579
3 737124.000000
8680
>>> df.convert_matlab_date('date')
87-
date
88-
0 2018-03-06 00:00:00.000000
89-
1 2018-03-05 19:34:50.563200
90-
2 2018-03-05 11:57:50.399999
91-
3 2018-03-05 00:00:00.000000
81+
date
82+
0 2018-03-06 00:00:00.000000000
83+
1 2018-03-05 19:34:50.563199671
84+
2 2018-03-05 11:57:50.399998876
85+
3 2018-03-05 00:00:00.000000000
9286
9387
Args:
9488
df: A pandas DataFrame.
95-
column_name: A column name.
89+
column_names: A column name, or a list of column names.
9690
9791
Returns:
9892
A pandas DataFrame with corrected dates.
9993
""" # noqa: E501
100-
days = pd.Series([dt.timedelta(v % 1) for v in df[column_name]])
101-
df[column_name] = (
102-
df[column_name].astype(int).apply(dt.datetime.fromordinal)
103-
+ days
104-
- dt.timedelta(days=366)
105-
)
106-
return df
94+
# https://stackoverflow.com/a/49135037/7175713
95+
if not isinstance(column_names, list):
96+
column_names = [column_names]
97+
dictionary = {
98+
column_name: pd.to_datetime(df[column_name] - 719529, unit="D")
99+
for column_name in column_names
100+
}
107101

102+
return df.assign(**dictionary)
108103

104+
105+
@pf.register_dataframe_method
109106
@pf.register_dataframe_method
107+
@refactored_function(
108+
message=(
109+
"This function will be deprecated in a 1.x release. "
110+
"Please use `pd.to_datetime` instead."
111+
)
112+
)
110113
@deprecated_alias(column="column_name")
111114
def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
112115
"""Convert unix epoch time into Python datetime format.
@@ -116,6 +119,11 @@ def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame:
116119
117120
This method mutates the original DataFrame.
118121
122+
!!!note
123+
124+
This function will be deprecated in a 1.x release.
125+
Please use `pd.to_datetime` instead.
126+
119127
Examples:
120128
>>> import pandas as pd
121129
>>> import janitor

janitor/polars/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from .clean_names import clean_names, make_clean_names
22
from .complete import complete
3+
from .dates_to_polars import convert_excel_date, convert_matlab_date
34
from .pivot_longer import pivot_longer, pivot_longer_spec
45
from .row_to_names import row_to_names
56

@@ -10,4 +11,6 @@
1011
"make_clean_names",
1112
"row_to_names",
1213
"complete",
14+
"convert_excel_date",
15+
"convert_matlab_date",
1316
]

janitor/polars/dates_to_polars.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from __future__ import annotations
2+
3+
from janitor.utils import import_message
4+
5+
from .polars_flavor import register_expr_method
6+
7+
try:
8+
import polars as pl
9+
except ImportError:
10+
import_message(
11+
submodule="polars",
12+
package="polars",
13+
conda_channel="conda-forge",
14+
pip_install=True,
15+
)
16+
17+
18+
@register_expr_method
19+
def convert_excel_date(expr: pl.Expr) -> pl.Expr:
20+
"""
21+
Convert Excel's serial date format into Python datetime format.
22+
23+
Inspiration is from
24+
[Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas).
25+
26+
Examples:
27+
>>> import polars as pl
28+
>>> import janitor.polars
29+
>>> df = pl.DataFrame({"date": [39690, 39690, 37118]})
30+
>>> df
31+
shape: (3, 1)
32+
┌───────┐
33+
│ date │
34+
│ --- │
35+
│ i64 │
36+
╞═══════╡
37+
│ 39690 │
38+
│ 39690 │
39+
│ 37118 │
40+
└───────┘
41+
>>> expression = pl.col('date').convert_excel_date().alias('date_')
42+
>>> df.with_columns(expression)
43+
shape: (3, 2)
44+
┌───────┬────────────┐
45+
│ date ┆ date_ │
46+
│ --- ┆ --- │
47+
│ i64 ┆ date │
48+
╞═══════╪════════════╡
49+
│ 39690 ┆ 2008-08-30 │
50+
│ 39690 ┆ 2008-08-30 │
51+
│ 37118 ┆ 2001-08-15 │
52+
└───────┴────────────┘
53+
54+
!!! info "New in version 0.28.0"
55+
56+
Returns:
57+
A polars Expression.
58+
""" # noqa: E501
59+
expression = pl.duration(days=expr)
60+
expression += pl.date(year=1899, month=12, day=30)
61+
return expression
62+
63+
64+
@register_expr_method
65+
def convert_matlab_date(expr: pl.Expr) -> pl.Expr:
66+
"""
67+
Convert Matlab's serial date number into Python datetime format.
68+
69+
Implementation is from
70+
[Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python).
71+
72+
73+
Examples:
74+
>>> import polars as pl
75+
>>> import janitor.polars
76+
>>> df = pl.DataFrame({"date": [737125.0, 737124.815863, 737124.4985, 737124]})
77+
>>> df
78+
shape: (4, 1)
79+
┌───────────────┐
80+
│ date │
81+
│ --- │
82+
│ f64 │
83+
╞═══════════════╡
84+
│ 737125.0 │
85+
│ 737124.815863 │
86+
│ 737124.4985 │
87+
│ 737124.0 │
88+
└───────────────┘
89+
>>> expression = pl.col('date').convert_matlab_date().alias('date_')
90+
>>> df.with_columns(expression)
91+
shape: (4, 2)
92+
┌───────────────┬─────────────────────────┐
93+
│ date ┆ date_ │
94+
│ --- ┆ --- │
95+
│ f64 ┆ datetime[μs] │
96+
╞═══════════════╪═════════════════════════╡
97+
│ 737125.0 ┆ 2018-03-06 00:00:00 │
98+
│ 737124.815863 ┆ 2018-03-05 19:34:50.563 │
99+
│ 737124.4985 ┆ 2018-03-05 11:57:50.399 │
100+
│ 737124.0 ┆ 2018-03-05 00:00:00 │
101+
└───────────────┴─────────────────────────┘
102+
103+
!!! info "New in version 0.28.0"
104+
105+
Returns:
106+
A polars Expression.
107+
""" # noqa: E501
108+
# https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python
109+
expression = expr.sub(719529).mul(86_400_000)
110+
expression = pl.duration(milliseconds=expression)
111+
expression += pl.datetime(year=1970, month=1, day=1)
112+
return expression

tests/functions/test_convert_excel_date.py

-12
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,3 @@ def test_convert_excel_date():
1818
)
1919

2020
assert df["hire_date"].dtype == "M8[ns]"
21-
22-
23-
@pytest.mark.functions
24-
def test_convert_excel_date_with_string_data():
25-
"""Raises ValueError if values of column are not numeric"""
26-
df = pd.read_excel(
27-
Path(pytest.EXAMPLES_DIR) / "notebooks" / "dirty_data.xlsx",
28-
engine="openpyxl",
29-
).clean_names()
30-
31-
with pytest.raises(ValueError):
32-
df.convert_excel_date("certification")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import polars as pl
2+
3+
import janitor.polars # noqa: F401
4+
5+
6+
def test_convert_excel_date():
7+
df = pl.DataFrame({"dates": [42580.3333333333]})
8+
9+
expression = pl.col("dates").convert_excel_date().alias("dd")
10+
expression = df.with_columns(expression).get_column("dd")
11+
assert expression.dtype.is_temporal() is True
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import polars as pl
2+
3+
import janitor.polars # noqa: F401
4+
5+
6+
def test_convert_matlab_date():
7+
df = pl.DataFrame(
8+
{
9+
"dates": [
10+
733_301.0,
11+
729_159.0,
12+
734_471.0,
13+
737_299.563_296_356_5,
14+
737_300.000_000_000_0,
15+
]
16+
}
17+
)
18+
expression = pl.col("dates").convert_matlab_date().alias("dd")
19+
expression = df.with_columns(expression).get_column("dd")
20+
assert expression.dtype.is_temporal() is True

0 commit comments

Comments
 (0)