Skip to content

Commit 30ce3b3

Browse files
Merge branch 'main' into fix-59242
2 parents 8813f77 + 70edaa0 commit 30ce3b3

File tree

25 files changed

+526
-133
lines changed

25 files changed

+526
-133
lines changed

doc/source/user_guide/cookbook.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,7 @@ Timeseries
874874
<https://stackoverflow.com/questions/13893227/vectorized-look-up-of-values-in-pandas-dataframe>`__
875875

876876
`Aggregation and plotting time series
877-
<https://nipunbatra.github.io/blog/visualisation/2013/05/01/aggregation-timeseries.html>`__
877+
<https://nipunbatra.github.io/blog/posts/2013-05-01-aggregation-timeseries.html>`__
878878

879879
Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series.
880880
`How to rearrange a Python pandas DataFrame?

doc/source/whatsnew/v3.0.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3636
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3737
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
38+
- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`)
3839
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
3940
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
4041
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
@@ -68,6 +69,7 @@ Other enhancements
6869
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
6970
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
7071
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
72+
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
7173
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
7274
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
7375
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
@@ -631,6 +633,7 @@ Datetimelike
631633
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
632634
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
633635
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
636+
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
634637
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
635638
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
636639
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`)

pandas/_libs/interval.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ cdef class IntervalMixin:
209209
"""
210210
Indicates if an interval is empty, meaning it contains no points.
211211
212+
An interval is considered empty if its `left` and `right` endpoints
213+
are equal, and it is not closed on both sides. This means that the
214+
interval does not include any real points. In the case of an
215+
:class:`pandas.arrays.IntervalArray` or :class:`IntervalIndex`, the
216+
property returns a boolean array indicating the emptiness of each interval.
217+
212218
Returns
213219
-------
214220
bool or ndarray

pandas/_libs/tslibs/period.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2140,6 +2140,12 @@ cdef class _Period(PeriodMixin):
21402140
"""
21412141
Get day of the month that a Period falls on.
21422142

2143+
The `day` property provides a simple way to access the day component
2144+
of a `Period` object, which represents time spans in various frequencies
2145+
(e.g., daily, hourly, monthly). If the period's frequency does not include
2146+
a day component (e.g., yearly or quarterly periods), the returned day
2147+
corresponds to the first day of that period.
2148+
21432149
Returns
21442150
-------
21452151
int

pandas/_typing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,9 @@ def closed(self) -> bool:
442442
AnyAll = Literal["any", "all"]
443443

444444
# merge
445-
MergeHow = Literal["left", "right", "inner", "outer", "cross"]
445+
MergeHow = Literal[
446+
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
447+
]
446448
MergeValidate = Literal[
447449
"one_to_one",
448450
"1:1",

pandas/core/computation/parsing.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,6 @@ def clean_column_name(name: Hashable) -> Hashable:
123123
-------
124124
name : hashable
125125
Returns the name after tokenizing and cleaning.
126-
127-
Notes
128-
-----
129-
For some cases, a name cannot be converted to a valid Python identifier.
130-
In that case :func:`tokenize_string` raises a SyntaxError.
131-
In that case, we just return the name unmodified.
132-
133-
If this name was used in the query string (this makes the query call impossible)
134-
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
135-
which is not caught and propagates to the user level.
136126
"""
137127
try:
138128
# Escape backticks
@@ -145,40 +135,6 @@ def clean_column_name(name: Hashable) -> Hashable:
145135
return name
146136

147137

148-
def tokenize_backtick_quoted_string(
149-
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
150-
) -> tuple[int, str]:
151-
"""
152-
Creates a token from a backtick quoted string.
153-
154-
Moves the token_generator forwards till right after the next backtick.
155-
156-
Parameters
157-
----------
158-
token_generator : Iterator[tokenize.TokenInfo]
159-
The generator that yields the tokens of the source string (Tuple[int, str]).
160-
The generator is at the first token after the backtick (`)
161-
162-
source : str
163-
The Python source code string.
164-
165-
string_start : int
166-
This is the start of backtick quoted string inside the source string.
167-
168-
Returns
169-
-------
170-
tok: Tuple[int, str]
171-
The token that represents the backtick quoted string.
172-
The integer is equal to BACKTICK_QUOTED_STRING (100).
173-
"""
174-
for _, tokval, start, _, _ in token_generator:
175-
if tokval == "`":
176-
string_end = start[1]
177-
break
178-
179-
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
180-
181-
182138
class ParseState(Enum):
183139
DEFAULT = 0
184140
IN_BACKTICK = 1

pandas/core/frame.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,8 @@
315315
----------%s
316316
right : DataFrame or named Series
317317
Object to merge with.
318-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
318+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
319+
default 'inner'
319320
Type of merge to be performed.
320321
321322
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -328,6 +329,10 @@
328329
join; preserve the order of the left keys.
329330
* cross: creates the cartesian product from both frames, preserves the order
330331
of the left keys.
332+
* left_anti: use only keys from left frame that are not in right frame, similar
333+
to SQL left anti join; preserve key order.
334+
* right_anti: use only keys from right frame that are not in left frame, similar
335+
to SQL right anti join; preserve key order.
331336
on : label or list
332337
Column or index level names to join on. These must be found in both
333338
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -4793,6 +4798,10 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
47934798
"""
47944799
Return a subset of the DataFrame's columns based on the column dtypes.
47954800
4801+
This method allows for filtering columns based on their data types.
4802+
It is useful when working with heterogeneous DataFrames where operations
4803+
need to be performed on a specific subset of data types.
4804+
47964805
Parameters
47974806
----------
47984807
include, exclude : scalar or list-like
@@ -10609,7 +10618,8 @@ def join(
1060910618
values given, the `other` DataFrame must have a MultiIndex. Can
1061010619
pass an array as the join key if it is not already contained in
1061110620
the calling DataFrame. Like an Excel VLOOKUP operation.
10612-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
10621+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
10622+
default 'left'
1061310623
How to handle the operation of the two objects.
1061410624
1061510625
* left: use calling frame's index (or column if on is specified)
@@ -10621,6 +10631,10 @@ def join(
1062110631
of the calling's one.
1062210632
* cross: creates the cartesian product from both frames, preserves the order
1062310633
of the left keys.
10634+
* left_anti: use set difference of calling frame's index and `other`'s
10635+
index.
10636+
* right_anti: use set difference of `other`'s index and calling frame's
10637+
index.
1062410638
lsuffix : str, default ''
1062510639
Suffix to use from left frame's overlapping columns.
1062610640
rsuffix : str, default ''

pandas/core/nanops.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,11 +1093,14 @@ def reduction(
10931093
if values.size == 0:
10941094
return _na_for_min_count(values, axis)
10951095

1096+
dtype = values.dtype
10961097
values, mask = _get_values(
10971098
values, skipna, fill_value_typ=fill_value_typ, mask=mask
10981099
)
10991100
result = getattr(values, meth)(axis)
1100-
result = _maybe_null_out(result, axis, mask, values.shape)
1101+
result = _maybe_null_out(
1102+
result, axis, mask, values.shape, datetimelike=dtype.kind in "mM"
1103+
)
11011104
return result
11021105

11031106
return reduction
@@ -1499,6 +1502,7 @@ def _maybe_null_out(
14991502
mask: npt.NDArray[np.bool_] | None,
15001503
shape: tuple[int, ...],
15011504
min_count: int = 1,
1505+
datetimelike: bool = False,
15021506
) -> np.ndarray | float | NaTType:
15031507
"""
15041508
Returns
@@ -1520,7 +1524,10 @@ def _maybe_null_out(
15201524
null_mask = np.broadcast_to(below_count, new_shape)
15211525

15221526
if np.any(null_mask):
1523-
if is_numeric_dtype(result):
1527+
if datetimelike:
1528+
# GH#60646 For datetimelike, no need to cast to float
1529+
result[null_mask] = iNaT
1530+
elif is_numeric_dtype(result):
15241531
if np.iscomplexobj(result):
15251532
result = result.astype("c16")
15261533
elif not is_float_dtype(result):

pandas/core/reshape/merge.py

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def merge(
180180
First pandas object to merge.
181181
right : DataFrame or named Series
182182
Second pandas object to merge.
183-
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
183+
how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti},
184+
default 'inner'
184185
Type of merge to be performed.
185186
186187
* left: use only keys from left frame, similar to a SQL left outer join;
@@ -193,6 +194,10 @@ def merge(
193194
join; preserve the order of the left keys.
194195
* cross: creates the cartesian product from both frames, preserves the order
195196
of the left keys.
197+
* left_anti: use only keys from left frame that are not in right frame, similar
198+
to SQL left anti join; preserve key order.
199+
* right_anti: use only keys from right frame that are not in left frame, similar
200+
to SQL right anti join; preserve key order.
196201
on : label or list
197202
Column or index level names to join on. These must be found in both
198203
DataFrames. If `on` is None and not merging on indexes then this defaults
@@ -953,7 +958,7 @@ def __init__(
953958
self,
954959
left: DataFrame | Series,
955960
right: DataFrame | Series,
956-
how: JoinHow | Literal["asof"] = "inner",
961+
how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner",
957962
on: IndexLabel | AnyArrayLike | None = None,
958963
left_on: IndexLabel | AnyArrayLike | None = None,
959964
right_on: IndexLabel | AnyArrayLike | None = None,
@@ -968,7 +973,7 @@ def __init__(
968973
_right = _validate_operand(right)
969974
self.left = self.orig_left = _left
970975
self.right = self.orig_right = _right
971-
self.how = how
976+
self.how, self.anti_join = self._validate_how(how)
972977

973978
self.on = com.maybe_make_list(on)
974979

@@ -998,14 +1003,6 @@ def __init__(
9981003
)
9991004
raise MergeError(msg)
10001005

1001-
# GH 59435: raise when "how" is not a valid Merge type
1002-
merge_type = {"left", "right", "inner", "outer", "cross", "asof"}
1003-
if how not in merge_type:
1004-
raise ValueError(
1005-
f"'{how}' is not a valid Merge type: "
1006-
f"left, right, inner, outer, cross, asof"
1007-
)
1008-
10091006
self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
10101007

10111008
(
@@ -1035,6 +1032,37 @@ def __init__(
10351032
if validate is not None:
10361033
self._validate_validate_kwd(validate)
10371034

1035+
@final
1036+
def _validate_how(
1037+
self, how: JoinHow | Literal["left_anti", "right_anti", "asof"]
1038+
) -> tuple[JoinHow | Literal["asof"], bool]:
1039+
"""
1040+
Validate the 'how' parameter and return the actual join type and whether
1041+
this is an anti join.
1042+
"""
1043+
# GH 59435: raise when "how" is not a valid Merge type
1044+
merge_type = {
1045+
"left",
1046+
"right",
1047+
"inner",
1048+
"outer",
1049+
"left_anti",
1050+
"right_anti",
1051+
"cross",
1052+
"asof",
1053+
}
1054+
if how not in merge_type:
1055+
raise ValueError(
1056+
f"'{how}' is not a valid Merge type: "
1057+
f"left, right, inner, outer, left_anti, right_anti, cross, asof"
1058+
)
1059+
anti_join = False
1060+
if how in {"left_anti", "right_anti"}:
1061+
how = how.split("_")[0] # type: ignore[assignment]
1062+
anti_join = True
1063+
how = cast(JoinHow | Literal["asof"], how)
1064+
return how, anti_join
1065+
10381066
def _maybe_require_matching_dtypes(
10391067
self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
10401068
) -> None:
@@ -1405,6 +1433,11 @@ def _get_join_info(
14051433
n = len(left_ax) if left_indexer is None else len(left_indexer)
14061434
join_index = default_index(n)
14071435

1436+
if self.anti_join:
1437+
join_index, left_indexer, right_indexer = self._handle_anti_join(
1438+
join_index, left_indexer, right_indexer
1439+
)
1440+
14081441
return join_index, left_indexer, right_indexer
14091442

14101443
@final
@@ -1447,6 +1480,48 @@ def _create_join_index(
14471480
return index.copy()
14481481
return index.take(indexer)
14491482

1483+
@final
1484+
def _handle_anti_join(
1485+
self,
1486+
join_index: Index,
1487+
left_indexer: npt.NDArray[np.intp] | None,
1488+
right_indexer: npt.NDArray[np.intp] | None,
1489+
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1490+
"""
1491+
Handle anti join by returning the correct join index and indexers
1492+
1493+
Parameters
1494+
----------
1495+
join_index : Index
1496+
join index
1497+
left_indexer : np.ndarray[np.intp] or None
1498+
left indexer
1499+
right_indexer : np.ndarray[np.intp] or None
1500+
right indexer
1501+
1502+
Returns
1503+
-------
1504+
Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None
1505+
"""
1506+
# Make sure indexers are not None
1507+
if left_indexer is None:
1508+
left_indexer = np.arange(len(self.left))
1509+
if right_indexer is None:
1510+
right_indexer = np.arange(len(self.right))
1511+
1512+
assert self.how in {"left", "right"}
1513+
if self.how == "left":
1514+
# Filter to rows where left keys are not in right keys
1515+
filt = right_indexer == -1
1516+
else:
1517+
# Filter to rows where right keys are not in left keys
1518+
filt = left_indexer == -1
1519+
join_index = join_index[filt]
1520+
left_indexer = left_indexer[filt]
1521+
right_indexer = right_indexer[filt]
1522+
1523+
return join_index, left_indexer, right_indexer
1524+
14501525
@final
14511526
def _get_merge_keys(
14521527
self,

pandas/io/formats/printing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ def _pprint_seq(
111111
"""
112112
if isinstance(seq, set):
113113
fmt = "{{{body}}}"
114+
elif isinstance(seq, frozenset):
115+
fmt = "frozenset({body})"
114116
else:
115117
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
116118

pandas/io/orc.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ def read_orc(
4545
"""
4646
Load an ORC object from the file path, returning a DataFrame.
4747
48+
This method reads an ORC (Optimized Row Columnar) file into a pandas
49+
DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format
50+
that provides efficient compression and fast retrieval for analytical workloads.
51+
It allows reading specific columns, handling different filesystem
52+
types (such as local storage, cloud storage via fsspec, or pyarrow filesystem),
53+
and supports different data type backends, including `numpy_nullable` and `pyarrow`.
54+
4855
Parameters
4956
----------
5057
path : str, path object, or file-like object

0 commit comments

Comments
 (0)