Skip to content

Commit 1f2dc4f

Browse files
jorisvandenbosschekhemkaran10Khemkaran
authored
[backport 2.3.x] BUG: fix Series.str.fullmatch() and Series.str.match() with a compiled regex failing with arrow strings (#61964) (#62113)
Co-authored-by: Khemkaran Sevta <[email protected]> Co-authored-by: Khemkaran <[email protected]>
1 parent c4fa611 commit 1f2dc4f

File tree

5 files changed

+41
-10
lines changed

5 files changed

+41
-10
lines changed

doc/source/whatsnew/v2.3.2.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ Bug fixes
2525
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
2626
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2727
(:issue:`61889`)
28-
28+
- Fixed ``~Series.str.match`` and ``~Series.str.fullmatch`` with compiled regex
29+
for the Arrow-backed string dtype (:issue:`61964`)
2930

3031
.. ---------------------------------------------------------------------------
3132
.. _whatsnew_232.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,23 +301,29 @@ def _str_contains(
301301

302302
def _str_match(
303303
self,
304-
pat: str,
304+
pat: str | re.Pattern,
305305
case: bool = True,
306306
flags: int = 0,
307307
na: Scalar | lib.NoDefault = lib.no_default,
308308
):
309-
if not pat.startswith("^"):
309+
if isinstance(pat, re.Pattern):
310+
# GH#61952
311+
pat = pat.pattern
312+
if isinstance(pat, str) and not pat.startswith("^"):
310313
pat = f"^{pat}"
311314
return self._str_contains(pat, case, flags, na, regex=True)
312315

313316
def _str_fullmatch(
314317
self,
315-
pat,
318+
pat: str | re.Pattern,
316319
case: bool = True,
317320
flags: int = 0,
318321
na: Scalar | lib.NoDefault = lib.no_default,
319322
):
320-
if not pat.endswith("$") or pat.endswith("\\$"):
323+
if isinstance(pat, re.Pattern):
324+
# GH#61952
325+
pat = pat.pattern
326+
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
321327
pat = f"{pat}$"
322328
return self._str_match(pat, case, flags, na)
323329

pandas/core/strings/accessor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,8 +1353,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
13531353
13541354
Parameters
13551355
----------
1356-
pat : str
1357-
Character sequence.
1356+
pat : str or compiled regex
1357+
Character sequence or regular expression.
13581358
case : bool, default True
13591359
If True, case sensitive.
13601360
flags : int, default 0 (no flags)

pandas/core/strings/object_array.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,15 @@ def rep(x, r):
245245

246246
def _str_match(
247247
self,
248-
pat: str,
248+
pat: str | re.Pattern,
249249
case: bool = True,
250250
flags: int = 0,
251251
na: Scalar | lib.NoDefault = lib.no_default,
252252
):
253253
if not case:
254254
flags |= re.IGNORECASE
255-
255+
if isinstance(pat, re.Pattern):
256+
pat = pat.pattern
256257
regex = re.compile(pat, flags=flags)
257258

258259
f = lambda x: regex.match(x) is not None
@@ -267,7 +268,8 @@ def _str_fullmatch(
267268
):
268269
if not case:
269270
flags |= re.IGNORECASE
270-
271+
if isinstance(pat, re.Pattern):
272+
pat = pat.pattern
271273
regex = re.compile(pat, flags=flags)
272274

273275
f = lambda x: regex.fullmatch(x) is not None

pandas/tests/strings/test_find_replace.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,17 @@ def test_match_case_kwarg(any_string_dtype):
822822
tm.assert_series_equal(result, expected)
823823

824824

825+
def test_match_compiled_regex(any_string_dtype):
826+
# GH#61952
827+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
828+
result = values.str.match(re.compile(r"ab"), case=False)
829+
expected_dtype = (
830+
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
831+
)
832+
expected = Series([True, True, True, True], dtype=expected_dtype)
833+
tm.assert_series_equal(result, expected)
834+
835+
825836
# --------------------------------------------------------------------------------------
826837
# str.fullmatch
827838
# --------------------------------------------------------------------------------------
@@ -891,6 +902,17 @@ def test_fullmatch_case_kwarg(any_string_dtype):
891902
tm.assert_series_equal(result, expected)
892903

893904

905+
def test_fullmatch_compiled_regex(any_string_dtype):
906+
# GH#61952
907+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
908+
result = values.str.fullmatch(re.compile(r"ab"), case=False)
909+
expected_dtype = (
910+
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
911+
)
912+
expected = Series([True, True, False, False], dtype=expected_dtype)
913+
tm.assert_series_equal(result, expected)
914+
915+
894916
# --------------------------------------------------------------------------------------
895917
# str.findall
896918
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)