Skip to content

Commit a7ce963

Browse files
Zeroto521pre-commit-ci[bot]samukwekuthatlittleboy
authored
[EHN] Let filter_string use parameters of Series.str.contains (pyjanitor-devs#1049)
* add `kwargs` for `filter_string` * Add documentation for parameters * Add example for case and regex parameters * Finished `filter_string` test cases * Update CHANGELOG.md * the result should be 3 * the result should be 6 * test regex parameter * correct example * update example * `str.contains` can only use in str type * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update filter.py * correct spells Co-authored-by: Jeremy Goh <[email protected]> * fill all keywords * pat -> search_string Co-authored-by: Jeremy Goh <[email protected]> * clear search_string description Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Samuel Oranyeli <[email protected]> Co-authored-by: Jeremy Goh <[email protected]>
1 parent 12fe660 commit a7ce963

File tree

3 files changed

+70
-10
lines changed

3 files changed

+70
-10
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
- [BUG] Fix SettingWithCopyWarning and other minor bugs when using `truncate_datetime_dataframe`, along with further performance improvements (PR #1040). @thatlittleboy
2121
- [ENH] Performance improvement for `conditional_join`. @samukweku
2222
- [ENH] Multiple `.value` is now supported in `pivot_longer`. #1034 @samukweku
23+
- [EHN] Let `filter_string` suit parameters of `Series.str.contains` Issue #1003 and #1047. @Zeroto521
2324

2425
## [v0.22.0] - 2021-11-21
2526

janitor/functions/filter.py

+42-8
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ def filter_string(
1616
column_name: Hashable,
1717
search_string: str,
1818
complement: bool = False,
19+
case: bool = True,
20+
flags: int = 0,
21+
na=None,
22+
regex: bool = True,
1923
) -> pd.DataFrame:
2024
"""Filter a string-based column according to whether it contains a substring.
2125
22-
This is super sugary syntax that builds on top of
23-
`pandas.Series.str.contains`. It is meant to be the method-chaining
24-
equivalent of the following:
26+
This is super sugary syntax that builds on top of `pandas.Series.str.contains`.
27+
It is meant to be the method-chaining equivalent of the following:
2528
2629
```python
2730
df = df[df[column_name].str.contains(search_string)]]
@@ -33,29 +36,60 @@ def filter_string(
3336
3437
>>> import pandas as pd
3538
>>> import janitor
36-
>>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peel", "sail"]})
39+
>>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peeL", "sail"]})
3740
>>> df
3841
a b
3942
0 3 bear
40-
1 4 peel
43+
1 4 peeL
4144
2 5 sail
4245
>>> df.filter_string(column_name="b", search_string="ee")
4346
a b
44-
1 4 peel
47+
1 4 peeL
48+
>>> df.filter_string(column_name="b", search_string="L", case=False)
49+
a b
50+
1 4 peeL
51+
2 5 sail
4552
46-
`search_string` is also permitted to be any valid regex pattern.
53+
Example: Filter names does not contain `'.'` (disable regex mode).
54+
55+
>>> import pandas as pd
56+
>>> import janitor
57+
>>> df = pd.Series(["JoseChen", "Brian.Salvi"], name="Name").to_frame()
58+
>>> df
59+
Name
60+
0 JoseChen
61+
1 Brian.Salvi
62+
>>> df.filter_string(column_name="Name", search_string=".", regex=False, complement=True)
63+
Name
64+
0 JoseChen
4765
4866
:param df: A pandas DataFrame.
4967
:param column_name: The column to filter. The column should contain strings.
5068
:param search_string: A regex pattern or a (sub-)string to search.
5169
:param complement: Whether to return the complement of the filter or not. If
5270
set to True, then the rows for which the string search fails are retained
5371
instead.
72+
:param case: If True, case sensitive.
73+
:param flags: Flags to pass through to the re module, e.g. re.IGNORECASE.
74+
:param na: Fill value for missing values. The default depends on dtype of
75+
the array. For object-dtype, `numpy.nan` is used. For `StringDtype`,
76+
`pandas.NA` is used.
77+
:param regex: If True, assumes `search_string` is a regular expression. If False,
78+
treats the `search_string` as a literal string.
5479
:returns: A filtered pandas DataFrame.
5580
""" # noqa: E501
56-
criteria = df[column_name].str.contains(search_string)
81+
82+
criteria = df[column_name].str.contains(
83+
pat=search_string,
84+
case=case,
85+
flags=flags,
86+
na=na,
87+
regex=regex,
88+
)
89+
5790
if complement:
5891
return df[~criteria]
92+
5993
return df[criteria]
6094

6195

tests/functions/test_filter_string.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,38 @@
44
@pytest.mark.functions
55
def test_filter_string(dataframe):
66
df = dataframe.filter_string(
7-
column_name="animals@#$%^", search_string="bbit"
7+
column_name="animals@#$%^",
8+
search_string="bbit",
89
)
10+
911
assert len(df) == 3
1012

1113

1214
def test_filter_string_complement(dataframe):
1315
df = dataframe.filter_string(
14-
column_name="cities", search_string="hang", complement=True
16+
column_name="cities",
17+
search_string="hang",
18+
complement=True,
1519
)
20+
1621
assert len(df) == 6
22+
23+
24+
def test_filter_string_case(dataframe):
25+
df = dataframe.filter_string(
26+
column_name="cities",
27+
search_string="B",
28+
case=False,
29+
)
30+
31+
assert len(df) == 6
32+
33+
34+
def test_filter_string_regex(dataframe):
35+
df = dataframe.change_type("Bell__Chart", str).filter_string(
36+
column_name="Bell__Chart",
37+
search_string="1.",
38+
regex=False,
39+
)
40+
41+
assert len(df) == 3

0 commit comments

Comments
 (0)