[EHN] Let filter_string use parameters of Series.str.contains (pyjanitor-devs#1049)

Zeroto521 · pre-commit-ci[bot] · samukweku · web-flow · commit a7ce9638d375 · 2022-03-28T07:46:41.000-04:00
* add `kwargs` for `filter_string` * Add documentation for parameters * Add example for case and regex parameters * Finished `filter_string` test cases * Update CHANGELOG.md * the result should be 3 * the result should be 6 * test regex parameter * correct example * update example * `str.contains` can only use in str type * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update filter.py * correct spells Co-authored-by: Jeremy Goh <30731072+thatlittleboy@users.noreply.github.com> * fill all keywords * pat -> search_string Co-authored-by: Jeremy Goh <30731072+thatlittleboy@users.noreply.github.com> * clear search_string description Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Samuel Oranyeli <samueloranyeli@gmail.com> Co-authored-by: Jeremy Goh <30731072+thatlittleboy@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
 -   [BUG] Fix SettingWithCopyWarning and other minor bugs when using `truncate_datetime_dataframe`, along with further performance improvements (PR #1040). @thatlittleboy
 -   [ENH] Performance improvement for `conditional_join`. @samukweku
 -   [ENH] Multiple `.value` is now supported in `pivot_longer`. #1034 @samukweku
+-   [EHN] Let `filter_string` suit parameters of `Series.str.contains` Issue #1003 and #1047. @Zeroto521
 
 ## [v0.22.0] - 2021-11-21
 
diff --git a/janitor/functions/filter.py b/janitor/functions/filter.py
@@ -16,12 +16,15 @@ def filter_string(
     column_name: Hashable,
     search_string: str,
     complement: bool = False,
+    case: bool = True,
+    flags: int = 0,
+    na=None,
+    regex: bool = True,
 ) -> pd.DataFrame:
     """Filter a string-based column according to whether it contains a substring.
 
-    This is super sugary syntax that builds on top of
-    `pandas.Series.str.contains`. It is meant to be the method-chaining
-    equivalent of the following:
+    This is super sugary syntax that builds on top of `pandas.Series.str.contains`.
+    It is meant to be the method-chaining equivalent of the following:
 
     ```python
     df = df[df[column_name].str.contains(search_string)]]
@@ -33,29 +36,60 @@ def filter_string(
 
         >>> import pandas as pd
         >>> import janitor
-        >>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peel", "sail"]})
+        >>> df = pd.DataFrame({"a": range(3, 6), "b": ["bear", "peeL", "sail"]})
         >>> df
            a     b
         0  3  bear
-        1  4  peel
+        1  4  peeL
         2  5  sail
         >>> df.filter_string(column_name="b", search_string="ee")
            a     b
-        1  4  peel
+        1  4  peeL
+        >>> df.filter_string(column_name="b", search_string="L", case=False)
+           a     b
+        1  4  peeL
+        2  5  sail
 
-    `search_string` is also permitted to be any valid regex pattern.
+    Example: Filter names does not contain `'.'` (disable regex mode).
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.Series(["JoseChen", "Brian.Salvi"], name="Name").to_frame()
+        >>> df
+                  Name
+        0     JoseChen
+        1  Brian.Salvi
+        >>> df.filter_string(column_name="Name", search_string=".", regex=False, complement=True)
+               Name
+        0  JoseChen
 
     :param df: A pandas DataFrame.
     :param column_name: The column to filter. The column should contain strings.
     :param search_string: A regex pattern or a (sub-)string to search.
     :param complement: Whether to return the complement of the filter or not. If
         set to True, then the rows for which the string search fails are retained
         instead.
+    :param case: If True, case sensitive.
+    :param flags: Flags to pass through to the re module, e.g. re.IGNORECASE.
+    :param na: Fill value for missing values. The default depends on dtype of
+        the array. For object-dtype, `numpy.nan` is used. For `StringDtype`,
+        `pandas.NA` is used.
+    :param regex: If True, assumes `search_string` is a regular expression. If False,
+        treats the `search_string` as a literal string.
     :returns: A filtered pandas DataFrame.
     """  # noqa: E501
-    criteria = df[column_name].str.contains(search_string)
+
+    criteria = df[column_name].str.contains(
+        pat=search_string,
+        case=case,
+        flags=flags,
+        na=na,
+        regex=regex,
+    )
+
     if complement:
         return df[~criteria]
+
     return df[criteria]
 
 
diff --git a/tests/functions/test_filter_string.py b/tests/functions/test_filter_string.py
@@ -4,13 +4,38 @@
 @pytest.mark.functions
 def test_filter_string(dataframe):
     df = dataframe.filter_string(
-        column_name="animals@#$%^", search_string="bbit"
+        column_name="animals@#$%^",
+        search_string="bbit",
     )
+
     assert len(df) == 3
 
 
 def test_filter_string_complement(dataframe):
     df = dataframe.filter_string(
-        column_name="cities", search_string="hang", complement=True
+        column_name="cities",
+        search_string="hang",
+        complement=True,
     )
+
     assert len(df) == 6
+
+
+def test_filter_string_case(dataframe):
+    df = dataframe.filter_string(
+        column_name="cities",
+        search_string="B",
+        case=False,
+    )
+
+    assert len(df) == 6
+
+
+def test_filter_string_regex(dataframe):
+    df = dataframe.change_type("Bell__Chart", str).filter_string(
+        column_name="Bell__Chart",
+        search_string="1.",
+        regex=False,
+    )
+
+    assert len(df) == 3