MWE - complete, process_text, expand_grid (#1013)

samukweku · samukweku · thatlittleboy · web-flow · commit d3146bbf0ddd · 2022-02-08T17:18:16.000-05:00
* mwe

* mwe

* mwe

* mwe

* Update janitor/functions/complete.py

Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;

* Update janitor/functions/expand_grid.py

Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;

* Update janitor/functions/process_text.py

Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;

* Update janitor/functions/process_text.py

Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;

* Update janitor/functions/process_text.py

Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;

* mwe

* mwe

Co-authored-by: samukweku &lt;samukweku@gmail.com&gt;
Co-authored-by: Jeremy Goh &lt;30731072+thatlittleboy@users.noreply.github.com&gt;
diff --git a/janitor/functions/complete.py b/janitor/functions/complete.py
@@ -19,7 +19,7 @@ def complete(
 ) -> pd.DataFrame:
     """
     It is modeled after tidyr's `complete` function, and is a wrapper around
-    `expand_grid` and `pd.merge`.
+    [`expand_grid`][janitor.functions.expand_grid.expand_grid] and `pd.merge`.
 
     Combinations of column names or a list/tuple of column names, or even a
     dictionary of column names and new values are possible.
@@ -28,39 +28,64 @@ def complete(
 
     MultiIndex columns are not supported.
 
-    Functional usage syntax:
-
-    ```python
-
-        import pandas as pd
-        import janitor as jn
-
-        df = pd.DataFrame(...)
-
-        df = jn.complete(
-            df = df,
-            column_label,
-            (column1, column2, ...),
-            {column1: new_values, ...},
-            by = label/list_of_labels
-        )
-    ```
-
-    Method chaining syntax:
-
-    ```python
-
-        df = (
-            pd.DataFrame(...)
-            .complete(
-                column_label,
-                (column1, column2, ...),
-                {column1: new_values, ...},
-                by = label/list_of_labels
-            )
-    ```
-
-    :param df: A pandas dataframe.
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Year": [1999, 2000, 2004, 1999, 2004],
+        ...         "Taxon": [
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Saccharina",
+        ...             "Agarum",
+        ...             "Agarum",
+        ...         ],
+        ...         "Abundance": [4, 5, 2, 1, 8],
+        ...     }
+        ... )
+        >>> df
+           Year       Taxon  Abundance
+        0  1999  Saccharina          4
+        1  2000  Saccharina          5
+        2  2004  Saccharina          2
+        3  1999      Agarum          1
+        4  2004      Agarum          8
+
+    Expose missing pairings of `Year` and `Taxon`:
+
+        >>> df.complete("Year", "Taxon", sort = True)
+           Year       Taxon  Abundance
+        0  1999      Agarum        1.0
+        1  1999  Saccharina        4.0
+        2  2000      Agarum        NaN
+        3  2000  Saccharina        5.0
+        4  2004      Agarum        8.0
+        5  2004  Saccharina        2.0
+
+    Expose missing years from 1999 to 2004 :
+
+        >>> df.complete(
+        ...     {"Year": range(df.Year.min(), df.Year.max() + 1)},
+        ...     "Taxon",
+        ...     sort=True,
+        ... )
+            Year       Taxon  Abundance
+        0   1999      Agarum        1.0
+        1   1999  Saccharina        4.0
+        2   2000      Agarum        NaN
+        3   2000  Saccharina        5.0
+        4   2001      Agarum        NaN
+        5   2001  Saccharina        NaN
+        6   2002      Agarum        NaN
+        7   2002  Saccharina        NaN
+        8   2003      Agarum        NaN
+        9   2003  Saccharina        NaN
+        10  2004      Agarum        8.0
+        11  2004  Saccharina        2.0
+
+    :param df: A pandas DataFrame.
     :param *columns: This refers to the columns to be
         completed. It could be column labels (string type),
         a list/tuple of column labels, or a dictionary that pairs
diff --git a/janitor/functions/expand_grid.py b/janitor/functions/expand_grid.py
@@ -26,7 +26,7 @@ def expand_grid(
 
 
     Data types are preserved in this function,
-    including Pandas' extension array dtypes.
+    including pandas' extension array dtypes.
 
     The output will always be a DataFrame, usually a MultiIndex,
     with the keys of the `others` dictionary serving as
@@ -36,41 +36,43 @@ def expand_grid(
     `others`, the columns are flattened, before the final
     cartesian DataFrame is generated.
 
-    If a Pandas Series/DataFrame is passed, and has a labeled index, or
+    If a pandas Series/DataFrame is passed, and has a labeled index, or
     a MultiIndex index, the index is discarded; the final DataFrame
     will have a RangeIndex.
 
     The MultiIndexed DataFrame can be flattened using pyjanitor's
-    `collapse_levels` method; the user can also decide to drop any of the
-    levels, via Pandas' `droplevel` method.
-
-    Functional usage syntax:
-
-    ```python
-
-        import pandas as pd
-        import janitor as jn
-
-        df = pd.DataFrame(...)
-        df = jn.expand_grid(df=df, df_key="...", others={...})
-    ```
-
-    Method-chaining usage syntax:
-
-    ```python
-        import pandas as pd
-        import janitor as jn
-
-        df = pd.DataFrame(...).expand_grid(df_key="bla",others={...})
-    ```
-
-    Usage independent of a DataFrame
-
-    ```python
-        import pandas as pd
-        from janitor import expand_grid
-
-        df = expand_grid(others = {"x":range(1,4), "y":[1,2]})
+    [`collapse_levels`][janitor.functions.collapse_levels.collapse_levels]
+    method; the user can also decide to drop any of the levels, via pandas'
+    `droplevel` method.
+
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor as jn
+        >>> df = pd.DataFrame({"x": [1, 2], "y": [2, 1]})
+        >>> data = {"z": [1, 2, 3]}
+        >>> df.expand_grid(df_key="df", others=data)
+          df     z
+           x  y  0
+        0  1  2  1
+        1  1  2  2
+        2  1  2  3
+        3  2  1  1
+        4  2  1  2
+        5  2  1  3
+
+    Expand_grid works with non-pandas objects:
+
+        >>> data = {"x": [1, 2, 3], "y": [1, 2]}
+        >>> jn.expand_grid(others=data)
+           x  y
+           0  0
+        0  1  1
+        1  1  2
+        2  2  1
+        3  2  2
+        4  3  1
+        5  3  2
 
     :param df: A pandas DataFrame.
     :param df_key: name of key for the dataframe.
@@ -97,12 +99,10 @@ def expand_grid(
 
         if not df_key:
             raise KeyError(
-                """
-                Using `expand_grid` as part of a
-                DataFrame method chain requires that
-                a string argument be provided for
-                the `df_key` parameter.
-                """
+                "Using `expand_grid` as part of a "
+                "DataFrame method chain requires that "
+                "a string argument be provided for "
+                "the `df_key` parameter. "
             )
 
         check("df_key", df_key, [str])
diff --git a/janitor/functions/process_text.py b/janitor/functions/process_text.py
@@ -11,7 +11,7 @@ def process_text(
     df: pd.DataFrame,
     column_name: str,
     string_function: str,
-    **kwargs: str,
+    **kwargs,
 ) -> pd.DataFrame:
     """
     Apply a Pandas string method to an existing column.
@@ -21,52 +21,52 @@ def process_text(
     along with keyword arguments, if any, to the function.
 
     This modifies an existing column; it does not create a new column;
-    new columns can be created via pyjanitor's `transform_columns`.
-
-
-    A list of all the string methods in Pandas can be accessed [here](https://pandas.pydata.org/docs/user_guide/text.html#method-summary)
-
-
-    Functional usage syntax:
-
-    ```python
-        import pandas as pd
-        import janitor as jn
-
-        df = pd.DataFrame(...)
-        df = jn.process_text(
-            df = df,
-            column_name,
-            string_function = "string_func_name_here",
-            kwargs
-            )
-    ```
-
-    Method-chaining usage syntax:
-
-    ```python
-
-        import pandas as pd
-        import janitor as jn
-
-        df = (
-            pd.DataFrame(...)
-            .process_text(
-                column_name,
-                string_function = "string_func_name_here",
-                kwargs
-                )
-        )
-    ```
-
+    new columns can be created via pyjanitor's
+    [`transform_columns`][janitor.functions.transform_columns.transform_columns].
+
+
+    A list of all the string methods in Pandas can be accessed [here](https://pandas.pydata.org/docs/user_guide/text.html#method-summary).
+
+
+    Example:
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> import re
+        >>> df = pd.DataFrame({"text": ["Ragnar", "sammywemmy", "ginger"],
+        ... "code": [1, 2, 3]})
+        >>> df
+                 text  code
+        0      Ragnar     1
+        1  sammywemmy     2
+        2      ginger     3
+        >>> df.process_text(column_name="text", string_function="lower")
+                 text  code
+        0      ragnar     1
+        1  sammywemmy     2
+        2      ginger     3
+
+    For string methods with parameters, simply pass the keyword arguments:
+
+        >>> df.process_text(
+        ...     column_name="text",
+        ...     string_function="extract",
+        ...     pat=r"(ag)",
+        ...     expand=False,
+        ...     flags=re.IGNORECASE,
+        ... )
+          text  code
+        0   ag     1
+        1  NaN     2
+        2  NaN     3
 
     :param df: A pandas DataFrame.
-    :param column_name: string column to be operated on.
+    :param column_name: String column to be operated on.
     :param string_function: pandas string method to be applied.
     :param kwargs: Keyword arguments for parameters of the `string_function`.
     :returns: A pandas DataFrame with modified column.
-    :raises KeyError: if ``string_function`` is not a Pandas string method.
-    :raises ValueError: if the text function returns a DataFrame, instead of a Series.
+    :raises KeyError: If `string_function` is not a Pandas string method.
+    :raises ValueError: If the text function returns a DataFrame, instead of a Series.
     """  # noqa: E501
 
     check("column_name", column_name, [str])
@@ -86,10 +86,8 @@ def process_text(
 
     if isinstance(result, pd.DataFrame):
         raise ValueError(
-            """
-            The outcome of the processed text is a DataFrame,
-            which is not supported in `process_text`.
-            """
+            "The outcome of the processed text is a DataFrame, "
+            "which is not supported in `process_text`."
         )
 
     return df.assign(**{column_name: result})