Add polars support for janitor.io.xlsx_table (#1357)

samukweku · samuel.oranyeli · ericmjl · web-flow · commit 46ab4d8a5521 · 2024-06-02T20:16:17.000-04:00
* add make_clean_names function that can be applied to polars

* add examples for make_clean_names

* changelog

* limit import location for polars

* limit import location for polars

* fix polars in environment-dev.yml

* install polars in doctest

* limit polars imports - user should have polars already installed

* use subprocess.run

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add subprocess.devnull to docstrings

* add os.devnull

* add polars as requirement for docs

* add polars to tests requirements

* delete irrelevant folder

* changelog

* create submodule for polars

* fix doctests

* fix tests; add polars to documentation

* fix tests; add polars to documentation

* import janitor.polars

* control docs output for polars submodule

* exclude functions in docs rendering

* exclude functions in docs rendering

* show_submodules=true

* fix docstring rendering for polars

* Expression -&gt; expression

* rename functions.py

* pivot_longer implemented for polars

* changelog

* keep changes related only to pivot_longer

* pd -&gt; pl

* pd -&gt; pl

* df.pivot_longer -&gt; df.janitor.pivot_longer

* df.pivot_longer -&gt; df.janitor.pivot_longer

* pd -&gt; pl

* pd -&gt; pl

* add &gt;&gt;&gt; df

* add &gt;&gt;&gt; df

* keep changes related only to polars pivot_longer

* add polars support to read_commandline

* remove irrelevant files

* minor edit to docs

* xlsx_table now supports polars

---------

Co-authored-by: samuel.oranyeli &lt;samuel.oranyeli@grow.inc&gt;
Co-authored-by: Eric Ma &lt;ericmjl@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 
 ## [Unreleased]
+-  [ENH] `xlsx_table` function now supports polars - Issue #1352
 
 -  [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
 -  [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343
diff --git a/janitor/io.py b/janitor/io.py
@@ -8,7 +8,7 @@
 from glob import glob
 from io import StringIO
 from itertools import chain
-from typing import IO, TYPE_CHECKING, Any, Iterable, Union
+from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union
 
 import pandas as pd
 
@@ -142,21 +142,23 @@ def xlsx_table(
     path: Union[str, IO, Workbook],
     sheetname: str = None,
     table: Union[str, list, tuple] = None,
-) -> Union[pd.DataFrame, dict]:
+    engine: str = "pandas",
+) -> Mapping:
     """Returns a DataFrame of values in a table in the Excel file.
 
     This applies to an Excel file, where the data range is explicitly
     specified as a Microsoft Excel table.
 
     If there is a single table in the sheet, or a string is provided
-    as an argument to the `table` parameter, a pandas DataFrame is returned;
+    as an argument to the `table` parameter, a DataFrame is returned;
     if there is more than one table in the sheet,
     and the `table` argument is `None`, or a list/tuple of names,
     a dictionary of DataFrames is returned, where the keys of the dictionary
     are the table names.
 
     Examples:
         >>> import pandas as pd
+        >>> import polars as pl
         >>> from janitor import xlsx_table
         >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
 
@@ -170,6 +172,20 @@ def xlsx_table(
         3           4    Competition
         4           5  Long Distance
 
+        >>> xlsx_table(filename, table='dCategory', engine='polars')
+        shape: (5, 2)
+        ┌────────────┬───────────────┐
+        │ CategoryID ┆ Category      │
+        │ ---        ┆ ---           │
+        │ i64        ┆ str           │
+        ╞════════════╪═══════════════╡
+        │ 1          ┆ Beginner      │
+        │ 2          ┆ Advanced      │
+        │ 3          ┆ Freestyle     │
+        │ 4          ┆ Competition   │
+        │ 5          ┆ Long Distance │
+        └────────────┴───────────────┘
+
         Multiple tables:
 
         >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -189,14 +205,16 @@ def xlsx_table(
     Args:
           path: Path to the Excel File. It can also be an openpyxl Workbook.
           table: Name of a table, or list of tables in the sheet.
+          engine: DataFrame engine. Should be either pandas or polars.
+            Defaults to pandas
 
     Raises:
         AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
         ValueError: If there are no tables in the sheet.
         KeyError: If the provided table does not exist in the sheet.
 
     Returns:
-        A pandas DataFrame, or a dictionary of DataFrames,
+        A DataFrame, or a dictionary of DataFrames,
             if there are multiple arguments for the `table` parameter,
             or the argument to `table` is `None`.
     """  # noqa : E501
@@ -219,6 +237,22 @@ def xlsx_table(
             DeprecationWarning,
             stacklevel=find_stack_level(),
         )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+
     if table is not None:
         check("table", table, [str, list, tuple])
         if isinstance(table, (list, tuple)):
@@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
             header_exist = contents.headerRowCount
             coordinates = contents.ref
             data = worksheet[coordinates]
-            data = [[entry.value for entry in cell] for cell in data]
             if header_exist:
                 header, *data = data
+                header = [cell.value for cell in header]
             else:
                 header = [f"C{num}" for num in range(len(data[0]))]
-            data = pd.DataFrame(data, columns=header)
-            dictionary[table_name] = data
+            data = zip(*data)
+            data = ([entry.value for entry in cell] for cell in data)
+            data = dict(zip(header, data))
+            dictionary[table_name] = base_engine.DataFrame(data)
         return dictionary
 
     worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]