xlsx_table now supports polars

samuel.oranyeli · samuel.oranyeli · commit 3b781c1a8e01 · 2024-05-01T21:42:09.000+10:00
diff --git a/.requirements/docs.in b/.requirements/docs.in
@@ -1,4 +1,5 @@
 mkdocs
+polars
 mkdocs-material
 mkdocstrings>=0.19.0
 mkdocstrings-python
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # Changelog
 
 ## [Unreleased]
--  [ENH] `read_commandline` function now supports polars - Issue #1352
+-  [ENH] `xlsx_table` function now supports polars - Issue #1352
 
 ## [v0.27.0] - 2024-03-21
 
diff --git a/janitor/io.py b/janitor/io.py
@@ -93,7 +93,7 @@ def read_csvs(
     return dfs_dict
 
 
-def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping:
+def read_commandline(cmd: str, **kwargs: Any) -> pd.DataFrame:
     """Read a CSV file based on a command-line command.
 
     For example, you may wish to run the following command on `sep-quarter.csv`
@@ -111,42 +111,26 @@ def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping:
     ```
 
     This function assumes that your command line command will return
-    an output that is parsable using the relevant engine and StringIO.
-    This function defaults to using `pd.read_csv` underneath the hood.
-    Keyword arguments are passed through as-is.
+    an output that is parsable using `pandas.read_csv` and StringIO.
+    We default to using `pd.read_csv` underneath the hood.
+    Keyword arguments are passed through to read_csv.
 
     Args:
         cmd: Shell command to preprocess a file on disk.
-        engine: DataFrame engine to process the output of the shell command.
-            Currently supports both pandas and polars.
         **kwargs: Keyword arguments that are passed through to
-            the engine's csv reader.
-
+            `pd.read_csv()`.
 
     Returns:
-        A DataFrame parsed from the stdout of the underlying
+        A pandas DataFrame parsed from the stdout of the underlying
             shell.
     """
 
     check("cmd", cmd, [str])
-    if engine not in {"pandas", "polars"}:
-        raise ValueError("engine should be either pandas or polars.")
     # adding check=True ensures that an explicit, clear error
     # is raised, so that the user can see the reason for the failure
     outcome = subprocess.run(
         cmd, shell=True, capture_output=True, text=True, check=True
     )
-    if engine == "polars":
-        try:
-            import polars as pl
-        except ImportError:
-            import_message(
-                submodule="polars",
-                package="polars",
-                conda_channel="conda-forge",
-                pip_install=True,
-            )
-        return pl.read_csv(StringIO(outcome.stdout), **kwargs)
     return pd.read_csv(StringIO(outcome.stdout), **kwargs)
 
 
@@ -158,21 +142,23 @@ def xlsx_table(
     path: Union[str, IO, Workbook],
     sheetname: str = None,
     table: Union[str, list, tuple] = None,
-) -> Union[pd.DataFrame, dict]:
+    engine: str = "pandas",
+) -> Mapping:
     """Returns a DataFrame of values in a table in the Excel file.
 
     This applies to an Excel file, where the data range is explicitly
     specified as a Microsoft Excel table.
 
     If there is a single table in the sheet, or a string is provided
-    as an argument to the `table` parameter, a pandas DataFrame is returned;
+    as an argument to the `table` parameter, a DataFrame is returned;
     if there is more than one table in the sheet,
     and the `table` argument is `None`, or a list/tuple of names,
     a dictionary of DataFrames is returned, where the keys of the dictionary
     are the table names.
 
     Examples:
         >>> import pandas as pd
+        >>> import polars as pl
         >>> from janitor import xlsx_table
         >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
 
@@ -186,6 +172,20 @@ def xlsx_table(
         3           4    Competition
         4           5  Long Distance
 
+        >>> xlsx_table(filename, table='dCategory', engine='polars')
+        shape: (5, 2)
+        ┌────────────┬───────────────┐
+        │ CategoryID ┆ Category      │
+        │ ---        ┆ ---           │
+        │ i64        ┆ str           │
+        ╞════════════╪═══════════════╡
+        │ 1          ┆ Beginner      │
+        │ 2          ┆ Advanced      │
+        │ 3          ┆ Freestyle     │
+        │ 4          ┆ Competition   │
+        │ 5          ┆ Long Distance │
+        └────────────┴───────────────┘
+
         Multiple tables:
 
         >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -205,14 +205,16 @@ def xlsx_table(
     Args:
           path: Path to the Excel File. It can also be an openpyxl Workbook.
           table: Name of a table, or list of tables in the sheet.
+          engine: DataFrame engine. Should be either pandas or polars.
+            Defaults to pandas
 
     Raises:
         AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
         ValueError: If there are no tables in the sheet.
         KeyError: If the provided table does not exist in the sheet.
 
     Returns:
-        A pandas DataFrame, or a dictionary of DataFrames,
+        A DataFrame, or a dictionary of DataFrames,
             if there are multiple arguments for the `table` parameter,
             or the argument to `table` is `None`.
     """  # noqa : E501
@@ -235,6 +237,22 @@ def xlsx_table(
             DeprecationWarning,
             stacklevel=find_stack_level(),
         )
+    if engine not in {"pandas", "polars"}:
+        raise ValueError("engine should be one of pandas or polars.")
+    base_engine = pd
+    if engine == "polars":
+        try:
+            import polars as pl
+
+            base_engine = pl
+        except ImportError:
+            import_message(
+                submodule="polars",
+                package="polars",
+                conda_channel="conda-forge",
+                pip_install=True,
+            )
+
     if table is not None:
         check("table", table, [str, list, tuple])
         if isinstance(table, (list, tuple)):
@@ -261,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
             header_exist = contents.headerRowCount
             coordinates = contents.ref
             data = worksheet[coordinates]
-            data = [[entry.value for entry in cell] for cell in data]
             if header_exist:
                 header, *data = data
+                header = [cell.value for cell in header]
             else:
                 header = [f"C{num}" for num in range(len(data[0]))]
-            data = pd.DataFrame(data, columns=header)
-            dictionary[table_name] = data
+            data = zip(*data)
+            data = ([entry.value for entry in cell] for cell in data)
+            data = dict(zip(header, data))
+            dictionary[table_name] = base_engine.DataFrame(data)
         return dictionary
 
     worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -45,6 +45,7 @@ nav:
       - Machine Learning: api/ml.md
       - Math: api/math.md
       # - PySpark: api/pyspark.md  # will be added back later
+      - Polars: api/polars.md
       - Timeseries: api/timeseries.md
       - XArray: api/xarray.md
   - Development Guide: devguide.md

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`mkdocs`
	`2`	`+polars`
`2`	`3`	`mkdocs-material`
`3`	`4`	`mkdocstrings>=0.19.0`
`4`	`5`	`mkdocstrings-python`