Skip to content

Commit 46ab4d8

Browse files
samukwekusamuel.oranyeliericmjl
authored
Add polars support for janitor.io.xlsx_table (#1357)
* add make_clean_names function that can be applied to polars * add examples for make_clean_names * changelog * limit import location for polars * limit import location for polars * fix polars in environment-dev.yml * install polars in doctest * limit polars imports - user should have polars already installed * use subprocess.run * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add subprocess.devnull to docstrings * add os.devnull * add polars as requirement for docs * add polars to tests requirements * delete irrelevant folder * changelog * create submodule for polars * fix doctests * fix tests; add polars to documentation * fix tests; add polars to documentation * import janitor.polars * control docs output for polars submodule * exclude functions in docs rendering * exclude functions in docs rendering * show_submodules=true * fix docstring rendering for polars * Expression -> expression * rename functions.py * pivot_longer implemented for polars * changelog * keep changes related only to pivot_longer * pd -> pl * pd -> pl * df.pivot_longer -> df.janitor.pivot_longer * df.pivot_longer -> df.janitor.pivot_longer * pd -> pl * pd -> pl * add >>> df * add >>> df * keep changes related only to polars pivot_longer * add polars support to read_commandline * remove irrelevant files * minor edit to docs * xlsx_table now supports polars --------- Co-authored-by: samuel.oranyeli <[email protected]> Co-authored-by: Eric Ma <[email protected]>
1 parent 891b711 commit 46ab4d8

File tree

2 files changed

+44
-7
lines changed

2 files changed

+44
-7
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## [Unreleased]
4+
- [ENH] `xlsx_table` function now supports polars - Issue #1352
45

56
- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
67
- [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343

janitor/io.py

+43-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from glob import glob
99
from io import StringIO
1010
from itertools import chain
11-
from typing import IO, TYPE_CHECKING, Any, Iterable, Union
11+
from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union
1212

1313
import pandas as pd
1414

@@ -142,21 +142,23 @@ def xlsx_table(
142142
path: Union[str, IO, Workbook],
143143
sheetname: str = None,
144144
table: Union[str, list, tuple] = None,
145-
) -> Union[pd.DataFrame, dict]:
145+
engine: str = "pandas",
146+
) -> Mapping:
146147
"""Returns a DataFrame of values in a table in the Excel file.
147148
148149
This applies to an Excel file, where the data range is explicitly
149150
specified as a Microsoft Excel table.
150151
151152
If there is a single table in the sheet, or a string is provided
152-
as an argument to the `table` parameter, a pandas DataFrame is returned;
153+
as an argument to the `table` parameter, a DataFrame is returned;
153154
if there is more than one table in the sheet,
154155
and the `table` argument is `None`, or a list/tuple of names,
155156
a dictionary of DataFrames is returned, where the keys of the dictionary
156157
are the table names.
157158
158159
Examples:
159160
>>> import pandas as pd
161+
>>> import polars as pl
160162
>>> from janitor import xlsx_table
161163
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
162164
@@ -170,6 +172,20 @@ def xlsx_table(
170172
3 4 Competition
171173
4 5 Long Distance
172174
175+
>>> xlsx_table(filename, table='dCategory', engine='polars')
176+
shape: (5, 2)
177+
┌────────────┬───────────────┐
178+
│ CategoryID ┆ Category │
179+
│ --- ┆ --- │
180+
│ i64 ┆ str │
181+
╞════════════╪═══════════════╡
182+
│ 1 ┆ Beginner │
183+
│ 2 ┆ Advanced │
184+
│ 3 ┆ Freestyle │
185+
│ 4 ┆ Competition │
186+
│ 5 ┆ Long Distance │
187+
└────────────┴───────────────┘
188+
173189
Multiple tables:
174190
175191
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -189,14 +205,16 @@ def xlsx_table(
189205
Args:
190206
path: Path to the Excel File. It can also be an openpyxl Workbook.
191207
table: Name of a table, or list of tables in the sheet.
208+
engine: DataFrame engine. Should be either pandas or polars.
209+
Defaults to pandas
192210
193211
Raises:
194212
AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
195213
ValueError: If there are no tables in the sheet.
196214
KeyError: If the provided table does not exist in the sheet.
197215
198216
Returns:
199-
A pandas DataFrame, or a dictionary of DataFrames,
217+
A DataFrame, or a dictionary of DataFrames,
200218
if there are multiple arguments for the `table` parameter,
201219
or the argument to `table` is `None`.
202220
""" # noqa : E501
@@ -219,6 +237,22 @@ def xlsx_table(
219237
DeprecationWarning,
220238
stacklevel=find_stack_level(),
221239
)
240+
if engine not in {"pandas", "polars"}:
241+
raise ValueError("engine should be one of pandas or polars.")
242+
base_engine = pd
243+
if engine == "polars":
244+
try:
245+
import polars as pl
246+
247+
base_engine = pl
248+
except ImportError:
249+
import_message(
250+
submodule="polars",
251+
package="polars",
252+
conda_channel="conda-forge",
253+
pip_install=True,
254+
)
255+
222256
if table is not None:
223257
check("table", table, [str, list, tuple])
224258
if isinstance(table, (list, tuple)):
@@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
245279
header_exist = contents.headerRowCount
246280
coordinates = contents.ref
247281
data = worksheet[coordinates]
248-
data = [[entry.value for entry in cell] for cell in data]
249282
if header_exist:
250283
header, *data = data
284+
header = [cell.value for cell in header]
251285
else:
252286
header = [f"C{num}" for num in range(len(data[0]))]
253-
data = pd.DataFrame(data, columns=header)
254-
dictionary[table_name] = data
287+
data = zip(*data)
288+
data = ([entry.value for entry in cell] for cell in data)
289+
data = dict(zip(header, data))
290+
dictionary[table_name] = base_engine.DataFrame(data)
255291
return dictionary
256292

257293
worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]

0 commit comments

Comments
 (0)