Skip to content

Commit 3b781c1

Browse files
author
samuel.oranyeli
committed
xlsx_table now supports polars
1 parent 4d9c35f commit 3b781c1

File tree

4 files changed

+51
-29
lines changed

4 files changed

+51
-29
lines changed

.requirements/docs.in

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mkdocs
2+
polars
23
mkdocs-material
34
mkdocstrings>=0.19.0
45
mkdocstrings-python

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Changelog
22

33
## [Unreleased]
4-
- [ENH] `read_commandline` function now supports polars - Issue #1352
4+
- [ENH] `xlsx_table` function now supports polars - Issue #1352
55

66
## [v0.27.0] - 2024-03-21
77

janitor/io.py

+48-28
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def read_csvs(
9393
return dfs_dict
9494

9595

96-
def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping:
96+
def read_commandline(cmd: str, **kwargs: Any) -> pd.DataFrame:
9797
"""Read a CSV file based on a command-line command.
9898
9999
For example, you may wish to run the following command on `sep-quarter.csv`
@@ -111,42 +111,26 @@ def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping:
111111
```
112112
113113
This function assumes that your command line command will return
114-
an output that is parsable using the relevant engine and StringIO.
115-
This function defaults to using `pd.read_csv` underneath the hood.
116-
Keyword arguments are passed through as-is.
114+
an output that is parsable using `pandas.read_csv` and StringIO.
115+
We default to using `pd.read_csv` underneath the hood.
116+
Keyword arguments are passed through to read_csv.
117117
118118
Args:
119119
cmd: Shell command to preprocess a file on disk.
120-
engine: DataFrame engine to process the output of the shell command.
121-
Currently supports both pandas and polars.
122120
**kwargs: Keyword arguments that are passed through to
123-
the engine's csv reader.
124-
121+
`pd.read_csv()`.
125122
126123
Returns:
127-
A DataFrame parsed from the stdout of the underlying
124+
A pandas DataFrame parsed from the stdout of the underlying
128125
shell.
129126
"""
130127

131128
check("cmd", cmd, [str])
132-
if engine not in {"pandas", "polars"}:
133-
raise ValueError("engine should be either pandas or polars.")
134129
# adding check=True ensures that an explicit, clear error
135130
# is raised, so that the user can see the reason for the failure
136131
outcome = subprocess.run(
137132
cmd, shell=True, capture_output=True, text=True, check=True
138133
)
139-
if engine == "polars":
140-
try:
141-
import polars as pl
142-
except ImportError:
143-
import_message(
144-
submodule="polars",
145-
package="polars",
146-
conda_channel="conda-forge",
147-
pip_install=True,
148-
)
149-
return pl.read_csv(StringIO(outcome.stdout), **kwargs)
150134
return pd.read_csv(StringIO(outcome.stdout), **kwargs)
151135

152136

@@ -158,21 +142,23 @@ def xlsx_table(
158142
path: Union[str, IO, Workbook],
159143
sheetname: str = None,
160144
table: Union[str, list, tuple] = None,
161-
) -> Union[pd.DataFrame, dict]:
145+
engine: str = "pandas",
146+
) -> Mapping:
162147
"""Returns a DataFrame of values in a table in the Excel file.
163148
164149
This applies to an Excel file, where the data range is explicitly
165150
specified as a Microsoft Excel table.
166151
167152
If there is a single table in the sheet, or a string is provided
168-
as an argument to the `table` parameter, a pandas DataFrame is returned;
153+
as an argument to the `table` parameter, a DataFrame is returned;
169154
if there is more than one table in the sheet,
170155
and the `table` argument is `None`, or a list/tuple of names,
171156
a dictionary of DataFrames is returned, where the keys of the dictionary
172157
are the table names.
173158
174159
Examples:
175160
>>> import pandas as pd
161+
>>> import polars as pl
176162
>>> from janitor import xlsx_table
177163
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
178164
@@ -186,6 +172,20 @@ def xlsx_table(
186172
3 4 Competition
187173
4 5 Long Distance
188174
175+
>>> xlsx_table(filename, table='dCategory', engine='polars')
176+
shape: (5, 2)
177+
┌────────────┬───────────────┐
178+
│ CategoryID ┆ Category │
179+
│ --- ┆ --- │
180+
│ i64 ┆ str │
181+
╞════════════╪═══════════════╡
182+
│ 1 ┆ Beginner │
183+
│ 2 ┆ Advanced │
184+
│ 3 ┆ Freestyle │
185+
│ 4 ┆ Competition │
186+
│ 5 ┆ Long Distance │
187+
└────────────┴───────────────┘
188+
189189
Multiple tables:
190190
191191
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -205,14 +205,16 @@ def xlsx_table(
205205
Args:
206206
path: Path to the Excel File. It can also be an openpyxl Workbook.
207207
table: Name of a table, or list of tables in the sheet.
208+
engine: DataFrame engine. Should be either pandas or polars.
209+
Defaults to pandas
208210
209211
Raises:
210212
AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
211213
ValueError: If there are no tables in the sheet.
212214
KeyError: If the provided table does not exist in the sheet.
213215
214216
Returns:
215-
A pandas DataFrame, or a dictionary of DataFrames,
217+
A DataFrame, or a dictionary of DataFrames,
216218
if there are multiple arguments for the `table` parameter,
217219
or the argument to `table` is `None`.
218220
""" # noqa : E501
@@ -235,6 +237,22 @@ def xlsx_table(
235237
DeprecationWarning,
236238
stacklevel=find_stack_level(),
237239
)
240+
if engine not in {"pandas", "polars"}:
241+
raise ValueError("engine should be one of pandas or polars.")
242+
base_engine = pd
243+
if engine == "polars":
244+
try:
245+
import polars as pl
246+
247+
base_engine = pl
248+
except ImportError:
249+
import_message(
250+
submodule="polars",
251+
package="polars",
252+
conda_channel="conda-forge",
253+
pip_install=True,
254+
)
255+
238256
if table is not None:
239257
check("table", table, [str, list, tuple])
240258
if isinstance(table, (list, tuple)):
@@ -261,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
261279
header_exist = contents.headerRowCount
262280
coordinates = contents.ref
263281
data = worksheet[coordinates]
264-
data = [[entry.value for entry in cell] for cell in data]
265282
if header_exist:
266283
header, *data = data
284+
header = [cell.value for cell in header]
267285
else:
268286
header = [f"C{num}" for num in range(len(data[0]))]
269-
data = pd.DataFrame(data, columns=header)
270-
dictionary[table_name] = data
287+
data = zip(*data)
288+
data = ([entry.value for entry in cell] for cell in data)
289+
data = dict(zip(header, data))
290+
dictionary[table_name] = base_engine.DataFrame(data)
271291
return dictionary
272292

273293
worksheets = [worksheet for worksheet in ws if worksheet.tables.items()]

mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ nav:
4545
- Machine Learning: api/ml.md
4646
- Math: api/math.md
4747
# - PySpark: api/pyspark.md # will be added back later
48+
- Polars: api/polars.md
4849
- Timeseries: api/timeseries.md
4950
- XArray: api/xarray.md
5051
- Development Guide: devguide.md

0 commit comments

Comments
 (0)