8
8
from glob import glob
9
9
from io import StringIO
10
10
from itertools import chain
11
- from typing import IO , TYPE_CHECKING , Any , Iterable , Union
11
+ from typing import IO , TYPE_CHECKING , Any , Iterable , Mapping , Union
12
12
13
13
import pandas as pd
14
14
@@ -142,21 +142,23 @@ def xlsx_table(
142
142
path : Union [str , IO , Workbook ],
143
143
sheetname : str = None ,
144
144
table : Union [str , list , tuple ] = None ,
145
- ) -> Union [pd .DataFrame , dict ]:
145
+ engine : str = "pandas" ,
146
+ ) -> Mapping :
146
147
"""Returns a DataFrame of values in a table in the Excel file.
147
148
148
149
This applies to an Excel file, where the data range is explicitly
149
150
specified as a Microsoft Excel table.
150
151
151
152
If there is a single table in the sheet, or a string is provided
152
- as an argument to the `table` parameter, a pandas DataFrame is returned;
153
+ as an argument to the `table` parameter, a DataFrame is returned;
153
154
if there is more than one table in the sheet,
154
155
and the `table` argument is `None`, or a list/tuple of names,
155
156
a dictionary of DataFrames is returned, where the keys of the dictionary
156
157
are the table names.
157
158
158
159
Examples:
159
160
>>> import pandas as pd
161
+ >>> import polars as pl
160
162
>>> from janitor import xlsx_table
161
163
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
162
164
@@ -170,6 +172,20 @@ def xlsx_table(
170
172
3 4 Competition
171
173
4 5 Long Distance
172
174
175
+ >>> xlsx_table(filename, table='dCategory', engine='polars')
176
+ shape: (5, 2)
177
+ ┌────────────┬───────────────┐
178
+ │ CategoryID ┆ Category │
179
+ │ --- ┆ --- │
180
+ │ i64 ┆ str │
181
+ ╞════════════╪═══════════════╡
182
+ │ 1 ┆ Beginner │
183
+ │ 2 ┆ Advanced │
184
+ │ 3 ┆ Freestyle │
185
+ │ 4 ┆ Competition │
186
+ │ 5 ┆ Long Distance │
187
+ └────────────┴───────────────┘
188
+
173
189
Multiple tables:
174
190
175
191
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -189,14 +205,16 @@ def xlsx_table(
189
205
Args:
190
206
path: Path to the Excel File. It can also be an openpyxl Workbook.
191
207
table: Name of a table, or list of tables in the sheet.
208
+ engine: DataFrame engine. Should be either pandas or polars.
209
+ Defaults to pandas
192
210
193
211
Raises:
194
212
AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
195
213
ValueError: If there are no tables in the sheet.
196
214
KeyError: If the provided table does not exist in the sheet.
197
215
198
216
Returns:
199
- A pandas DataFrame, or a dictionary of DataFrames,
217
+ A DataFrame, or a dictionary of DataFrames,
200
218
if there are multiple arguments for the `table` parameter,
201
219
or the argument to `table` is `None`.
202
220
""" # noqa : E501
@@ -219,6 +237,22 @@ def xlsx_table(
219
237
DeprecationWarning ,
220
238
stacklevel = find_stack_level (),
221
239
)
240
+ if engine not in {"pandas" , "polars" }:
241
+ raise ValueError ("engine should be one of pandas or polars." )
242
+ base_engine = pd
243
+ if engine == "polars" :
244
+ try :
245
+ import polars as pl
246
+
247
+ base_engine = pl
248
+ except ImportError :
249
+ import_message (
250
+ submodule = "polars" ,
251
+ package = "polars" ,
252
+ conda_channel = "conda-forge" ,
253
+ pip_install = True ,
254
+ )
255
+
222
256
if table is not None :
223
257
check ("table" , table , [str , list , tuple ])
224
258
if isinstance (table , (list , tuple )):
@@ -245,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
245
279
header_exist = contents .headerRowCount
246
280
coordinates = contents .ref
247
281
data = worksheet [coordinates ]
248
- data = [[entry .value for entry in cell ] for cell in data ]
249
282
if header_exist :
250
283
header , * data = data
284
+ header = [cell .value for cell in header ]
251
285
else :
252
286
header = [f"C{ num } " for num in range (len (data [0 ]))]
253
- data = pd .DataFrame (data , columns = header )
254
- dictionary [table_name ] = data
287
+ data = zip (* data )
288
+ data = ([entry .value for entry in cell ] for cell in data )
289
+ data = dict (zip (header , data ))
290
+ dictionary [table_name ] = base_engine .DataFrame (data )
255
291
return dictionary
256
292
257
293
worksheets = [worksheet for worksheet in ws if worksheet .tables .items ()]
0 commit comments