@@ -93,7 +93,7 @@ def read_csvs(
93
93
return dfs_dict
94
94
95
95
96
- def read_commandline (cmd : str , engine = "pandas" , ** kwargs : Any ) -> Mapping :
96
+ def read_commandline (cmd : str , ** kwargs : Any ) -> pd . DataFrame :
97
97
"""Read a CSV file based on a command-line command.
98
98
99
99
For example, you may wish to run the following command on `sep-quarter.csv`
@@ -111,42 +111,26 @@ def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping:
111
111
```
112
112
113
113
This function assumes that your command line command will return
114
- an output that is parsable using the relevant engine and StringIO.
115
- This function defaults to using `pd.read_csv` underneath the hood.
116
- Keyword arguments are passed through as-is .
114
+ an output that is parsable using `pandas.read_csv` and StringIO.
115
+ We default to using `pd.read_csv` underneath the hood.
116
+ Keyword arguments are passed through to read_csv .
117
117
118
118
Args:
119
119
cmd: Shell command to preprocess a file on disk.
120
- engine: DataFrame engine to process the output of the shell command.
121
- Currently supports both pandas and polars.
122
120
**kwargs: Keyword arguments that are passed through to
123
- the engine's csv reader.
124
-
121
+ `pd.read_csv()`.
125
122
126
123
Returns:
127
- A DataFrame parsed from the stdout of the underlying
124
+ A pandas DataFrame parsed from the stdout of the underlying
128
125
shell.
129
126
"""
130
127
131
128
check ("cmd" , cmd , [str ])
132
- if engine not in {"pandas" , "polars" }:
133
- raise ValueError ("engine should be either pandas or polars." )
134
129
# adding check=True ensures that an explicit, clear error
135
130
# is raised, so that the user can see the reason for the failure
136
131
outcome = subprocess .run (
137
132
cmd , shell = True , capture_output = True , text = True , check = True
138
133
)
139
- if engine == "polars" :
140
- try :
141
- import polars as pl
142
- except ImportError :
143
- import_message (
144
- submodule = "polars" ,
145
- package = "polars" ,
146
- conda_channel = "conda-forge" ,
147
- pip_install = True ,
148
- )
149
- return pl .read_csv (StringIO (outcome .stdout ), ** kwargs )
150
134
return pd .read_csv (StringIO (outcome .stdout ), ** kwargs )
151
135
152
136
@@ -158,21 +142,23 @@ def xlsx_table(
158
142
path : Union [str , IO , Workbook ],
159
143
sheetname : str = None ,
160
144
table : Union [str , list , tuple ] = None ,
161
- ) -> Union [pd .DataFrame , dict ]:
145
+ engine : str = "pandas" ,
146
+ ) -> Mapping :
162
147
"""Returns a DataFrame of values in a table in the Excel file.
163
148
164
149
This applies to an Excel file, where the data range is explicitly
165
150
specified as a Microsoft Excel table.
166
151
167
152
If there is a single table in the sheet, or a string is provided
168
- as an argument to the `table` parameter, a pandas DataFrame is returned;
153
+ as an argument to the `table` parameter, a DataFrame is returned;
169
154
if there is more than one table in the sheet,
170
155
and the `table` argument is `None`, or a list/tuple of names,
171
156
a dictionary of DataFrames is returned, where the keys of the dictionary
172
157
are the table names.
173
158
174
159
Examples:
175
160
>>> import pandas as pd
161
+ >>> import polars as pl
176
162
>>> from janitor import xlsx_table
177
163
>>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx"
178
164
@@ -186,6 +172,20 @@ def xlsx_table(
186
172
3 4 Competition
187
173
4 5 Long Distance
188
174
175
+ >>> xlsx_table(filename, table='dCategory', engine='polars')
176
+ shape: (5, 2)
177
+ ┌────────────┬───────────────┐
178
+ │ CategoryID ┆ Category │
179
+ │ --- ┆ --- │
180
+ │ i64 ┆ str │
181
+ ╞════════════╪═══════════════╡
182
+ │ 1 ┆ Beginner │
183
+ │ 2 ┆ Advanced │
184
+ │ 3 ┆ Freestyle │
185
+ │ 4 ┆ Competition │
186
+ │ 5 ┆ Long Distance │
187
+ └────────────┴───────────────┘
188
+
189
189
Multiple tables:
190
190
191
191
>>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"])
@@ -205,14 +205,16 @@ def xlsx_table(
205
205
Args:
206
206
path: Path to the Excel File. It can also be an openpyxl Workbook.
207
207
table: Name of a table, or list of tables in the sheet.
208
+ engine: DataFrame engine. Should be either pandas or polars.
209
+ Defaults to pandas
208
210
209
211
Raises:
210
212
AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet.
211
213
ValueError: If there are no tables in the sheet.
212
214
KeyError: If the provided table does not exist in the sheet.
213
215
214
216
Returns:
215
- A pandas DataFrame, or a dictionary of DataFrames,
217
+ A DataFrame, or a dictionary of DataFrames,
216
218
if there are multiple arguments for the `table` parameter,
217
219
or the argument to `table` is `None`.
218
220
""" # noqa : E501
@@ -235,6 +237,22 @@ def xlsx_table(
235
237
DeprecationWarning ,
236
238
stacklevel = find_stack_level (),
237
239
)
240
+ if engine not in {"pandas" , "polars" }:
241
+ raise ValueError ("engine should be one of pandas or polars." )
242
+ base_engine = pd
243
+ if engine == "polars" :
244
+ try :
245
+ import polars as pl
246
+
247
+ base_engine = pl
248
+ except ImportError :
249
+ import_message (
250
+ submodule = "polars" ,
251
+ package = "polars" ,
252
+ conda_channel = "conda-forge" ,
253
+ pip_install = True ,
254
+ )
255
+
238
256
if table is not None :
239
257
check ("table" , table , [str , list , tuple ])
240
258
if isinstance (table , (list , tuple )):
@@ -261,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table(
261
279
header_exist = contents .headerRowCount
262
280
coordinates = contents .ref
263
281
data = worksheet [coordinates ]
264
- data = [[entry .value for entry in cell ] for cell in data ]
265
282
if header_exist :
266
283
header , * data = data
284
+ header = [cell .value for cell in header ]
267
285
else :
268
286
header = [f"C{ num } " for num in range (len (data [0 ]))]
269
- data = pd .DataFrame (data , columns = header )
270
- dictionary [table_name ] = data
287
+ data = zip (* data )
288
+ data = ([entry .value for entry in cell ] for cell in data )
289
+ data = dict (zip (header , data ))
290
+ dictionary [table_name ] = base_engine .DataFrame (data )
271
291
return dictionary
272
292
273
293
worksheets = [worksheet for worksheet in ws if worksheet .tables .items ()]
0 commit comments