Skip to content

Commit bc132a7

Browse files
samukwekusamuel.oranyeli
authored and
samuel.oranyeli
committed
Add support for polars in xlsx_cells (#1358)
1 parent e2ab8df commit bc132a7

File tree

2 files changed

+70
-8
lines changed

2 files changed

+70
-8
lines changed

CHANGELOG.md

-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Changelog
22

33
## [Unreleased]
4-
- [ENH] `xlsx_table` function now supports polars - Issue #1352
54

65
- [ENH] Added a `pivot_longer` method, and a `pivot_longer_spec` function for polars - Issue #1352
76

janitor/io.py

+70-7
Original file line numberDiff line numberDiff line change
@@ -337,21 +337,24 @@ def xlsx_cells(
337337
border: bool = False,
338338
protection: bool = False,
339339
comment: bool = False,
340+
engine: str = "pandas",
340341
**kwargs: Any,
341-
) -> Union[dict, pd.DataFrame]:
342+
) -> Mapping:
342343
"""Imports data from spreadsheet without coercing it into a rectangle.
343344
344345
Each cell is represented by a row in a dataframe, and includes the
345346
cell's coordinates, the value, row and column position.
346347
The cell formatting (fill, font, border, etc) can also be accessed;
347348
usually this is returned as a dictionary in the cell, and the specific
348-
cell format attribute can be accessed using `pd.Series.str.get`.
349+
cell format attribute can be accessed using `pd.Series.str.get`
350+
or `pl.struct.field` if it is a polars DataFrame.
349351
350352
Inspiration for this comes from R's [tidyxl][link] package.
351353
[link]: https://nacnudus.github.io/tidyxl/reference/tidyxl.html
352354
353355
Examples:
354356
>>> import pandas as pd
357+
>>> import polars as pl
355358
>>> from janitor import xlsx_cells
356359
>>> pd.set_option("display.max_columns", None)
357360
>>> pd.set_option("display.expand_frame_repr", False)
@@ -398,6 +401,40 @@ def xlsx_cells(
398401
7 00000000
399402
Name: fill, dtype: object
400403
404+
Access cell formatting in a polars DataFrame:
405+
406+
>>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
407+
>>> out
408+
shape: (8,)
409+
Series: 'fill' [struct[3]]
410+
[
411+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
412+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
413+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
414+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
415+
{"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
416+
{"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
417+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
418+
{null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
419+
]
420+
421+
Specific cell attributes can be acessed via Polars' struct:
422+
423+
>>> out.struct.field('fgColor').struct.field('rgb')
424+
shape: (8,)
425+
Series: 'rgb' [str]
426+
[
427+
"00000000"
428+
"00000000"
429+
"00000000"
430+
"00000000"
431+
"FFFFFF00"
432+
"FFFFFF00"
433+
"00000000"
434+
"00000000"
435+
]
436+
437+
401438
Args:
402439
path: Path to the Excel File. It can also be an openpyxl Workbook.
403440
sheetnames: Names of the sheets from which the cells are to be extracted.
@@ -426,6 +463,7 @@ def xlsx_cells(
426463
It is usually returned as a dictionary.
427464
comment: If `True`, return comment properties of the cell.
428465
It is usually returned as a dictionary.
466+
engine: DataFrame engine. Should be either pandas or polars.
429467
**kwargs: Any other attributes of the cell, that can be accessed from openpyxl.
430468
431469
Raises:
@@ -434,7 +472,7 @@ def xlsx_cells(
434472
is not a openpyxl cell attribute.
435473
436474
Returns:
437-
A pandas DataFrame, or a dictionary of DataFrames.
475+
A DataFrame, or a dictionary of DataFrames.
438476
""" # noqa : E501
439477

440478
try:
@@ -462,6 +500,21 @@ def xlsx_cells(
462500
path = load_workbook(
463501
filename=path, read_only=read_only, keep_links=False
464502
)
503+
if engine not in {"pandas", "polars"}:
504+
raise ValueError("engine should be one of pandas or polars.")
505+
base_engine = pd
506+
if engine == "polars":
507+
try:
508+
import polars as pl
509+
510+
base_engine = pl
511+
except ImportError:
512+
import_message(
513+
submodule="polars",
514+
package="polars",
515+
conda_channel="conda-forge",
516+
pip_install=True,
517+
)
465518
# start_point and end_point applies if the user is interested in
466519
# only a subset of the Excel File and knows the coordinates
467520
if start_point or end_point:
@@ -533,6 +586,7 @@ def xlsx_cells(
533586
start_point,
534587
end_point,
535588
include_blank_cells,
589+
base_engine=base_engine,
536590
)
537591
for sheetname in sheetnames
538592
}
@@ -552,6 +606,7 @@ def _xlsx_cells(
552606
start_point: Union[str, int],
553607
end_point: Union[str, int],
554608
include_blank_cells: bool,
609+
base_engine,
555610
):
556611
"""
557612
Function to process a single sheet. Returns a DataFrame.
@@ -567,7 +622,7 @@ def _xlsx_cells(
567622
path_is_workbook: True/False.
568623
569624
Returns:
570-
A pandas DataFrame.
625+
A DataFrame.
571626
"""
572627

573628
if start_point:
@@ -579,15 +634,23 @@ def _xlsx_cells(
579634
if (cell.value is None) and (not include_blank_cells):
580635
continue
581636
for value in defaults:
582-
frame[value].append(getattr(cell, value, None))
637+
outcome = getattr(cell, value, None)
638+
if value.startswith("is_"):
639+
pass
640+
elif outcome is not None:
641+
outcome = str(outcome)
642+
frame[value].append(outcome)
583643
for parent, boolean_value in parameters.items():
584644
check(f"The value for {parent}", boolean_value, [bool])
585645
if not boolean_value:
586646
continue
587647
boolean_value = _object_to_dict(getattr(cell, parent, None))
648+
if isinstance(boolean_value, dict) or (boolean_value is None):
649+
pass
650+
else:
651+
boolean_value = str(boolean_value)
588652
frame[parent].append(boolean_value)
589-
590-
return pd.DataFrame(frame, copy=False)
653+
return base_engine.DataFrame(frame)
591654

592655

593656
def _object_to_dict(obj):

0 commit comments

Comments
 (0)