@@ -337,21 +337,24 @@ def xlsx_cells(
337
337
border : bool = False ,
338
338
protection : bool = False ,
339
339
comment : bool = False ,
340
+ engine : str = "pandas" ,
340
341
** kwargs : Any ,
341
- ) -> Union [ dict , pd . DataFrame ] :
342
+ ) -> Mapping :
342
343
"""Imports data from spreadsheet without coercing it into a rectangle.
343
344
344
345
Each cell is represented by a row in a dataframe, and includes the
345
346
cell's coordinates, the value, row and column position.
346
347
The cell formatting (fill, font, border, etc) can also be accessed;
347
348
usually this is returned as a dictionary in the cell, and the specific
348
- cell format attribute can be accessed using `pd.Series.str.get`.
349
+ cell format attribute can be accessed using `pd.Series.str.get`
350
+ or `pl.struct.field` if it is a polars DataFrame.
349
351
350
352
Inspiration for this comes from R's [tidyxl][link] package.
351
353
[link]: https://nacnudus.github.io/tidyxl/reference/tidyxl.html
352
354
353
355
Examples:
354
356
>>> import pandas as pd
357
+ >>> import polars as pl
355
358
>>> from janitor import xlsx_cells
356
359
>>> pd.set_option("display.max_columns", None)
357
360
>>> pd.set_option("display.expand_frame_repr", False)
@@ -398,6 +401,40 @@ def xlsx_cells(
398
401
7 00000000
399
402
Name: fill, dtype: object
400
403
404
+ Access cell formatting in a polars DataFrame:
405
+
406
+ >>> out = xlsx_cells(filename, sheetnames="highlights", engine='polars', fill=True).get_column('fill')
407
+ >>> out
408
+ shape: (8,)
409
+ Series: 'fill' [struct[3]]
410
+ [
411
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
412
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
413
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
414
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
415
+ {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
416
+ {"solid",{"FFFFFF00","rgb",0.0},{"FFFFFF00","rgb",0.0}}
417
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
418
+ {null,{"00000000","rgb",0.0},{"00000000","rgb",0.0}}
419
+ ]
420
+
421
+ Specific cell attributes can be acessed via Polars' struct:
422
+
423
+ >>> out.struct.field('fgColor').struct.field('rgb')
424
+ shape: (8,)
425
+ Series: 'rgb' [str]
426
+ [
427
+ "00000000"
428
+ "00000000"
429
+ "00000000"
430
+ "00000000"
431
+ "FFFFFF00"
432
+ "FFFFFF00"
433
+ "00000000"
434
+ "00000000"
435
+ ]
436
+
437
+
401
438
Args:
402
439
path: Path to the Excel File. It can also be an openpyxl Workbook.
403
440
sheetnames: Names of the sheets from which the cells are to be extracted.
@@ -426,6 +463,7 @@ def xlsx_cells(
426
463
It is usually returned as a dictionary.
427
464
comment: If `True`, return comment properties of the cell.
428
465
It is usually returned as a dictionary.
466
+ engine: DataFrame engine. Should be either pandas or polars.
429
467
**kwargs: Any other attributes of the cell, that can be accessed from openpyxl.
430
468
431
469
Raises:
@@ -434,7 +472,7 @@ def xlsx_cells(
434
472
is not a openpyxl cell attribute.
435
473
436
474
Returns:
437
- A pandas DataFrame, or a dictionary of DataFrames.
475
+ A DataFrame, or a dictionary of DataFrames.
438
476
""" # noqa : E501
439
477
440
478
try :
@@ -462,6 +500,21 @@ def xlsx_cells(
462
500
path = load_workbook (
463
501
filename = path , read_only = read_only , keep_links = False
464
502
)
503
+ if engine not in {"pandas" , "polars" }:
504
+ raise ValueError ("engine should be one of pandas or polars." )
505
+ base_engine = pd
506
+ if engine == "polars" :
507
+ try :
508
+ import polars as pl
509
+
510
+ base_engine = pl
511
+ except ImportError :
512
+ import_message (
513
+ submodule = "polars" ,
514
+ package = "polars" ,
515
+ conda_channel = "conda-forge" ,
516
+ pip_install = True ,
517
+ )
465
518
# start_point and end_point applies if the user is interested in
466
519
# only a subset of the Excel File and knows the coordinates
467
520
if start_point or end_point :
@@ -533,6 +586,7 @@ def xlsx_cells(
533
586
start_point ,
534
587
end_point ,
535
588
include_blank_cells ,
589
+ base_engine = base_engine ,
536
590
)
537
591
for sheetname in sheetnames
538
592
}
@@ -552,6 +606,7 @@ def _xlsx_cells(
552
606
start_point : Union [str , int ],
553
607
end_point : Union [str , int ],
554
608
include_blank_cells : bool ,
609
+ base_engine ,
555
610
):
556
611
"""
557
612
Function to process a single sheet. Returns a DataFrame.
@@ -567,7 +622,7 @@ def _xlsx_cells(
567
622
path_is_workbook: True/False.
568
623
569
624
Returns:
570
- A pandas DataFrame.
625
+ A DataFrame.
571
626
"""
572
627
573
628
if start_point :
@@ -579,15 +634,23 @@ def _xlsx_cells(
579
634
if (cell .value is None ) and (not include_blank_cells ):
580
635
continue
581
636
for value in defaults :
582
- frame [value ].append (getattr (cell , value , None ))
637
+ outcome = getattr (cell , value , None )
638
+ if value .startswith ("is_" ):
639
+ pass
640
+ elif outcome is not None :
641
+ outcome = str (outcome )
642
+ frame [value ].append (outcome )
583
643
for parent , boolean_value in parameters .items ():
584
644
check (f"The value for { parent } " , boolean_value , [bool ])
585
645
if not boolean_value :
586
646
continue
587
647
boolean_value = _object_to_dict (getattr (cell , parent , None ))
648
+ if isinstance (boolean_value , dict ) or (boolean_value is None ):
649
+ pass
650
+ else :
651
+ boolean_value = str (boolean_value )
588
652
frame [parent ].append (boolean_value )
589
-
590
- return pd .DataFrame (frame , copy = False )
653
+ return base_engine .DataFrame (frame )
591
654
592
655
593
656
def _object_to_dict (obj ):
0 commit comments