@@ -132,6 +132,16 @@ def pivot_longer(
132
132
0 1 sp m 5564 2
133
133
1 1 rel f 65 3
134
134
135
+ Split the column labels for the above dataframe using named groups in `names_pattern`:
136
+
137
+ >>> df.pivot_longer(
138
+ ... index = 'id',
139
+ ... names_pattern = r"new_?(?P<diagnosis>.+)_(?P<gender>.)(?P<age>\\ d+)",
140
+ ... )
141
+ id diagnosis gender age value
142
+ 0 1 sp m 5564 2
143
+ 1 1 rel f 65 3
144
+
135
145
Convert the dtypes of specific columns with `names_transform`:
136
146
137
147
>>> result = (df
@@ -173,6 +183,50 @@ def pivot_longer(
173
183
0 50 1 10 30
174
184
1 50 2 20 40
175
185
186
+ Replicate the above with named groups in `names_pattern` - use `_` instead of `.value`:
187
+
188
+ >>> df.pivot_longer(
189
+ ... index="unit",
190
+ ... names_pattern=r"(?P<_>x|y)_(?P<time>[0-9])(?P<__>_mean)",
191
+ ... )
192
+ unit time x_mean y_mean
193
+ 0 50 1 10 30
194
+ 1 50 2 20 40
195
+
196
+ Reshape dataframe by passing a sequence to `names_pattern`:
197
+
198
+ >>> df = pd.DataFrame({'hr1': [514, 573],
199
+ ... 'hr2': [545, 526],
200
+ ... 'team': ['Red Sox', 'Yankees'],
201
+ ... 'year1': [2007, 2007],
202
+ ... 'year2': [2008, 2008]})
203
+ >>> df
204
+ hr1 hr2 team year1 year2
205
+ 0 514 545 Red Sox 2007 2008
206
+ 1 573 526 Yankees 2007 2008
207
+ >>> df.pivot_longer(
208
+ ... index = 'team',
209
+ ... names_to = ['year', 'hr'],
210
+ ... names_pattern = ['year', 'hr']
211
+ ... )
212
+ team hr year
213
+ 0 Red Sox 514 2007
214
+ 1 Yankees 573 2007
215
+ 2 Red Sox 545 2008
216
+ 3 Yankees 526 2008
217
+
218
+ Reshape above dataframe by passing a dictionary to `names_pattern`:
219
+
220
+ >>> df.pivot_longer(
221
+ ... index = 'team',
222
+ ... names_pattern = {"year":"year", "hr":"hr"}
223
+ ... )
224
+ team hr year
225
+ 0 Red Sox 514 2007
226
+ 1 Yankees 573 2007
227
+ 2 Red Sox 545 2008
228
+ 3 Yankees 526 2008
229
+
176
230
Multiple values_to:
177
231
178
232
>>> df = pd.DataFrame(
@@ -207,7 +261,7 @@ def pivot_longer(
207
261
... column_names=slice("Mango", "Vodka"),
208
262
... names_to=("Fruit", "Drink"),
209
263
... values_to=("Pounds", "Ounces"),
210
- ... names_pattern=[r "M|O|W", r "G|V"],
264
+ ... names_pattern=["M|O|W", "G|V"],
211
265
... )
212
266
City State Fruit Pounds Drink Ounces
213
267
0 Houston Texas Mango 4 Gin 16.0
@@ -220,11 +274,34 @@ def pivot_longer(
220
274
7 Austin Texas Watermelon 99 None NaN
221
275
8 Hoover Alabama Watermelon 43 None NaN
222
276
277
+ Replicate the above transformation with a nested dictionary passed to `names_pattern`:
278
+
279
+ >>> df.pivot_longer(
280
+ ... index=["City", "State"],
281
+ ... column_names=slice("Mango", "Vodka"),
282
+ ... names_pattern={
283
+ ... "Fruit": {"Pounds": "M|O|W"},
284
+ ... "Drink": {"Ounces": "G|V"},
285
+ ... },
286
+ ... )
287
+ City State Fruit Pounds Drink Ounces
288
+ 0 Houston Texas Mango 4 Gin 16.0
289
+ 1 Austin Texas Mango 10 Gin 200.0
290
+ 2 Hoover Alabama Mango 90 Gin 34.0
291
+ 3 Houston Texas Orange 10 Vodka 20.0
292
+ 4 Austin Texas Orange 8 Vodka 33.0
293
+ 5 Hoover Alabama Orange 14 Vodka 18.0
294
+ 6 Houston Texas Watermelon 40 None NaN
295
+ 7 Austin Texas Watermelon 99 None NaN
296
+ 8 Hoover Alabama Watermelon 43 None NaN
223
297
224
298
!!! abstract "Version Changed"
225
299
226
300
- 0.24.0
227
301
- Added `dropna` parameter.
302
+ - 0.24.1
303
+ - `names_pattern` can accept a dictionary.
304
+ - named groups supported in `names_pattern`.
228
305
229
306
230
307
:param df: A pandas DataFrame.
@@ -257,13 +334,27 @@ def pivot_longer(
257
334
or regular expression. `names_sep` does not work with MultiIndex
258
335
columns.
259
336
:param names_pattern: Determines how the column name is broken up.
260
- It can be a regular expression containing matching groups (it takes
261
- the same specification as pandas' `str.extract` method), or a
262
- list/tuple of regular expressions. If it is a single regex, the
263
- number of groups must match the length of `names_to`.
337
+ It can be a regular expression containing matching groups.
338
+ Under the hood it is processed with pandas' `str.extract` function.
339
+ If it is a single regex, the number of groups must match
340
+ the length of `names_to`.
341
+ Named groups are supported, if `names_to` is none. `_` is used
342
+ instead of `.value` as a placeholder in named groups.
343
+ `_` can be overloaded for multiple `.value`
344
+ calls - `_`, `__`, `___`, ...
345
+ `names_pattern` can also be a list/tuple of regular expressions
346
+ It can also be a list/tuple of strings;
347
+ the strings will be treated as regular expressions.
348
+ Under the hood it is processed with pandas' `str.contains` function.
264
349
For a list/tuple of regular expressions,
265
350
`names_to` must also be a list/tuple and the lengths of both
266
351
arguments must match.
352
+ `names_pattern` can also be a dictionary, where the keys are
353
+ the new column names, while the values can be a regular expression
354
+ or a string which will be evaluated as a regular expression.
355
+ Alternatively, a nested dictionary can be used, where the sub
356
+ key(s) are associated with `values_to`. Please have a look
357
+ at the examples for usage.
267
358
`names_pattern` does not work with MultiIndex columns.
268
359
:param names_transform: Use this option to change the types of columns that
269
360
have been transformed to rows. This does not applies to the values' columns.
@@ -428,34 +519,40 @@ def _data_checks_pivot_longer(
428
519
if column_names is None :
429
520
column_names = df .columns .difference (index , sort = False ).tolist ()
430
521
431
- len_names_to = 0
432
522
if names_to is not None :
433
523
if isinstance (names_to , str ):
434
524
names_to = [names_to ]
435
525
elif isinstance (names_to , tuple ):
436
526
names_to = [* names_to ]
527
+
437
528
check ("names_to" , names_to , [list , str , tuple ])
438
529
439
530
uniques = set ()
440
531
for word in names_to :
441
- check (f"{ word } in names_to" , word , [str ])
532
+ check (f"' { word } ' in names_to" , word , [str ])
442
533
if (word in uniques ) and (word != ".value" ):
443
- raise ValueError (f"{ word } is duplicated in names_to." )
534
+ raise ValueError (f"' { word } ' is duplicated in names_to." )
444
535
uniques .add (word )
445
536
446
- len_names_to = len (names_to )
447
537
else :
448
538
if not any ((names_sep , names_pattern )):
449
539
names_to = ["variable" ]
450
540
451
541
check ("values_to" , values_to , [str , list , tuple ])
452
- if isinstance (values_to , (list , tuple )) and (
453
- not isinstance (names_pattern , (list , tuple ))
454
- ):
455
- raise TypeError (
456
- "values_to can be a list/tuple only "
457
- "if names_pattern is a list/tuple."
458
- )
542
+ if isinstance (values_to , (list , tuple )):
543
+ if not isinstance (names_pattern , (list , tuple )):
544
+ raise TypeError (
545
+ "values_to can be a list/tuple only "
546
+ "if names_pattern is a list/tuple."
547
+ )
548
+ if index :
549
+ exclude = set (values_to ).intersection (index )
550
+ if exclude :
551
+ raise ValueError (
552
+ f"Labels { * exclude , } in values_to already exist as "
553
+ "column labels assigned to the dataframe's "
554
+ "index parameter. Kindly use unique labels."
555
+ )
459
556
if (
460
557
(names_sep is None )
461
558
and (names_pattern is None )
@@ -475,25 +572,40 @@ def _data_checks_pivot_longer(
475
572
)
476
573
477
574
if names_pattern is not None :
478
- check ("names_pattern" , names_pattern , [ str , Pattern , list , tuple ])
479
- if names_to is None :
480
- raise ValueError ( "Kindly provide values for names_to." )
575
+ check (
576
+ "names_pattern" , names_pattern , [ str , Pattern , list , tuple , dict ]
577
+ )
481
578
if isinstance (names_pattern , (str , Pattern )):
482
- num_regex_grps = re .compile (names_pattern ).groups
483
-
484
- if len_names_to != num_regex_grps :
579
+ regex = re .compile (names_pattern )
580
+ if names_to is None :
581
+ if regex .groupindex :
582
+ names_to = regex .groupindex .keys ()
583
+ names_to = [
584
+ ".value"
585
+ if ("_" in name ) and (len (set (name )) == 1 )
586
+ else name
587
+ for name in names_to
588
+ ]
589
+ len_names_to = len (names_to )
590
+ else :
591
+ raise ValueError ("Kindly provide values for names_to." )
592
+ else :
593
+ len_names_to = len (names_to )
594
+ if len_names_to != regex .groups :
485
595
raise ValueError (
486
596
f"The length of names_to does not match "
487
597
"the number of groups in names_pattern. "
488
598
f"The length of names_to is { len_names_to } "
489
599
"while the number of groups in the regex "
490
- f"is { num_regex_grps } ."
600
+ f"is { regex . groups } ."
491
601
)
492
602
493
603
elif isinstance (names_pattern , (list , tuple )):
604
+ if names_to is None :
605
+ raise ValueError ("Kindly provide values for names_to." )
494
606
for word in names_pattern :
495
- check (f"{ word } in names_pattern" , word , [str , Pattern ])
496
-
607
+ check (f"' { word } ' in names_pattern" , word , [str , Pattern ])
608
+ len_names_to = len ( names_to )
497
609
if len (names_pattern ) != len_names_to :
498
610
raise ValueError (
499
611
f"The length of names_to does not match "
@@ -521,12 +633,91 @@ def _data_checks_pivot_longer(
521
633
check (f"{ word } in values_to" , word , [str ])
522
634
if word in names_to :
523
635
raise ValueError (
524
- f"{ word } in values_to already exists in names_to."
636
+ f"'{ word } ' in values_to "
637
+ "already exists in names_to."
525
638
)
526
639
527
640
if word in uniques :
528
- raise ValueError (f"{ word } is duplicated in values_to." )
641
+ raise ValueError (
642
+ f"'{ word } ' is duplicated in values_to."
643
+ )
529
644
uniques .add (word )
645
+ # outer keys belong to names_to
646
+ # if the values are dicts,
647
+ # then the inner key belongs to values_to
648
+ # inner keys should not exist in the outer keys
649
+ # non keys belong to names_pattern
650
+ elif isinstance (names_pattern , dict ):
651
+ if names_to is not None :
652
+ raise ValueError (
653
+ "names_to should be None "
654
+ "when names_pattern is a dictionary"
655
+ )
656
+ for key , value in names_pattern .items ():
657
+ check (f"'{ key } ' in names_pattern" , key , [str ])
658
+ if index and (key in index ):
659
+ raise ValueError (
660
+ f"'{ key } ' in the names_pattern dictionary "
661
+ "already exists as a column label "
662
+ "assigned to the index parameter. "
663
+ "Kindly use a unique name"
664
+ )
665
+ names_to = list (names_pattern )
666
+ is_dict = (
667
+ isinstance (arg , dict ) for _ , arg in names_pattern .items ()
668
+ )
669
+ if all (is_dict ):
670
+ values_to = []
671
+ patterns = []
672
+ for key , value in names_pattern .items ():
673
+ if len (value ) != 1 :
674
+ raise ValueError (
675
+ "The length of the dictionary paired "
676
+ f"with '{ key } ' in names_pattern "
677
+ "should be length 1, instead got "
678
+ f"{ len (value )} "
679
+ )
680
+ for k , v in value .items ():
681
+ if not isinstance (k , str ):
682
+ raise TypeError (
683
+ "The key in the nested dictionary "
684
+ f"for '{ key } ' in names_pattern "
685
+ "should be a string, instead got {type(k)}"
686
+ )
687
+ if k in names_pattern :
688
+ raise ValueError (
689
+ f"'{ k } ' in the nested dictionary "
690
+ "already exists as one of the main "
691
+ "keys in names_pattern"
692
+ )
693
+ if index and (k in index ):
694
+ raise ValueError (
695
+ f"'{ k } ' in the nested dictionary "
696
+ "already exists as a column label "
697
+ "assigned to the index parameter. "
698
+ "Kindly use a unique name"
699
+ )
700
+ check (
701
+ f"The value paired with '{ k } ' "
702
+ "in the nested dictionary in names_pattern" ,
703
+ v ,
704
+ [str , Pattern ],
705
+ )
706
+ patterns .append (v )
707
+ values_to .append (k )
708
+ else :
709
+ patterns = []
710
+ for key , value in names_pattern .items ():
711
+ check (
712
+ f"The value paired with '{ key } ' "
713
+ "in the names_pattern dictionary" ,
714
+ value ,
715
+ [str , Pattern ],
716
+ )
717
+
718
+ patterns .append (value )
719
+ names_pattern = patterns
720
+ patterns = None
530
721
531
722
if names_sep is not None :
532
723
check ("names_sep" , names_sep , [str , Pattern ])
@@ -707,14 +898,6 @@ def _pivot_longer_names_pattern_sequence(
707
898
names_pattern is provided, and is a list/tuple.
708
899
"""
709
900
values_to_is_a_sequence = isinstance (values_to , (list , tuple ))
710
- if values_to_is_a_sequence and index :
711
- exclude = set (values_to ).intersection (index )
712
- if exclude :
713
- raise ValueError (
714
- f"Labels { * exclude , } in values_to already exist as "
715
- "column labels assigned to the dataframe's index parameter. "
716
- "Kindly use unique labels."
717
- )
718
901
values = df .columns
719
902
720
903
mapping = [
0 commit comments