Skip to content

Commit f137305

Browse files
ioanaifpre-commit-ci[bot]jpivarski
authored
fix: dask failing for TTrees with duplicate TBranch names (#1189)
* fix: dask failing for TTrees with duplicate TBranch names * style: pre-commit fixes * Update _dask.py * Test file name update * preserve order in `common_keys` while dropping duplicates --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski <[email protected]>
1 parent e47badf commit f137305

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

src/uproot/_dask.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ def real_filter_branch(branch):
606606
filter_typename=filter_typename,
607607
filter_branch=real_filter_branch,
608608
full_paths=full_paths,
609+
ignore_duplicates=True,
609610
)
610611

611612
if common_keys is None:
@@ -747,6 +748,7 @@ def _get_dask_array_delay_open(
747748
filter_typename=filter_typename,
748749
filter_branch=filter_branch,
749750
full_paths=full_paths,
751+
ignore_duplicates=True,
750752
)
751753

752754
dask_dict = {}
@@ -1441,6 +1443,7 @@ def real_filter_branch(branch):
14411443
filter_typename=filter_typename,
14421444
filter_branch=real_filter_branch,
14431445
full_paths=full_paths,
1446+
ignore_duplicates=True,
14441447
)
14451448

14461449
if common_keys is None:
@@ -1586,7 +1589,7 @@ def _get_dak_array_delay_open(
15861589
ffile_path, fobject_path = files[0][0:2]
15871590

15881591
if known_base_form is not None:
1589-
common_keys = list(known_base_form.fields)
1592+
common_keys = list(dict.fromkeys(known_base_form.fields))
15901593
base_form = known_base_form
15911594
else:
15921595
obj = uproot._util.regularize_object_path(
@@ -1598,6 +1601,7 @@ def _get_dak_array_delay_open(
15981601
filter_typename=filter_typename,
15991602
filter_branch=filter_branch,
16001603
full_paths=full_paths,
1604+
ignore_duplicates=True,
16011605
)
16021606
base_form = _get_ttree_form(
16031607
awkward, obj, common_keys, interp_options.get("ak_add_doc")

src/uproot/behaviors/TBranch.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,7 @@ def keys(
11261126
filter_branch=no_filter,
11271127
recursive=True,
11281128
full_paths=True,
1129+
ignore_duplicates=False,
11291130
):
11301131
"""
11311132
Args:
@@ -1143,6 +1144,7 @@ def keys(
11431144
full_paths (bool): If True, include the full path to each subbranch
11441145
with slashes (``/``); otherwise, use the descendant's name as
11451146
the output name.
1147+
ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys.
11461148
11471149
Returns the names of the subbranches as a list of strings.
11481150
"""
@@ -1153,6 +1155,7 @@ def keys(
11531155
filter_branch=filter_branch,
11541156
recursive=recursive,
11551157
full_paths=full_paths,
1158+
ignore_duplicates=ignore_duplicates,
11561159
)
11571160
)
11581161

@@ -1279,6 +1282,7 @@ def iterkeys(
12791282
filter_branch=no_filter,
12801283
recursive=True,
12811284
full_paths=True,
1285+
ignore_duplicates=False,
12821286
):
12831287
"""
12841288
Args:
@@ -1296,6 +1300,8 @@ def iterkeys(
12961300
full_paths (bool): If True, include the full path to each subbranch
12971301
with slashes (``/``); otherwise, use the descendant's name as
12981302
the output name.
1303+
ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys.
1304+
12991305
13001306
Returns the names of the subbranches as an iterator over strings.
13011307
"""
@@ -1305,6 +1311,7 @@ def iterkeys(
13051311
filter_branch=filter_branch,
13061312
recursive=recursive,
13071313
full_paths=full_paths,
1314+
ignore_duplicates=ignore_duplicates,
13081315
):
13091316
yield k
13101317

@@ -1353,6 +1360,7 @@ def iteritems(
13531360
filter_branch=no_filter,
13541361
recursive=True,
13551362
full_paths=True,
1363+
ignore_duplicates=False,
13561364
):
13571365
"""
13581366
Args:
@@ -1370,6 +1378,8 @@ def iteritems(
13701378
full_paths (bool): If True, include the full path to each subbranch
13711379
with slashes (``/``) in the name; otherwise, use the descendant's
13721380
name as the name without modification.
1381+
ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys.
1382+
13731383
13741384
Returns (name, branch) pairs of the subbranches as an iterator over
13751385
2-tuples of (str, :doc:`uproot.behaviors.TBranch.TBranch`).
@@ -1385,6 +1395,8 @@ def iteritems(
13851395
f"filter_branch must be None or a function: TBranch -> bool, not {filter_branch!r}"
13861396
)
13871397

1398+
keys_set = set()
1399+
13881400
for branch in self.branches:
13891401
if (
13901402
(
@@ -1394,7 +1406,11 @@ def iteritems(
13941406
and (filter_typename is no_filter or filter_typename(branch.typename))
13951407
and (filter_branch is no_filter or filter_branch(branch))
13961408
):
1397-
yield branch.name, branch
1409+
if ignore_duplicates and branch.name in keys_set:
1410+
pass
1411+
else:
1412+
keys_set.add(branch.name)
1413+
yield branch.name, branch
13981414

13991415
if recursive:
14001416
for k1, v in branch.iteritems(
@@ -1408,7 +1424,11 @@ def iteritems(
14081424
if filter_name is no_filter or _filter_name_deep(
14091425
filter_name, self, v
14101426
):
1411-
yield k2, v
1427+
if ignore_duplicates and branch.name in keys_set:
1428+
pass
1429+
else:
1430+
keys_set.add(k2)
1431+
yield k2, v
14121432

14131433
def itertypenames(
14141434
self,
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE
2+
3+
import pytest
4+
import uproot
5+
import skhep_testdata
6+
7+
8+
def test_dask_duplicated_keys():
9+
10+
lazy = uproot.dask(
11+
skhep_testdata.data_path("uproot-metadata-performance.root") + ":Events"
12+
)
13+
materialized = lazy.FatJet_btagDDBvLV2.compute()
14+
15+
lazy = uproot.dask(skhep_testdata.data_path("uproot-issue513.root") + ":Delphes")
16+
materialized = lazy.Particle.compute()
17+
18+
lazy = uproot.dask(
19+
skhep_testdata.data_path("uproot-issue443.root") + ":muonDataTree"
20+
)
21+
materialized = lazy.hitEnd.compute()

0 commit comments

Comments
 (0)