Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 110 additions & 25 deletions src/uproot/behaviors/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sys
import warnings
from collections.abc import Mapping
from functools import partial

import numpy

Expand All @@ -23,6 +24,7 @@
import uproot.language.python
import uproot.source.chunk
from uproot._util import no_filter, unset
from uproot.behaviors.TBranch import _regularize_array_cache


def iterate(
Expand Down Expand Up @@ -615,12 +617,13 @@ def arrays(
entry_start=None,
entry_stop=None,
decompression_executor=None, # TODO: Not implemented yet
array_cache="inherit", # TODO: Not implemented yet
array_cache="inherit",
library="ak", # TODO: Not implemented yet
backend="cpu",
interpreter="cpu",
ak_add_doc=False,
how=None,
virtual=True,
# For compatibility reasons we also accepts kwargs meant for TTrees
interpretation_executor=None,
filter_branch=unset,
Expand Down Expand Up @@ -659,7 +662,7 @@ def arrays(
is used. (Not implemented yet.)
array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays;
if "inherit", use the file's cache; if None, do not use a cache;
if a memory size, create a new cache of this size. (Not implemented yet.)
if a memory size, create a new cache of this size.
library (str or :doc:`uproot.interpretation.library.Library`): The library
that is used to represent arrays. Options are ``"np"`` for NumPy,
``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.)
Expand All @@ -676,6 +679,7 @@ def arrays(
``list``, and ``dict``. Note that the container *type itself*
must be passed as ``how``, not an instance of that type (i.e.
``how=tuple``, not ``how=()``).
virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
interpretation_executor (None): This argument is not used and is only included for now
for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
and will be removed in a future version.
Expand Down Expand Up @@ -725,6 +729,8 @@ def arrays(
[c.num_entries for c in clusters[start_cluster_idx:stop_cluster_idx]]
)

array_cache = _regularize_array_cache(array_cache, self.ntuple._file)

form, field_path = self.to_akform(
filter_name=filter_name,
filter_typename=filter_typename,
Expand All @@ -747,26 +753,49 @@ def arrays(
clusters_datas,
start_cluster_idx,
stop_cluster_idx,
pad_missing_element=True,
)

for key in target_cols:
if "column" in key and "union" not in key:
key_nr = int(key.split("-")[1])
# Find how many elements should be padded at the beginning
n_padding = self.ntuple.column_records[key_nr].first_element_index
n_padding -= cluster_starts[start_cluster_idx]
n_padding = max(n_padding, 0)
dtype = None
if interpreter == "cpu":
content = self.ntuple.read_col_pages(
content_generator = partial(
self.ntuple.read_cluster_range,
key_nr,
range(start_cluster_idx, stop_cluster_idx),
pad_missing_element=True,
start_cluster_idx,
stop_cluster_idx,
missing_element_padding=n_padding,
array_cache=array_cache,
)
if virtual:
total_length, _, dtype = (
self.ntuple._expected_array_length_starts_dtype(
key_nr,
start_cluster_idx,
stop_cluster_idx,
missing_element_padding=n_padding,
)
)
if "cardinality" in key:
total_length -= 1
content = (total_length, content_generator)
else:
content = content_generator()
elif interpreter == "gpu" and backend == "cuda":
content = content_dict[key_nr]
elif interpreter == "gpu":
raise NotImplementedError(
f"Backend {backend} GDS support not implemented."
)
else:
raise NotImplementedError(f"Backend {backend} not implemented.")
dtype_byte = self.ntuple.column_records[key_nr].type
_fill_container_dict(container_dict, content, key, dtype_byte)
_fill_container_dict(container_dict, content, key, dtype_byte, dtype)

cluster_offset = cluster_starts[start_cluster_idx]
entry_start -= cluster_offset
Expand Down Expand Up @@ -1771,27 +1800,75 @@ def _cupy_insert(arr, obj, value):
return out


def _fill_container_dict(container_dict, content, key, dtype_byte):
array_library_string = uproot._util.get_array_library(content)
def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.virtual import VirtualNDArray

if type(content) == tuple:
# Virtual arrays not yet implemented for GPU
array_library_string = "numpy"
virtual = True
length = int(content[0])
raw_generator = content[1]
else:
virtual = False
array_library_string = uproot._util.get_array_library(content)

library = numpy if array_library_string == "numpy" else uproot.extras.cupy()

if "cardinality" in key:
content = library.diff(content)

if "optional" in key:
# We need to convert from a ListOffsetArray to an IndexedOptionArray
diff = library.diff(content)
missing = library.nonzero(diff == 0)[0]
missing -= library.arange(len(missing), dtype=missing.dtype)
dtype = "int64" if content.dtype == library.uint64 else "int32"
indices = library.arange(len(content) - len(missing), dtype=dtype)
if array_library_string == "numpy":
indices = numpy.insert(indices, missing, -1)
if virtual:

def generator():
materialized = raw_generator()
materialized = library.diff(materialized)
return materialized

virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
)
container_dict[f"{key}-data"] = generator
else:
content = library.diff(content)
container_dict[f"{key}-data"] = content
elif "optional" in key:
if virtual:

def generator():
# We need to convert from a ListOffsetArray to an IndexedOptionArray
materialized = raw_generator()
diff = library.diff(materialized)
missing = library.nonzero(diff == 0)[0]
missing -= library.arange(len(missing), dtype=missing.dtype)
dtype = "int64" if materialized.dtype == library.uint64 else "int32"
indices = library.arange(len(materialized) - len(missing), dtype=dtype)
if array_library_string == "numpy":
indices = numpy.insert(indices, missing, -1)
else:
indices = _cupy_insert(indices, missing, -1)
return indices

virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
)
container_dict[f"{key}-index"] = generator
else:
indices = _cupy_insert(indices, missing, -1)
container_dict[f"{key}-index"] = indices
# We need to convert from a ListOffsetArray to an IndexedOptionArray
diff = library.diff(content)
missing = library.nonzero(diff == 0)[0]
missing -= library.arange(len(missing), dtype=missing.dtype)
dtype = "int64" if content.dtype == library.uint64 else "int32"
indices = library.arange(len(content) - len(missing), dtype=dtype)
if array_library_string == "numpy":
indices = numpy.insert(indices, missing, -1)
else:
indices = _cupy_insert(indices, missing, -1)
container_dict[f"{key}-index"] = indices
elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
if virtual:
# TODO: Figure out how to handle this one
content = raw_generator()
print(f"{length} {len(content)}")
kindex, tags = uproot.models.RNTuple._split_switch_bits(content)
# Find invalid variants and adjust buffers accordingly
invalid = numpy.flatnonzero(tags == -1)
Expand All @@ -1808,6 +1885,14 @@ def _fill_container_dict(container_dict, content, key, dtype_byte):
container_dict[f"{key}-union-index"] = library.array(kindex)
container_dict[f"{key}-union-tags"] = library.array(tags)
else:
# don't distinguish data and offsets
container_dict[f"{key}-data"] = content
container_dict[f"{key}-offsets"] = content
if virtual:
virtual_array = VirtualNDArray(
Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
)
# don't distinguish data and offsets
container_dict[f"{key}-data"] = raw_generator
container_dict[f"{key}-offsets"] = raw_generator
else:
# don't distinguish data and offsets
container_dict[f"{key}-data"] = content
container_dict[f"{key}-offsets"] = content
8 changes: 4 additions & 4 deletions src/uproot/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@
0x0B: "float16",
0x0C: "float32",
0x0D: "float64",
0x0E: "uint32", # Index32
0x0F: "uint64", # Index64
0x0E: "int32", # Index32
0x0F: "int64", # Index64
0x10: "switch", # Switch: (uint64, uint32)
0x11: "int16", # SplitInt16: split + zigzag encoding
0x12: "uint16", # SplitUInt16: split encoding
Expand All @@ -150,8 +150,8 @@
0x17: "float16", # SplitReal16: split encoding
0x18: "float32", # SplitReal32: split encoding
0x19: "float64", # SplitReal64: split encoding
0x1A: "uint32", # SplitIndex32: split + delta encoding
0x1B: "uint64", # SplitIndex64: split + delta encoding
0x1A: "int32", # SplitIndex32: split + delta encoding
0x1B: "int64", # SplitIndex64: split + delta encoding
0x1C: "real32trunc", # Real32Trunc: float32 with truncated mantissa
0x1D: "real32quant", # Real32Quant: float32 with quantized integer representation
}
Expand Down
Loading
Loading