scikit-hep · ariostas · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/src/uproot/behaviors/RNTuple.py b/src/uproot/behaviors/RNTuple.py
@@ -15,6 +15,7 @@
 import sys
 import warnings
 from collections.abc import Mapping
+from functools import partial
 
 import numpy
 
@@ -23,6 +24,7 @@
 import uproot.language.python
 import uproot.source.chunk
 from uproot._util import no_filter, unset
+from uproot.behaviors.TBranch import _regularize_array_cache
 
 
 def iterate(
@@ -615,12 +617,13 @@ def arrays(
         entry_start=None,
         entry_stop=None,
         decompression_executor=None,  # TODO: Not implemented yet
-        array_cache="inherit",  # TODO: Not implemented yet
+        array_cache="inherit",
         library="ak",  # TODO: Not implemented yet
         backend="cpu",
         interpreter="cpu",
         ak_add_doc=False,
         how=None,
+        virtual=True,
         # For compatibility reasons we also accepts kwargs meant for TTrees
         interpretation_executor=None,
         filter_branch=unset,
@@ -659,7 +662,7 @@ def arrays(
                 is used. (Not implemented yet.)
             array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays;
                 if "inherit", use the file's cache; if None, do not use a cache;
-                if a memory size, create a new cache of this size. (Not implemented yet.)
+                if a memory size, create a new cache of this size.
             library (str or :doc:`uproot.interpretation.library.Library`): The library
                 that is used to represent arrays. Options are ``"np"`` for NumPy,
                 ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.)
@@ -676,6 +679,7 @@ def arrays(
                 ``list``, and ``dict``. Note that the container *type itself*
                 must be passed as ``how``, not an instance of that type (i.e.
                 ``how=tuple``, not ``how=()``).
+            virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
             interpretation_executor (None): This argument is not used and is only included for now
                 for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
                 and will be removed in a future version.
@@ -725,6 +729,8 @@ def arrays(
             [c.num_entries for c in clusters[start_cluster_idx:stop_cluster_idx]]
         )
 
+        array_cache = _regularize_array_cache(array_cache, self.ntuple._file)
+
         form, field_path = self.to_akform(
             filter_name=filter_name,
             filter_typename=filter_typename,
@@ -747,26 +753,49 @@ def arrays(
                 clusters_datas,
                 start_cluster_idx,
                 stop_cluster_idx,
-                pad_missing_element=True,
             )
 
         for key in target_cols:
             if "column" in key and "union" not in key:
                 key_nr = int(key.split("-")[1])
+                # Find how many elements should be padded at the beginning
+                n_padding = self.ntuple.column_records[key_nr].first_element_index
+                n_padding -= cluster_starts[start_cluster_idx]
+                n_padding = max(n_padding, 0)
+                dtype = None
                 if interpreter == "cpu":
-                    content = self.ntuple.read_col_pages(
+                    content_generator = partial(
+                        self.ntuple.read_cluster_range,
                         key_nr,
-                        range(start_cluster_idx, stop_cluster_idx),
-                        pad_missing_element=True,
+                        start_cluster_idx,
+                        stop_cluster_idx,
+                        missing_element_padding=n_padding,
+                        array_cache=array_cache,
                     )
+                    if virtual:
+                        total_length, _, dtype = (
+                            self.ntuple._expected_array_length_starts_dtype(
+                                key_nr,
+                                start_cluster_idx,
+                                stop_cluster_idx,
+                                missing_element_padding=n_padding,
+                            )
+                        )
+                        if "cardinality" in key:
+                            total_length -= 1
+                        content = (total_length, content_generator)
+                    else:
+                        content = content_generator()
                 elif interpreter == "gpu" and backend == "cuda":
                     content = content_dict[key_nr]
                 elif interpreter == "gpu":
                     raise NotImplementedError(
                         f"Backend {backend} GDS support not implemented."
                     )
+                else:
+                    raise NotImplementedError(f"Backend {backend} not implemented.")
                 dtype_byte = self.ntuple.column_records[key_nr].type
-                _fill_container_dict(container_dict, content, key, dtype_byte)
+                _fill_container_dict(container_dict, content, key, dtype_byte, dtype)
 
         cluster_offset = cluster_starts[start_cluster_idx]
         entry_start -= cluster_offset
@@ -1771,27 +1800,75 @@ def _cupy_insert(arr, obj, value):
     return out
 
 
-def _fill_container_dict(container_dict, content, key, dtype_byte):
-    array_library_string = uproot._util.get_array_library(content)
+def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
+    from awkward._nplikes.numpy import Numpy
+    from awkward._nplikes.virtual import VirtualNDArray
+
+    if type(content) == tuple:
+        # Virtual arrays not yet implemented for GPU
+        array_library_string = "numpy"
+        virtual = True
+        length = int(content[0])
+        raw_generator = content[1]
+    else:
+        virtual = False
+        array_library_string = uproot._util.get_array_library(content)
 
     library = numpy if array_library_string == "numpy" else uproot.extras.cupy()
 
     if "cardinality" in key:
-        content = library.diff(content)
-
-    if "optional" in key:
-        # We need to convert from a ListOffsetArray to an IndexedOptionArray
-        diff = library.diff(content)
-        missing = library.nonzero(diff == 0)[0]
-        missing -= library.arange(len(missing), dtype=missing.dtype)
-        dtype = "int64" if content.dtype == library.uint64 else "int32"
-        indices = library.arange(len(content) - len(missing), dtype=dtype)
-        if array_library_string == "numpy":
-            indices = numpy.insert(indices, missing, -1)
+        if virtual:
+
+            def generator():
+                materialized = raw_generator()
+                materialized = library.diff(materialized)
+                return materialized
+
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
+            )
+            container_dict[f"{key}-data"] = generator
+        else:
+            content = library.diff(content)
+            container_dict[f"{key}-data"] = content
+    elif "optional" in key:
+        if virtual:
+
+            def generator():
+                # We need to convert from a ListOffsetArray to an IndexedOptionArray
+                materialized = raw_generator()
+                diff = library.diff(materialized)
+                missing = library.nonzero(diff == 0)[0]
+                missing -= library.arange(len(missing), dtype=missing.dtype)
+                dtype = "int64" if materialized.dtype == library.uint64 else "int32"
+                indices = library.arange(len(materialized) - len(missing), dtype=dtype)
+                if array_library_string == "numpy":
+                    indices = numpy.insert(indices, missing, -1)
+                else:
+                    indices = _cupy_insert(indices, missing, -1)
+                return indices
+
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
+            )
+            container_dict[f"{key}-index"] = generator
         else:
-            indices = _cupy_insert(indices, missing, -1)
-        container_dict[f"{key}-index"] = indices
+            # We need to convert from a ListOffsetArray to an IndexedOptionArray
+            diff = library.diff(content)
+            missing = library.nonzero(diff == 0)[0]
+            missing -= library.arange(len(missing), dtype=missing.dtype)
+            dtype = "int64" if content.dtype == library.uint64 else "int32"
+            indices = library.arange(len(content) - len(missing), dtype=dtype)
+            if array_library_string == "numpy":
+                indices = numpy.insert(indices, missing, -1)
+            else:
+                indices = _cupy_insert(indices, missing, -1)
+            container_dict[f"{key}-index"] = indices
     elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
+        if virtual:
+            # TODO: Figure out how to handle this one
+            content = raw_generator()
+            print(f"{length}  {len(content)}")
         kindex, tags = uproot.models.RNTuple._split_switch_bits(content)
         # Find invalid variants and adjust buffers accordingly
         invalid = numpy.flatnonzero(tags == -1)
@@ -1808,6 +1885,14 @@ def _fill_container_dict(container_dict, content, key, dtype_byte):
         container_dict[f"{key}-union-index"] = library.array(kindex)
         container_dict[f"{key}-union-tags"] = library.array(tags)
     else:
-        # don't distinguish data and offsets
-        container_dict[f"{key}-data"] = content
-        container_dict[f"{key}-offsets"] = content
+        if virtual:
+            virtual_array = VirtualNDArray(
+                Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
+            )
+            # don't distinguish data and offsets
+            container_dict[f"{key}-data"] = raw_generator
+            container_dict[f"{key}-offsets"] = raw_generator
+        else:
+            # don't distinguish data and offsets
+            container_dict[f"{key}-data"] = content
+            container_dict[f"{key}-offsets"] = content
diff --git a/src/uproot/const.py b/src/uproot/const.py
@@ -138,8 +138,8 @@
     0x0B: "float16",
     0x0C: "float32",
     0x0D: "float64",
-    0x0E: "uint32",  # Index32
-    0x0F: "uint64",  # Index64
+    0x0E: "int32",  # Index32
+    0x0F: "int64",  # Index64
     0x10: "switch",  # Switch: (uint64, uint32)
     0x11: "int16",  # SplitInt16: split + zigzag encoding
     0x12: "uint16",  # SplitUInt16: split encoding
@@ -150,8 +150,8 @@
     0x17: "float16",  # SplitReal16: split encoding
     0x18: "float32",  # SplitReal32: split encoding
     0x19: "float64",  # SplitReal64: split encoding
-    0x1A: "uint32",  # SplitIndex32: split + delta encoding
-    0x1B: "uint64",  # SplitIndex64: split + delta encoding
+    0x1A: "int32",  # SplitIndex32: split + delta encoding
+    0x1B: "int64",  # SplitIndex64: split + delta encoding
     0x1C: "real32trunc",  # Real32Trunc: float32 with truncated mantissa
     0x1D: "real32quant",  # Real32Quant: float32 with quantized integer representation
 }