AMReX-Codes · ax3l · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 24, 2025
diff --git a/src/Base/Array4.H b/src/Base/Array4.H
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "pyAMReX.H"
+#include "dlpack/dlpack.h"
 
 #include <AMReX_Array4.H>
 #include <AMReX_BLassert.H>
@@ -185,6 +186,7 @@ namespace pyAMReX
              */
 
 
+            /*
             // CPU: __array_interface__ v3
             // https://numpy.org/doc/stable/reference/arrays.interface.html
             .def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
@@ -220,15 +222,80 @@ namespace pyAMReX
                 d["version"] = 3;
                 return d;
             })
+            */
 
 
-            // TODO: __dlpack__ __dlpack_device__
-            // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
+            // DLPack v1.1 protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
             // https://dmlc.github.io/dlpack/latest/
-            // https://data-apis.org/array-api/latest/design_topics/data_interchange.html
-            // https://github.com/data-apis/consortium-feedback/issues/1
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
+            .def("__dlpack__", [](
+                    Array4<T> const &a4,
+                    /* TODO: Handle keyword arguments */
+                    [[maybe_unused]] std::optional<py::handle> stream = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<int, int>> max_version = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<DLDeviceType, int32_t>> dl_device = std::nullopt,
+                    [[maybe_unused]] std::optional<bool> copy = std::nullopt
+
+            )
+            {
+                // Allocate shape/strides arrays
+                constexpr int ndim = 4;
+                auto const len = length(a4);
+
+                // Construct DLManagedTensorVersioned (DLPack 1.1 standard)
+                auto *dl_mgt_tensor = new DLManagedTensorVersioned;
+                dl_mgt_tensor->version = DLPackVersion{};
+                dl_mgt_tensor->flags = 0; // No special flags
+                dl_mgt_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
+                dl_mgt_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                dl_mgt_tensor->dl_tensor.ndim = ndim;
+                dl_mgt_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype<T>();
+                dl_mgt_tensor->dl_tensor.shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
+                dl_mgt_tensor->dl_tensor.strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
+                dl_mgt_tensor->dl_tensor.byte_offset = 0;
+                dl_mgt_tensor->manager_ctx = nullptr;  // TODO: we can increase/decrease the Python ref counter of the producer here
+                dl_mgt_tensor->deleter = [](DLManagedTensorVersioned *self) {
+                    delete[] self->dl_tensor.shape;
+                    delete[] self->dl_tensor.strides;
+                    delete self;
+                };
+                // Return as Python capsule
+                return py::capsule(
+                    dl_mgt_tensor,
+                    "dltensor_versioned",
+                    /*[](void* ptr) {
+                        auto* tensor = static_cast<DLManagedTensorVersioned*>(ptr);
+                        tensor->deleter(tensor);
+                    }*/
+                    [](PyObject *capsule)
+                    {
+                        if (PyCapsule_IsValid(capsule, "used_dltensor_versioned")) {
+                            return;  /* Do nothing if the capsule has been consumed. */
+                        }
+                        auto *p = static_cast<DLManagedTensorVersioned*>(
+                            PyCapsule_GetPointer(capsule, "dltensor_versioned"));
+                        if (p && p->deleter)
+                            p->deleter(p);
+                    }
+                );
+            },
+                py::arg("stream") = py::none(),
+                py::arg("max_version") = py::none(),
+                py::arg("dl_device") = py::none(),
+                py::arg("copy") = py::none(),
+                R"doc(
+                DLPack protocol for zero-copy tensor exchange.
+                See https://dmlc.github.io/dlpack/latest/ for details.
+                )doc"
+            )
+            .def("__dlpack_device__", [](Array4<T> const &a4) {
+                DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                return std::make_tuple(static_cast<int32_t>(device.device_type), device.device_id);
+            }, R"doc(
+                DLPack device info (device_type, device_id).
+            )doc")
+
 
             .def("to_host", [](Array4<T> const & a4) {
                 // py::tuple to std::vector

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(AmrCore)
 add_subdirectory(Base)
 #add_subdirectory(Boundary)
+add_subdirectory(dlpack)
 #add_subdirectory(EB)
 #add_subdirectory(Extern)
 #add_subdirectory(LinearSolvers)

diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py
@@ -1,7 +1,7 @@
 """
 This file is part of pyAMReX
 
-Copyright 2023 AMReX community
+Copyright 2023-2025 AMReX community
 Authors: Axel Huebl
 License: BSD-3-Clause-LBNL
 """
@@ -92,9 +92,52 @@ def array4_to_cupy(self, copy=False, order="F"):
         raise ValueError("The order argument must be F or C.")
 
 
+def array4_to_dpnp(self, copy=False, order="F"):
+    """
+    Provide a dpnp view into an Array4.
+
+    This includes ngrow guard cells of the box.
+
+    Note on the order of indices:
+    By default, this is as in AMReX in Fortran contiguous order, indexing as
+    x,y,z. This has performance implications for use in external libraries such
+    as dpnp.
+    The order="C" option will index as z,y,x and may perform better.
+    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
+
+    Parameters
+    ----------
+    self : amrex.Array4_*
+        An Array4 class in pyAMReX
+    copy : bool, optional
+        Copy the data if true, otherwise create a view (default).
+    order : string, optional
+        F order (default) or C. C is faster with external libraries.
+
+    Returns
+    -------
+    dpnp.array
+        A dpnp n-dimensional array.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if dpnp is not installed
+    """
+    import dpnp as dp
+
+    if order == "F":
+        return dp.from_dlpack(self, copy=copy).T
+    elif order == "C":
+        return dp.from_dlpack(self, copy=copy)
+    else:
+        raise ValueError("The order argument must be F or C.")
+
+
 def array4_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into an Array4, depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into an Array4, depending on amr.Config.have_gpu
+    and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -120,14 +163,20 @@ def array4_to_xp(self, copy=False, order="F"):
     Returns
     -------
     xp.array
-        A NumPy or CuPy n-dimensional array.
+        A NumPy, CuPy or dpnp n-dimensional array.
     """
     import inspect
 
     amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+
+    if amr.Config.have_gpu:
+        if amr.Config.gpu_backend == "SYCL":
+            return self.to_dpnp(copy, order)
+        else:  # if not SYCL use cupy
+            return self.to_cupy(copy, order)
+
+    # if no GPU, use NumPy
+    return self.to_numpy(copy, order)
 
 
 def register_Array4_extension(amr):
@@ -144,4 +193,5 @@ def register_Array4_extension(amr):
     ):
         Array4_type.to_numpy = array4_to_numpy
         Array4_type.to_cupy = array4_to_cupy
+        Array4_type.to_dpnp = array4_to_dpnp
         Array4_type.to_xp = array4_to_xp
diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
@@ -63,50 +63,10 @@ def mf_to_numpy(self, copy=False, order="F"):
     return views
 
 
-def mf_to_cupy(self, copy=False, order="F"):
-    """
-    Provide a CuPy view into a MultiFab.
-
-    This includes ngrow guard cells of each box.
-
-    Note on the order of indices:
-    By default, this is as in AMReX in Fortran contiguous order, indexing as
-    x,y,z. This has performance implications for use in external libraries such
-    as cupy.
-    The order="C" option will index as z,y,x and perform better with cupy.
-    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
-
-    Parameters
-    ----------
-    self : amrex.MultiFab
-        A MultiFab class in pyAMReX
-    copy : bool, optional
-        Copy the data if true, otherwise create a view (default).
-    order : string, optional
-        F order (default) or C. C is faster with external libraries.
-
-    Returns
-    -------
-    list of cupy.array
-        A list of CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
-
-    Raises
-    ------
-    ImportError
-        Raises an exception if cupy is not installed
-    """
-    views = []
-    for mfi in self:
-        views.append(self.array(mfi).to_cupy(copy, order))
-
-    return views
-
-
 def mf_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into a MultiFab,
-    depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into a MultiFab,
+    depending on amr.Config.have_gpu and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -132,15 +92,14 @@ def mf_to_xp(self, copy=False, order="F"):
     Returns
     -------
     list of xp.array
-        A list of NumPy or CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
+        A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block
+        in the MultiFab.
     """
-    import inspect
+    views = []
+    for mfi in self:
+        views.append(self.array(mfi).to_xp(copy, order))
 
-    amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+    return views
 
 
 def copy_multifab(amr, self):
@@ -490,6 +449,10 @@ def __getitem__(self, index, with_internal_ghosts=False):
         Whether to include internal ghost cells. When true, data from ghost cells may be used that
         overlaps valid cells.
     """
+    import inspect
+
+    amr = inspect.getmodule(self)
+
     index4 = _process_index(self, index)
 
     # Gather the data to be included in a list to be sent to other processes
@@ -503,17 +466,19 @@ def __getitem__(self, index, with_internal_ghosts=False):
             device_arr = _get_field(self, mfi)
             slice_arr = device_arr[block_slices]
             try:
-                # Copy data from host to device using cupy syntax
-                slice_arr = slice_arr.get()
+                if amr.Config.gpu_backend == "SYCL":
+                    import dpnp
+
+                    slice_arr = dpnp.asnumpy(slice_arr)
+                else:
+                    # Copy data from host to device using cupy syntax
+                    slice_arr = slice_arr.get()
             except AttributeError:
                 # Array is already a numpy array on the host
                 pass
             datalist.append((global_slices, slice_arr))
 
     # Gather the data from all processors
-    import inspect
-
-    amr = inspect.getmodule(self)
     if amr.Config.have_mpi:
         npes = amr.ParallelDescriptor.NProcs()
     else:
@@ -604,7 +569,10 @@ def __setitem__(self, index, value):
 
     amr = inspect.getmodule(self)
     if amr.Config.have_gpu:
-        import cupy as xp
+        if amr.Config.gpu_backend == "SYCL":
+            import dpnp as xp
+        else:
+            import cupy as xp
     else:
         xp = np
 
@@ -653,7 +621,6 @@ def register_MultiFab_extension(amr):
     amr.MultiFab.__iter__ = lambda mfab: amr.MFIter(mfab)
 
     amr.MultiFab.to_numpy = mf_to_numpy
-    amr.MultiFab.to_cupy = mf_to_cupy
     amr.MultiFab.to_xp = mf_to_xp
 
     amr.MultiFab.copy = lambda self: copy_multifab(amr, self)
@@ -669,7 +636,6 @@ def register_MultiFab_extension(amr):
     amr.iMultiFab.__iter__ = lambda imfab: amr.MFIter(imfab)
 
     amr.iMultiFab.to_numpy = mf_to_numpy
-    amr.iMultiFab.to_cupy = mf_to_cupy
     amr.iMultiFab.to_xp = mf_to_xp
 
     amr.iMultiFab.copy = lambda self: copy_multifab(amr, self)

diff --git a/src/dlpack/CMakeLists.txt b/src/dlpack/CMakeLists.txt
@@ -0,0 +1,6 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    target_sources(pyAMReX_${D}d
+        PRIVATE
+            DLPack.cpp
+    )
+endforeach()
diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
@@ -0,0 +1,35 @@
+#include "pyAMReX.H"
+
+#include "dlpack.h"
+
+
+void init_DLPack(py::module& m)
+{
+    using namespace amrex;
+
+    // register types only if not already present, e.g., from another library
+    // that also implements DLPack bindings and exposes the types
+
+    // TODO: py::type pyDLDeviceType = py::type::of<DLDeviceType>();
+    bool pyDLDeviceType = false;
+    if (!pyDLDeviceType) {
+        py::native_enum<DLDeviceType>(m, "DLDeviceType", "enum.IntEnum")
+            .value("kDLCPU", DLDeviceType::kDLCPU)
+            .value("kDLCUDA", DLDeviceType::kDLCUDA)
+            .value("kDLCUDAHost", DLDeviceType::kDLCUDAHost)
+            .value("kDLOpenCL", DLDeviceType::kDLOpenCL)
+            .value("kDLVulkan", DLDeviceType::kDLVulkan)
+            .value("kDLMetal", DLDeviceType::kDLMetal)
+            .value("kDLVPI", DLDeviceType::kDLVPI)
+            .value("kDLROCM", DLDeviceType::kDLROCM)
+            .value("kDLROCMHost", DLDeviceType::kDLROCMHost)
+            .value("kDLExtDev", DLDeviceType::kDLExtDev)
+            .value("kDLCUDAManaged", DLDeviceType::kDLCUDAManaged)
+            .value("kDLOneAPI", DLDeviceType::kDLOneAPI)
+            .value("kDLWebGPU", DLDeviceType::kDLWebGPU)
+            .value("kDLHexagon", DLDeviceType::kDLHexagon)
+            .value("kDLMAIA", DLDeviceType::kDLMAIA)
+        ;
+    }
+
+}