uxlfoundation · Rbiessy · Apr 30, 2025 · Apr 22, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -338,7 +338,7 @@ Supported compilers include:
         <tr>
             <td rowspan=2 align="center">NVIDIA GPU</td>
             <td align="center">NVIDIA cuFFT</td>
-            <td align="center">Open DPC++</td>
+            <td align="center">Open DPC++</br>AdaptiveCpp</td>
             <td align="center">Dynamic, Static</td>
         </tr>
         <tr>
@@ -349,7 +349,7 @@ Supported compilers include:
         <tr>
             <td rowspan=2 align="center">AMD GPU</td>
             <td align="center">AMD rocFFT</td>
-            <td align="center">Open DPC++</td>
+            <td align="center">Open DPC++</br>AdaptiveCpp</td>
             <td align="center">Dynamic, Static</td>
         </tr>
         <tr>

@@ -57,7 +57,7 @@ additional guidance. The target architectures must be specified with
 ``HIP_TARGETS``. See the `AdaptiveCpp documentation
 <https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md#adaptivecpp-targets-specification>`_.
 
-If a backend library supports multiple domains (i.e. BLAS, RNG), it may be
+If a backend library supports multiple domains (i.e. BLAS, DFT, RNG), it may be
 desirable to only enable selected domains. For this, the ``TARGET_DOMAINS``
 variable should be set. For further details, see :ref:`_build_target_domains`.
 
@@ -81,6 +81,9 @@ The most important supported build options are:
    * - ENABLE_CUBLAS_BACKEND
      - True, False
      - False     
+   * - ENABLE_CUFFT_BACKEND
+     - True, False
+     - False
    * - ENABLE_CURAND_BACKEND
      - True, False
      - False     
@@ -93,6 +96,9 @@ The most important supported build options are:
    * - ENABLE_ROCBLAS_BACKEND
      - True, False
      - False     
+   * - ENABLE_ROCFFT_BACKEND
+     - True, False
+     - False
    * - ENABLE_ROCRAND_BACKEND
      - True, False
      - False     
@@ -106,7 +112,7 @@ The most important supported build options are:
      - True, False
      - True      
    * - TARGET_DOMAINS (list)
-     - blas, rng
+     - blas, dft, rng
      - All supported domains
 
 Some additional build options are given in
@@ -120,8 +126,8 @@ Backends
 Building for CUDA
 ~~~~~~~~~~~~~~~~~
 
-The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND`` and
-``ENABLE_CURAND_BACKEND``.
+The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
+``ENABLE_CUFFT_BACKEND`` and ``ENABLE_CURAND_BACKEND``.
 
 The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. For
 example, to target a Nvidia A100 (Ampere architecture), set
@@ -140,8 +146,8 @@ the CUDA libraries should be found automatically by CMake.
 Building for ROCm
 ~~~~~~~~~~~~~~~~~
 
-The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND`` and
-``ENABLE_ROCRAND_BACKEND``.
+The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND``,
+``ENABLE_ROCFFT_BACKEND`` and ``ENABLE_ROCRAND_BACKEND``.
 
 The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. See
 the `AdaptiveCpp documentation

@@ -76,7 +76,7 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc,
             auto stream = detail::setup_stream(func_name, ih, plan);
 
             auto inout_native = reinterpret_cast<fwd<descriptor_type>*>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
+                ih.get_native_mem<detail::sycl_cuda_backend>(inout_acc));
             detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
                 func_name, stream, plan, reinterpret_cast<void*>(inout_native + offsets[0]),
                 reinterpret_cast<void*>(inout_native + offsets[1]));
@@ -121,14 +121,14 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc,
         dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
             auto stream = detail::setup_stream(func_name, ih, plan);
 
-            auto in_native = reinterpret_cast<void*>(
-                reinterpret_cast<bwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void*>(
-                reinterpret_cast<fwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
+            auto in_native =
+                reinterpret_cast<void*>(reinterpret_cast<bwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(in_acc)) +
+                                        offsets[0]);
+            auto out_native =
+                reinterpret_cast<void*>(reinterpret_cast<fwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(out_acc)) +
+                                        offsets[1]);
             detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
                 func_name, stream, plan, in_native, out_native);
         });

@@ -34,6 +34,8 @@
 #include "oneapi/math/dft/detail/cufft/onemath_dft_cufft.hpp"
 #include "oneapi/math/dft/types.hpp"
 
+#include "execute_helper.hpp"
+#include "../../execute_helper_generic.hpp"
 #include "../stride_helper.hpp"
 
 #include <cufft.h>
@@ -84,7 +86,7 @@ class cufft_commit final : public dft::detail::commit_impl<prec, dom> {
         if (fix_context) {
             // cufftDestroy changes the context so change it back.
             CUdevice interopDevice =
-                sycl::get_native<sycl::backend::ext_oneapi_cuda>(this->get_queue().get_device());
+                sycl::get_native<sycl_cuda_backend>(this->get_queue().get_device());
             CUcontext interopContext;
             if (cuDevicePrimaryCtxRetain(&interopContext, interopDevice) != CUDA_SUCCESS) {
                 throw math::exception("dft/backends/cufft", __FUNCTION__,
@@ -353,16 +355,16 @@ class cufft_commit final : public dft::detail::commit_impl<prec, dom> {
             .submit([&](sycl::handler& cgh) {
                 auto workspace_acc =
                     buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-                cgh.host_task([=](sycl::interop_handle ih) {
-                    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+                dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
+                    auto stream = ih.get_native_queue<sycl_cuda_backend>();
                     auto result = cufftSetStream(plan, stream);
                     if (result != CUFFT_SUCCESS) {
                         throw oneapi::math::exception(
                             "dft/backends/cufft", "set_workspace",
                             "cufftSetStream returned " + std::to_string(result));
                     }
                     auto workspace_native = reinterpret_cast<scalar_type*>(
-                        ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(workspace_acc));
+                        ih.get_native_mem<sycl_cuda_backend>(workspace_acc));
                     cufftSetWorkArea(plan, workspace_native);
                 });
             })

@@ -36,6 +36,12 @@
 
 namespace oneapi::math::dft::cufft::detail {
 
+#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__)
+constexpr auto sycl_cuda_backend{ sycl::backend::cuda };
+#else // DPC++
+constexpr auto sycl_cuda_backend{ sycl::backend::ext_oneapi_cuda };
+#endif
+
 template <dft::precision prec, dft::domain dom>
 inline dft::detail::commit_impl<prec, dom>* checked_get_commit(
     dft::detail::descriptor<prec, dom>& desc) {
@@ -142,7 +148,7 @@ void cufft_execute(const std::string& func, CUstream stream, cufftHandle plan, v
 }
 
 inline CUstream setup_stream(const std::string& func, sycl::interop_handle ih, cufftHandle plan) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+    auto stream = ih.get_native_queue<sycl_cuda_backend>();
     auto result = cufftSetStream(plan, stream);
     if (result != CUFFT_SUCCESS) {
         throw oneapi::math::exception("dft/backends/cufft", func,

@@ -79,7 +79,7 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc,
             auto stream = detail::setup_stream(func_name, ih, plan);
 
             auto inout_native = reinterpret_cast<fwd<descriptor_type>*>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
+                ih.get_native_mem<detail::sycl_cuda_backend>(inout_acc));
             detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
                 func_name, stream, plan, reinterpret_cast<void*>(inout_native + offsets[0]),
                 reinterpret_cast<void*>(inout_native + offsets[1]));
@@ -124,14 +124,14 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc,
         dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
             auto stream = detail::setup_stream(func_name, ih, plan);
 
-            auto in_native = reinterpret_cast<void*>(
-                reinterpret_cast<fwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void*>(
-                reinterpret_cast<bwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
+            auto in_native =
+                reinterpret_cast<void*>(reinterpret_cast<fwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(in_acc)) +
+                                        offsets[0]);
+            auto out_native =
+                reinterpret_cast<void*>(reinterpret_cast<bwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(out_acc)) +
+                                        offsets[1]);
             detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
                 func_name, stream, plan, in_native, out_native);
         });

@@ -34,6 +34,8 @@
 #include "oneapi/math/dft/detail/rocfft/onemath_dft_rocfft.hpp"
 #include "oneapi/math/dft/types.hpp"
 
+#include "execute_helper.hpp"
+#include "../../execute_helper_generic.hpp"
 #include "../stride_helper.hpp"
 
 #include "rocfft_handle.hpp"
@@ -557,9 +559,9 @@ class rocfft_commit final : public dft::detail::commit_impl<prec, dom> {
         this->get_queue().submit([&](sycl::handler& cgh) {
             auto workspace_acc =
                 buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-            cgh.host_task([=](sycl::interop_handle ih) {
+            dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
                 auto workspace_native = reinterpret_cast<scalar_type*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_hip>(workspace_acc));
+                    ih.get_native_mem<sycl_hip_backend>(workspace_acc));
                 set_workspace_impl(handle, workspace_native, workspace_bytes, "set_workspace");
             });
         });

@@ -36,6 +36,12 @@
 
 namespace oneapi::math::dft::rocfft::detail {
 
+#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__)
+constexpr auto sycl_hip_backend{ sycl::backend::hip };
+#else // DPC++
+constexpr auto sycl_hip_backend{ sycl::backend::ext_oneapi_hip };
+#endif
+
 template <dft::precision prec, dft::domain dom>
 inline dft::detail::commit_impl<prec, dom>* checked_get_commit(
     dft::detail::descriptor<prec, dom>& desc) {
@@ -60,12 +66,12 @@ inline auto expect_config(DescT& desc, const char* message) {
 
 template <typename Acc>
 inline void* native_mem(sycl::interop_handle& ih, Acc& buf) {
-    return ih.get_native_mem<sycl::backend::ext_oneapi_hip>(buf);
+    return ih.get_native_mem<sycl_hip_backend>(buf);
 }
 
 inline hipStream_t setup_stream(const std::string& func, sycl::interop_handle& ih,
                                 rocfft_execution_info info) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+    auto stream = ih.get_native_queue<sycl_hip_backend>();
     auto result = rocfft_execution_info_set_stream(info, stream);
     if (result != rocfft_status_success) {
         throw oneapi::math::exception(

@@ -39,7 +39,11 @@ namespace oneapi::math::dft::detail {
  */
 template <typename HandlerT, typename FnT>
 static inline void fft_enqueue_task(HandlerT&& cgh, FnT&& f) {
-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+#if defined(__ADAPTIVECPP__)
+    cgh.AdaptiveCpp_enqueue_custom_operation([=](sycl::interop_handle ih) {
+#elif defined(__HIPSYCL__)
+    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
+#elif defined(SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND)
     cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 #else
     cgh.host_task([=](sycl::interop_handle ih) {

@@ -571,7 +571,9 @@ inline void recommit_values(sycl::queue& sycl_queue) {
 }
 
 template <oneapi::math::dft::precision precision, oneapi::math::dft::domain domain>
-inline void change_queue_causes_wait(sycl::queue& busy_queue) {
+inline void change_queue_causes_wait([[maybe_unused]] sycl::queue& busy_queue) {
+// Skip this test in AdaptiveCpp, which doesn't support host_task
+#if !defined(__ADAPTIVECPP__) && !defined(__HIPSYCL__)
     // create a queue with work on it, and then show that work is waited on when the descriptor
     // is committed to a new queue.
     // its possible to have a false positive result, but a false negative should not be possible.
@@ -616,6 +618,7 @@ inline void change_queue_causes_wait(sycl::queue& busy_queue) {
     // busy queue task has now completed.
     auto after_status = e.template get_info<sycl::info::event::command_execution_status>();
     ASSERT_EQ(after_status, sycl::info::event_command_status::complete);
+#endif
 }
 
 template <oneapi::math::dft::precision precision, oneapi::math::dft::domain domain>