Skip to content

Commit

Permalink
Merge branch 'fft-functional-programming' into 'master'
Browse files Browse the repository at this point in the history
fft functional

See merge request correaa/boost-multi!1368
  • Loading branch information
correaa committed Mar 8, 2025
2 parents c8e308b + 1a53378 commit da79e40
Show file tree
Hide file tree
Showing 10 changed files with 931 additions and 592 deletions.
37 changes: 1 addition & 36 deletions .gitlab-ci-correaa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -574,40 +574,6 @@ nvhpc-22.7:
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-24.11 c++20 par:
stage: build
image: nvcr.io/nvidia/nvhpc:24.11-devel-cuda12.6-ubuntu24.04 # nvcr.io/nvidia/nvhpc:24.7-devel-cuda12.5-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
tags:
- non-shared
- large-disk-space
- x86_64
interruptible: true
script:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-stdpar=multicore"
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-24.11:
stage: build
image: nvcr.io/nvidia/nvhpc:24.11-devel-cuda12.6-ubuntu24.04 # nvcr.io/nvidia/nvhpc:24.7-devel-cuda12.5-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
tags:
- non-shared
- large-disk-space
- x86_64
interruptible: true
script:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-25.1:
stage: build
image: nvcr.io/nvidia/nvhpc:25.1-devel-cuda12.6-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
Expand All @@ -620,12 +586,11 @@ nvhpc-25.1:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release # -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-stdpar=multicore"
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]


cuda:
stage: build
allow_failure: false
Expand Down
55 changes: 35 additions & 20 deletions include/boost/multi/adaptors/cufft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
#include <boost/multi/detail/config/NODISCARD.hpp>

#include <array>
#include <cstddef>
#include <map>
#include <tuple>

#include<thrust/memory.h> // for raw_pointer_cast

#if not defined(__HIP_ROCclr__)
#if !defined(__HIP_ROCclr__)
#include <cufft.h>
#include <cufftXt.h>
#endif
Expand Down Expand Up @@ -83,27 +84,27 @@ constexpr sign backward{CUFFT_INVERSE};

static_assert(forward != none && none != backward && backward != forward);

template<dimensionality_type DD = -1, class Alloc = void*>
template<dimensionality_type DD, class Alloc = void*>
class plan {
Alloc alloc_;
::size_t workSize_ = 0;
void* workArea_{};

using complex_type = cufftDoubleComplex;
cufftHandle h_{}; // TODO(correaa) put this in a unique_ptr
std::array<std::pair<bool, fftw_iodim64>, DD + 1> which_iodims_{};
int first_howmany_{};

using complex_type = cufftDoubleComplex;

public:
public:
using allocator_type = Alloc;

plan(plan&& other) noexcept :
h_{std::exchange(other.h_, {})},
which_iodims_{std::exchange(other.which_iodims_, {})},
first_howmany_{std::exchange(other.first_howmany_, {})},
alloc_{std::move(other.alloc_)},
workSize_{std::exchange(other.workSize_, {})},
workArea_{std::exchange(other.workArea_, {})},
alloc_{std::move(other.alloc_)}
h_{std::exchange(other.h_, {})},
which_iodims_{std::exchange(other.which_iodims_, {})},
first_howmany_{std::exchange(other.first_howmany_, {})}
{}

template<
Expand Down Expand Up @@ -136,8 +137,8 @@ class plan {
std::array<fftw_iodim64, D> dims{};
auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) {return elem.second;});

std::array<fftw_iodim64, D> howmany_dims{};
auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});
// std::array<fftw_iodim64, D> howmany_dims{};
// auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});

which_iodims_ = which_iodims;
first_howmany_ = part - which_iodims.begin();
Expand All @@ -157,7 +158,7 @@ class plan {
int ostride = *(ostrides_end -1);
auto onembed = ostrides; onembed.fill(0);

for(std::size_t idx = 1; idx != ion_end - ion.begin(); ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) replace with algorithm
for(std::ptrdiff_t idx = 1; idx != ion_end - ion.begin(); ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) replace with algorithm
assert(ostrides[idx - 1] >= ostrides[idx]);
assert(ostrides[idx - 1] % ostrides[idx] == 0);
onembed[idx] = ostrides[idx - 1] / ostrides[idx];
Expand Down Expand Up @@ -285,22 +286,28 @@ class plan {

private:

template<typename = void>
void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const{
cufftSafeCall(::cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
cufftSafeCall(cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
// cudaDeviceSynchronize();
}

public:
template<class IPtr, class OPtr>
void execute(IPtr idata, OPtr odata, int direction) { // TODO(correaa) make const
auto execute(IPtr idata, OPtr odata, int direction)
-> decltype((void)(
reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata)),
reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata))
))
{ // TODO(correaa) make const
if(first_howmany_ == DD) {
ExecZ2Z_(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata)), reinterpret_cast<complex_type*>(::thrust::raw_pointer_cast(odata)), direction); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) wrap a legacy interface
return;
}
if(first_howmany_ == DD - 1) {
if( which_iodims_[first_howmany_].first) {throw std::runtime_error{"logic error"};}
for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch)
::cufftExecZ2Z(
cufftExecZ2Z(
h_,
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx*which_iodims_[first_howmany_].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type *>(::thrust::raw_pointer_cast(odata + idx*which_iodims_[first_howmany_].second.os)) , // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
Expand All @@ -315,7 +322,7 @@ class plan {
if(idata == odata) {throw std::runtime_error{"complicated inplace 2"};}
for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
::cufftExecZ2Z(
cufftExecZ2Z(
h_,
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx*which_iodims_[first_howmany_].second.is + jdx*which_iodims_[first_howmany_ + 1].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type *>(::thrust::raw_pointer_cast(odata + idx*which_iodims_[first_howmany_].second.os + jdx*which_iodims_[first_howmany_ + 1].second.os)) , // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
Expand Down Expand Up @@ -380,24 +387,32 @@ class cached_plan {
if(it_ == LEAKY_cache.end()) {it_ = LEAKY_cache.insert(std::make_pair(std::make_tuple(which, in, out), plan<D, Alloc>(which, in, out, alloc))).first;}
}
template<class IPtr, class OPtr>
void execute(IPtr idata, OPtr odata, int direction) {
auto execute(IPtr idata, OPtr odata, int direction)
-> decltype(
(void)
(std::declval<
typename std::map<std::tuple<std::array<bool, D>, multi::layout_t<D>, multi::layout_t<D>>, plan<D, Alloc> >::iterator&
>()->second.execute(idata, odata, direction)
)
)
{
// assert(it_ != LEAKY_cache.end());
it_->second.execute(idata, odata, direction);
}
};

template<typename In, class Out, dimensionality_type D = In::rank::value, std::enable_if_t<!multi::has_get_allocator<In>::value, int> =0>
template<typename In, class Out, dimensionality_type D = In::rank::value, std::enable_if_t<!multi::has_get_allocator<In>::value, int> =0, typename = decltype(::thrust::raw_pointer_cast(std::declval<In const&>().base()))>
auto dft(std::array<bool, +D> which, In const& in, Out&& out, int sgn)
->decltype(cufft::cached_plan<D>{which, in.layout(), out.layout()}.execute(in.base(), out.base(), sgn), std::forward<Out>(out)) {
return cufft::cached_plan<D>{which, in.layout(), out.layout()}.execute(in.base(), out.base(), sgn), std::forward<Out>(out); }

template<typename In, class Out, dimensionality_type D = In::rank::value, std::enable_if_t< multi::has_get_allocator<In>::value, int> =0>
template<typename In, class Out, dimensionality_type D = In::rank::value, std::enable_if_t< multi::has_get_allocator<In>::value, int> =0, typename = decltype(raw_pointer_cast(std::declval<In const&>().base()))>
auto dft(std::array<bool, +D> which, In const& in, Out&& out, int sgn)
->decltype(cufft::cached_plan<D /*, typename std::allocator_traits<typename In::allocator_type>::rebind_alloc<char>*/ >{which, in.layout(), out.layout()/*, i.get_allocator()*/}.execute(in.base(), out.base(), sgn), std::forward<Out>(out)) {
return cufft::cached_plan<D /*, typename std::allocator_traits<typename In::allocator_type>::rebind_alloc<char>*/ >{which, in.layout(), out.layout()/*, i.get_allocator()*/}.execute(in.base(), out.base(), sgn), std::forward<Out>(out); }

template<typename In, class Out, dimensionality_type D = In::rank::value>//, std::enable_if_t<not multi::has_get_allocator<In>::value, int> =0>
auto dft_forward(std::array<bool, +D> which, In const& in, Out&& out) -> Out&& {
auto dft_forward(std::array<bool, +D> which, In const& in, Out&& out) -> Out&& {
//->decltype(cufft::plan<D>{which, i.layout(), o.layout()}.execute(i.base(), o.base(), cufft::forward), std::forward<Out>(o)) {
return cufft::cached_plan<D>{which, in.layout(), out.layout()}.execute(in.base(), out.base(), cufft::forward), std::forward<Out>(out); }

Expand Down
4 changes: 2 additions & 2 deletions include/boost/multi/adaptors/cufft/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,14 @@ include(CTest)
include_directories(${CMAKE_BINARY_DIR})

# file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS cufft.cpp)
#set(TEST_SRCS cufft.cpp)

foreach(TEST_FILE ${TEST_SRCS})
set(TEST_EXE "${TEST_FILE}.x")
add_executable(${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options(${TEST_EXE} PRIVATE -std=c++17)
# target_compile_options(${TEST_EXE} PRIVATE -std=c++17)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS") # needed by Boost.Test and NVCC
Expand Down
Loading

0 comments on commit da79e40

Please sign in to comment.