Skip to content

Commit

Permalink
fft functional
Browse files Browse the repository at this point in the history
  • Loading branch information
alfC committed Mar 6, 2025
1 parent 015103c commit 56d563c
Show file tree
Hide file tree
Showing 6 changed files with 907 additions and 581 deletions.
37 changes: 1 addition & 36 deletions .gitlab-ci-correaa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -574,40 +574,6 @@ nvhpc-22.7:
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-24.11 c++20 par:
stage: build
image: nvcr.io/nvidia/nvhpc:24.11-devel-cuda12.6-ubuntu24.04 # nvcr.io/nvidia/nvhpc:24.7-devel-cuda12.5-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
tags:
- non-shared
- large-disk-space
- x86_64
interruptible: true
script:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-stdpar=multicore"
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-24.11:
stage: build
image: nvcr.io/nvidia/nvhpc:24.11-devel-cuda12.6-ubuntu24.04 # nvcr.io/nvidia/nvhpc:24.7-devel-cuda12.5-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
tags:
- non-shared
- large-disk-space
- x86_64
interruptible: true
script:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2024/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]

nvhpc-25.1:
stage: build
image: nvcr.io/nvidia/nvhpc:25.1-devel-cuda12.6-ubuntu24.04 # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nvhpc/tags
Expand All @@ -620,12 +586,11 @@ nvhpc-25.1:
- apt-get update && apt-get install --no-install-recommends -y cmake make libboost-timer-dev libboost-serialization-dev libfftw3-dev pkg-config
- /opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ --version
- mkdir build && cd build
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release
- CXX=/opt/nvidia/hpc_sdk/Linux_x86_64/2025/compilers/bin/nvc++ cmake .. -DCMAKE_BUILD_TYPE=Release # -DCMAKE_CXX_STANDARD=20 -DCMAKE_CXX_FLAGS="-stdpar=multicore"
- cmake --build . --parallel 2 || cmake --build . --verbose
- OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ctest --output-on-failure
needs: ["nvhpc"]


cuda:
stage: build
allow_failure: false
Expand Down
29 changes: 15 additions & 14 deletions include/boost/multi/adaptors/cufft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
#include <boost/multi/detail/config/NODISCARD.hpp>

#include <array>
#include <cstddef>
#include <map>
#include <tuple>

#include<thrust/memory.h> // for raw_pointer_cast

#if not defined(__HIP_ROCclr__)
#if !defined(__HIP_ROCclr__)
#include <cufft.h>
#include <cufftXt.h>
#endif
Expand Down Expand Up @@ -88,22 +89,22 @@ class plan {
Alloc alloc_;
::size_t workSize_ = 0;
void* workArea_{};

using complex_type = cufftDoubleComplex;
cufftHandle h_{}; // TODO(correaa) put this in a unique_ptr
std::array<std::pair<bool, fftw_iodim64>, DD + 1> which_iodims_{};
int first_howmany_{};

using complex_type = cufftDoubleComplex;

public:
public:
using allocator_type = Alloc;

plan(plan&& other) noexcept :
h_{std::exchange(other.h_, {})},
which_iodims_{std::exchange(other.which_iodims_, {})},
first_howmany_{std::exchange(other.first_howmany_, {})},
alloc_{std::move(other.alloc_)},
workSize_{std::exchange(other.workSize_, {})},
workArea_{std::exchange(other.workArea_, {})},
alloc_{std::move(other.alloc_)}
h_{std::exchange(other.h_, {})},
which_iodims_{std::exchange(other.which_iodims_, {})},
first_howmany_{std::exchange(other.first_howmany_, {})}
{}

template<
Expand Down Expand Up @@ -136,8 +137,8 @@ class plan {
std::array<fftw_iodim64, D> dims{};
auto const dims_end = std::transform(which_iodims.begin(), part, dims.begin(), [](auto elem) {return elem.second;});

std::array<fftw_iodim64, D> howmany_dims{};
auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});
// std::array<fftw_iodim64, D> howmany_dims{};
// auto const howmany_dims_end = std::transform(part, which_iodims.end() -1, howmany_dims.begin(), [](auto elem) {return elem.second;});

which_iodims_ = which_iodims;
first_howmany_ = part - which_iodims.begin();
Expand All @@ -157,7 +158,7 @@ class plan {
int ostride = *(ostrides_end -1);
auto onembed = ostrides; onembed.fill(0);

for(std::size_t idx = 1; idx != ion_end - ion.begin(); ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) replace with algorithm
for(std::ptrdiff_t idx = 1; idx != ion_end - ion.begin(); ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) replace with algorithm
assert(ostrides[idx - 1] >= ostrides[idx]);
assert(ostrides[idx - 1] % ostrides[idx] == 0);
onembed[idx] = ostrides[idx - 1] / ostrides[idx];
Expand Down Expand Up @@ -286,7 +287,7 @@ class plan {
private:

void ExecZ2Z_(complex_type const* idata, complex_type* odata, int direction) const{
cufftSafeCall(::cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
cufftSafeCall(cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction)); // NOLINT(cppcoreguidelines-pro-type-const-cast) wrap legacy interface
// cudaDeviceSynchronize();
}

Expand All @@ -300,7 +301,7 @@ class plan {
if(first_howmany_ == DD - 1) {
if( which_iodims_[first_howmany_].first) {throw std::runtime_error{"logic error"};}
for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-id-dependent-backward-branch)
::cufftExecZ2Z(
cufftExecZ2Z(
h_,
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx*which_iodims_[first_howmany_].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type *>(::thrust::raw_pointer_cast(odata + idx*which_iodims_[first_howmany_].second.os)) , // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
Expand All @@ -315,7 +316,7 @@ class plan {
if(idata == odata) {throw std::runtime_error{"complicated inplace 2"};}
for(int idx = 0; idx != which_iodims_[first_howmany_].second.n; ++idx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
for(int jdx = 0; jdx != which_iodims_[first_howmany_ + 1].second.n; ++jdx) { // NOLINT(altera-unroll-loops,altera-unroll-loops,altera-id-dependent-backward-branch) TODO(correaa) use an algorithm
::cufftExecZ2Z(
cufftExecZ2Z(
h_,
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(::thrust::raw_pointer_cast(idata + idx*which_iodims_[first_howmany_].second.is + jdx*which_iodims_[first_howmany_ + 1].second.is))), // NOLINT(cppcoreguidelines-pro-type-const-cast,cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
reinterpret_cast<complex_type *>(::thrust::raw_pointer_cast(odata + idx*which_iodims_[first_howmany_].second.os + jdx*which_iodims_[first_howmany_ + 1].second.os)) , // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast) legacy interface
Expand Down
Loading

0 comments on commit 56d563c

Please sign in to comment.