From 220506b98cadc05cf466f807af6593a59eea9cf4 Mon Sep 17 00:00:00 2001 From: Simon Frasch Date: Thu, 7 Jan 2021 14:03:46 +0100 Subject: [PATCH] Readme and examples updated --- README.md | 54 ++-- docs/source/details.rst | 8 +- docs/source/examples.rst | 517 ++++++++++++++++++++++----------------- docs/source/index.rst | 4 +- examples/example.c | 29 ++- examples/example.cpp | 47 ++-- examples/example.f90 | 39 ++- 7 files changed, 416 insertions(+), 282 deletions(-) diff --git a/README.md b/README.md index aa0a8f7..4d74fe6 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,12 @@ To allow for pre-allocation and reuse of memory, the design is based on two clas - **Grid**: Provides memory for transforms up to a given size. - **Transform**: Created with information on sparse input data and is associated with a *Grid*. Maximum size is limited by *Grid* dimensions. Internal reference counting to *Grid* objects guarantee a valid state until *Transform* object destruction. -The user provides memory for storing sparse frequency domain data, while a *Transform* provides memory for space domain data. This implies, that executing a *Transform* will override the space domain data of all other *Transforms* associated with the same *Grid*. +A transform can be computed in-place and out-of-place. Addtionally, an internally allocated work buffer can optionally be used for input / output of space domain data. + +### New Features in v1.0 +- Support for externally allocated memory for space domain data including in-place and out-of-place transforms +- Optional asynchronous computation when using GPUs +- Simplified / direct transform handle creation if no resource reuse through grid handles is required ## Documentation Documentation can be found [here](https://spfft.readthedocs.io/en/latest/). @@ -88,21 +93,21 @@ int main(int argc, char** argv) { // Use default OpenMP value const int numThreads = -1; - // use all elements in this example. + // Use all elements in this example. const int numFrequencyElements = dimX * dimY * dimZ; // Slice length in space domain. Equivalent to dimZ for non-distributed case. const int localZLength = dimZ; - // interleaved complex numbers + // Interleaved complex numbers std::vector frequencyElements; frequencyElements.reserve(2 * numFrequencyElements); - // indices of frequency elements + // Indices of frequency elements std::vector indices; indices.reserve(dimX * dimY * dimZ * 3); - // initialize frequency domain values and indices + // Initialize frequency domain values and indices double initValue = 0.0; for (int xIndex = 0; xIndex < dimX; ++xIndex) { for (int yIndex = 0; yIndex < dimY; ++yIndex) { @@ -126,31 +131,48 @@ int main(int argc, char** argv) { std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; } - // create local Grid. For distributed computations, a MPI Communicator has to be provided + // Create local Grid. For distributed computations, a MPI Communicator has to be provided spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); - // create transform + // Create transform. + // Note: A transform handle can be created without a grid if no resource sharing is desired. spfft::Transform transform = grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, localZLength, numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices.data()); - // Get pointer to space domain data. Alignment fullfills requirements for std::complex. - // Can also be read as std::complex elements (guaranteed by the standard to be binary compatible - // since C++11). - double* spaceDomain = transform.space_domain_data(SPFFT_PU_HOST); - // transform backward + /////////////////////////////////////////////////// + // Option A: Reuse internal buffer for space domain + /////////////////////////////////////////////////// + + // Transform backward transform.backward(frequencyElements.data(), SPFFT_PU_HOST); + // Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid + // std::complex pointer. Using the internal working buffer as input / output can help reduce + // memory usage. + double* spaceDomainPtr = transform.space_domain_data(SPFFT_PU_HOST); + std::cout << std::endl << "After backward transform:" << std::endl; for (int i = 0; i < transform.local_slice_size(); ++i) { - std::cout << spaceDomain[2 * i] << ", " << spaceDomain[2 * i + 1] << std::endl; + std::cout << spaceDomainPtr[2 * i] << ", " << spaceDomainPtr[2 * i + 1] << std::endl; } - // transform forward - transform.forward(SPFFT_PU_HOST, frequencyElements.data(), SPFFT_NO_SCALING); + ///////////////////////////////////////////////// + // Option B: Use external buffer for space domain + ///////////////////////////////////////////////// + + std::vector spaceDomainVec(2 * transform.local_slice_size()); + + // Transform backward + transform.backward(frequencyElements.data(), spaceDomainVec.data()); + + // Transform forward + transform.forward(spaceDomainVec.data(), frequencyElements.data(), SPFFT_NO_SCALING); + + // Note: In-place transforms are also supported by passing the same pointer for input and output. - std::cout << std::endl << "After forward transform (without scaling):" << std::endl; + std::cout << std::endl << "After forward transform (without normalization):" << std::endl; for (int i = 0; i < numFrequencyElements; ++i) { std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; } diff --git a/docs/source/details.rst b/docs/source/details.rst index 75cb07b..d7e83c8 100644 --- a/docs/source/details.rst +++ b/docs/source/details.rst @@ -13,7 +13,6 @@ Transform Definition - :math:`\omega_{N}^{k,n} = e^{2\pi i \frac{k n}{N}}`: *Backward* transform from frequency domain to space domain - Complex Number Format --------------------- SpFFT always assumes an interleaved format in double or single precision. The alignment of memory provided for space domain data is guaranteed to fulfill to the requirements for std::complex (for C++11), C complex types and GPU complex types of CUDA or ROCm. @@ -90,9 +89,14 @@ The execution of transforms is thread-safe if GPU --- -| Saving transfer time between host and GPU is key to good performance for execution with GPUs. Ideally, both input and output is located on GPU memory. If host memory pointers are provided as input or output, it is helpful to use pinned memory through the CUDA or ROCm API. +| Saving transfer time between host and GPU is key to good performance for execution with GPUs. Ideally, both input and output is located on GPU memory. If host memory pointers are provided as input or output, it is beneficial to use pinned memory through the CUDA or ROCm API. | If available, GPU aware MPI can be utilized, to safe on the otherwise required transfers between host and GPU in preparation of the MPI exchange. This can greatly impact performance and is enabled by compiling the library with the CMake option SPFFT_GPU_DIRECT set to ON. .. note:: Additional environment variables may have to be set for some MPI implementations, to allow GPUDirect usage. .. note:: The execution of a transform is synchronized with the default stream. + +Multi-GPU +--------- +Multi-GPU support is not available for individual transform operations, but each Grid / Transform can be associated to a different GPU. At creation time, the current GPU id is stored internally and used for operations later on. So by either using the asynchronous execution mode or using the multi-transform functionality, multiple GPUs can be used at the same time. + diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 34ec4dd..d06f06d 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -6,289 +6,344 @@ C++ .. code-block:: c++ - #include - #include - #include +#include +#include +#include - #include "spfft/spfft.hpp" +#include "spfft/spfft.hpp" - int main(int argc, char** argv) { - const int dimX = 2; - const int dimY = 2; - const int dimZ = 2; +int main(int argc, char** argv) { + const int dimX = 2; + const int dimY = 2; + const int dimZ = 2; - std::cout << "Dimensions: x = " << dimX << ", y = " << dimY << ", z = " << dimZ << std::endl - << std::endl; + std::cout << "Dimensions: x = " << dimX << ", y = " << dimY << ", z = " << dimZ << std::endl + << std::endl; - // Use default OpenMP value - const int numThreads = -1; + // Use default OpenMP value + const int numThreads = -1; - // use all elements in this example. - const int numFrequencyElements = dimX * dimY * dimZ; + // Use all elements in this example. + const int numFrequencyElements = dimX * dimY * dimZ; - // Slice length in space domain. Equivalent to dimZ for non-distributed case. - const int localZLength = dimZ; + // Slice length in space domain. Equivalent to dimZ for non-distributed case. + const int localZLength = dimZ; - // interleaved complex numbers - std::vector frequencyElements; - frequencyElements.reserve(2 * numFrequencyElements); + // Interleaved complex numbers + std::vector frequencyElements; + frequencyElements.reserve(2 * numFrequencyElements); - // indices of frequency elements - std::vector indices; - indices.reserve(3 * numFrequencyElements); + // Indices of frequency elements + std::vector indices; + indices.reserve(dimX * dimY * dimZ * 3); - // initialize frequency domain values and indices - double initValue = 0.0; - for (int xIndex = 0; xIndex < dimX; ++xIndex) { - for (int yIndex = 0; yIndex < dimY; ++yIndex) { - for (int zIndex = 0; zIndex < dimZ; ++zIndex) { - // init with interleaved complex numbers - frequencyElements.emplace_back(initValue); - frequencyElements.emplace_back(-initValue); + // Initialize frequency domain values and indices + double initValue = 0.0; + for (int xIndex = 0; xIndex < dimX; ++xIndex) { + for (int yIndex = 0; yIndex < dimY; ++yIndex) { + for (int zIndex = 0; zIndex < dimZ; ++zIndex) { + // init with interleaved complex numbers + frequencyElements.emplace_back(initValue); + frequencyElements.emplace_back(-initValue); - // add index triplet for value - indices.emplace_back(xIndex); - indices.emplace_back(yIndex); - indices.emplace_back(zIndex); + // add index triplet for value + indices.emplace_back(xIndex); + indices.emplace_back(yIndex); + indices.emplace_back(zIndex); - initValue += 1.0; - } - } - } + initValue += 1.0; + } + } + } - std::cout << "Input:" << std::endl; - for (int i = 0; i < numFrequencyElements; ++i) { - std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; - } + std::cout << "Input:" << std::endl; + for (int i = 0; i < numFrequencyElements; ++i) { + std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; + } - // create local Grid. For distributed computations, a MPI Communicator has to be provided - spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); + // Create local Grid. For distributed computations, a MPI Communicator has to be provided + spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); - // create transform - spfft::Transform transform = - grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, localZLength, - numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices.data()); + // Create transform. + // Note: A transform handle can be created without a grid if no resource sharing is desired. + spfft::Transform transform = + grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, localZLength, + numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices.data()); - // Get pointer to space domain data. Alignment fullfills requirements for std::complex. - // Can also be read as std::complex elements (guaranteed by the standard to be binary compatible - // since C++11). - double* spaceDomain = transform.space_domain_data(SPFFT_PU_HOST); - // transform backward - transform.backward(frequencyElements.data(), SPFFT_PU_HOST); + /////////////////////////////////////////////////// + // Option A: Reuse internal buffer for space domain + /////////////////////////////////////////////////// - std::cout << std::endl << "After backward transform:" << std::endl; - for (int i = 0; i < transform.local_slice_size(); ++i) { - std::cout << spaceDomain[2 * i] << ", " << spaceDomain[2 * i + 1] << std::endl; - } + // Transform backward + transform.backward(frequencyElements.data(), SPFFT_PU_HOST); - // transform forward - transform.forward(SPFFT_PU_HOST, frequencyElements.data(), SPFFT_NO_SCALING); + // Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid + // std::complex pointer. Using the internal working buffer as input / output can help reduce + // memory usage. + double* spaceDomainPtr = transform.space_domain_data(SPFFT_PU_HOST); - std::cout << std::endl << "After forward transform (without scaling):" << std::endl; - for (int i = 0; i < numFrequencyElements; ++i) { - std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; - } + std::cout << std::endl << "After backward transform:" << std::endl; + for (int i = 0; i < transform.local_slice_size(); ++i) { + std::cout << spaceDomainPtr[2 * i] << ", " << spaceDomainPtr[2 * i + 1] << std::endl; + } - return 0; - } + ///////////////////////////////////////////////// + // Option B: Use external buffer for space domain + ///////////////////////////////////////////////// + std::vector spaceDomainVec(2 * transform.local_slice_size()); + + // Transform backward + transform.backward(frequencyElements.data(), spaceDomainVec.data()); + + // Transform forward + transform.forward(spaceDomainVec.data(), frequencyElements.data(), SPFFT_NO_SCALING); + + // Note: In-place transforms are also supported by passing the same pointer for input and output. + + std::cout << std::endl << "After forward transform (without normalization):" << std::endl; + for (int i = 0; i < numFrequencyElements; ++i) { + std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; + } + + return 0; +} C - .. code-block:: c - #include - #include +#include +#include + +#include "spfft/spfft.h" - #include "spfft/spfft.h" +int main(int argc, char** argv) { + const int dimX = 2; + const int dimY = 2; + const int dimZ = 2; - int main(int argc, char** argv) { - const int dimX = 2; - const int dimY = 2; - const int dimZ = 2; + printf("Dimensions: x = %d, y = %d, z = %d\n\n", dimX, dimY, dimZ); - printf("Dimensions: x = %d, y = %d, z = %d\n\n", dimX, dimY, dimZ); + /* Use default OpenMP value */ + const int numThreads = -1; - /* Use default OpenMP value */ - const int numThreads = -1; + /* use all elements in this example. */ + const int numFrequencyElements = dimX * dimY * dimZ; - /* use all elements in this example. */ - const int numFrequencyElements = dimX * dimY * dimZ; + /* Slice length in space domain. Equivalent to dimZ for non-distributed case. */ + const int localZLength = dimZ; - /* Slice length in space domain. Equivalent to dimZ for non-distributed case. */ - const int localZLength = dimZ; + /* interleaved complex numbers */ + double* frequencyElements = (double*)malloc(2 * sizeof(double) * numFrequencyElements); - /* interleaved complex numbers */ - double* frequencyElements = (double*)malloc(2 * sizeof(double) * numFrequencyElements); + /* indices of frequency elements */ + int* indices = (int*)malloc(3 * sizeof(int) * numFrequencyElements); - /* indices of frequency elements */ - int* indices = (int*)malloc(3 * sizeof(int) * numFrequencyElements); + /* initialize frequency domain values and indices */ + double initValue = 0.0; + size_t count = 0; + for (int xIndex = 0; xIndex < dimX; ++xIndex) { + for (int yIndex = 0; yIndex < dimY; ++yIndex) { + for (int zIndex = 0; zIndex < dimZ; ++zIndex, ++count) { + /* init values */ + frequencyElements[2 * count] = initValue; + frequencyElements[2 * count + 1] = -initValue; - /* initialize frequency domain values and indices */ - double initValue = 0.0; - size_t count = 0; - for (int xIndex = 0; xIndex < dimX; ++xIndex) { - for (int yIndex = 0; yIndex < dimY; ++yIndex) { - for (int zIndex = 0; zIndex < dimZ; ++zIndex, ++count) { - /* init values */ - frequencyElements[2 * count] = initValue; - frequencyElements[2 * count + 1] = -initValue; + /* add index triplet for value */ + indices[3 * count] = xIndex; + indices[3 * count + 1] = yIndex; + indices[3 * count + 2] = zIndex; - /* add index triplet for value */ - indices[3 * count] = xIndex; - indices[3 * count + 1] = yIndex; - indices[3 * count + 2] = zIndex; + initValue += 1.0; + } + } + } - initValue += 1.0; - } - } - } + printf("Input:\n"); + for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { + printf("%f, %f\n", frequencyElements[2 * i], frequencyElements[2 * i + 1]); + } + printf("\n"); - printf("Input:\n"); - for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { - printf("%f, %f\n", frequencyElements[2 * i], frequencyElements[2 * i + 1]); - } - printf("\n"); + SpfftError status = 0; - SpfftError status = 0; + /* create local Grid. For distributed computations, a MPI Communicator has to be provided */ + SpfftGrid grid; + status = spfft_grid_create(&grid, dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); + if (status != SPFFT_SUCCESS) exit(status); - /* create local Grid. For distributed computations, a MPI Communicator has to be provided */ - SpfftGrid grid; - status = spfft_grid_create(&grid, dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); - if (status != SPFFT_SUCCESS) exit(status); + /* create transform */ + SpfftTransform transform; + status = spfft_transform_create(&transform, grid, SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, + dimZ, localZLength, numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices); + if (status != SPFFT_SUCCESS) exit(status); - /* create transform */ - SpfftTransform transform; - status = spfft_transform_create(&transform, grid, SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, - dimZ, localZLength, numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices); - if (status != SPFFT_SUCCESS) exit(status); + /* grid can be safely destroyed after creating all transforms */ + status = spfft_grid_destroy(grid); + if (status != SPFFT_SUCCESS) exit(status); - /* grid can be safely destroyed after creating all transforms */ - status = spfft_grid_destroy(grid); - if (status != SPFFT_SUCCESS) exit(status); - /* get pointer to space domain data. Alignment is guaranteed to fullfill requirements C complex - types */ - double* spaceDomain; - status = spfft_transform_get_space_domain(transform, SPFFT_PU_HOST, &spaceDomain); - if (status != SPFFT_SUCCESS) exit(status); + /************************************************** + Option A: Reuse internal buffer for space domain + ***************************************************/ - /* transform backward */ - status = spfft_transform_backward(transform, frequencyElements, SPFFT_PU_HOST); - if (status != SPFFT_SUCCESS) exit(status); + /* Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid + complex type pointer. Using the internal working buffer as input / output can help reduce + memory usage.*/ + double* spaceDomain; + status = spfft_transform_get_space_domain(transform, SPFFT_PU_HOST, &spaceDomain); + if (status != SPFFT_SUCCESS) exit(status); - printf("After backward transform:\n"); - for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { - printf("%f, %f\n", spaceDomain[2 * i], spaceDomain[2 * i + 1]); - } - printf("\n"); + /* transform backward */ + status = spfft_transform_backward(transform, frequencyElements, SPFFT_PU_HOST); + if (status != SPFFT_SUCCESS) exit(status); - /* transform forward */ - status = spfft_transform_forward(transform, SPFFT_PU_HOST, frequencyElements, SPFFT_NO_SCALING); - if (status != SPFFT_SUCCESS) exit(status); + printf("After backward transform:\n"); + for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { + printf("%f, %f\n", spaceDomain[2 * i], spaceDomain[2 * i + 1]); + } + printf("\n"); - printf("After forward transform (without scaling):\n"); - for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { - printf("%f, %f\n", frequencyElements[2 * i], frequencyElements[2 * i + 1]); - } - /* destroying the final transform will free the associated memory */ - status = spfft_transform_destroy(transform); - if (status != SPFFT_SUCCESS) exit(status); + /********************************************** + Option B: Use external buffer for space domain + ***********************************************/ + spaceDomain = (double*)malloc(2 * sizeof(double) * dimX * dimY * dimZ); - return 0; - } + /* transform backward */ + status = spfft_transform_backward_ptr(transform, frequencyElements, spaceDomain); + if (status != SPFFT_SUCCESS) exit(status); + /* transform forward */ + status = spfft_transform_forward_ptr(transform, spaceDomain, frequencyElements, SPFFT_NO_SCALING); + if (status != SPFFT_SUCCESS) exit(status); + + /* Note: In-place transforms are also supported by passing the same pointer for input and output. */ + + printf("After forward transform (without normalization):\n"); + for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { + printf("%f, %f\n", frequencyElements[2 * i], frequencyElements[2 * i + 1]); + } + + /* destroying the final transform will free the associated memory */ + status = spfft_transform_destroy(transform); + if (status != SPFFT_SUCCESS) exit(status); + + free(spaceDomain); + free(frequencyElements); + + return 0; +} Fortran ------- .. code-block:: fortran - program main - use iso_c_binding - use spfft - implicit none - integer :: i, j, k, counter - integer, parameter :: dimX = 2 - integer, parameter :: dimY = 2 - integer, parameter :: dimZ = 2 - integer, parameter :: maxNumLocalZColumns = dimX * dimY - integer, parameter :: processingUnit = 1 - integer, parameter :: maxNumThreads = -1 - type(c_ptr) :: grid = c_null_ptr - type(c_ptr) :: transform = c_null_ptr - integer :: errorCode = 0 - integer, dimension(dimX * dimY * dimZ * 3):: indices = 0 - complex(C_DOUBLE_COMPLEX), dimension(dimX * dimY * dimZ):: frequencyElements - complex(C_DOUBLE_COMPLEX), pointer :: spaceDomain(:,:,:) - type(c_ptr) :: realValuesPtr - - - counter = 0 - do k = 1, dimZ - do j = 1, dimY - do i = 1, dimX - frequencyElements(counter + 1) = cmplx(counter, -counter) - indices(counter * 3 + 1) = i - 1 - indices(counter * 3 + 2) = j - 1 - indices(counter * 3 + 3) = k - 1 - counter = counter + 1 - end do - end do - end do - - ! print input - print *, "Input:" - do i = 1, size(frequencyElements) - print *, frequencyElements(i) - end do - - - ! create grid and transform - errorCode = spfft_grid_create(grid, dimX, dimY, dimZ, maxNumLocalZColumns, processingUnit, maxNumThreads); - if (errorCode /= SPFFT_SUCCESS) error stop - errorCode = spfft_transform_create(transform, grid, processingUnit, 0, dimX, dimY, dimZ, dimZ,& - size(frequencyElements), SPFFT_INDEX_TRIPLETS, indices) - if (errorCode /= SPFFT_SUCCESS) error stop - - ! grid can be safely destroyed after creating all required transforms - errorCode = spfft_grid_destroy(grid) - if (errorCode /= SPFFT_SUCCESS) error stop - - ! set space domain array to use memory allocted by the library - errorCode = spfft_transform_get_space_domain(transform, processingUnit, realValuesPtr) - if (errorCode /= SPFFT_SUCCESS) error stop - - ! transform backward - errorCode = spfft_transform_backward(transform, frequencyElements, processingUnit) - if (errorCode /= SPFFT_SUCCESS) error stop - - - call c_f_pointer(realValuesPtr, spaceDomain, [dimX,dimY,dimZ]) - - print *, "" - print *, "After backward transform:" - do k = 1, size(spaceDomain, 3) - do j = 1, size(spaceDomain, 2) - do i = 1, size(spaceDomain, 1) - print *, spaceDomain(i, j, k) - end do - end do - end do - - ! transform forward (will invalidate space domain data) - errorCode = spfft_transform_forward(transform, processingUnit, frequencyElements, 0) - if (errorCode /= SPFFT_SUCCESS) error stop - - print *, "" - print *, "After forward transform (without scaling):" - do i = 1, size(frequencyElements) - print *, frequencyElements(i) - end do - - ! destroying the final transform will free the associated memory - errorCode = spfft_transform_destroy(transform) - if (errorCode /= SPFFT_SUCCESS) error stop - - end + +program main + use iso_c_binding + use spfft + implicit none + integer :: i, j, k, counter + integer, parameter :: dimX = 2 + integer, parameter :: dimY = 2 + integer, parameter :: dimZ = 2 + integer, parameter :: maxNumLocalZColumns = dimX * dimY + integer, parameter :: processingUnit = 1 + integer, parameter :: maxNumThreads = -1 + type(c_ptr) :: grid = c_null_ptr + type(c_ptr) :: transform = c_null_ptr + integer :: errorCode = 0 + integer, dimension(dimX * dimY * dimZ * 3):: indices = 0 + complex(C_DOUBLE_COMPLEX), dimension(dimX * dimY * dimZ):: frequencyElements + real(C_DOUBLE), dimension(2*dimX * dimY * dimZ):: spaceDomain + complex(C_DOUBLE_COMPLEX), pointer :: spaceDomainPtr(:,:,:) + type(c_ptr) :: realValuesPtr + + + counter = 0 + do k = 1, dimZ + do j = 1, dimY + do i = 1, dimX + frequencyElements(counter + 1) = cmplx(counter, -counter) + indices(counter * 3 + 1) = i - 1 + indices(counter * 3 + 2) = j - 1 + indices(counter * 3 + 3) = k - 1 + counter = counter + 1 + end do + end do + end do + + ! print input + print *, "Input:" + do i = 1, size(frequencyElements) + print *, frequencyElements(i) + end do + + + ! create grid + errorCode = spfft_grid_create(grid, dimX, dimY, dimZ, maxNumLocalZColumns, processingUnit, maxNumThreads); + if (errorCode /= SPFFT_SUCCESS) error stop + + ! create transform + ! Note: A transform handle can be created without a grid if no resource sharing is desired. + errorCode = spfft_transform_create(transform, grid, processingUnit, 0, dimX, dimY, dimZ, dimZ,& + size(frequencyElements), SPFFT_INDEX_TRIPLETS, indices) + if (errorCode /= SPFFT_SUCCESS) error stop + + ! grid can be safely destroyed after creating all required transforms + errorCode = spfft_grid_destroy(grid) + if (errorCode /= SPFFT_SUCCESS) error stop + + + ! ************************************************* + ! Option A: Reuse internal buffer for space domain + ! ************************************************* + + ! set space domain array to use memory allocted by the library + errorCode = spfft_transform_get_space_domain(transform, processingUnit, realValuesPtr) + if (errorCode /= SPFFT_SUCCESS) error stop + + ! transform backward + errorCode = spfft_transform_backward(transform, frequencyElements, processingUnit) + if (errorCode /= SPFFT_SUCCESS) error stop + + + call c_f_pointer(realValuesPtr, spaceDomainPtr, [dimX,dimY,dimZ]) + + print *, "" + print *, "After backward transform:" + do k = 1, size(spaceDomainPtr, 3) + do j = 1, size(spaceDomainPtr, 2) + do i = 1, size(spaceDomainPtr, 1) + print *, spaceDomainPtr(i, j, k) + end do + end do + end do + + + ! ********************************************** + ! Option B: Use external buffer for space domain + ! ********************************************** + + ! transform backward + errorCode = spfft_transform_backward_ptr(transform, frequencyElements, spaceDomain) + if (errorCode /= SPFFT_SUCCESS) error stop + + ! transform forward + errorCode = spfft_transform_forward_ptr(transform, spaceDomain, frequencyElements, SPFFT_NO_SCALING) + if (errorCode /= SPFFT_SUCCESS) error stop + + print *, "" + print *, "After forward transform (without normalization):" + do i = 1, size(frequencyElements) + print *, frequencyElements(i) + end do + + ! destroying the final transform will free the associated memory + errorCode = spfft_transform_destroy(transform) + if (errorCode /= SPFFT_SUCCESS) error stop + +end diff --git a/docs/source/index.rst b/docs/source/index.rst index 1302fd9..2e72741 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,7 +34,7 @@ To allow for pre-allocation and reuse of memory, the design is based on two clas - **Grid**: Allocates memory for transforms up to a given size in each dimension. - **Transform**: Is associated with a *Grid* and can have any size up to the *Grid* dimensions. A *Transform* holds a counted reference to the underlying *Grid*. Therefore, *Transforms* created with the same *Grid* share memory, which is only freed, once the *Grid* and all associated *Transforms* are destroyed. -The user provides memory for storing sparse frequency domain data, while a *Transform* provides memory for space domain data. This implies, that executing a *Transform* will override the space domain data of all other *Transforms* associated with the same *Grid*. +A transform can be computed in-place and out-of-place. Addtionally, an internally allocated work buffer can optionally be used for input / output of space domain data. .. note:: The creation of Grids and Transforms, as well as the forward and backward execution may entail MPI calls and must be synchronized between all ranks. @@ -77,8 +77,6 @@ The user provides memory for storing sparse frequency domain data, while a *Tran - - .. Indices and tables .. ================== diff --git a/examples/example.c b/examples/example.c index 03bf3c6..c0679a3 100644 --- a/examples/example.c +++ b/examples/example.c @@ -68,8 +68,14 @@ int main(int argc, char** argv) { status = spfft_grid_destroy(grid); if (status != SPFFT_SUCCESS) exit(status); - /* get pointer to space domain data. Alignment is guaranteed to fullfill requirements C complex - types */ + + /************************************************** + Option A: Reuse internal buffer for space domain + ***************************************************/ + + /* Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid + complex type pointer. Using the internal working buffer as input / output can help reduce + memory usage.*/ double* spaceDomain; status = spfft_transform_get_space_domain(transform, SPFFT_PU_HOST, &spaceDomain); if (status != SPFFT_SUCCESS) exit(status); @@ -84,11 +90,23 @@ int main(int argc, char** argv) { } printf("\n"); + + /********************************************** + Option B: Use external buffer for space domain + ***********************************************/ + spaceDomain = (double*)malloc(2 * sizeof(double) * dimX * dimY * dimZ); + + /* transform backward */ + status = spfft_transform_backward_ptr(transform, frequencyElements, spaceDomain); + if (status != SPFFT_SUCCESS) exit(status); + /* transform forward */ - status = spfft_transform_forward(transform, SPFFT_PU_HOST, frequencyElements, SPFFT_NO_SCALING); + status = spfft_transform_forward_ptr(transform, spaceDomain, frequencyElements, SPFFT_NO_SCALING); if (status != SPFFT_SUCCESS) exit(status); - printf("After forward transform (without scaling):\n"); + /* Note: In-place transforms are also supported by passing the same pointer for input and output. */ + + printf("After forward transform (without normalization):\n"); for (size_t i = 0; i < dimX * dimY * dimZ; ++i) { printf("%f, %f\n", frequencyElements[2 * i], frequencyElements[2 * i + 1]); } @@ -97,5 +115,8 @@ int main(int argc, char** argv) { status = spfft_transform_destroy(transform); if (status != SPFFT_SUCCESS) exit(status); + free(spaceDomain); + free(frequencyElements); + return 0; } diff --git a/examples/example.cpp b/examples/example.cpp index f922fbb..dd854fb 100644 --- a/examples/example.cpp +++ b/examples/example.cpp @@ -15,21 +15,21 @@ int main(int argc, char** argv) { // Use default OpenMP value const int numThreads = -1; - // use all elements in this example. + // Use all elements in this example. const int numFrequencyElements = dimX * dimY * dimZ; // Slice length in space domain. Equivalent to dimZ for non-distributed case. const int localZLength = dimZ; - // interleaved complex numbers + // Interleaved complex numbers std::vector frequencyElements; frequencyElements.reserve(2 * numFrequencyElements); - // indices of frequency elements + // Indices of frequency elements std::vector indices; indices.reserve(dimX * dimY * dimZ * 3); - // initialize frequency domain values and indices + // Initialize frequency domain values and indices double initValue = 0.0; for (int xIndex = 0; xIndex < dimX; ++xIndex) { for (int yIndex = 0; yIndex < dimY; ++yIndex) { @@ -53,31 +53,48 @@ int main(int argc, char** argv) { std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; } - // create local Grid. For distributed computations, a MPI Communicator has to be provided + // Create local Grid. For distributed computations, a MPI Communicator has to be provided spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads); - // create transform + // Create transform. + // Note: A transform handle can be created without a grid if no resource sharing is desired. spfft::Transform transform = grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, localZLength, numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices.data()); - // Get pointer to space domain data. Alignment fullfills requirements for std::complex. - // Can also be read as std::complex elements (guaranteed by the standard to be binary compatible - // since C++11). - double* spaceDomain = transform.space_domain_data(SPFFT_PU_HOST); - // transform backward + /////////////////////////////////////////////////// + // Option A: Reuse internal buffer for space domain + /////////////////////////////////////////////////// + + // Transform backward transform.backward(frequencyElements.data(), SPFFT_PU_HOST); + // Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid + // std::complex pointer. Using the internal working buffer as input / output can help reduce + // memory usage. + double* spaceDomainPtr = transform.space_domain_data(SPFFT_PU_HOST); + std::cout << std::endl << "After backward transform:" << std::endl; for (int i = 0; i < transform.local_slice_size(); ++i) { - std::cout << spaceDomain[2 * i] << ", " << spaceDomain[2 * i + 1] << std::endl; + std::cout << spaceDomainPtr[2 * i] << ", " << spaceDomainPtr[2 * i + 1] << std::endl; } - // transform forward - transform.forward(SPFFT_PU_HOST, frequencyElements.data(), SPFFT_NO_SCALING); + ///////////////////////////////////////////////// + // Option B: Use external buffer for space domain + ///////////////////////////////////////////////// + + std::vector spaceDomainVec(2 * transform.local_slice_size()); + + // Transform backward + transform.backward(frequencyElements.data(), spaceDomainVec.data()); + + // Transform forward + transform.forward(spaceDomainVec.data(), frequencyElements.data(), SPFFT_NO_SCALING); + + // Note: In-place transforms are also supported by passing the same pointer for input and output. - std::cout << std::endl << "After forward transform (without scaling):" << std::endl; + std::cout << std::endl << "After forward transform (without normalization):" << std::endl; for (int i = 0; i < numFrequencyElements; ++i) { std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl; } diff --git a/examples/example.f90 b/examples/example.f90 index f41a12c..873653b 100644 --- a/examples/example.f90 +++ b/examples/example.f90 @@ -1,4 +1,3 @@ - program main use iso_c_binding use spfft @@ -15,7 +14,8 @@ program main integer :: errorCode = 0 integer, dimension(dimX * dimY * dimZ * 3):: indices = 0 complex(C_DOUBLE_COMPLEX), dimension(dimX * dimY * dimZ):: frequencyElements - complex(C_DOUBLE_COMPLEX), pointer :: spaceDomain(:,:,:) + real(C_DOUBLE), dimension(2*dimX * dimY * dimZ):: spaceDomain + complex(C_DOUBLE_COMPLEX), pointer :: spaceDomainPtr(:,:,:) type(c_ptr) :: realValuesPtr @@ -39,9 +39,12 @@ program main end do - ! create grid and transform + ! create grid errorCode = spfft_grid_create(grid, dimX, dimY, dimZ, maxNumLocalZColumns, processingUnit, maxNumThreads); if (errorCode /= SPFFT_SUCCESS) error stop + + ! create transform + ! Note: A transform handle can be created without a grid if no resource sharing is desired. errorCode = spfft_transform_create(transform, grid, processingUnit, 0, dimX, dimY, dimZ, dimZ,& size(frequencyElements), SPFFT_INDEX_TRIPLETS, indices) if (errorCode /= SPFFT_SUCCESS) error stop @@ -50,6 +53,11 @@ program main errorCode = spfft_grid_destroy(grid) if (errorCode /= SPFFT_SUCCESS) error stop + + ! ************************************************* + ! Option A: Reuse internal buffer for space domain + ! ************************************************* + ! set space domain array to use memory allocted by the library errorCode = spfft_transform_get_space_domain(transform, processingUnit, realValuesPtr) if (errorCode /= SPFFT_SUCCESS) error stop @@ -59,24 +67,33 @@ program main if (errorCode /= SPFFT_SUCCESS) error stop - call c_f_pointer(realValuesPtr, spaceDomain, [dimX,dimY,dimZ]) + call c_f_pointer(realValuesPtr, spaceDomainPtr, [dimX,dimY,dimZ]) print *, "" print *, "After backward transform:" - do k = 1, size(spaceDomain, 3) - do j = 1, size(spaceDomain, 2) - do i = 1, size(spaceDomain, 1) - print *, spaceDomain(i, j, k) + do k = 1, size(spaceDomainPtr, 3) + do j = 1, size(spaceDomainPtr, 2) + do i = 1, size(spaceDomainPtr, 1) + print *, spaceDomainPtr(i, j, k) end do end do end do - ! transform forward (will invalidate space domain data) - errorCode = spfft_transform_forward(transform, processingUnit, frequencyElements, 0) + + ! ********************************************** + ! Option B: Use external buffer for space domain + ! ********************************************** + + ! transform backward + errorCode = spfft_transform_backward_ptr(transform, frequencyElements, spaceDomain) + if (errorCode /= SPFFT_SUCCESS) error stop + + ! transform forward + errorCode = spfft_transform_forward_ptr(transform, spaceDomain, frequencyElements, SPFFT_NO_SCALING) if (errorCode /= SPFFT_SUCCESS) error stop print *, "" - print *, "After forward transform (without scaling):" + print *, "After forward transform (without normalization):" do i = 1, size(frequencyElements) print *, frequencyElements(i) end do