diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..0020756
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,159 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR) # 3.8 added CUDA language support without FindCUDA
+project(SpFFT LANGUAGES CXX VERSION 0.9.1)
+
+# allow {module}_ROOT variables to be set
+if(POLICY CMP0074)
+	cmake_policy(SET CMP0074 NEW)
+endif()
+
+# use INTERFACE_LINK_LIBRARIES property if available
+if(POLICY CMP0022)
+	cmake_policy(SET CMP0022 NEW)
+endif()
+
+# set default build type to RELEASE
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+	set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type" FORCE)
+	set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+		"Debug" "Release" "MinSizeRel" "RelWithDebInfo"
+		)
+endif()
+
+# set language and standard
+set(CMAKE_CXX_STANDARD 11)
+
+#add local module path
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake/modules)
+
+# Options
+option(SPFFT_STATIC "Compile as static library" OFF)
+option(SPFFT_OMP "Compile with OpenMP support" ON)
+option(SPFFT_MPI "Compile with MPI support" ON)
+option(SPFFT_GPU_DIRECT "Compile with GPU direct (GPU aware MPI) support." OFF)
+option(SPFFT_BUILD_TESTS "Build tests" OFF)
+option(SPFFT_SINGLE_PRECISION "Enable single precision support" OFF)
+option(SPFFT_INSTALL "Enable CMake install commands" ON)
+
+set(SPFFT_GPU_BACKEND "OFF" CACHE STRING "GPU backend")
+set_property(CACHE SPFFT_GPU_BACKEND PROPERTY STRINGS
+	"OFF" "CUDA" "ROCM"
+	)
+
+
+# set preferred library type
+if (SPFFT_STATIC)
+	# prefer static over dynamic libraries with the find_library() command by changing the order
+	set(CMAKE_FIND_LIBRARY_SUFFIXES_SAVE ${CMAKE_FIND_LIBRARY_SUFFIXES})
+	if(APPLE)
+		set(CMAKE_FIND_LIBRARY_SUFFIXES .a .tbd .dylib .so)
+	elseif(UNIX)
+		set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so)
+	endif()
+	set(SPFFT_LIBRARY_TYPE STATIC)
+else()
+	set(SPFFT_LIBRARY_TYPE SHARED)
+endif()
+
+set(SPFFT_DEFINITIONS)
+set(SPFFT_EXTERNAL_COMPILE_OPTIONS)
+set(SPFFT_LIBS)
+set(SPFFT_EXTERNAL_LIBS)
+set(SPFFT_INTERFACE_LIBS)
+set(SPFFT_INCLUDE_DIRS)
+set(SPFFT_EXTERNAL_INCLUDE_DIRS)
+
+# Options combination check
+set(SPFFT_CUDA OFF)
+set(SPFFT_ROCM OFF)
+if(SPFFT_GPU_BACKEND)
+	if(SPFFT_GPU_BACKEND STREQUAL "CUDA")
+		set(SPFFT_CUDA ON)
+	elseif(SPFFT_GPU_BACKEND STREQUAL "ROCM")
+		set(SPFFT_ROCM ON)
+	else()
+		message(FATAL_ERROR "Invalid GPU backend option")
+	endif()
+endif()
+mark_as_advanced(SPFFT_CUDA SPFFT_ROCM)
+
+# CUDA
+if(SPFFT_CUDA)
+	enable_language(CUDA)
+	find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+	find_library(CUDA_CUFFT_LIBRARY cufft PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+	list(APPEND SPFFT_EXTERNAL_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_CUFFT_LIBRARY})
+	list(APPEND SPFFT_EXTERNAL_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif()
+
+# ROCM
+if(SPFFT_ROCM)
+	find_package(ROCM REQUIRED COMPONENTS rocfft)
+	list(APPEND SPFFT_EXTERNAL_INCLUDE_DIRS ${ROCM_INCLUDE_DIRS})
+	list(APPEND SPFFT_EXTERNAL_LIBS ${ROCM_LIBRARIES})
+	list(APPEND SPFFT_EXTERNAL_COMPILE_OPTIONS ${ROCM_DEFINITIONS})
+endif()
+
+
+
+if(SPFFT_MPI)
+	find_package(MPI REQUIRED)
+	list(APPEND SPFFT_EXTERNAL_LIBS MPI::MPI_CXX)
+	list(APPEND SPFFT_INTERFACE_LIBS ${MPI_CXX_LIBRARIES})
+endif()
+
+if(SPFFT_OMP)
+	find_package(OpenMP REQUIRED)
+	list(APPEND SPFFT_EXTERNAL_LIBS OpenMP::OpenMP_CXX)
+	list(APPEND SPFFT_INTERFACE_LIBS ${OpenMP_CXX_LIBRARIES})
+endif()
+
+if(SPFFT_GPU_DIRECT)
+	message(STATUS "GPU Direct support enabled: Additional environment variables might have to be set before execution. (e.g \"export MPICH_RDMA_ENABLED_CUDA=1\")")
+endif()
+
+
+
+# Use MKL if available, otherwise require FFTW3
+find_package(MKLSequential)
+if(MKLSequential_FOUND)
+	list(APPEND SPFFT_EXTERNAL_LIBS MKL::Sequential)
+	list(APPEND SPFFT_INTERFACE_LIBS ${MKLSequential_LIBRARIES})
+else()
+	find_package(FFTW REQUIRED)
+	list(APPEND SPFFT_EXTERNAL_LIBS FFTW::FFTW)
+	list(APPEND SPFFT_INTERFACE_LIBS ${FFTW_LIBRARIES})
+	if(SPFFT_SINGLE_PRECISION AND NOT FFTW_FLOAT_FOUND)
+		message(FATAL_ERROR "FFTW library with single precision support NOT FOUND. Disable SPFFT_SINGLE_PRECISION or provide path to library.")
+	endif()
+endif()
+
+
+if(SPFFT_BUILD_TESTS)
+	# enable timing with testing
+	set(SPFFT_TIMING ON)
+endif()
+
+# generate config.h
+configure_file(include/spfft/config.h.in ${PROJECT_BINARY_DIR}/spfft/config.h)
+
+list(APPEND SPFFT_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/src)
+list(APPEND SPFFT_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include)
+list(APPEND SPFFT_INCLUDE_DIRS ${PROJECT_BINARY_DIR})
+list(APPEND SPFFT_EXTERNAL_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/ext)
+
+#############################################################################
+# All include dirs and definitions must be set before sub-directory is added!
+#############################################################################
+add_subdirectory(src)
+
+list(APPEND SPFFT_LIBS spfft)
+
+# add tests for developement
+if(SPFFT_BUILD_TESTS)
+	add_subdirectory(tests)
+endif()
+
+# reset cmake library suffixes
+if(SPFFT_STATIC)
+	set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_SAVE})
+endif()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ab85216
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2019 ETH Zurich, Simon Frasch
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f068ff0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,165 @@
+# SpFFT
+SpFFT is a library for the computation 3D FFTs with sparse frequency domain data written in C++ with support for MPI, OpenMP, CUDA and ROCm.
+
+It was originally intended for transforms of data with spherical cutoff in frequency domain, as required by some computational material science codes, but was generalized to sparse frequency domain data.
+
+
+### Design Goals
+- Sparse frequency domain input
+- Reuse of pre-allocated memory
+- Support of negative indexing for frequency domain data
+- Unified interface for calculations on CPUs and GPUs
+- Support of Complex-To-Real and Real-To-Complex transforms, where the full hermitian symmetry property is utilized. Therefore, there is no redundant frequency domain data, as is usually the case for dense 3D R2C / C2R transforms with libraries such as FFTW.
+- C++, C and Fortran interfaces
+
+### Interface Design
+To allow for pre-allocation and reuse of memory, the design is based on two classes:
+
+- **Grid**: Allocates memory for transforms up to a given size in each dimension.
+- **Transform**: Is created using a *Grid* and can have any size up to the maximum allowed by the *Grid*. A *Transform* holds a counted reference to the underlying *Grid*. Therefore, *Transforms* created from the same *Grid* will share the memory, which is only freed, once the *Grid* and all associated *Transforms* are destroyed.
+
+The user provides memory for storing the sparse frequency domain data, while a *Transform* provides memory for the space domain data. This implies, that executing a *Transform* will override the space domain data of all other *Transforms* associated to the same *Grid*.
+
+## Documentation
+Documentation can be found HERE (TODO).
+
+## Requirements
+- C++ Compiler with C++11 support
+- CMake version 3.8 or greater
+- Library providing a FFTW 3.x interface (FFTW3 or Intel MKL)
+- For multi-threading: OpenMP support by the compiler
+- For GPU support: CUDA or ROCm
+
+## Installation
+The build system follows the standard CMake workflow. Example:
+```console
+mkdir build
+cd build
+cmake .. -DSPFFT_OMP=ON -DSPFFT_MPI=ON -DSPFFT_GPU_BACKEND=CUDA -DSPFFT_SINGLE_PRECISION=OFF -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j8 install
+```
+
+### CMake options
+| Option                 | Default | Description                                      |
+|------------------------|---------|--------------------------------------------------|
+| SPFFT_MPI              | ON      | Enable MPI support                               |
+| SPFFT_OMP              | ON      | Enable multi-threading with OpenMP               |
+| SPFFT_GPU_BACKEND      | OFF     | Select GPU backend. Can be OFF, CUDA or ROCM     |
+| SPFFT_GPU_DIRECT       | OFF     | Use GPU aware MPI with GPUDirect                 |
+| SPFFT_SINGLE_PRECISION | OFF     | Enable single precision support                  |
+| SPFFT_STATIC           | OFF     | Build as static library                          |
+| SPFFT_BUILD_TESTS      | OFF     | Build test executables for developement purposes |
+| SPFFT_INSTALL          | ON      | Add library to install target                    |
+
+## Examples
+Further exmples for C++, C and Fortran can be found in the "examples" folder.
+```cpp
+#include <complex>
+#include <iostream>
+#include <vector>
+
+#include "spfft/spfft.hpp"
+
+int main(int argc, char** argv) {
+  const int dimX = 2;
+  const int dimY = 2;
+  const int dimZ = 2;
+
+  std::cout << "Dimensions: x = " << dimX << ", y = " << dimY << ", z = " << dimZ << std::endl
+            << std::endl;
+
+  const int numThreads = -1; // Use default OpenMP value
+
+  std::vector<std::complex<double>> freqValues;
+  freqValues.reserve(dimX * dimY * dimZ);
+
+  std::vector<int> indices;
+  indices.reserve(dimX * dimY * dimZ * 3);
+
+  // initialize frequency domain values and indices
+  double initValue = 0.0;
+  for (int xIndex = 0; xIndex < dimX; ++xIndex) {
+    for (int yIndex = 0; yIndex < dimY; ++yIndex) {
+      for (int zIndex = 0; zIndex < dimZ; ++zIndex) {
+        // init values
+        freqValues.emplace_back(initValue, -initValue);
+
+        // add index triplet for value
+        indices.emplace_back(xIndex);
+        indices.emplace_back(yIndex);
+        indices.emplace_back(zIndex);
+
+        initValue += 1.0;
+      }
+    }
+  }
+
+  std::cout << "Input:" << std::endl;
+  for (const auto& value : freqValues) {
+    std::cout << value.real() << ", " << value.imag() << std::endl;
+  }
+
+  // create local Grid. For distributed computations, a MPI Communicator has to be provided
+  spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);
+
+  // create transform
+  spfft::Transform transform =
+      grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, dimZ,
+                            freqValues.size(), SPFFT_INDEX_TRIPLETS, indices.data());
+
+  // get pointer to space domain data. Alignment is guaranteed to fullfill requirements for
+  // std::complex
+  std::complex<double>* realValues =
+      reinterpret_cast<std::complex<double>*>(transform.space_domain_data(SPFFT_PU_HOST));
+
+  // transform backward
+  transform.backward(reinterpret_cast<double*>(freqValues.data()), SPFFT_PU_HOST);
+
+  std::cout << std::endl << "After backward transform:" << std::endl;
+  for (int i = 0; i < transform.local_slice_size(); ++i) {
+    std::cout << realValues[i].real() << ", " << realValues[i].imag() << std::endl;
+  }
+
+  // transform forward
+  transform.forward(SPFFT_PU_HOST, reinterpret_cast<double*>(freqValues.data()), SPFFT_NO_SCALING);
+
+  std::cout << std::endl << "After forward transform (without scaling):" << std::endl;
+  for (const auto& value : freqValues) {
+    std::cout << value.real() << ", " << value.imag() << std::endl;
+  }
+
+  return 0;
+}
+```
+
+## License
+
+```
+Copyright (c) 2019 ETH Zurich, Simon Frasch
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/cmake/SpFFTConfig.cmake b/cmake/SpFFTConfig.cmake
new file mode 100644
index 0000000..9d0cf51
--- /dev/null
+++ b/cmake/SpFFTConfig.cmake
@@ -0,0 +1,14 @@
+# options used for building library
+set(SPFFT_OMP @SPFFT_OMP@)
+set(SPFFT_MPI @SPFFT_MPI@)
+set(SPFFT_STATIC @SPFFT_STATIC@)
+set(SPFFT_GPU_DIRECT @SPFFT_GPU_DIRECT@)
+set(SPFFT_SINGLE_PRECISION @SPFFT_SINGLE_PRECISION@)
+set(SPFFT_GPU_BACKEND @SPFFT_GPU_BACKEND@)
+
+# add version of package
+include("${CMAKE_CURRENT_LIST_DIR}/SpFFTConfigVersion.cmake")
+
+# add library target
+include("${CMAKE_CURRENT_LIST_DIR}/SpFFTTargets.cmake")
+
diff --git a/cmake/modules/FindFFTW.cmake b/cmake/modules/FindFFTW.cmake
new file mode 100644
index 0000000..78bad8b
--- /dev/null
+++ b/cmake/modules/FindFFTW.cmake
@@ -0,0 +1,113 @@
+#  Copyright (c) 2019 ETH Zurich, Simon Frasch
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#  2. Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#     may be used to endorse or promote products derived from this software
+#     without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+
+
+#.rst:
+# FindFFTW
+# -----------
+#
+# This module searches for the fftw3 library.
+#
+# The following variables are set
+#
+# ::
+#
+#   FFTW_FOUND           - True if double precision fftw library is found
+#   FFTW_FLOAT_FOUND     - True if single precision fftw library is found
+#   FFTW_LIBRARIES       - The required libraries
+#   FFTW_INCLUDE_DIRS    - The required include directory
+#
+# The following import target is created
+#
+# ::
+#
+#   FFTW::FFTW
+
+
+
+# set paths to look for library
+set(_FFTW_PATHS ${FFTW_ROOT} $ENV{FFTW_ROOT})
+
+set(_FFTW_DEFAULT_PATH_SWITCH)
+
+if(_FFTW_PATHS)
+    # disable default paths if ROOT is set
+    set(_FFTW_DEFAULT_PATH_SWITCH NO_DEFAULT_PATH)
+else()
+    # try to detect location with pkgconfig
+    find_package(PkgConfig QUIET)
+    if(PKG_CONFIG_FOUND)
+      pkg_check_modules(PKG_FFTW QUIET "fftw3")
+    endif()
+    set(_FFTW_PATHS ${PKG_FFTW_LIBRARY_DIRS})
+endif()
+
+
+find_library(
+    FFTW_LIBRARIES
+    NAMES "fftw3"
+    HINTS ${_FFTW_PATHS}
+    PATH_SUFFIXES "lib" "lib64"
+    ${_FFTW_DEFAULT_PATH_SWITCH}
+)
+find_library(
+    _FFTW_FLOAT_LIBRARY
+    NAMES "fftw3f"
+    HINTS ${_FFTW_PATHS}
+    PATH_SUFFIXES "lib" "lib64"
+    ${_FFTW_DEFAULT_PATH_SWITCH}
+)
+find_path(FFTW_INCLUDE_DIRS
+    NAMES "fftw3.h"
+    HINTS ${_FFTW_PATHS}
+    PATH_SUFFIXES "include" "include/fftw" "../include" "../include/fftw"
+    ${_FFTW_DEFAULT_PATH_SWITCH}
+)
+
+# check if found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES )
+
+# check if single precision library found
+if(_FFTW_FLOAT_LIBRARY AND FFTW_FOUND)
+    list(APPEND FFTW_LIBRARIES ${_FFTW_FLOAT_LIBRARY})
+    set(FFTW_FLOAT_FOUND TRUE)
+else()
+    set(FFTW_FLOAT_FOUND FALSE)
+endif()
+
+
+# add target to link against
+if(FFTW_FOUND)
+    if(NOT TARGET FFTW::FFTW)
+        add_library(FFTW::FFTW INTERFACE IMPORTED)
+    endif()
+    set_property(TARGET FFTW::FFTW PROPERTY INTERFACE_LINK_LIBRARIES ${FFTW_LIBRARIES})
+    set_property(TARGET FFTW::FFTW PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${FFTW_INCLUDE_DIRS})
+endif()
+
+# prevent clutter in cache
+MARK_AS_ADVANCED(FFTW_FOUND FFTW_LIBRARIES FFTW_INCLUDE_DIRS pkgcfg_lib_PKG_FFTW_fftw3 _FFTW_FLOAT_LIBRARY)
diff --git a/cmake/modules/FindMKLSequential.cmake b/cmake/modules/FindMKLSequential.cmake
new file mode 100644
index 0000000..bf84d87
--- /dev/null
+++ b/cmake/modules/FindMKLSequential.cmake
@@ -0,0 +1,138 @@
+#  Copyright (c) 2019 ETH Zurich, Simon Frasch
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#  2. Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#     may be used to endorse or promote products derived from this software
+#     without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+
+
+#.rst:
+# FindMKLSequential
+# -----------
+#
+# This module searches for the sequential 32-bit integer MKL library.
+# Only looks for static libraries by default.
+#
+#
+# The following variables are set
+#
+# ::
+#
+#   MKLSequential_FOUND                - True if double precision fftw library is found
+#   MKLSequential_LIBRARIES            - The required libraries
+#   MKLSequential_INCLUDE_DIRS         - The required include directory
+#   MKLSequential_FFTW_INCLUDE_DIRS    - The required fftw interface include directory
+#
+# The following import target is created
+#
+# ::
+#
+#   MKL::Sequential
+
+
+# try to detect location with pkgconfig
+if(NOT MKLSequential_ROOT)
+    find_package(PkgConfig QUIET)
+    if(PKG_CONFIG_FOUND)
+       # look for dynmic module, such that a -L flag can be parsed
+      pkg_check_modules(PKG_MKL QUIET "mkl-dynamic-lp64-seq")
+    endif()
+endif()
+
+# set paths to look for MKL
+set(_MKLSequential_PATHS ${MKLSequential_ROOT} $ENV{MKLROOT} ${PKG_MKL_LIBRARY_DIRS})
+
+# do not look at any default paths if a custom path was set or pgk-config module exists
+set(_MKLSequential_DEFAULT_PATH_SWITCH)
+if(_MKLSequential_PATHS)
+    set(_MKLSequential_DEFAULT_PATH_SWITCH NO_DEFAULT_PATH)
+endif()
+
+
+# find all MKL libraries / include directories
+find_library(
+    _MKLSequential_INT_LIB
+    NAMES "mkl_intel_lp64"
+    HINTS ${_MKLSequential_PATHS}
+    PATH_SUFFIXES "intel64_lin" "intel64" "lib/intel64_lin" "lib/intel64"
+    ${_MKLSequential_DEFAULT_PATH_SWITCH}
+)
+find_library(
+    _MKLSequential_SEQ_LIB
+    NAMES "mkl_sequential"
+    HINTS ${_MKLSequential_PATHS}
+    PATH_SUFFIXES "intel64_lin" "intel64" "lib/intel64_lin" "lib/intel64"
+    ${_MKLSequential_DEFAULT_PATH_SWITCH}
+)
+find_library(
+    _MKLSequential_CORE_LIB
+    NAMES "mkl_core"
+    HINTS ${_MKLSequential_PATHS}
+    PATH_SUFFIXES "intel64_lin" "intel64" "lib/intel64_lin" "lib/intel64"
+    ${_MKLSequential_DEFAULT_PATH_SWITCH}
+)
+find_path(MKLSequential_INCLUDE_DIRS
+    NAMES "mkl.h"
+    HINTS ${_MKLSequential_PATHS}
+    PATH_SUFFIXES "include" "../include"
+    ${_MKLSequential_DEFAULT_PATH_SWITCH}
+)
+find_path(MKLSequential_FFTW_INCLUDE_DIRS
+    NAMES "fftw3.h"
+    HINTS ${_MKLSequential_PATHS}
+    PATH_SUFFIXES "include" "../include" "include/fftw" "../include/fftw" "fftw"
+    ${_MKLSequential_DEFAULT_PATH_SWITCH}
+)
+
+# check if found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKLSequential REQUIRED_VARS _MKLSequential_INT_LIB 
+    _MKLSequential_SEQ_LIB _MKLSequential_CORE_LIB MKLSequential_INCLUDE_DIRS MKLSequential_FFTW_INCLUDE_DIRS)
+
+# add target to link against
+if(MKLSequential_FOUND)
+    # libries have inter-dependencies, therefore use link group on Linux
+    if(UNIX AND NOT APPLE)
+        set(MKLSequential_LIBRARIES "-Wl,--start-group" ${_MKLSequential_INT_LIB} ${_MKLSequential_SEQ_LIB} ${_MKLSequential_CORE_LIB} "-Wl,--end-group")
+    else()
+        set(MKLSequential_LIBRARIES ${_MKLSequential_INT_LIB} ${_MKLSequential_SEQ_LIB} ${_MKLSequential_CORE_LIB})
+    endif()
+    # external libries required on unix
+    if(UNIX)
+        list(APPEND MKLSequential_LIBRARIES -lpthread -lm -ldl)
+    endif()
+
+    # create interface target
+    if(NOT TARGET MKL::Sequential)
+        add_library(MKL::Sequential INTERFACE IMPORTED)
+    endif()
+    set_property(TARGET MKL::Sequential PROPERTY INTERFACE_LINK_LIBRARIES ${MKLSequential_LIBRARIES})
+    set_property(TARGET MKL::Sequential PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${MKLSequential_INCLUDE_DIRS} ${MKLSequential_FFTW_INCLUDE_DIRS})
+endif()
+
+# prevent clutter in gui
+MARK_AS_ADVANCED(MKLSequential_FOUND MKLSequential_LIBRARIES MKLSequential_INCLUDE_DIRS
+    _MKLSequential_INT_LIB _MKLSequential_SEQ_LIB _MKLSequential_CORE_LIB MKLSequential_FFTW_INCLUDE_DIRS
+    _MKLSequential_DEFAULT_PATH_SWITCH _MKLSequential_PATHS)
+
+MARK_AS_ADVANCED(pkgcfg_lib_PKG_MKL_dl pkgcfg_lib_PKG_MKL_m pkgcfg_lib_PKG_MKL_mkl_core
+    pkgcfg_lib_PKG_MKL_mkl_sequential pkgcfg_lib_PKG_MKL_mkl_intel_lp64 pkgcfg_lib_PKG_MKL_pthread)
diff --git a/cmake/modules/FindROCM.cmake b/cmake/modules/FindROCM.cmake
new file mode 100644
index 0000000..d48d486
--- /dev/null
+++ b/cmake/modules/FindROCM.cmake
@@ -0,0 +1,439 @@
+# - Find the ROCM library
+#
+# Usage:
+#   find_package(ROCM [REQUIRED] [QUIET] COMPONENTS [components ...] )
+#
+# Compnents available:
+#  - hipblas
+#  - hipsparse
+#  - rocfft
+#  - rocblas
+#  - rocsparse
+#
+# Commands made available:
+#   rocm_hip_add_library(<name> <sources> [STATIC | SHARED] [FLAGS] <flags> [OUTPUT_DIR] <dir> [INCLUDE_DIRS] <dirs ...>)
+#    --- Compiles source files into an imported library with hipcc. No global defitions or include directories are taken into account.
+#
+# The following variables can be set for compilation:
+#   ROCM_HIPCC_FLAGS ----------------- Flags passed on to hipcc compiler
+#   ROCM_HIPCC_FLAGS_DEBUG ----------- Flags passed on to hipcc compiler in DEBUG mode
+#   ROCM_HIPCC_FLAGS_RELEASE --------- Flags passed on to hipcc compiler in RELEASE mode
+#   ROCM_HIPCC_FLAGS_RELWITHDEBINFO -- Flags passed on to hipcc compiler in RELWITHDEBINFO mode
+#   ROCM_HIPCC_FLAGS_MINSIZEREL ------ Flags passed on to hipcc compiler in MINSIZEREL mode
+#
+# The following variables can be set to specify a search location
+#   ROCM_ROOT ------------ if set, the libraries are exclusively searched under this path
+#   <COMPONENT>_ROOT ------ if set, search for component specific libraries at given path. Takes precedence over ROCM_ROOT
+#
+# The following variables are generated:
+#   ROCM_FOUND ------------------- true if ROCM is found on the system
+#   ROCM_LIBRARIES --------------- full path to ROCM
+#   ROCM_INCLUDE_DIRS ------------ ROCM include directories
+#   ROCM_DEFINITIONS ------------- ROCM definitions
+#   ROCM_HCC_EXECUTABLE ---------- ROCM HCC compiler
+#   ROCM_HCC-CONFIG_EXECUTABLE --- ROCM HCC config
+#   ROCM_HIPCC_EXECUTABLE -------- HIPCC compiler
+#   ROCM_HIPCONFIG_EXECUTABLE ---- hip config
+#   ROCM_HIPIFY-PERL_EXECUTABLE -- hipify
+#   ROCM_HIP_PLATFORM ------------ Platform identifier: "hcc" or "nvcc"
+#
+
+
+set(ROCM_HIPCC_FLAGS "" CACHE STRING "Flags for HIPCC Compiler")
+set(ROCM_HIPCC_FLAGS_DEBUG "-g" CACHE STRING "Debug flags for HIPCC Compiler")
+set(ROCM_HIPCC_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Release flags for HIPCC Compiler")
+set(ROCM_HIPCC_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG" CACHE STRING "Release with debug flags for HIPCC Compiler")
+set(ROCM_HIPCC_FLAGS_MINSIZEREL "-Os -DNDEBUG" CACHE STRING "Minimum size flags for HIPCC Compiler")
+
+#If environment variable ROCM_ROOT is specified
+if(NOT ROCM_ROOT AND ENV{ROCM_ROOT})
+    file(TO_CMAKE_PATH "$ENV{ROCM_ROOT}" ROCM_ROOT)
+    set(ROCM_ROOT "${ROCM_ROOT}" CACHE PATH "Root directory for ROCM installation.")
+endif()
+
+set(ROCM_FOUND FALSE)
+set(ROCM_LIBRARIES)
+set(ROCM_INCLUDE_DIRS)
+set(ROCM_DEFINITIONS)
+unset(ROCM_HCC_EXECUTABLE)
+unset(ROCM_HCC-CONFIG_EXECUTABLE)
+unset(ROCM_HIPCC_EXECUTABLE)
+unset(ROCM_HIPCONFIG_EXECUTABLE)
+unset(ROCM_HIPFIY-PERL-EXECUTABLE)
+unset(ROCM_HIP_PLATFORM)
+
+include(FindPackageHandleStandardArgs)
+
+
+# Finds libraries and include path for rocm modules
+# IN:
+#   - module_name: name of a module (e.g. hcc)
+#   - following arguments: name of libraries required
+# OUT:
+#   - ROCM_LIBRARIES: Appends to list of libraries
+#   - ROCM_INCLUDE_DIRS: Appends to include dirs
+function(find_rcm_module module_name)
+    # convert module name to upper case for consistent variable naming
+    string(TOUPPER ${module_name} MODULE_NAME_UPPER)
+
+
+    if(DEFINED ${MODULE_NAME_UPPER}_ROOT)
+	set(ROOT_DIR ${${MODULE_NAME_UPPER}_ROOT})
+    elseif(DEFINED ROCM_ROOT)
+	set(ROOT_DIR ${ROCM_ROOT})
+    endif()
+
+    # get abosolute path to avoid issues with tilde
+    if(ROOT_DIR)
+        get_filename_component(ROOT_DIR ${ROOT_DIR} ABSOLUTE)
+    endif()
+
+    # remove module name from input arguments
+    set(LIBRARY_NAMES ${ARGV})
+    list(REMOVE_AT LIBRARY_NAMES 0)
+
+    if(${ROCM_FIND_REQUIRED})
+	set(ROCM_${MODULE_NAME_UPPER}_FIND_REQUIRED TRUE)
+    else()
+	set(ROCM_${MODULE_NAME_UPPER}_FIND_REQUIRED FALSE)
+    endif()
+    if(${ROCM_FIND_QUIETLY})
+	set(ROCM_${MODULE_NAME_UPPER}_FIND_QUIETLY TRUE)
+    else()
+	set(ROCM_${MODULE_NAME_UPPER}_FIND_QUIETLY FALSE)
+    endif()
+
+    set(ROCM_LIBRARIES_${MODULE_NAME_UPPER})
+
+    if(ROOT_DIR)
+        # find libraries
+        foreach(library_name IN LISTS LIBRARY_NAMES)
+            find_library(
+                ROCM_LIBRARIES_${library_name}
+                NAMES ${library_name}
+                PATHS ${ROOT_DIR}
+                PATH_SUFFIXES "lib" "${module_name}/lib"
+                NO_DEFAULT_PATH
+            )
+	    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
+                "For ROCM module ${module_name}, library ${library_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
+                REQUIRED_VARS ROCM_LIBRARIES_${library_name})
+	    if(ROCM_LIBRARIES_${library_name})
+		list(APPEND ROCM_LIBRARIES_${MODULE_NAME_UPPER} ${ROCM_LIBRARIES_${library_name}})
+		mark_as_advanced(ROCM_LIBRARIES_${library_name})
+	    endif()
+        endforeach()
+
+        # find include directory
+        find_path(
+            ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}
+            NAMES ${module_name}/include
+	    PATHS ${ROOT_DIR} ${ROOT_DIR}/..
+            NO_DEFAULT_PATH
+        )
+        # set include directory for module if found
+        if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+            set(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}}/${module_name}/include)
+        endif()
+
+    else()
+
+        foreach(library_name IN LISTS LIBRARY_NAMES)
+            find_library(
+                ROCM_LIBRARIES_${library_name}
+                NAMES ${library_name}
+                PATHS /opt/rocm
+                PATH_SUFFIXES "lib" "lib64" "${module_name}/lib" "rocm/${module_name}/lib"
+            )
+	    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
+                "For ROCM module ${module_name}, library ${library_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
+                REQUIRED_VARS ROCM_LIBRARIES_${library_name})
+	    if(ROCM_LIBRARIES_${library_name})
+		list(APPEND ROCM_LIBRARIES_${MODULE_NAME_UPPER} ${ROCM_LIBRARIES_${library_name}})
+		mark_as_advanced(ROCM_LIBRARIES_${library_name})
+	    endif()
+        endforeach()
+
+        # find include directory
+        find_path(
+            ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}
+            NAMES ${module_name}/include
+            PATHS /opt/rocm/
+        )
+        # set include directory for module if found
+        if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+            set(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}}/${module_name}/include)
+        endif()
+    endif()
+
+
+    # check if all required parts found
+    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
+        "ROCM module ${module_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
+        REQUIRED_VARS ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+    if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+	mark_as_advanced(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+    endif()
+
+    # set global variables
+    if(ROCM_LIBRARIES_${MODULE_NAME_UPPER})
+        set(ROCM_LIBRARIES ${ROCM_LIBRARIES} ${ROCM_LIBRARIES_${MODULE_NAME_UPPER}} PARENT_SCOPE)
+    endif()
+    if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
+        set(ROCM_INCLUDE_DIRS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}} PARENT_SCOPE)
+    endif()
+
+endfunction()
+
+
+# Finds executables of rocm modules
+# IN:
+#   - module_name: name of a module (e.g. hcc)
+#   - executable_name: name of the executable (e.g. hcc)
+# OUT:
+#   - ROCM_${executable_name}_EXECUTABLE: Path to executable
+function(find_rocm_executable module_name executable_name)
+    string(TOUPPER ${module_name} MODULE_NAME_UPPER)
+    string(TOUPPER ${executable_name} EXECUTABLE_NAME_UPPER)
+    unset(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE PARENT_SCOPE)
+
+    if(DEFINED ${MODULE_NAME_UPPER}_ROOT)
+	set(ROOT_DIR ${${MODULE_NAME_UPPER}_ROOT})
+    elseif(DEFINED ROCM_ROOT)
+	set(ROOT_DIR ${ROCM_ROOT})
+    endif()
+
+    # get abosolute path to avoid issues with tilde
+    if(ROOT_DIR)
+        get_filename_component(ROOT_DIR ${ROOT_DIR} ABSOLUTE)
+    endif()
+
+    if(ROOT_DIR)
+            find_file(
+                ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE
+                NAMES ${executable_name}
+		PATHS ${ROOT_DIR}
+		PATH_SUFFIXES "bin" "${module_name}/bin"
+                NO_DEFAULT_PATH
+            )
+    else()
+            find_file(
+                ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE
+                NAMES ${executable_name}
+                PATHS "/opt/rocm"
+		PATH_SUFFIXES "bin" "${module_name}/bin"
+            )
+    endif()
+    set(ROCM_${EXECUTABLE_NAME_UPPER} ROCM_${EXECUTABLE_NAME_UPPER} PARENT_SCOPE)
+
+    if(${ROCM_FIND_REQUIRED})
+	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_REQUIRED TRUE)
+    else()
+	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_REQUIRED FALSE)
+    endif()
+    if(${ROCM_FIND_QUIETLY})
+	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_QUIETLY TRUE)
+    else()
+	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_QUIETLY FALSE)
+    endif()
+    find_package_handle_standard_args(ROCM FAIL_MESSAGE
+	"ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER} ${executable_name} executable could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT."
+        REQUIRED_VARS ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
+    if(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
+	set(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE ${ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE} PARENT_SCOPE)
+	mark_as_advanced(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
+    endif()
+endfunction()
+
+
+
+# find compilers
+find_rocm_executable(hcc hcc)
+find_rocm_executable(hip hipcc)
+
+if(ROCM_HIPCC_EXECUTABLE AND ROCM_HCC_EXECUTABLE)
+    set(ROCM_FOUND TRUE)
+else()
+    set(ROCM_FOUND FALSE)
+    return()
+endif()
+
+
+# find other executables and libraries
+find_rocm_executable(hcc hcc-config)
+find_rocm_executable(hip hipconfig)
+find_rocm_executable(hip hipify-perl)
+find_rcm_module(hcc LTO mcwamp mcwamp_cpu mcwamp_hsa hc_am)
+find_rcm_module(hip hip_hcc)
+find_rcm_module(rocm hsa-runtime64)
+
+
+# parse hip config
+execute_process(COMMAND ${ROCM_HIPCONFIG_EXECUTABLE} -P OUTPUT_VARIABLE ROCM_HIP_PLATFORM RESULT_VARIABLE RESULT_VALUE)
+if(NOT ${RESULT_VALUE} EQUAL 0)
+    message(FATAL_ERROR "Error parsing platform identifier from hipconfig! Code: ${RESULT_VALUE}")
+endif()
+if(NOT ROCM_HIP_PLATFORM)
+    message(FATAL_ERROR "Empty platform identifier from hipconfig!")
+endif()
+
+# set definitions
+if("${ROCM_HIP_PLATFORM}" STREQUAL "hcc")
+    set(ROCM_DEFINITIONS -D__HIP_PLATFORM_HCC__)
+elseif("${ROCM_HIP_PLATFORM}" STREQUAL "nvcc")
+    set(ROCM_DEFINITIONS -D__HIP_PLATFORM_NVCC__)
+else()
+    message(FATAL_ERROR "Could not parse platform identifier from hipconfig! Value: ${ROCM_HIP_PLATFORM}")
+endif()
+
+# find libraries for each specified components
+foreach(module_name IN LISTS ROCM_FIND_COMPONENTS)
+    # set required libaries for each module
+    if("${module_name}" STREQUAL "hipblas")
+        find_rcm_module(hipblas hipblas)
+    elseif("${module_name}" STREQUAL "hipsparse")
+        find_rcm_module(hipsparse hipsparse)
+    elseif("${module_name}" STREQUAL "rocblas")
+        find_rcm_module(rocblas rocblas)
+    elseif("${module_name}" STREQUAL "rocsparse")
+        find_rcm_module(rocsparse rocsparse)
+    elseif("${module_name}" STREQUAL "rocfft")
+        find_rcm_module(rocfft rocfft rocfft-device)
+    else()
+        message(FATAL_ERROR "Unrecognized component \"${module_name}\" in FindROCM module!")
+    endif()
+endforeach()
+
+
+# Generates library compiled with hipcc
+# Usage:
+#   rocm_hip_add_library(<name> <sources> [STATIC | SHARED] [FLAGS] <flags> [OUTPUT_DIR] <dir> [INCLUDE_DIRS] <dirs ...>)
+macro(rocm_hip_add_library)
+    cmake_parse_arguments(
+        HIP_LIB
+        "SHARED;STATIC"
+        "OUTPUT_DIR"
+        "FLAGS;INCLUDE_DIRS"
+        ${ARGN}
+    )
+    # allow either STATIC or SHARED
+    if(HIP_LIB_SHARED AND HIP_LIB_STATIC)
+        message(FATAL_ERROR "rocm_hip_add_library: library cannot by both static and shared!")
+    endif()
+
+    # default to SHARED
+    if(NOT (HIP_LIB_SHARED OR HIP_LIB_STATIC))
+        set(HIP_LIB_SHARED TRUE)
+    endif()
+
+    # default to current binary output directory
+    if(NOT HIP_LIB_OUTPUT_DIR)
+	set(HIP_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+
+    # parse positional arguments
+    list(LENGTH HIP_LIB_UNPARSED_ARGUMENTS NARGS)
+    if(${NARGS} LESS 2)
+        message(FATAL_ERROR "rocm_hip_add_library: Not enough arguments!")
+    endif()
+    list(GET HIP_LIB_UNPARSED_ARGUMENTS 0 HIP_LIB_NAME)
+    list(REMOVE_AT HIP_LIB_UNPARSED_ARGUMENTS 0)
+    set(HIP_LIB_SOURCES ${HIP_LIB_UNPARSED_ARGUMENTS})
+
+    # generate include flags
+    set(_ROCM_FULL_PATH_INCLUDE_FLAGS)
+    foreach(_rocm_iternal_dir IN LISTS HIP_LIB_INCLUDE_DIRS)
+	if(NOT IS_ABSOLUTE ${_rocm_iternal_dir})
+	    get_filename_component(_rocm_iternal_dir ${_rocm_iternal_dir} ABSOLUTE)
+	endif()
+	list(APPEND _ROCM_FULL_PATH_INCLUDE_FLAGS -I${_rocm_iternal_dir})
+    endforeach()
+
+    # generate full path to source files
+    unset(_ROCM_SOURCES)
+    foreach(source IN LISTS HIP_LIB_SOURCES)
+	if(NOT IS_ABSOLUTE ${source})
+	    get_filename_component(source ${source} ABSOLUTE)
+	endif()
+	set(_ROCM_SOURCES ${_ROCM_SOURCES} ${source})
+    endforeach()
+    get_filename_component(HIP_LIB_OUTPUT_DIR ${HIP_LIB_OUTPUT_DIR} ABSOLUTE)
+
+    # generate flags to use
+    set(_ROCM_STD_FLAGS ${HIP_LIB_FLAGS} ${ROCM_HIPCC_FLAGS})
+    if(_ROCM_STD_FLAGS)
+	list(FILTER _ROCM_STD_FLAGS INCLUDE REGEX -std=)
+    endif()
+    set(_ROCM_FLAGS ${HIP_LIB_FLAGS})
+    if(CMAKE_CXX_STANDARD AND NOT _ROCM_STD_FLAGS)
+	list(APPEND _ROCM_FLAGS -std=c++${CMAKE_CXX_STANDARD})
+    endif()
+    if(CMAKE_BUILD_TYPE)
+	string(TOUPPER ${CMAKE_BUILD_TYPE} _ROCM_BUILD_TYPE_UPPER)
+	list(APPEND _ROCM_FLAGS ${ROCM_HIPCC_FLAGS_${_ROCM_BUILD_TYPE_UPPER}})
+    endif()
+
+    if(NOT ROCM_HIPCC_EXECUTABLE)
+	    message(FATAL_ERROR "HIPCC executable not found!")
+    endif()
+
+    set(_ROCM_FLAGS ${_ROCM_FLAGS} -fPIC  -fno-gpu-rdc )
+
+    # compile all files to .o
+    set(_ROCM_OBJS)
+    set(_ROCM_OBJ_TARGETS)
+    foreach(_rocm_file IN LISTS _ROCM_SOURCES)
+
+	# create output directory for .o file
+	get_filename_component(_ROCM_CURRENT_DIR ${_rocm_file} DIRECTORY)
+	file(RELATIVE_PATH _ROCM_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ${_ROCM_CURRENT_DIR})
+	set(_ROCM_OBJ_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${HIP_LIB_NAME}.dir/${_ROCM_CURRENT_DIR}")
+	file(MAKE_DIRECTORY ${_ROCM_OBJ_OUT_DIR})
+
+	# set .o name and path
+	get_filename_component(_ROCM_FILE_NAME_ONLY ${_rocm_file} NAME)
+	set(_ROCM_OBJ_FILE ${_ROCM_OBJ_OUT_DIR}/${_ROCM_FILE_NAME_ONLY}.o)
+	list(APPEND _ROCM_OBJS ${_ROCM_OBJ_FILE})
+	list(APPEND _ROCM_OBJ_TARGETS HIP_TARGET_${_ROCM_FILE_NAME_ONLY})
+
+	# compile .o file
+	add_custom_target(HIP_TARGET_${_ROCM_FILE_NAME_ONLY} COMMAND ${ROCM_HIPCC_EXECUTABLE} -c ${_rocm_file} -o ${_ROCM_OBJ_FILE} ${_ROCM_FLAGS} ${_ROCM_FULL_PATH_INCLUDE_FLAGS}
+	    WORKING_DIRECTORY ${_ROCM_OBJ_OUT_DIR} SOURCES ${_rocm_file})
+
+    endforeach()
+
+    # compile shared library
+    if(HIP_LIB_SHARED)
+	add_custom_target(HIP_TARGET_${HIP_LIB_NAME} COMMAND ${ROCM_HIPCC_EXECUTABLE} ${_ROCM_OBJS} -fPIC --shared -o ${HIP_LIB_OUTPUT_DIR}/lib${HIP_LIB_NAME}.so
+	    ${_ROCM_FLAGS} ${_ROCM_FULL_PATH_INCLUDE_FLAGS}
+	    WORKING_DIRECTORY ${HIP_LIB_OUTPUT_DIR})
+
+	add_library(${HIP_LIB_NAME} INTERFACE)
+	target_link_libraries(${HIP_LIB_NAME} INTERFACE ${HIP_LIB_OUTPUT_DIR}/lib${HIP_LIB_NAME}.so)
+
+	# add depencies
+	add_dependencies(${HIP_LIB_NAME} HIP_TARGET_${HIP_LIB_NAME})
+	foreach(_rocm_target IN LISTS _ROCM_OBJ_TARGETS)
+	    add_dependencies(HIP_TARGET_${HIP_LIB_NAME} ${_rocm_target})
+	endforeach()
+    endif()
+
+    # static library
+    if(HIP_LIB_STATIC)
+        # create library from object files
+        add_library(${HIP_LIB_NAME} ${_ROCM_OBJS})
+        set_target_properties(${HIP_LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        set_source_files_properties(
+            ${_ROCM_OBJS}
+            PROPERTIES
+            EXTERNAL_OBJECT true
+            GENERATED true
+            )
+	# add dependencies
+	foreach(_rocm_target IN LISTS _ROCM_OBJ_TARGETS)
+	    add_dependencies(${HIP_LIB_NAME} ${_rocm_target})
+	endforeach()
+    endif()
+
+endmacro()
+
diff --git a/docs/Doxyfile b/docs/Doxyfile
new file mode 100644
index 0000000..809a400
--- /dev/null
+++ b/docs/Doxyfile
@@ -0,0 +1,18 @@
+PROJECT_NAME      = "SpFFT"
+XML_OUTPUT        = build/xml
+INPUT             = ../include
+INCLUDE_PATH      = ../include ../src
+GENERATE_LATEX    = NO
+GENERATE_MAN      = NO
+GENERATE_RTF      = NO
+CASE_SENSE_NAMES  = NO
+GENERATE_HTML     = NO
+GENERATE_XML      = YES
+RECURSIVE         = YES
+QUIET             = YES
+JAVADOC_AUTOBRIEF = YES
+WARN_IF_UNDOCUMENTED = NO
+MACRO_EXPANSION = YES
+PREDEFINED = "SPFFT_MPI" "SPFFT_SINGLE_PRECISION"
+EXTRACT_PRIVATE   = NO
+EXTRACT_ALL       = YES
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..69fe55e
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..278c5f7
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# run doxygen first for code doc xml generation
+import subprocess
+subprocess.call('cd .. ; doxygen', shell=True)
+
+
+html_theme = "sphinx_rtd_theme"
+#  html_theme = "bootstrap"
+html_theme_path = ["_themes"]
+
+# -- Project information -----------------------------------------------------
+
+project = u'SpFFT'
+copyright = u'2019, ETH Zurich'
+author = u'ETH Zurich, Simon Frasch'
+breathe_projects = { 'SpFFT': '../build/xml' }
+highlight_language = 'c++'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u'0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+    'breathe'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'SpFFTdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'SpFFT.tex', u'SpFFT Documentation',
+     u'ETH Zurich, Simon Frasch', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'spfft', u'SpFFT Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'SpFFT', u'SpFFT Documentation',
+     author, 'SpFFT', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/details.rst b/docs/source/details.rst
new file mode 100644
index 0000000..04b2b6e
--- /dev/null
+++ b/docs/source/details.rst
@@ -0,0 +1,54 @@
+Details
+=======
+
+Complex Number Format
+---------------------
+SpFFT always assumes an interleaved format in double or single precision. The alignment of memory provided for space domain data is guaranteed to fulfill to the requirements for std::complex (for C++11), C complex types and GPU complex types of CUDA or ROCM.
+
+
+Indexing
+--------
+The only format for providing the indices of the sparse frequency domain data supported at the moment are index triplets in an interleaved array.
+Example: x\ :sub:`1`\ , y\ :sub:`1`\ , z\ :sub:`1`\, x\ :sub:`2`\ , y\ :sub:`2`\ , z\ :sub:`2`\ ...
+
+Indices for a dimension of size *n* must be either in the interval [0, *n* - 1] or [floor(*n*/2) - *n* + 1, floor(*n*/2)].
+
+.. note:: For R2C transforms, the full hermitian symmetry property is exploited. All indices in X must always be in the interval [0, floor(*n*/2)], an some other index combinations (where one or two indices are 0) can be ommitted without loss of information.
+
+
+Data Distribution
+-----------------
+| The order and distribution of frequency space elements can have significant impact on performance.
+| Z-coloumns must *not* be split between MPI ranks. Locally, elements are best grouped by z-columns and ordered by their z-index within each column.
+
+| The ideal distribution of z-columns between MPI ranks differs for execution on host and GPU.
+
+| For execution on host:
+|    Indices of z-columns are ideally continuous in y on each MPI rank.
+
+| For execution on GPU:
+|    Indices of z-columns are ideally continuous in x on each MPI rank.
+
+MPI Exchange
+------------
+The MPI exchange is based on a collective MPI call. The following options are available:
+
+SPFFT_EXCH_BUFFERED
+ Exchange with MPI_Alltoall. Requires repacking of data into buffer. Possibly best optimized for large number of ranks by MPI implementations, but does not adjust well to non-uniform data distributions.
+
+SPFFT_EXCH_COMPACT_BUFFERED
+  Exchange with MPI_Alltoallv. Requires repacking of data into buffer. Performance is usually close to MPI_alltoall and it adapts well to non-unitform data distributions.
+
+SPFFT_EXCH_UNBUFFERED
+  Exchange with MPI_Alltoallw. Does not require repacking of data into buffer (outside of the MPI library). Performance varies widely between systems and MPI implementations. It is generally difficult to optimize for large number of ranks, but may perform best in certain conditions.
+
+| For both *SPFFT_EXCH_BUFFERED* and *SPFFT_EXCH_COMPACT_BUFFERED*, an exchange in single precision can be selected. With transforms in double precision, the number of bytes sent and received is halved. For execution on GPUs without GPUDirect, the data transfer between GPU and host also benefits. This option can provide a significant speedup, but incurs a slight accuracy loss. The double precision values are converted to and from single precision between the transform in z and the transform in x / y, while all actual calculations are still done in the selected precision.
+
+
+GPU
+---
+| Saving transfer time between host and GPU is key to good performance for execution with GPUs. Ideally, both input and output is located on GPU memory. If host memory pointers are provided as input or output, it is helpful to use pinned memory through the CUDA or ROCm API.
+
+| If available, GPU aware MPI can be utilized, to safe on the otherwise required transfers between host and GPU in preparation of the MPI exchange. This can greatly impact performance and is enabled by compiling the library with the CMake option SPFFT_GPU_DIRECT set to ON.
+
+.. note:: Additional environment variables may have to be set for some MPI implementations, to allow GPUDirect usage.
diff --git a/docs/source/errors_c.rst b/docs/source/errors_c.rst
new file mode 100644
index 0000000..eb5cb2e
--- /dev/null
+++ b/docs/source/errors_c.rst
@@ -0,0 +1,5 @@
+Errors
+======
+
+.. doxygenfile:: spfft/errors.h
+   :project: SpFFT
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
new file mode 100644
index 0000000..d548829
--- /dev/null
+++ b/docs/source/examples.rst
@@ -0,0 +1,275 @@
+Examples
+========
+
+C++
+----
+
+.. code-block:: c++
+
+   #include <complex>
+   #include <iostream>
+   #include <vector>
+
+   #include "spfft/spfft.hpp"
+
+   int main(int argc, char** argv) {
+     const int dimX = 2;
+     const int dimY = 2;
+     const int dimZ = 2;
+
+     std::cout << "Dimensions: x = " << dimX << ", y = " << dimY << ", z = " << dimZ << std::endl
+	       << std::endl;
+
+     const int numThreads = -1; // Use default OpenMP value
+
+     std::vector<std::complex<double>> freqValues;
+     freqValues.reserve(dimX * dimY * dimZ);
+
+     std::vector<int> indices;
+     indices.reserve(dimX * dimY * dimZ * 3);
+
+     // initialize frequency domain values and indices
+     double initValue = 0.0;
+     for (int xIndex = 0; xIndex < dimX; ++xIndex) {
+       for (int yIndex = 0; yIndex < dimY; ++yIndex) {
+	 for (int zIndex = 0; zIndex < dimZ; ++zIndex) {
+	   // init values
+	   freqValues.emplace_back(initValue, -initValue);
+
+	   // add index triplet for value
+	   indices.emplace_back(xIndex);
+	   indices.emplace_back(yIndex);
+	   indices.emplace_back(zIndex);
+
+	   initValue += 1.0;
+	 }
+       }
+     }
+
+     std::cout << "Input:" << std::endl;
+     for (const auto& value : freqValues) {
+       std::cout << value.real() << ", " << value.imag() << std::endl;
+     }
+
+     // create local Grid. For distributed computations, a MPI Communicator has to be provided
+     spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);
+
+     // create transform
+     spfft::Transform transform =
+	 grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, dimZ,
+			       freqValues.size(), SPFFT_INDEX_TRIPLETS, indices.data());
+
+     // get pointer to space domain data. Alignment is guaranteed to fullfill requirements for
+     // std::complex
+     std::complex<double>* realValues =
+	 reinterpret_cast<std::complex<double>*>(transform.space_domain_data(SPFFT_PU_HOST));
+
+     // transform backward
+     transform.backward(reinterpret_cast<double*>(freqValues.data()), SPFFT_PU_HOST);
+
+     std::cout << std::endl << "After backward transform:" << std::endl;
+     for (int i = 0; i < transform.local_slice_size(); ++i) {
+       std::cout << realValues[i].real() << ", " << realValues[i].imag() << std::endl;
+     }
+
+     // transform forward
+     transform.forward(SPFFT_PU_HOST, reinterpret_cast<double*>(freqValues.data()), SPFFT_NO_SCALING);
+
+     std::cout << std::endl << "After forward transform (without scaling):" << std::endl;
+     for (const auto& value : freqValues) {
+       std::cout << value.real() << ", " << value.imag() << std::endl;
+     }
+
+     return 0;
+   }
+
+
+C
+-
+.. code-block:: c
+
+   #include <stdio.h>
+   #include <stdlib.h>
+
+   #include "spfft/spfft.h"
+
+   int main(int argc, char** argv) {
+     const int dimX = 2;
+     const int dimY = 2;
+     const int dimZ = 2;
+
+     printf("Dimensions: x = %d, y = %d, z = %d\n\n", dimX, dimY, dimZ);
+
+     const int numThreads = -1; /* Use default OpenMP value */
+
+     double* freqValues = (double*)malloc(2 * sizeof(double) * dimX * dimY * dimZ);
+
+     int* indices = (int*)malloc(3 * sizeof(int) * dimX * dimY * dimZ);
+
+     /* initialize frequency domain values and indices */
+     double initValue = 0.0;
+     size_t count = 0;
+     for (int xIndex = 0; xIndex < dimX; ++xIndex) {
+       for (int yIndex = 0; yIndex < dimY; ++yIndex) {
+	 for (int zIndex = 0; zIndex < dimZ; ++zIndex, ++count) {
+	   /* init values */
+	   freqValues[2 * count] = initValue;
+	   freqValues[2 * count + 1] = -initValue;
+
+	   /* add index triplet for value */
+	   indices[3 * count] = xIndex;
+	   indices[3 * count + 1] = yIndex;
+	   indices[3 * count + 2] = zIndex;
+
+	   initValue += 1.0;
+	 }
+       }
+     }
+
+     printf("Input:\n");
+     for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+       printf("%f, %f\n", freqValues[2 * i], freqValues[2 * i + 1]);
+     }
+     printf("\n");
+
+     SpfftError status = 0;
+
+     /* create local Grid. For distributed computations, a MPI Communicator has to be provided */
+     SpfftGrid grid;
+     status = spfft_grid_create(&grid, dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     /* create transform */
+     SpfftTransform transform;
+     status = spfft_transform_create(&transform, grid, SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY,
+				     dimZ, dimZ, dimX * dimY * dimZ, SPFFT_INDEX_TRIPLETS, indices);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     /* grid can be safely destroyed after creating all transforms */
+     status = spfft_grid_destroy(grid);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     /* get pointer to space domain data. Alignment is guaranteed to fullfill requirements C complex
+      types */
+     double* realValues;
+     status = spfft_transform_get_space_domain(transform, SPFFT_PU_HOST, &realValues);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     /* transform backward */
+     status = spfft_transform_backward(transform, freqValues, SPFFT_PU_HOST);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     printf("After backward transform:\n");
+     for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+       printf("%f, %f\n", realValues[2 * i], realValues[2 * i + 1]);
+     }
+     printf("\n");
+
+     /* transform forward */
+     status = spfft_transform_forward(transform, SPFFT_PU_HOST, freqValues, SPFFT_NO_SCALING);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     printf("After forward transform (without scaling):\n");
+     for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+       printf("%f, %f\n", freqValues[2 * i], freqValues[2 * i + 1]);
+     }
+
+     /* destroying the final transform will free the associated memory */
+     status = spfft_transform_destroy(transform);
+     if (status != SPFFT_SUCCESS) exit(status);
+
+     return 0;
+   }
+
+Fortran
+-------
+.. code-block:: fortran
+
+   program main
+       use iso_c_binding
+       use spfft
+       implicit none
+       integer :: i, j, k, counter
+       integer, parameter :: dimX = 2
+       integer, parameter :: dimY = 2
+       integer, parameter :: dimZ = 2
+       integer, parameter :: maxNumLocalZColumns = dimX * dimY
+       integer, parameter :: processingUnit = 1
+       integer, parameter :: maxNumThreads = -1
+       type(c_ptr) :: grid = c_null_ptr
+       type(c_ptr) :: transform = c_null_ptr
+       integer :: error = 0
+       integer, dimension(dimX * dimY * dimZ * 3):: indices = 0
+       complex(C_DOUBLE_COMPLEX), dimension(dimX * dimY * dimZ):: freqValues
+       complex(C_DOUBLE_COMPLEX), pointer :: realValues(:,:,:)
+       type(c_ptr) :: realValuesPtr
+
+
+       counter = 0
+       do k = 1, dimZ
+           do j = 1, dimY
+               do i = 1, dimX
+                freqValues(counter + 1) = cmplx(counter, counter)
+                indices(counter * 3 + 1) = i - 1
+                indices(counter * 3 + 2) = j - 1
+                indices(counter * 3 + 3) = k - 1
+                counter = counter + 1
+               end do
+           end do
+       end do
+
+       ! print input
+       print *, "Input:"
+       do i = 1, size(freqValues)
+            print *, freqValues(i)
+       end do
+
+
+       ! create grid and transform
+       error = spfft_grid_create(grid, dimX, dimY, dimZ, maxNumLocalZColumns, processingUnit, maxNumThreads);
+       if (error /= 0) stop error
+       error = spfft_transform_create(transform, grid, processingUnit, 0, dimX, dimY, dimZ, dimZ, size(freqValues), 0, indices)
+       if (error /= 0) stop error
+
+       ! grid can be safely deleted after creating all required transforms
+       error = spfft_grid_destroy(grid)
+       if (error /= 0) stop error
+
+       ! set space domain array to use memory allocted by the library
+       error = spfft_transform_get_space_domain(transform, processingUnit, realValuesPtr)
+       if (error /= 0) stop error
+
+       ! transform backward
+       error = spfft_transform_backward(transform, freqValues, processingUnit)
+       if (error /= 0) stop error
+
+
+       call c_f_pointer(realValuesPtr, realValues, [dimX,dimY,dimZ])
+
+       print *, ""
+       print *, "After backward transform:"
+       do k = 1, size(realValues, 3)
+           do j = 1, size(realValues, 2)
+               do i = 1, size(realValues, 1)
+                print *, realValues(i, j, k)
+               end do
+           end do
+       end do
+
+       ! transform forward (will invalidate space domain data)
+       error = spfft_transform_forward(transform, processingUnit, freqValues, 0)
+       if (error /= 0) stop error
+
+       print *, ""
+       print *, "After forward transform (without scaling):"
+       do i = 1, size(freqValues)
+                print *, freqValues(i)
+       end do
+
+       ! destroy transform after use
+       ! (will release memory if all transforms from the same grid are destroyed)
+       error = spfft_transform_destroy(transform)
+       if (error /= 0) stop error
+
+   end
+
diff --git a/docs/source/exceptions.rst b/docs/source/exceptions.rst
new file mode 100644
index 0000000..092064c
--- /dev/null
+++ b/docs/source/exceptions.rst
@@ -0,0 +1,5 @@
+Exceptions
+==========
+
+.. doxygenfile:: spfft/exceptions.hpp
+   :project: SpFFT
diff --git a/docs/source/grid.rst b/docs/source/grid.rst
new file mode 100644
index 0000000..01a9e16
--- /dev/null
+++ b/docs/source/grid.rst
@@ -0,0 +1,9 @@
+Grid
+====
+.. note::
+   A Grid object can be safely destroyed after transforms have been created. The transforms hold a reference counted objtect containing the allocated memory, which will remain valid until all transforms are destroyed as well.
+
+
+.. doxygenclass:: spfft::Grid
+   :project: SpFFT
+   :members:
diff --git a/docs/source/grid_c.rst b/docs/source/grid_c.rst
new file mode 100644
index 0000000..c59fd1c
--- /dev/null
+++ b/docs/source/grid_c.rst
@@ -0,0 +1,5 @@
+Grid
+====
+
+.. doxygenfile:: spfft/grid.h
+   :project: SpFFT
diff --git a/docs/source/grid_float.rst b/docs/source/grid_float.rst
new file mode 100644
index 0000000..f840c52
--- /dev/null
+++ b/docs/source/grid_float.rst
@@ -0,0 +1,12 @@
+GridFloat
+=========
+
+.. note::
+   This class is only available if single precision support is enabled, in which case the marco SPFFT_SINGLE_PRECISION is defined in config.h.
+
+.. note::
+   A Grid object can be safely destroyed after transforms have been created. The transforms hold a reference counted objtect containing the allocated memory, which will remain valid until all transforms are destroyed as well.
+
+.. doxygenclass:: spfft::GridFloat
+   :project: SpFFT
+   :members:
diff --git a/docs/source/grid_float_c.rst b/docs/source/grid_float_c.rst
new file mode 100644
index 0000000..e1a46f0
--- /dev/null
+++ b/docs/source/grid_float_c.rst
@@ -0,0 +1,8 @@
+GridFloat
+=========
+.. note::
+   These functions are only available if single precision support is enabled, in which case the marco SPFFT_SINGLE_PRECISION is defined in config.h.
+
+.. doxygenfile:: spfft/grid_float.h
+   :project: SpFFT
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..4fcf6a4
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,81 @@
+.. Copyright (c) 2019, ETH Zurich
+
+   Distributed under the terms of the BSD 3-Clause License.
+
+   The full license is in the file LICENSE, distributed with this software.
+
+SpFFT Documentation
+===================
+| SpFFT is a library for the computation 3D FFTs with sparse frequency domain data written in C++ with support for MPI, OpenMP, CUDA and ROCm.
+
+| It was originally intended for transforms of data with spherical cutoff in frequency domain, as required by some computational material science codes, but was generalized to sparse frequency domain data.
+
+
+Design Goals
+------------
+
+- Sparse frequency domain input
+- Reuse of pre-allocated memory
+- Support of negative indexing for frequency domain data
+- Unified interface for calculations on CPUs and GPUs
+- Support of Complex-To-Real and Real-To-Complex transforms, where the full hermitian symmetry property is utilized. Therefore, there is no redundant frequency domain data, as is usually the case for dense 3D R2C / C2R transforms with libraries such as FFTW.
+- C++, C and Fortran interfaces
+
+Interface Design
+----------------
+To allow for pre-allocation and reuse of memory, the design is based on two classes:
+
+- **Grid**: Allocates memory for transforms up to a given size in each dimension.
+- **Transform**: Is created using a *Grid* and can have any size up to the maximum allowed by the *Grid*. A *Transform* holds a counted reference to the underlying *Grid*. Therefore, *Transforms* created from the same *Grid* will share the memory, which is only freed, once the *Grid* and all associated *Transforms* are destroyed.
+
+The user provides memory for storing the sparse frequency domain data, while a *Transform* provides memory for the space domain data. This implies, that executing a *Transform* will override the space domain data of all other *Transforms* associated to the same *Grid*.
+
+.. note::
+   The creation of Grids and Transforms, as well as the forward and backward execution may entail MPI calls and must be synchronized between all ranks.
+
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   installation
+   examples
+   details
+
+.. toctree::
+   :maxdepth: 2
+   :caption: C++ API REFERENCE:
+   :hidden:
+
+   types
+   grid
+   grid_float
+   transform
+   transform_float
+   multi_transform
+   exceptions
+
+.. toctree::
+   :maxdepth: 2
+   :caption: C API REFERENCE:
+   :hidden:
+
+   types
+   grid_c
+   grid_float_c
+   transform_c
+   transform_float_c
+   multi_transform_c
+   errors_c
+
+
+
+
+
+
+.. Indices and tables
+.. ==================
+
+.. * :ref:`genindex`
+
+
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 0000000..ff0c63b
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,38 @@
+Installation
+============
+
+Requirements
+------------
+- C++ Compiler with C++11 support
+- CMake version 3.8 or greater
+- Library providing a FFTW 3.x interface (FFTW3 or Intel MKL)
+- For multi-threading: OpenMP support by the compiler
+- For GPU support: CUDA or ROCm
+
+Build
+-----
+
+The build system follows the standard CMake workflow. 
+Example:
+
+.. code-block:: bash
+
+	mkdir build
+	cd build
+	cmake .. -DSPFFT_OMP=ON -DSPFFT_MPI=ON -DSPFFT_GPU_BACKEND=CUDA -DSPFFT_SINGLE_PRECISION=OFF -DCMAKE_INSTALL_PREFIX=/usr/local
+	make -j8 install
+
+CMake options
+-------------
+====================== ======= ================================================
+Option                 Default Description
+====================== ======= ================================================
+SPFFT_MPI              ON      Enable MPI support
+SPFFT_OMP              ON      Enable multi-threading with OpenMP
+SPFFT_GPU_BACKEND      OFF     Select GPU backend. Can be OFF, CUDA or ROCM
+SPFFT_GPU_DIRECT       OFF     Use GPU aware MPI with GPUDirect
+SPFFT_SINGLE_PRECISION OFF     Enable single precision support
+SPFFT_STATIC           OFF     Build as static library
+SPFFT_BUILD_TESTS      OFF     Build test executables for developement purposes
+SPFFT_INSTALL          ON      Add library to install target
+====================== ======= ================================================
diff --git a/docs/source/multi_transform.rst b/docs/source/multi_transform.rst
new file mode 100644
index 0000000..cf4b340
--- /dev/null
+++ b/docs/source/multi_transform.rst
@@ -0,0 +1,10 @@
+Multi-Transform
+===============
+.. note::
+   Only fully independent transforms can be executed in parallel.
+
+.. doxygenfile:: spfft/multi_transform.hpp
+   :project: SpFFT
+
+.. doxygenfile:: spfft/multi_transform_float.hpp
+   :project: SpFFT
diff --git a/docs/source/multi_transform_c.rst b/docs/source/multi_transform_c.rst
new file mode 100644
index 0000000..0a249be
--- /dev/null
+++ b/docs/source/multi_transform_c.rst
@@ -0,0 +1,10 @@
+Multi-Transform
+===============
+.. note::
+   Only fully independent transforms can be executed in parallel.
+
+.. doxygenfile:: spfft/multi_transform.h
+   :project: SpFFT
+
+.. doxygenfile:: spfft/multi_transform_float.h
+   :project: SpFFT
diff --git a/docs/source/transform.rst b/docs/source/transform.rst
new file mode 100644
index 0000000..ca959e3
--- /dev/null
+++ b/docs/source/transform.rst
@@ -0,0 +1,8 @@
+Transform
+=========
+.. note::
+   This class only holds an internal reference counted object. The object remains in a usable state even if the associated Grid object is destroyed. In addition, copying a transform only requires an internal copy of a shared pointer.
+
+.. doxygenclass:: spfft::Transform
+   :project: SpFFT
+   :members:
diff --git a/docs/source/transform_c.rst b/docs/source/transform_c.rst
new file mode 100644
index 0000000..918f1a8
--- /dev/null
+++ b/docs/source/transform_c.rst
@@ -0,0 +1,7 @@
+Transform
+=========
+.. note::
+   This class only holds an internal reference counted object. The object remains in a usable state even if the associated Grid object is destroyed. In addition, copying a transform only requires an internal copy of a shared pointer.
+
+.. doxygenfile:: spfft/transform.h
+   :project: SpFFT
diff --git a/docs/source/transform_float.rst b/docs/source/transform_float.rst
new file mode 100644
index 0000000..43272ce
--- /dev/null
+++ b/docs/source/transform_float.rst
@@ -0,0 +1,12 @@
+TransformFloat
+==============
+.. note::
+   This class is only available if single precision support is enabled, in which case the marco SPFFT_SINGLE_PRECISION is defined in config.h.
+
+.. note::
+   This class only holds an internal reference counted object. The object remains in a usable state even if the associated Grid object is destroyed. In addition, copying a transform only requires an internal copy of a shared pointer.
+
+
+.. doxygenclass:: spfft::TransformFloat
+   :project: SpFFT
+   :members:
diff --git a/docs/source/transform_float_c.rst b/docs/source/transform_float_c.rst
new file mode 100644
index 0000000..6afbe19
--- /dev/null
+++ b/docs/source/transform_float_c.rst
@@ -0,0 +1,8 @@
+TransformFloat
+==============
+.. note::
+   These functions are only available if single precision support is enabled, in which case the marco SPFFT_SINGLE_PRECISION is defined in config.h.
+
+.. doxygenfile:: spfft/transform_float.h
+   :project: SpFFT
+
diff --git a/docs/source/types.rst b/docs/source/types.rst
new file mode 100644
index 0000000..89f35e8
--- /dev/null
+++ b/docs/source/types.rst
@@ -0,0 +1,5 @@
+Types
+=====
+
+.. doxygenfile:: spfft/types.h
+   :project: SpFFT
diff --git a/examples/example.c b/examples/example.c
new file mode 100644
index 0000000..725ab5f
--- /dev/null
+++ b/examples/example.c
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "spfft/spfft.h"
+
+int main(int argc, char** argv) {
+  const int dimX = 2;
+  const int dimY = 2;
+  const int dimZ = 2;
+
+  printf("Dimensions: x = %d, y = %d, z = %d\n\n", dimX, dimY, dimZ);
+
+  const int numThreads = -1; /* Use default OpenMP value */
+
+  double* freqValues = (double*)malloc(2 * sizeof(double) * dimX * dimY * dimZ);
+
+  int* indices = (int*)malloc(3 * sizeof(int) * dimX * dimY * dimZ);
+
+  /* initialize frequency domain values and indices */
+  double initValue = 0.0;
+  size_t count = 0;
+  for (int xIndex = 0; xIndex < dimX; ++xIndex) {
+    for (int yIndex = 0; yIndex < dimY; ++yIndex) {
+      for (int zIndex = 0; zIndex < dimZ; ++zIndex, ++count) {
+        /* init values */
+        freqValues[2 * count] = initValue;
+        freqValues[2 * count + 1] = -initValue;
+
+        /* add index triplet for value */
+        indices[3 * count] = xIndex;
+        indices[3 * count + 1] = yIndex;
+        indices[3 * count + 2] = zIndex;
+
+        initValue += 1.0;
+      }
+    }
+  }
+
+  printf("Input:\n");
+  for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+    printf("%f, %f\n", freqValues[2 * i], freqValues[2 * i + 1]);
+  }
+  printf("\n");
+
+  SpfftError status = 0;
+
+  /* create local Grid. For distributed computations, a MPI Communicator has to be provided */
+  SpfftGrid grid;
+  status = spfft_grid_create(&grid, dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  /* create transform */
+  SpfftTransform transform;
+  status = spfft_transform_create(&transform, grid, SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY,
+                                  dimZ, dimZ, dimX * dimY * dimZ, SPFFT_INDEX_TRIPLETS, indices);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  /* grid can be safely destroyed after creating all transforms */
+  status = spfft_grid_destroy(grid);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  /* get pointer to space domain data. Alignment is guaranteed to fullfill requirements C complex
+   types */
+  double* realValues;
+  status = spfft_transform_get_space_domain(transform, SPFFT_PU_HOST, &realValues);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  /* transform backward */
+  status = spfft_transform_backward(transform, freqValues, SPFFT_PU_HOST);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  printf("After backward transform:\n");
+  for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+    printf("%f, %f\n", realValues[2 * i], realValues[2 * i + 1]);
+  }
+  printf("\n");
+
+  /* transform forward */
+  status = spfft_transform_forward(transform, SPFFT_PU_HOST, freqValues, SPFFT_NO_SCALING);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  printf("After forward transform (without scaling):\n");
+  for (size_t i = 0; i < dimX * dimY * dimZ; ++i) {
+    printf("%f, %f\n", freqValues[2 * i], freqValues[2 * i + 1]);
+  }
+
+  /* destroying the final transform will free the associated memory */
+  status = spfft_transform_destroy(transform);
+  if (status != SPFFT_SUCCESS) exit(status);
+
+  return 0;
+}
diff --git a/examples/example.cpp b/examples/example.cpp
new file mode 100644
index 0000000..34087c8
--- /dev/null
+++ b/examples/example.cpp
@@ -0,0 +1,76 @@
+#include <complex>
+#include <iostream>
+#include <vector>
+
+#include "spfft/spfft.hpp"
+
+int main(int argc, char** argv) {
+  const int dimX = 2;
+  const int dimY = 2;
+  const int dimZ = 2;
+
+  std::cout << "Dimensions: x = " << dimX << ", y = " << dimY << ", z = " << dimZ << std::endl
+            << std::endl;
+
+  const int numThreads = -1; // Use default OpenMP value
+
+  std::vector<std::complex<double>> freqValues;
+  freqValues.reserve(dimX * dimY * dimZ);
+
+  std::vector<int> indices;
+  indices.reserve(dimX * dimY * dimZ * 3);
+
+  // initialize frequency domain values and indices
+  double initValue = 0.0;
+  for (int xIndex = 0; xIndex < dimX; ++xIndex) {
+    for (int yIndex = 0; yIndex < dimY; ++yIndex) {
+      for (int zIndex = 0; zIndex < dimZ; ++zIndex) {
+        // init values
+        freqValues.emplace_back(initValue, -initValue);
+
+        // add index triplet for value
+        indices.emplace_back(xIndex);
+        indices.emplace_back(yIndex);
+        indices.emplace_back(zIndex);
+
+        initValue += 1.0;
+      }
+    }
+  }
+
+  std::cout << "Input:" << std::endl;
+  for (const auto& value : freqValues) {
+    std::cout << value.real() << ", " << value.imag() << std::endl;
+  }
+
+  // create local Grid. For distributed computations, a MPI Communicator has to be provided
+  spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);
+
+  // create transform
+  spfft::Transform transform =
+      grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, dimZ,
+                            freqValues.size(), SPFFT_INDEX_TRIPLETS, indices.data());
+
+  // get pointer to space domain data. Alignment is guaranteed to fullfill requirements for
+  // std::complex
+  std::complex<double>* realValues =
+      reinterpret_cast<std::complex<double>*>(transform.space_domain_data(SPFFT_PU_HOST));
+
+  // transform backward
+  transform.backward(reinterpret_cast<double*>(freqValues.data()), SPFFT_PU_HOST);
+
+  std::cout << std::endl << "After backward transform:" << std::endl;
+  for (int i = 0; i < transform.local_slice_size(); ++i) {
+    std::cout << realValues[i].real() << ", " << realValues[i].imag() << std::endl;
+  }
+
+  // transform forward
+  transform.forward(SPFFT_PU_HOST, reinterpret_cast<double*>(freqValues.data()), SPFFT_NO_SCALING);
+
+  std::cout << std::endl << "After forward transform (without scaling):" << std::endl;
+  for (const auto& value : freqValues) {
+    std::cout << value.real() << ", " << value.imag() << std::endl;
+  }
+
+  return 0;
+}
diff --git a/examples/example.f90 b/examples/example.f90
new file mode 100644
index 0000000..9658ebf
--- /dev/null
+++ b/examples/example.f90
@@ -0,0 +1,87 @@
+
+program main
+    use iso_c_binding
+    use spfft
+    implicit none
+    integer :: i, j, k, counter
+    integer, parameter :: dimX = 2
+    integer, parameter :: dimY = 2
+    integer, parameter :: dimZ = 2
+    integer, parameter :: maxNumLocalZColumns = dimX * dimY
+    integer, parameter :: processingUnit = 1
+    integer, parameter :: maxNumThreads = -1
+    type(c_ptr) :: grid = c_null_ptr
+    type(c_ptr) :: transform = c_null_ptr
+    integer :: errorCode = 0
+    integer, dimension(dimX * dimY * dimZ * 3):: indices = 0
+    complex(C_DOUBLE_COMPLEX), dimension(dimX * dimY * dimZ):: freqValues
+    complex(C_DOUBLE_COMPLEX), pointer :: realValues(:,:,:)
+    type(c_ptr) :: realValuesPtr
+
+
+    counter = 0
+    do k = 1, dimZ
+        do j = 1, dimY
+            do i = 1, dimX
+             freqValues(counter + 1) = cmplx(counter, -counter)
+             indices(counter * 3 + 1) = i - 1
+             indices(counter * 3 + 2) = j - 1
+             indices(counter * 3 + 3) = k - 1
+             counter = counter + 1
+            end do
+        end do
+    end do
+
+    ! print input
+    print *, "Input:"
+    do i = 1, size(freqValues)
+         print *, freqValues(i)
+    end do
+
+
+    ! create grid and transform
+    errorCode = spfft_grid_create(grid, dimX, dimY, dimZ, maxNumLocalZColumns, processingUnit, maxNumThreads);
+    if (errorCode /= SPFFT_SUCCESS) error stop
+    errorCode = spfft_transform_create(transform, grid, processingUnit, 0, dimX, dimY, dimZ, dimZ, size(freqValues), 0, indices)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+    ! grid can be safely destroyed after creating all required transforms
+    errorCode = spfft_grid_destroy(grid)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+    ! set space domain array to use memory allocted by the library
+    errorCode = spfft_transform_get_space_domain(transform, processingUnit, realValuesPtr)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+    ! transform backward
+    errorCode = spfft_transform_backward(transform, freqValues, processingUnit)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+
+    call c_f_pointer(realValuesPtr, realValues, [dimX,dimY,dimZ])
+
+    print *, ""
+    print *, "After backward transform:"
+    do k = 1, size(realValues, 3)
+        do j = 1, size(realValues, 2)
+            do i = 1, size(realValues, 1)
+             print *, realValues(i, j, k)
+            end do
+        end do
+    end do
+
+    ! transform forward (will invalidate space domain data)
+    errorCode = spfft_transform_forward(transform, processingUnit, freqValues, 0)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+    print *, ""
+    print *, "After forward transform (without scaling):"
+    do i = 1, size(freqValues)
+             print *, freqValues(i)
+    end do
+
+    ! destroying the final transform will free the associated memory
+    errorCode = spfft_transform_destroy(transform)
+    if (errorCode /= SPFFT_SUCCESS) error stop
+
+end
diff --git a/include/spfft/config.h.in b/include/spfft/config.h.in
new file mode 100644
index 0000000..64ea307
--- /dev/null
+++ b/include/spfft/config.h.in
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*****************
+ * CMAKE GENERATED
+ *****************/
+
+#ifndef SPFFT_CONFIG_H
+#define SPFFT_CONFIG_H
+
+#cmakedefine SPFFT_CUDA
+#cmakedefine SPFFT_ROCM
+#cmakedefine SPFFT_MPI
+#cmakedefine SPFFT_OMP
+#cmakedefine SPFFT_TIMING
+#cmakedefine SPFFT_SINGLE_PRECISION
+#cmakedefine SPFFT_GPU_DIRECT
+
+#endif
diff --git a/include/spfft/errors.h b/include/spfft/errors.h
new file mode 100644
index 0000000..7df3531
--- /dev/null
+++ b/include/spfft/errors.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_ERRORS_H
+#define SPFFT_ERRORS_H
+
+#include "spfft/config.h"
+
+enum SpfftError {
+  /**
+   * Success. No error.
+   */
+  SPFFT_SUCCESS,
+  /**
+   * Unknown error.
+   */
+  SPFFT_UNKNOWN_ERROR,
+  /**
+   * Invalid Grid or Transform handle.
+   */
+  SPFFT_INVALID_HANDLE_ERROR,
+  /**
+   * Integer overflow.
+   */
+  SPFFT_OVERFLOW_ERROR,
+  /**
+   * Failed to allocate memory on host.
+   */
+  SPFFT_ALLOCATION_ERROR,
+  /**
+   * Invalid parameter.
+   */
+  SPFFT_INVALID_PARAMETER_ERROR,
+  /**
+   * Duplicate indices given to transform. May indicate non-local z-coloumn between MPI ranks.
+   */
+  SPFFT_DUPLICATE_INDICES_ERROR,
+  /**
+   * Invalid indices given to transform.
+   */
+  SPFFT_INVALID_INDICES_ERROR,
+  /**
+   * Library not compiled with MPI support.
+   */
+  SPFFT_MPI_SUPPORT_ERROR,
+  /**
+   * MPI error. Only returned if error code of MPI API calls is non-zero.
+   */
+  SPFFT_MPI_ERROR,
+  /**
+   * Parameters differ between MPI ranks.
+   */
+  SPFFT_MPI_PARAMETER_MISMATCH_ERROR,
+  /**
+   * Failed execution on host.
+   */
+  SPFFT_HOST_EXECUTION_ERROR,
+  /**
+   * FFTW library error.
+   */
+  SPFFT_FFTW_ERROR,
+  /**
+   * Generic GPU error.
+   */
+  SPFFT_GPU_ERROR,
+  /**
+   * Detected error on GPU from previous GPU API / kernel calls.
+   */
+  SPFFT_GPU_PRECEDING_ERROR,
+  /**
+   * Library not compiled with GPU support.
+   */
+  SPFFT_GPU_SUPPORT_ERROR,
+  /**
+   * Failed allocation on GPU.
+   */
+  SPFFT_GPU_ALLOCATION_ERROR,
+  /**
+   * Failed to launch kernel on GPU.
+   */
+  SPFFT_GPU_LAUNCH_ERROR,
+  /**
+   * No GPU device detected.
+   */
+  SPFFT_GPU_NO_DEVICE_ERROR,
+  /**
+   * Invalid value passed to GPU API.
+   */
+  SPFFT_GPU_INVALID_VALUE_ERROR,
+  /**
+   * Invalid device pointer used.
+   */
+  SPFFT_GPU_INVALID_DEVICE_PTR_ERROR,
+  /**
+   * Failed to copy from / to GPU.
+   */
+  SPFFT_GPU_COPY_ERROR,
+  /**
+   * Failure in GPU FFT library call.
+   */
+  SPFFT_GPU_FFT_ERROR
+};
+
+#ifndef __cplusplus
+/*! \cond PRIVATE */
+// C only
+typedef enum SpfftError SpfftError;
+/*! \endcond */
+#endif // cpp
+
+#endif
diff --git a/include/spfft/exceptions.hpp b/include/spfft/exceptions.hpp
new file mode 100644
index 0000000..525edc4
--- /dev/null
+++ b/include/spfft/exceptions.hpp
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_EXCEPTIONS_H
+#define SPFFT_EXCEPTIONS_H
+
+#include <stdexcept>
+#include "spfft/config.h"
+#include "spfft/errors.h"
+
+namespace spfft {
+
+/**
+ * A generic error. Base type for all other exceptions.
+ */
+class GenericError : public std::exception {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Generic error"; }
+
+  virtual auto error_code() const noexcept -> SpfftError {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+};
+
+/**
+ * Overflow of integer values.
+ */
+class OverflowError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Overflow error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_OVERFLOW_ERROR;
+  }
+};
+
+/**
+ * Failed allocation on host.
+ */
+class HostAllocationError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Host allocation error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_ALLOCATION_ERROR;
+  }
+};
+
+/**
+ * Invalid parameter.
+ */
+class InvalidParameterError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Invalid parameter error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_INVALID_PARAMETER_ERROR;
+  }
+};
+
+/**
+ * Duplicate indices given to transform. May indicate non-local z-coloumn between MPI ranks.
+ */
+class DuplicateIndicesError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Duplicate indices error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_DUPLICATE_INDICES_ERROR;
+  }
+};
+
+
+/**
+ * Invalid indices given to transform.
+ */
+class InvalidIndicesError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Invalid indices error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_INVALID_INDICES_ERROR;
+  }
+};
+
+/**
+ * Library not compiled with MPI support.
+ */
+class MPISupportError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override {
+    return "SpFFT: Not compiled with MPI support error";
+  }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_MPI_SUPPORT_ERROR;
+  }
+};
+
+/**
+ * MPI error. Only thrown if error code of MPI API calls is non-zero.
+ */
+class MPIError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: MPI error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_MPI_ERROR;
+  }
+};
+
+/**
+ * Parameters differ between MPI ranks.
+ */
+class MPIParameterMismatchError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override {
+    return "SpFFT: Mismatched parameters between MPI ranks";
+  }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_MPI_PARAMETER_MISMATCH_ERROR;
+  }
+};
+
+/**
+ * Failed execution on host.
+ */
+class HostExecutionError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Host execution error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_HOST_EXECUTION_ERROR;
+  }
+};
+
+/**
+ * FFTW library error.
+ */
+class FFTWError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: FFTW error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_FFTW_ERROR;
+  }
+};
+
+/**
+ * Unknown internal error.
+ */
+class InternalError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Internal error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_FFTW_ERROR;
+  }
+};
+
+// ==================================
+// GPU Errors
+// ==================================
+/**
+ * Generic GPU error. Base type for all GPU related exceptions.
+ */
+class GPUError : public GenericError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: GPU error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_ERROR;
+  }
+};
+
+/**
+ * Library not compiled with GPU support.
+ */
+class GPUSupportError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override {
+    return "SpFFT: Not compiled with GPU support";
+  }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_SUPPORT_ERROR;
+  }
+};
+
+/**
+ * Detected error on GPU from previous GPU API / kernel calls.
+ */
+class GPUPrecedingError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override {
+    return "SpFFT: Detected error from preceding gpu calls.";
+  }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_PRECEDING_ERROR;
+  }
+};
+
+/**
+ * Failed allocation on GPU.
+ */
+class GPUAllocationError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: GPU allocation error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_ALLOCATION_ERROR;
+  }
+};
+
+/**
+ * Failed to launch kernel on GPU.
+ */
+class GPULaunchError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: GPU launch error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_LAUNCH_ERROR;
+  }
+};
+
+/**
+ * No GPU device detected.
+ */
+class GPUNoDeviceError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: no GPU available"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_NO_DEVICE_ERROR;
+  }
+};
+
+/**
+ * Invalid value passed to GPU API.
+ */
+class GPUInvalidValueError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override {
+    return "SpFFT: GPU call with invalid value";
+  }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_INVALID_VALUE_ERROR;
+  }
+};
+
+/**
+ * Invalid device pointer used.
+ */
+class GPUInvalidDevicePointerError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: Invalid GPU pointer"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_INVALID_DEVICE_PTR_ERROR;
+  }
+};
+
+/**
+ * Failed to copy from / to GPU.
+ */
+class GPUCopyError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: GPU Memory copy error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_COPY_ERROR;
+  }
+};
+
+/**
+ * Failure in GPU FFT library call.
+ */
+class GPUFFTError : public GPUError {
+public:
+  auto what() const noexcept -> const char* override { return "SpFFT: GPU FFT error"; }
+
+  auto error_code() const noexcept -> SpfftError override {
+    return SpfftError::SPFFT_GPU_FFT_ERROR;
+  }
+};
+
+} // namespace spfft
+
+#endif
diff --git a/include/spfft/grid.h b/include/spfft/grid.h
new file mode 100644
index 0000000..d976d1e
--- /dev/null
+++ b/include/spfft/grid.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GRID_H
+#define SPFFT_GRID_H
+
+#include "spfft/config.h"
+#include "spfft/errors.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Grid handle.
+ */
+typedef void* SpfftGrid;
+
+/**
+ * Constructor for a local grid.
+ *
+ * @param[out] grid Handle to grid.
+ * @param[in] maxDimX Maximum dimension in x.
+ * @param[in] maxDimY Maximum dimension in y.
+ * @param[in] maxDimZ Maximum dimension in z.
+ * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain.
+ * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+ * allowed to use. If smaller than 1, the OpenMP default value is used.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_create(SpfftGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                             int maxNumLocalZColumns, SpfftProcessingUnitType processingUnit,
+                             int maxNumThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Constructor for a distributed grid.
+ *
+ * @param[out] grid Handle to grid.
+ * @param[in] maxDimX Maximum dimension in x.
+ * @param[in] maxDimY Maximum dimension in y.
+ * @param[in] maxDimZ Maximum dimension in z.
+ * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the
+ * local MPI rank.
+ * @param[in] maxLocalZLength Maximum length in z in space domain for the local MPI rank.
+ * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+ * allowed to use. If smaller than 1, the OpenMP default value is used.
+ * @param[in] comm The MPI communicator to use. Will be duplicated for internal use.
+ * @param[in] exchangeType The type of MPI exchange to use. Possible values are
+ * SPFFT_EXCH_DEFAULT, SPFFT_EXCH_BUFFERED, SPFFT_EXCH_COMPACT_BUFFERED and SPFFT_EXCH_UNBUFFERED.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_create_distributed(SpfftGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                                         int maxNumLocalZColumns, int maxLocalZLength,
+                                         SpfftProcessingUnitType processingUnit, int maxNumThreads,
+                                         MPI_Comm comm, SpfftExchangeType exchangeType);
+#endif
+
+/**
+ * Destroy a grid.
+ *
+ * A grid can be safely destroyed independet from any related transforms. The internal memory
+ * is released, once all associated transforms are destroyed as well (through internal reference
+ * counting).
+ *
+ * @param[in] grid Handle to grid.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_destroy(SpfftGrid grid);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimX Maximum dimension in x.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_max_dim_x(SpfftGrid grid, int* dimX);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimY Maximum dimension in y.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_max_dim_y(SpfftGrid grid, int* dimY);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimZ Maximum dimension in z.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_max_dim_z(SpfftGrid grid, int* dimZ);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the local MPI
+ * rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_max_num_local_z_columns(SpfftGrid grid, int* maxNumLocalZColumns);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] maxLocalZLength Maximum length in z in space domain of the local MPI rank.
+ * rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_max_local_z_length(SpfftGrid grid, int* maxLocalZLength);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] processingUnit The processing unit, the grid has prepared for. Can be SPFFT_PU_HOST
+ * or SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_processing_unit(SpfftGrid grid, SpfftProcessingUnitType* processingUnit);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] deviceId The GPU device id used. Returns always 0, if no GPU support is enabled.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_device_id(SpfftGrid grid, int* deviceId);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] numThreads The exact number of threads used by transforms created from this grid. May
+ * be less than the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_num_threads(SpfftGrid grid, int* numThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] comm The internal MPI communicator.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_grid_communicator(SpfftGrid grid, MPI_Comm* comm);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/grid.hpp b/include/spfft/grid.hpp
new file mode 100644
index 0000000..75c058f
--- /dev/null
+++ b/include/spfft/grid.hpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GRID_HPP
+#define SPFFT_GRID_HPP
+
+#include <memory>
+#include "spfft/config.h"
+#include "spfft/transform.hpp"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+namespace spfft {
+
+// Forward declaration for internal use
+template <typename T>
+class GridInternal;
+
+/**
+ * A Grid, which provides pre-allocated memory for double precision transforms.
+ */
+class Grid {
+public:
+  /**
+   * Constructor for a local grid.
+   *
+   * @param[in] maxDimX Maximum dimension in x.
+   * @param[in] maxDimY Maximum dimension in y.
+   * @param[in] maxDimZ Maximum dimension in z.
+   * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain.
+   * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+   * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+   * allowed to use. If smaller than 1, the OpenMP default value is used.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  Grid(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns,
+       SpfftProcessingUnitType processingUnit, int maxNumThreads);
+
+#ifdef SPFFT_MPI
+  /**
+   * Constructor for a distributed grid.
+   *
+   * @param[in] maxDimX Maximum dimension in x.
+   * @param[in] maxDimY Maximum dimension in y.
+   * @param[in] maxDimZ Maximum dimension in z.
+   * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the
+   * local MPI rank.
+   * @param[in] maxLocalZLength Maximum length in z in space domain for the local MPI rank.
+   * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+   * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+   * allowed to use. If smaller than 1, the OpenMP default value is used.
+   * @param[in] comm The MPI communicator to use. Will be duplicated for internal use.
+   * @param[in] exchangeType The type of MPI exchange to use. Possible values are
+   * SPFFT_EXCH_DEFAULT, SPFFT_EXCH_BUFFERED, SPFFT_EXCH_COMPACT_BUFFERED and SPFFT_EXCH_UNBUFFERED.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  Grid(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns, int maxLocalZLength,
+       SpfftProcessingUnitType processingUnit, int maxNumThreads, MPI_Comm comm,
+       SpfftExchangeType exchangeType);
+#endif
+
+  /**
+   * Custom copy constructor.
+   *
+   * Creates a independent copy. Calls MPI functions for the distributed case.
+   */
+  Grid(const Grid&);
+
+  /**
+   * Default move constructor.
+   */
+  Grid(Grid&&) = default;
+
+  /**
+   * Custom copy operator.
+   *
+   * Creates a independent copy. Calls MPI functions for the distributed case.
+   */
+  Grid& operator=(const Grid&);
+
+  /**
+   * Default move operator.
+   */
+  Grid& operator=(Grid&&) = default;
+
+  /**
+   * Creates a transform from this grid object.
+   *
+   * @param[in] processingUnit The processing unit type to use. Must be either SPFFT_PU_HOST or
+   * SPFFT_PU_GPU and be supported by the grid itself.
+   * @param[in] transformType The transform type (complex to complex or real to complex). Can be
+   * SPFFT_TRANS_C2C or SPFFT_TRANS_R2C.
+   * @param[in] dimX The dimension in x. The maximum allowed depends on the grid parameters.
+   * @param[in] dimY The dimension in y. The maximum allowed depends on the grid parameters.
+   * @param[in] dimZ The dimension in z. The maximum allowed depends on the grid parameters.
+   * @param[in] localZLength The length in z in space domain of the local MPI rank.
+   * @param[in] numLocalElements The number of elements in frequency domain of the local MPI
+   * rank.
+   * @param[in] indexFormat The index format. Only SPFFT_INDEX_TRIPLETS currently supported.
+   * @param[in] indices Pointer to the frequency indices. Posive and negative indexing is supported.
+   * @return Transform
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  Transform create_transform(SpfftProcessingUnitType processingUnit,
+                             SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                             int localZLength, int numLocalElements,
+                             SpfftIndexFormatType indexFormat, const int* indices) const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in x.
+   */
+  int max_dim_x() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in y.
+   */
+  int max_dim_y() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in z.
+   */
+  int max_dim_z() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum number of z-columns in frequency domain of the local MPI rank.
+   */
+  int max_num_local_z_columns() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum length in z in space domain of the local MPI rank.
+   */
+  int max_local_z_length() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The processing unit, the grid has prepared for. Can be SPFFT_PU_HOST or SPFFT_PU_GPU or
+   * SPFFT_PU_HOST | SPFFT_PU_GPU.
+   */
+  SpfftProcessingUnitType processing_unit() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The GPU device id used. Always returns 0, if no GPU support is enabled.
+   */
+  int device_id() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The exact number of threads used by transforms created from this grid. May be less than
+   * the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+   */
+  int num_threads() const;
+
+#ifdef SPFFT_MPI
+  /**
+   * Access a grid parameter.
+   * @return The internal MPI communicator.
+   */
+  MPI_Comm communicator() const;
+#endif
+
+private:
+  std::shared_ptr<GridInternal<double>> grid_;
+};
+} // namespace spfft
+#endif
diff --git a/include/spfft/grid_float.h b/include/spfft/grid_float.h
new file mode 100644
index 0000000..cae97a0
--- /dev/null
+++ b/include/spfft/grid_float.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GRID_FLOAT_H
+#define SPFFT_GRID_FLOAT_H
+
+#include "spfft/config.h"
+#include "spfft/errors.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Grid handle.
+ */
+typedef void* SpfftFloatGrid;
+
+/**
+ * Constructor for a single precision local grid.
+ *
+ * @param[out] grid Handle to grid.
+ * @param[in] maxDimX Maximum dimension in x.
+ * @param[in] maxDimY Maximum dimension in y.
+ * @param[in] maxDimZ Maximum dimension in z.
+ * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain.
+ * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+ * allowed to use. If smaller than 1, the OpenMP default value is used.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_create(SpfftFloatGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                             int maxNumLocalZColumns, SpfftProcessingUnitType processingUnit,
+                             int maxNumThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Constructor for a single precision distributed grid.
+ *
+ * @param[out] grid Handle to grid.
+ * @param[in] maxDimX Maximum dimension in x.
+ * @param[in] maxDimY Maximum dimension in y.
+ * @param[in] maxDimZ Maximum dimension in z.
+ * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the
+ * local MPI rank.
+ * @param[in] maxLocalZLength Maximum length in z in space domain for the local MPI rank.
+ * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+ * allowed to use. If smaller than 1, the OpenMP default value is used.
+ * @param[in] comm The MPI communicator to use. Will be duplicated for internal use.
+ * @param[in] exchangeType The type of MPI exchange to use. Possible values are
+ * SPFFT_EXCH_DEFAULT, SPFFT_EXCH_BUFFERED, SPFFT_EXCH_COMPACT_BUFFERED and SPFFT_EXCH_UNBUFFERED.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_create_distributed(SpfftFloatGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                                         int maxNumLocalZColumns, int maxLocalZLength,
+                                         SpfftProcessingUnitType processingUnit, int maxNumThreads,
+                                         MPI_Comm comm, SpfftExchangeType exchangeType);
+#endif
+
+/**
+ * Destroy a grid.
+ *
+ * A grid can be safely destroyed independet from any related transforms. The internal memory
+ * is released, once all associated transforms are destroyed as well (through internal reference
+ * counting).
+ *
+ * @param[in] grid Handle to grid.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_destroy(SpfftFloatGrid grid);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimX Maximum dimension in x.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_max_dim_x(SpfftFloatGrid grid, int* dimX);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimY Maximum dimension in y.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_max_dim_y(SpfftFloatGrid grid, int* dimY);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] dimZ Maximum dimension in z.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_max_dim_z(SpfftFloatGrid grid, int* dimZ);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the local MPI
+ * rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_max_num_local_z_columns(SpfftFloatGrid grid, int* maxNumLocalZColumns);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] maxLocalZLength Maximum length in z in space domain of the local MPI rank.
+ * rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_max_local_z_length(SpfftFloatGrid grid, int* maxLocalZLength);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] processingUnit The processing unit, the grid has prepared for. Can be SPFFT_PU_HOST
+ * or SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_processing_unit(SpfftFloatGrid grid, SpfftProcessingUnitType* processingUnit);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] deviceId The GPU device id used. Returns always 0, if no GPU support is enabled.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_device_id(SpfftFloatGrid grid, int* deviceId);
+
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] numThreads The exact number of threads used by transforms created from this grid. May
+ * be less than the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_num_threads(SpfftFloatGrid grid, int* numThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Access a grid parameter.
+ * @param[in] grid Handle to grid.
+ * @param[out] comm The internal MPI communicator.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_grid_communicator(SpfftFloatGrid grid, MPI_Comm* comm);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/grid_float.hpp b/include/spfft/grid_float.hpp
new file mode 100644
index 0000000..293edee
--- /dev/null
+++ b/include/spfft/grid_float.hpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GRID_FLOAT_HPP
+#define SPFFT_GRID_FLOAT_HPP
+
+#include <memory>
+#include "spfft/config.h"
+#include "spfft/transform_float.hpp"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+namespace spfft {
+
+// Forward declaration for internal use
+template <typename T>
+class GridInternal;
+#ifdef SPFFT_SINGLE_PRECISION
+
+/**
+ * A Grid, which provides pre-allocated memory for single precision transforms.
+ */
+class GridFloat {
+public:
+  /**
+   * Constructor for a local grid.
+   *
+   * @param[in] maxDimX Maximum dimension in x.
+   * @param[in] maxDimY Maximum dimension in y.
+   * @param[in] maxDimZ Maximum dimension in z.
+   * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain.
+   * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+   * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+   * allowed to use. If smaller than 1, the OpenMP default value is used.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  GridFloat(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns,
+            SpfftProcessingUnitType processingUnit, int maxNumThreads);
+
+#ifdef SPFFT_MPI
+  /**
+   * Constructor for a distributed grid.
+   *
+   * @param[in] maxDimX Maximum dimension in x.
+   * @param[in] maxDimY Maximum dimension in y.
+   * @param[in] maxDimZ Maximum dimension in z.
+   * @param[in] maxNumLocalZColumns Maximum number of z-columns in frequency domain of the
+   * local MPI rank.
+   * @param[in] maxLocalZLength Maximum length in z in space domain for the local MPI rank.
+   * @param[in] processingUnit The processing unit type to prepare for. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU or SPFFT_PU_HOST | SPFFT_PU_GPU.
+   * @param[in] maxNumThreads The maximum number of threads, transforms created with this grid are
+   * allowed to use. If smaller than 1, the OpenMP default value is used.
+   * @param[in] comm The MPI communicator to use. Will be duplicated for internal use.
+   * @param[in] exchangeType The type of MPI exchange to use. Possible values are
+   * SPFFT_EXCH_DEFAULT, SPFFT_EXCH_BUFFERED, SPFFT_EXCH_COMPACT_BUFFERED and SPFFT_EXCH_UNBUFFERED.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  GridFloat(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns, int maxLocalZLength,
+            SpfftProcessingUnitType processingUnit, int maxNumThreads, MPI_Comm comm,
+            SpfftExchangeType exchangeType);
+#endif
+
+  /**
+   * Custom copy constructor.
+   *
+   * Creates a independent copy. Calls MPI functions for the distributed case.
+   */
+  GridFloat(const GridFloat&);
+
+  /**
+   * Default move constructor.
+   */
+  GridFloat(GridFloat&&) = default;
+
+  /**
+   * Custom copy operator.
+   *
+   * Creates a independent copy. Calls MPI functions for the distributed case.
+   */
+  GridFloat& operator=(const GridFloat&);
+
+  /**
+   * Default move operator.
+   */
+  GridFloat& operator=(GridFloat&&) = default;
+
+  /**
+   * Creates a transform from this grid object.
+   *
+   * @param[in] processingUnit The processing unit type to use. Must be either SPFFT_PU_HOST or
+   * SPFFT_PU_GPU and be supported by the grid itself.
+   * @param[in] transformType The transform type (complex to complex or real to complex). Can be
+   * SPFFT_TRANS_C2C or SPFFT_TRANS_R2C.
+   * @param[in] dimX The dimension in x. The maximum allowed depends on the grid parameters.
+   * @param[in] dimY The dimension in y. The maximum allowed depends on the grid parameters.
+   * @param[in] dimZ The dimension in z. The maximum allowed depends on the grid parameters.
+   * @param[in] localZLength The length in z in space domain of the local MPI rank.
+   * @param[in] numLocalElements The number of elements in frequency domain of the local MPI
+   * rank.
+   * @param[in] indexFormat The index format. Only SPFFT_INDEX_TRIPLETS currently supported.
+   * @param[in] indices Pointer to the frequency indices. Posive and negative indexing is supported.
+   * @return Transform
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  TransformFloat create_transform(SpfftProcessingUnitType processingUnit,
+                                  SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                                  int localZLength, int numLocalElements,
+                                  SpfftIndexFormatType indexFormat, const int* indices) const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in x.
+   */
+  int max_dim_x() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in y.
+   */
+  int max_dim_y() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum dimension in z.
+   */
+  int max_dim_z() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum number of z-columns in frequency domain of the local MPI rank.
+   */
+  int max_num_local_z_columns() const;
+
+  /**
+   * Access a grid parameter.
+   * @return Maximum length in z in space domain of the local MPI rank.
+   */
+  int max_local_z_length() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The processing unit, the grid has prepared for. Can be SPFFT_PU_HOST or SPFFT_PU_GPU or
+   * SPFFT_PU_HOST | SPFFT_PU_GPU.
+   */
+  SpfftProcessingUnitType processing_unit() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The GPU device id used. Always returns 0, if no GPU support is enabled.
+   */
+  int device_id() const;
+
+  /**
+   * Access a grid parameter.
+   * @return The exact number of threads used by transforms created from this grid. May be less than
+   * the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+   */
+  int num_threads() const;
+
+#ifdef SPFFT_MPI
+  MPI_Comm communicator() const;
+#endif
+private:
+  /*! \cond PRIVATE */
+  std::shared_ptr<GridInternal<float>> grid_;
+  /*! \endcond */
+};
+#endif
+
+} // namespace spfft
+#endif
diff --git a/include/spfft/multi_transform.h b/include/spfft/multi_transform.h
new file mode 100644
index 0000000..0ec486c
--- /dev/null
+++ b/include/spfft/multi_transform.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MULTI_TRANSFORM_H
+#define SPFFT_MULTI_TRANSFORM_H
+
+#include "spfft/config.h"
+#include "spfft/transform.h"
+#include "spfft/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Execute multiple independent forward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputLocations Input locations for each transform.
+ * @param[out] outputPointers Output pointers for each transform.
+ * @param[in] scalingTypes Scaling types for each transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_multi_transform_forward(int numTransforms, SpfftTransform* transforms,
+                                         SpfftProcessingUnitType* inputLocations,
+                                         double** outputPointers, SpfftScalingType* scalingTypes);
+
+/**
+ * Execute multiple independent backward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputPointers Input pointers for each transform.
+ * @param[in] outputLocations Output locations for each transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_multi_transform_backward(int numTransforms, SpfftTransform* transforms,
+                                          double** inputPointers,
+                                          SpfftProcessingUnitType* outputLocations);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/multi_transform.hpp b/include/spfft/multi_transform.hpp
new file mode 100644
index 0000000..3b52753
--- /dev/null
+++ b/include/spfft/multi_transform.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MULTI_TRANSFORM_HPP
+#define SPFFT_MULTI_TRANSFORM_HPP
+
+#include "spfft/config.h"
+#include "spfft/transform.hpp"
+#include "spfft/types.h"
+
+namespace spfft {
+
+/**
+ * Execute multiple independent forward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputLocations Input locations for each transform.
+ * @param[out] outputPointers Output pointers for each transform.
+ * @param[in] scalingTypes Scaling types for each transform.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ */
+void multi_transform_forward(int numTransforms, Transform* transforms,
+                             SpfftProcessingUnitType* inputLocations, double** outputPointers,
+                             SpfftScalingType* scalingTypes);
+
+/**
+ * Execute multiple independent backward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputPointers Input pointers for each transform.
+ * @param[in] outputLocations Output locations for each transform.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ */
+void multi_transform_backward(int numTransforms, Transform* transforms, double** inputPointers,
+                              SpfftProcessingUnitType* outputLocations);
+
+} // namespace spfft
+
+#endif
diff --git a/include/spfft/multi_transform_float.h b/include/spfft/multi_transform_float.h
new file mode 100644
index 0000000..f99550f
--- /dev/null
+++ b/include/spfft/multi_transform_float.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MULTI_TRANSFORM_FLOAT_H
+#define SPFFT_MULTI_TRANSFORM_FLOAT_H
+
+#include "spfft/config.h"
+#include "spfft/transform_float.h"
+#include "spfft/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Execute multiple independent forward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputLocations Input locations for each transform.
+ * @param[out] outputPointers Output pointers for each transform.
+ * @param[in] scalingTypes Scaling types for each transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_multi_transform_forward(int numTransforms, SpfftFloatTransform* transforms,
+                                               SpfftProcessingUnitType* inputLocations,
+                                               float** outputPointers,
+                                               SpfftScalingType* scalingTypes);
+
+/**
+ * Execute multiple independent backward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputPointers Input pointers for each transform.
+ * @param[in] outputLocations Output locations for each transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_multi_transform_backward(int numTransforms, SpfftFloatTransform* transforms,
+                                                float** inputPointers,
+                                                SpfftProcessingUnitType* outputLocations);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/multi_transform_float.hpp b/include/spfft/multi_transform_float.hpp
new file mode 100644
index 0000000..3e8af72
--- /dev/null
+++ b/include/spfft/multi_transform_float.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MULTI_TRANSFORM_HPP
+#define SPFFT_MULTI_TRANSFORM_HPP
+
+#include "spfft/config.h"
+#include "spfft/transform_float.hpp"
+#include "spfft/types.h"
+
+namespace spfft {
+
+#ifdef SPFFT_SINGLE_PRECISION
+/**
+ * Execute multiple independent forward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputLocations Input locations for each transform.
+ * @param[out] outputPointers Output pointers for each transform.
+ * @param[in] scalingTypes Scaling types for each transform.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ */
+void multi_transform_forward(int numTransforms, TransformFloat* transforms,
+                             SpfftProcessingUnitType* inputLocations, float** outputPointers,
+                             SpfftScalingType* scalingTypes);
+
+/**
+ * Execute multiple independent backward transforms at once by internal pipelining.
+ *
+ * @param[in] numTransforms Number of transforms to execute.
+ * @param[in] transforms Transforms to execute.
+ * @param[in] inputPointers Input pointers for each transform.
+ * @param[in] outputLocations Output locations for each transform.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ */
+void multi_transform_backward(int numTransforms, TransformFloat* transforms, float** inputPointers,
+                              SpfftProcessingUnitType* outputLocations);
+#endif
+
+} // namespace spfft
+
+#endif
diff --git a/include/spfft/spfft.f90 b/include/spfft/spfft.f90
new file mode 100644
index 0000000..c96357b
--- /dev/null
+++ b/include/spfft/spfft.f90
@@ -0,0 +1,249 @@
+module spfft
+
+use iso_c_binding
+implicit none
+
+! Constants
+integer(c_int), parameter ::                  &
+    SPFFT_EXCH_DEFAULT                  = 0,  &
+    SPFFT_EXCH_BUFFERED                 = 1,  &
+    SPFFT_EXCH_BUFFERED_FLOAT           = 2,  &
+    SPFFT_EXCH_COMPACT_BUFFERED         = 3,  &
+    SPFFT_EXCH_COMPACT_BUFFERED_FLOAT   = 4,  &
+    SPFFT_EXCH_UNBUFFERED               = 5,  &
+
+    SPFFT_PU_HOST                       = 1,  &
+    SPFFT_PU_GPU                        = 2,  &
+
+    SPFFT_INDEX_TRIPLETS                = 0,  &
+
+    SPFFT_TRANS_C2C                     = 0,  &
+    SPFFT_TRANS_R2C                     = 1,  &
+
+    SPFFT_NO_SCALING                    = 0,  &
+    SPFFT_FULL_SCALING                  = 1,  &
+
+    SPFFT_SUCCESS                       = 0,  &
+    SPFFT_UNKNOWN_ERROR                 = 1,  &
+    SPFFT_INVALID_HANDLE_ERROR          = 2,  &
+    SPFFT_OVERFLOW_ERROR                = 3,  &
+    SPFFT_ALLOCATION_ERROR              = 4,  &
+    SPFFT_INVALID_PARAMETER_ERROR       = 5,  &
+    SPFFT_DUPLICATE_INDICES_ERROR       = 6,  &
+    SPFFT_INVALID_INDICES_ERROR         = 7,  &
+    SPFFT_MPI_SUPPORT_ERROR             = 8,  &
+    SPFFT_MPI_ERROR                     = 9,  &
+    SPFFT_MPI_PARAMETER_MISMATCH_ERROR  = 10, &
+    SPFFT_HOST_EXECUTION_ERROR          = 11, &
+    SPFFT_FFTW_ERROR                    = 12, &
+    SPFFT_GPU_ERROR                     = 13, &
+    SPFFT_GPU_PRECEDING_ERROR           = 14, &
+    SPFFT_GPU_SUPPORT_ERROR             = 15, &
+    SPFFT_GPU_ALLOCATION_ERROR          = 16, &
+    SPFFT_GPU_LAUNCH_ERROR              = 17, &
+    SPFFT_GPU_NO_DEVICE_ERROR           = 18, &
+    SPFFT_GPU_INVALID_VALUE_ERROR       = 19, &
+    SPFFT_GPU_INVALID_DEVICE_PTR_ERROR  = 20, &
+    SPFFT_GPU_COPY_ERROR                = 21, &
+    SPFFT_GPU_FFT_ERROR                 = 22
+
+interface
+  !--------------------------
+  !          Grid
+  !--------------------------
+  integer(c_int) function spfft_grid_create(grid, maxDimX, maxDimY, maxDimZ, &
+      maxNumLocalZColumns, processingUnit, maxNumThreads) bind(C)
+    use iso_c_binding
+    type(c_ptr), intent(out) :: grid
+    integer(c_int), value :: maxDimX
+    integer(c_int), value :: maxDimY
+    integer(c_int), value :: maxDimZ
+    integer(c_int), value :: maxNumLocalZColumns
+    integer(c_int), value :: processingUnit
+    integer(c_int), value :: maxNumThreads
+  end function
+
+  integer(c_int) function spfft_grid_create_distributed(grid, maxDimX, maxDimY, maxDimZ, &
+      maxNumLocalZColumns, maxLocalZLength, processingUnit, maxNumThreads,&
+      comm, exchangeType) bind(C, name='spfft_grid_create_distributed_fortran')
+    use iso_c_binding
+    type(c_ptr), intent(out) :: grid
+    integer(c_int), value :: maxDimX
+    integer(c_int), value :: maxDimY
+    integer(c_int), value :: maxDimZ
+    integer(c_int), value :: maxNumLocalZColumns
+    integer(c_int), value :: maxLocalZLength
+    integer(c_int), value :: processingUnit
+    integer(c_int), value :: maxNumThreads
+    integer(c_int), value :: comm
+    integer(c_int), value :: exchangeType
+  end function
+
+  integer(c_int) function spfft_grid_destroy(grid) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+  end function
+
+  integer(c_int) function spfft_grid_max_dim_x(grid, dimX) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: dimX
+  end function
+
+  integer(c_int) function spfft_grid_max_dim_y(grid, dimY) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: dimY
+  end function
+
+  integer(c_int) function spfft_grid_max_dim_z(grid, dimZ) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: dimZ
+  end function
+
+  integer(c_int) function spfft_grid_max_num_local_z_columns(grid, maxNumLocalZColumns) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: maxNumLocalZColumns
+  end function
+
+  integer(c_int) function spfft_grid_max_local_z_length(grid, maxLocalZLength) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: maxLocalZLength
+  end function
+
+  integer(c_int) function spfft_grid_processing_unit(grid, processingUnit) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: processingUnit
+  end function
+
+  integer(c_int) function spfft_grid_device_id(grid, deviceId) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: deviceId
+  end function
+
+  integer(c_int) function spfft_grid_num_threads(grid, numThreads) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: numThreads
+  end function
+
+  integer(c_int) function spfft_grid_communicator(grid, comm) &
+      bind(C, name="spfft_grid_communicator_fortran")
+    use iso_c_binding
+    type(c_ptr), value :: grid
+    integer(c_int), intent(out) :: comm
+  end function
+
+  !--------------------------
+  !        Transform
+  !--------------------------
+  integer(c_int) function spfft_transform_create(transform, grid, processingUnit, &
+      transformType, dimX, dimY, dimZ, localZLength, numLocalElements, indexFormat, indices) bind(C)
+    use iso_c_binding
+    type(c_ptr), intent(out) :: transform
+    type(c_ptr), value :: grid
+    integer(c_int), value :: processingUnit
+    integer(c_int), value :: transformType
+    integer(c_int), value :: dimX
+    integer(c_int), value :: dimY
+    integer(c_int), value :: dimZ
+    integer(c_int), value :: localZLength
+    integer(c_int), value :: numLocalElements
+    integer(c_int), value :: indexFormat
+    integer(c_int), dimension(*), intent(in) :: indices
+  end function
+
+  integer(c_int) function spfft_transform_destroy(transform) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+  end function
+
+
+  integer(c_int) function spfft_transform_backward(transform, input, &
+                                  outputLocation) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    complex(c_double), dimension(*), intent(in) :: input
+    integer(c_int), value :: outputLocation
+  end function
+
+  integer(c_int) function spfft_transform_forward(transform, inputLocation, &
+                                  output, scaling) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), value :: inputLocation
+    complex(c_double), dimension(*), intent(out) :: output
+    integer(c_int), value :: scaling
+  end function
+
+  integer(c_int) function  spfft_transform_get_space_domain(transform, &
+                                             dataLocation, dataPtr) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), value :: dataLocation
+    type(c_ptr), intent(out) :: dataPtr
+  end function
+
+  integer(c_int) function spfft_transform_dim_x(transform, dimX) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: dimX
+  end function
+
+  integer(c_int) function spfft_transform_dim_y(transform, dimY) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: dimY
+  end function
+
+  integer(c_int) function spfft_transform_dim_z(transform, dimZ) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: dimZ
+  end function
+
+  integer(c_int) function spfft_local_z_length(transform, localZLength) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: localZLength
+  end function
+
+  integer(c_int) function spfft_transform_local_z_offset(transform, offset) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: offset
+  end function
+
+  integer(c_int) function spfft_transform_num_local_elements(transform, numLocalElements) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: numLocalElements
+  end function
+
+  integer(c_int) function spfft_transform_device_id(transform, deviceId) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: deviceId
+  end function
+
+  integer(c_int) function spfft_transform_num_threads(transform, numThreads) bind(C)
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: numThreads
+  end function
+
+  integer(c_int) function spfft_transform_communicator(transform, comm) &
+      bind(C, name="spfft_transform_communicator_fortran")
+    use iso_c_binding
+    type(c_ptr), value :: transform
+    integer(c_int), intent(out) :: comm
+  end function
+
+end interface
+
+end
diff --git a/include/spfft/spfft.h b/include/spfft/spfft.h
new file mode 100644
index 0000000..d7e1a30
--- /dev/null
+++ b/include/spfft/spfft.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SPFFT_H
+#define SPFFT_SPFFT_H
+
+#include "spfft/config.h"
+#include "spfft/grid.h"
+#include "spfft/grid_float.h"
+#include "spfft/transform.h"
+#include "spfft/transform_float.h"
+
+#endif
diff --git a/include/spfft/spfft.hpp b/include/spfft/spfft.hpp
new file mode 100644
index 0000000..1129a58
--- /dev/null
+++ b/include/spfft/spfft.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SPFFT_HPP
+#define SPFFT_SPFFT_HPP
+
+#include "spfft/config.h"
+#include "spfft/grid.hpp"
+#include "spfft/grid_float.hpp"
+#include "spfft/transform.hpp"
+#include "spfft/transform_float.hpp"
+#include "spfft/multi_transform.hpp"
+#include "spfft/multi_transform_float.hpp"
+
+#endif
diff --git a/include/spfft/transform.h b/include/spfft/transform.h
new file mode 100644
index 0000000..46eff02
--- /dev/null
+++ b/include/spfft/transform.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_H
+#define SPFFT_TRANSFORM_H
+
+#include "spfft/config.h"
+#include "spfft/errors.h"
+#include "spfft/grid.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Transform handle.
+ */
+typedef void* SpfftTransform;
+
+/**
+ * Creates a transform from a grid handle.
+ *
+ * @param[out] transform Handle to the transform.
+ * @param[in] grid Handle to the grid, with which the transform is created.
+ * @param[in] processingUnit The processing unit type to use. Must be either SPFFT_PU_HOST or
+ * SPFFT_PU_GPU and be supported by the grid itself.
+ * @param[in] transformType The transform type (complex to complex or real to complex). Can be
+ * SPFFT_TRANS_C2C or SPFFT_TRANS_R2C.
+ * @param[in] dimX The dimension in x. The maximum allowed depends on the grid parameters.
+ * @param[in] dimY The dimension in y. The maximum allowed depends on the grid parameters.
+ * @param[in] dimZ The dimension in z. The maximum allowed depends on the grid parameters.
+ * @param[in] localZLength The length in z in space domain of the local MPI rank.
+ * @param[in] numLocalElements The number of elements in frequency domain of the local MPI
+ * rank.
+ * @param[in] indexFormat The index format. Only SPFFT_INDEX_TRIPLETS currently supported.
+ * @param[in] indices Pointer to the frequency indices. Posive and negative indexing is supported.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_create(SpfftTransform* transform, SpfftGrid grid,
+                                  SpfftProcessingUnitType processingUnit,
+                                  SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                                  int localZLength, int numLocalElements,
+                                  SpfftIndexFormatType indexFormat, const int* indices);
+
+/**
+ * Destroy a transform.
+ *
+ * @param[in] transform Handle to the transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_destroy(SpfftTransform transform);
+
+/**
+ * Clone a transform.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[out] newTransform Independent transform with the same parameters, but with new underlying
+ * grid.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_clone(SpfftTransform transform, SpfftTransform* newTransform);
+
+/**
+ * Execute a forward transform from space domain to frequency domain.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] inputLocation The processing unit, to take the input from. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @param[out] output Pointer to memory, where the frequency domain elements are written to. Can
+ * be located at Host or GPU memory (if GPU is set as processing unit).
+ * @param[in] scaling Controls scaling of output. SPFFT_NO_SCALING to disable or
+ * SPFFT_FULL_SCALING to scale by factor 1 / (dim_x() * dim_y() * dim_z()).
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_forward(SpfftTransform transform, SpfftProcessingUnitType inputLocation,
+                                   double* output, SpfftScalingType scaling);
+
+/**
+ * Execute a backward transform from frequency domain to space domain.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] input Input data in frequency domain. Must match the indices provided at transform
+ * creation. Can be located at Host or GPU memory, if GPU is set as processing unit.
+ * @param[in] outputLocation The processing unit, to place the output at. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_backward(SpfftTransform transform, const double* input,
+                                    SpfftProcessingUnitType outputLocation);
+/**
+ * Provides access to the space domain data.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] dataLocation The processing unit to query for the data. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @param[out] data Pointer to space domain data on given processing unit. Alignment is guaranteed
+ * to fulfill requirements for std::complex and C language complex types.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_get_space_domain(SpfftTransform transform,
+                                            SpfftProcessingUnitType dataLocation, double** data);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimX Dimension in x.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_dim_x(SpfftTransform transform, int* dimX);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimY Dimension in y.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_dim_y(SpfftTransform transform, int* dimY);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimZ Dimension in z.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_dim_z(SpfftTransform transform, int* dimZ);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] localZLength size in z of the slice in space domain on the local MPI rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_local_z_length(SpfftTransform transform, int* localZLength);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] offset Offset in z of the space domain slice held by the local MPI rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_local_z_offset(SpfftTransform transform, int* offset);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] numLocalElements Number of local elements in frequency domain.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_num_local_elements(SpfftTransform transform, int* numLocalElements);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] deviceId The GPU device id used. Returns always 0, if no GPU support is enabled.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_device_id(SpfftTransform transform, int* deviceId);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] numThreads The exact number of threads used by transforms created from this grid. May
+ * be less than the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_num_threads(SpfftTransform transform, int* numThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] comm The internal MPI communicator.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_transform_communicator(SpfftTransform transform, MPI_Comm* comm);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/transform.hpp b/include/spfft/transform.hpp
new file mode 100644
index 0000000..bb66042
--- /dev/null
+++ b/include/spfft/transform.hpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_HPP
+#define SPFFT_TRANSFORM_HPP
+
+#include <memory>
+#include "spfft/config.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+namespace spfft {
+
+template <typename T>
+class TransformInternal;
+
+class Grid;
+
+template <typename TransformType>
+class MultiTransformInternal;
+
+template <typename T>
+class GridInternal;
+
+/**
+ * A transform in double precision with fixed dimensions. Shares memory with other transform created
+ * from the same Grid object.
+ */
+class Transform {
+public:
+  using ValueType = double;
+
+  /**
+   * Default copy constructor.
+   */
+  Transform(const Transform&) = default;
+
+  /**
+   * Default move constructor.
+   */
+  Transform(Transform&&) = default;
+
+  /**
+   * Default copy operator.
+   */
+  Transform& operator=(const Transform&) = default;
+
+  /**
+   * Default move operator.
+   */
+  Transform& operator=(Transform&&) = default;
+
+  /**
+   * Clone transform.
+   *
+   * @return Independent transform with the same parameters, but with new underlying grid.
+   */
+  Transform clone() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Type of transform.
+   */
+  SpfftTransformType type() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in x.
+   */
+  int dim_x() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in y.
+   */
+  int dim_y() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in z.
+   */
+  int dim_z() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Length in z of the space domain slice held by the local MPI rank.
+   */
+  int local_z_length() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Offset in z of the space domain slice held by the local MPI rank.
+   */
+  int local_z_offset() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Number of elements in the space domain slice held by the local MPI rank.
+   */
+  int local_slice_size() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Number of elements in frequency domain.
+   */
+  int num_local_elements() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The processing unit used for calculations. Can be SPFFT_PU_HOST or SPFFT_PU_GPU.
+   */
+  SpfftProcessingUnitType processing_unit() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The GPU device id used. Returns always 0, if no GPU support is enabled.
+   */
+  int device_id() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The exact number of threads used by transforms created from this grid. May be less than
+   * the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+   */
+  int num_threads() const;
+
+#ifdef SPFFT_MPI
+  /**
+   * Access a transform parameter.
+   * @return The internal MPI communicator.
+   */
+  MPI_Comm communicator() const;
+#endif
+
+  /**
+   * Provides access to the space domain data.
+   *
+   * @param[in] dataLocation The processing unit to query for the data. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @return Pointer to space domain data on given processing unit. Alignment is guaranteed to
+   * fulfill requirements for std::complex and C language complex types.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  double* space_domain_data(SpfftProcessingUnitType dataLocation);
+
+  /**
+   * Execute a forward transform from space domain to frequency domain.
+   *
+   * @param[in] inputLocation The processing unit, to take the input from. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @param[out] output Pointer to memory, where the frequency domain elements are written to. Can
+   * be located at Host or GPU memory (if GPU is set as processing unit).
+   * @param[in] scaling Controls scaling of output. SPFFT_NO_SCALING to disable or
+   * SPFFT_FULL_SCALING to scale by factor 1 / (dim_x() * dim_y() * dim_z()).
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  void forward(SpfftProcessingUnitType inputLocation, double* output,
+               SpfftScalingType scaling = SPFFT_NO_SCALING);
+
+  /**
+   * Execute a backward transform from frequency domain to space domain.
+   *
+   * @param[in] input Input data in frequency domain. Must match the indices provided at transform
+   * creation. Can be located at Host or GPU memory, if GPU is set as processing unit.
+   * @param[in] outputLocation The processing unit, to place the output at. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  void backward(const double* input, SpfftProcessingUnitType outputLocation);
+
+private:
+  /*! \cond PRIVATE */
+  friend Grid;
+  friend MultiTransformInternal<Transform>;
+
+  Transform(const std::shared_ptr<GridInternal<double>>& grid,
+            SpfftProcessingUnitType executionUnit, SpfftTransformType transformType, int dimX,
+            int dimY, int dimZ, int localZLength, int numLocalElements,
+            SpfftIndexFormatType dataFormat, const int* indices);
+
+  explicit Transform(std::shared_ptr<TransformInternal<double>> transform);
+
+  std::shared_ptr<TransformInternal<double>> transform_;
+  /*! \endcond */
+};
+
+} // namespace spfft
+#endif
diff --git a/include/spfft/transform_float.h b/include/spfft/transform_float.h
new file mode 100644
index 0000000..75b7e92
--- /dev/null
+++ b/include/spfft/transform_float.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_FLOAT_H
+#define SPFFT_TRANSFORM_FLOAT_H
+
+#include "spfft/config.h"
+#include "spfft/errors.h"
+#include "spfft/grid_float.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Transform handle.
+ */
+typedef void* SpfftFloatTransform;
+
+/**
+ * Creates a single precision transform from a single precision grid handle.
+ *
+ * @param[out] transform Handle to the transform.
+ * @param[in] grid Handle to the grid, with which the transform is created.
+ * @param[in] processingUnit The processing unit type to use. Must be either SPFFT_PU_HOST or
+ * SPFFT_PU_GPU and be supported by the grid itself.
+ * @param[in] transformType The transform type (complex to complex or real to complex). Can be
+ * SPFFT_TRANS_C2C or SPFFT_TRANS_R2C.
+ * @param[in] dimX The dimension in x. The maximum allowed depends on the grid parameters.
+ * @param[in] dimY The dimension in y. The maximum allowed depends on the grid parameters.
+ * @param[in] dimZ The dimension in z. The maximum allowed depends on the grid parameters.
+ * @param[in] localZLength The length in z in space domain of the local MPI rank.
+ * @param[in] numLocalElements The number of elements in frequency domain of the local MPI
+ * rank.
+ * @param[in] indexFormat The index format. Only SPFFT_INDEX_TRIPLETS currently supported.
+ * @param[in] indices Pointer to the frequency indices. Posive and negative indexing is supported.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_create(SpfftFloatTransform* transform, SpfftFloatGrid grid,
+                                        SpfftProcessingUnitType processingUnit,
+                                        SpfftTransformType transformType, int dimX, int dimY,
+                                        int dimZ, int localZLength, int numLocalElements,
+                                        SpfftIndexFormatType indexFormat, const int* indices);
+
+/**
+ * Destroy a transform.
+ *
+ * @param[in] transform Handle to the transform.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_destroy(SpfftFloatTransform transform);
+
+/**
+ * Clone a transform.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[out] newTransform Independent transform with the same parameters, but with new underlying
+ * grid.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_clone(SpfftFloatTransform transform,
+                                       SpfftFloatTransform* newTransform);
+
+/**
+ * Execute a forward transform from space domain to frequency domain.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] inputLocation The processing unit, to take the input from. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @param[out] output Pointer to memory, where the frequency domain elements are written to. Can
+ * be located at Host or GPU memory (if GPU is set as processing unit).
+ * @param[in] scaling Controls scaling of output. SPFFT_NO_SCALING to disable or
+ * SPFFT_FULL_SCALING to scale by factor 1 / (dim_x() * dim_y() * dim_z()).
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_forward(SpfftFloatTransform transform,
+                                         SpfftProcessingUnitType inputLocation, float* output,
+                                         SpfftScalingType scaling);
+
+/**
+ * Execute a backward transform from frequency domain to space domain.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] input Input data in frequency domain. Must match the indices provided at transform
+ * creation. Can be located at Host or GPU memory, if GPU is set as processing unit.
+ * @param[in] outputLocation The processing unit, to place the output at. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_backward(SpfftFloatTransform transform, const float* input,
+                                          SpfftProcessingUnitType outputLocation);
+/**
+ * Provides access to the space domain data.
+ *
+ * @param[in] transform Handle to the transform.
+ * @param[in] dataLocation The processing unit to query for the data. Can be SPFFT_PU_HOST or
+ * SPFFT_PU_GPU (if GPU is set as execution unit).
+ * @param[out] data Pointer to space domain data on given processing unit. Alignment is guaranteed
+ * to fulfill requirements for std::complex and C language complex types.
+ * @throw GenericError SpFFT error. Can be a derived type.
+ * @throw std::exception Error from standard library calls. Can be a derived type.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_get_space_domain(SpfftFloatTransform transform,
+                                                 SpfftProcessingUnitType dataLocation,
+                                                 float** data);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimX Dimension in x.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_dim_x(SpfftFloatTransform transform, int* dimX);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimY Dimension in y.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_dim_y(SpfftFloatTransform transform, int* dimY);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] dimZ Dimension in z.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_dim_z(SpfftFloatTransform transform, int* dimZ);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] localZLength size in z of the slice in space domain on the local MPI rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_local_z_length(SpfftFloatTransform transform, int* localZLength);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] offset Offset in z of the space domain slice held by the local MPI rank.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_local_z_offset(SpfftFloatTransform transform, int* offset);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] numLocalElements Number of local elements in frequency domain.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_num_local_elements(SpfftFloatTransform transform, int* numLocalElements);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] deviceId The GPU device id used. Returns always 0, if no GPU support is enabled.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_device_id(SpfftFloatTransform transform, int* deviceId);
+
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] numThreads The exact number of threads used by transforms created from this grid. May
+ * be less than the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_num_threads(SpfftFloatTransform transform, int* numThreads);
+
+#ifdef SPFFT_MPI
+/**
+ * Access a transform parameter.
+ * @param[in] transform Handle to the transform.
+ * @param[out] comm The internal MPI communicator.
+ * @return Error code or SPFFT_SUCCESS.
+ */
+SpfftError spfft_float_transform_communicator(SpfftFloatTransform transform, MPI_Comm* comm);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/spfft/transform_float.hpp b/include/spfft/transform_float.hpp
new file mode 100644
index 0000000..f9eb074
--- /dev/null
+++ b/include/spfft/transform_float.hpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_FLOAT_HPP
+#define SPFFT_TRANSFORM_FLOAT_HPP
+
+#include <memory>
+#include "spfft/config.h"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#endif
+
+namespace spfft {
+
+template <typename T>
+class TransformInternal;
+
+class Grid;
+
+template <typename TransformType>
+class MultiTransformInternal;
+
+template <typename T>
+class GridInternal;
+
+#ifdef SPFFT_SINGLE_PRECISION
+
+class GridFloat;
+
+/**
+ * A transform in single precision with fixed dimensions. Shares memory with other transform created
+ * from the same Grid object.
+ */
+class TransformFloat {
+public:
+  using ValueType = float;
+  /**
+   * Default copy constructor.
+   */
+  TransformFloat(const TransformFloat&) = default;
+
+  /**
+   * Default move constructor.
+   */
+  TransformFloat(TransformFloat&&) = default;
+
+  /**
+   * Default copy operator.
+   */
+  TransformFloat& operator=(const TransformFloat&) = default;
+
+  /**
+   * Default move operator.
+   */
+  TransformFloat& operator=(TransformFloat&&) = default;
+
+  /**
+   * Clone transform.
+   *
+   * @return Independent transform with the same parameters, but with new underlying grid.
+   */
+  TransformFloat clone() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Type of transform.
+   */
+  SpfftTransformType type() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in x.
+   */
+  int dim_x() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in y.
+   */
+  int dim_y() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Dimension in z.
+   */
+  int dim_z() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Length in z of the space domain slice held by the local MPI rank.
+   */
+  int local_z_length() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Offset in z of the space domain slice held by the local MPI rank.
+   */
+  int local_z_offset() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Number of elements in the space domain slice held by the local MPI rank.
+   */
+  int local_slice_size() const;
+
+  /**
+   * Access a transform parameter.
+   * @return Number of elements in frequency domain.
+   */
+  int num_local_elements() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The processing unit used for calculations. Can be SPFFT_PU_HOST or SPFFT_PU_GPU.
+   */
+  SpfftProcessingUnitType processing_unit() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The GPU device id used. Returns always 0, if no GPU support is enabled.
+   */
+  int device_id() const;
+
+  /**
+   * Access a transform parameter.
+   * @return The exact number of threads used by transforms created from this grid. May be less than
+   * the maximum given to the constructor. Always 1, if not compiled with OpenMP support.
+   */
+  int num_threads() const;
+
+#ifdef SPFFT_MPI
+  /**
+   * Access a transform parameter.
+   * @return The internal MPI communicator.
+   */
+  MPI_Comm communicator() const;
+#endif
+
+  /**
+   * Provides access to the space domain data.
+   *
+   * @param[in] dataLocation The processing unit to query for the data. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @return Pointer to space domain data on given processing unit. Alignment is guaranteed to
+   * fulfill requirements for std::complex and C language complex types.
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  float* space_domain_data(SpfftProcessingUnitType dataLocation);
+
+  /**
+   * Execute a forward transform from space domain to frequency domain.
+   *
+   * @param[in] inputLocation The processing unit, to take the input from. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @param[out] output Pointer to memory, where the frequency domain elements are written to. Can
+   * be located at Host or GPU memory (if GPU is set as processing unit).
+   * @param[in] scaling Controls scaling of output. SPFFT_NO_SCALING to disable or
+   * SPFFT_FULL_SCALING to scale by factor 1 / (dim_x() * dim_y() * dim_z()).
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  void forward(SpfftProcessingUnitType inputLocation, float* output,
+               SpfftScalingType scaling = SPFFT_NO_SCALING);
+
+  /**
+   * Execute a backward transform from frequency domain to space domain.
+   *
+   * @param[in] input Input data in frequency domain. Must match the indices provided at transform
+   * creation. Can be located at Host or GPU memory, if GPU is set as processing unit.
+   * @param[in] outputLocation The processing unit, to place the output at. Can be SPFFT_PU_HOST or
+   * SPFFT_PU_GPU (if GPU is set as execution unit).
+   * @throw GenericError SpFFT error. Can be a derived type.
+   * @throw std::exception Error from standard library calls. Can be a derived type.
+   */
+  void backward(const float* input, SpfftProcessingUnitType outputLocation);
+
+private:
+  /*! \cond PRIVATE */
+  friend GridFloat;
+  friend MultiTransformInternal<TransformFloat>;
+
+  TransformFloat(const std::shared_ptr<GridInternal<float>>& grid,
+                 SpfftProcessingUnitType executionUnit, SpfftTransformType transformType, int dimX,
+                 int dimY, int dimZ, int localZLength, int numLocalElements,
+                 SpfftIndexFormatType dataFormat, const int* indices);
+
+  explicit TransformFloat(std::shared_ptr<TransformInternal<float>> transform);
+
+  std::shared_ptr<TransformInternal<float>> transform_;
+  /*! \endcond */
+};
+#endif
+
+} // namespace spfft
+#endif
diff --git a/include/spfft/types.h b/include/spfft/types.h
new file mode 100644
index 0000000..785bdc8
--- /dev/null
+++ b/include/spfft/types.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TYPES_H
+#define SPFFT_TYPES_H
+
+#include "spfft/config.h"
+
+enum SpfftExchangeType {
+  /**
+   * Default exchange. Equivalent to SPFFT_EXCH_COMPACT_BUFFERED.
+   */
+  SPFFT_EXCH_DEFAULT,
+  /**
+   * Exchange based on MPI_Alltoall.
+   */
+  SPFFT_EXCH_BUFFERED,
+  /**
+   * Exchange based on MPI_Alltoall in single precision.
+   * Slight accuracy loss for double precision transforms due to conversion to float prior to MPI
+   * exchange.
+   */
+  SPFFT_EXCH_BUFFERED_FLOAT,
+  /**
+   * Exchange based on MPI_Alltoallv.
+   */
+  SPFFT_EXCH_COMPACT_BUFFERED,
+  /**
+   * Exchange based on MPI_Alltoallv in single precision.
+   * Slight accuracy loss for double precision transforms due to conversion to float prior to MPI
+   * exchange.
+   */
+  SPFFT_EXCH_COMPACT_BUFFERED_FLOAT,
+  /**
+   * Exchange based on MPI_Alltoallw.
+   */
+  SPFFT_EXCH_UNBUFFERED
+};
+
+/**
+ * Processing unit type
+ */
+enum SpfftProcessingUnitType {
+  /**
+   * HOST
+   */
+  SPFFT_PU_HOST = 1,
+  /**
+   * GPU
+   */
+  SPFFT_PU_GPU = 2
+};
+
+enum SpfftIndexFormatType {
+  /**
+   * Triplets of x,y,z frequency indices
+   */
+  SPFFT_INDEX_TRIPLETS
+};
+
+enum SpfftTransformType {
+  /**
+   * Complex-to-Complex transform
+   */
+  SPFFT_TRANS_C2C,
+
+  /**
+   * Real-to-Complex transform
+   */
+  SPFFT_TRANS_R2C
+};
+
+enum SpfftScalingType {
+  /**
+   * No scaling
+   */
+  SPFFT_NO_SCALING,
+  /**
+   * Full scaling
+   */
+  SPFFT_FULL_SCALING
+};
+
+#ifndef __cplusplus
+/*! \cond PRIVATE */
+// C only
+typedef enum SpfftExchangeType SpfftExchangeType;
+typedef enum SpfftProcessingUnitType SpfftProcessingUnitType;
+typedef enum SpfftTransformType SpfftTransformType;
+typedef enum SpfftIndexFormatType SpfftIndexFormatType;
+typedef enum SpfftScalingType SpfftScalingType;
+/*! \endcond */
+#endif // cpp
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..b2df1eb
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,114 @@
+set(SPFFT_SOURCE_FILES
+	memory/aligned_allocation.cpp
+	timing/timing.cpp
+	timing/host_timing.cpp
+	parameters/parameters.cpp
+	execution/execution_host.cpp
+	spfft/transform.cpp
+	spfft/transform_internal.cpp
+	spfft/multi_transform.cpp
+	spfft/grid.cpp
+	spfft/grid_internal.cpp
+	)
+
+if(SPFFT_SINGLE_PRECISION)
+	list(APPEND SPFFT_SOURCE_FILES
+		spfft/transform_float.cpp
+		spfft/multi_transform_float.cpp
+		spfft/grid_float.cpp
+		)
+endif()
+
+set(SPFFT_GPU_KERNELS)
+if(SPFFT_CUDA OR SPFFT_ROCM)
+	list(APPEND SPFFT_GPU_KERNELS
+		transpose/gpu_kernels/local_transpose_kernels.cu
+		compression/gpu_kernels/compression_kernels.cu
+		symmetry/gpu_kernels/symmetry_kernels.cu
+		transpose/gpu_kernels/buffered_kernels.cu
+		transpose/gpu_kernels/compact_buffered_kernels.cu
+		)
+	list(APPEND SPFFT_SOURCE_FILES
+		execution/execution_gpu.cpp
+		)
+	if(SPFFT_MPI)
+		list(APPEND SPFFT_SOURCE_FILES
+			transpose/transpose_mpi_buffered_gpu.cpp
+			transpose/transpose_mpi_compact_buffered_gpu.cpp
+			transpose/transpose_mpi_unbuffered_gpu.cpp
+			)
+	endif()
+endif()
+
+if(SPFFT_CUDA)
+	list(APPEND SPFFT_SOURCE_FILES ${SPFFT_GPU_KERNELS})
+endif()
+
+if(SPFFT_ROCM)
+	rocm_hip_add_library(spfft_device ${SPFFT_GPU_KERNELS} STATIC INCLUDE_DIRS ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS}
+		FLAGS --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906)
+endif()
+
+if(SPFFT_MPI)
+	list(APPEND SPFFT_SOURCE_FILES
+		transpose/transpose_mpi_buffered_host.cpp
+		transpose/transpose_mpi_compact_buffered_host.cpp
+		transpose/transpose_mpi_unbuffered_host.cpp
+		)
+endif()
+
+add_library(spfft ${SPFFT_LIBRARY_TYPE}
+	${SPFFT_SOURCE_FILES}
+	)
+
+target_compile_options(spfft PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+target_include_directories(spfft PRIVATE ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+target_include_directories(spfft PRIVATE ${PROJECT_SOURCE_DIR}/include)
+target_include_directories(spfft PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(spfft PRIVATE ${PROJECT_BINARY_DIR})
+target_link_libraries(spfft PRIVATE ${SPFFT_EXTERNAL_LIBS})
+target_link_libraries(spfft INTERFACE ${SPFFT_INTERFACE_LIBS})
+
+if(SPFFT_ROCM)
+	target_link_libraries(spfft PRIVATE spfft_device)
+endif()
+
+target_include_directories(spfft INTERFACE $<INSTALL_INTERFACE:include>) # for install(EXPORT ...)
+target_include_directories(spfft INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>  $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>) # for export(...)
+
+# set list of interface libries for exporting
+if(POLICY CMP0022)
+	set_target_properties(spfft PROPERTIES INTERFACE_LINK_LIBRARIES "${SPFFT_INTERFACE_LIBS}")
+else()
+	set_target_properties(spfft PROPERTIES LINK_INTERFACE_LIBRARIES "${SPFFT_INTERFACE_LIBS}")
+endif()
+
+
+# generate cmake package
+include(CMakePackageConfigHelpers)
+message(STATUS "project: ${PROJECT_SOURCE_DIR}")
+write_basic_package_version_file(
+	"${PROJECT_BINARY_DIR}/SpFFTConfigVersion.cmake"
+	VERSION ${Upstream_VERSION}
+	COMPATIBILITY AnyNewerVersion
+)
+export(TARGETS spfft FILE ${PROJECT_BINARY_DIR}/SpFFTTargets.cmake)
+configure_file(${PROJECT_SOURCE_DIR}/cmake/SpFFTConfig.cmake
+	"${PROJECT_BINARY_DIR}/SpFFTConfig.cmake"
+	@ONLY
+)
+
+# installation commands
+if(SPFFT_INSTALL)
+	install(TARGETS spfft DESTINATION lib EXPORT SpFFTTargets)
+	install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/spfft DESTINATION include FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+	install(FILES ${PROJECT_BINARY_DIR}/spfft/config.h DESTINATION include/spfft)
+	install(EXPORT SpFFTTargets DESTINATION lib/cmake)
+	install(
+	  FILES
+		"${PROJECT_BINARY_DIR}/SpFFTConfig.cmake"
+		"${PROJECT_BINARY_DIR}/SpFFTConfigVersion.cmake"
+	  DESTINATION
+		lib/cmake
+	)
+endif()
diff --git a/src/compression/compression_gpu.hpp b/src/compression/compression_gpu.hpp
new file mode 100644
index 0000000..1aeccad
--- /dev/null
+++ b/src/compression/compression_gpu.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_COMPRESSION_GPU_HPP
+#define SPFFT_COMPRESSION_GPU_HPP
+
+#include <complex>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include "compression/gpu_kernels/compression_kernels.hpp"
+#include "compression/indices.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+namespace spfft {
+
+// Handles packing and unpacking of sparse frequency values for single or double precision on GPU
+class CompressionGPU {
+public:
+  CompressionGPU(const std::shared_ptr<Parameters>& param)
+      : indicesGPU_(param->local_value_indices().size()) { // stream MUST synchronize with default stream
+    copy_to_gpu(param->local_value_indices(), indicesGPU_);
+  }
+
+  // Pack values into output buffer
+  template <typename T>
+  auto compress(const GPUStreamHandle& stream,
+                const GPUArrayView2D<typename gpu::fft::ComplexType<T>::type> input, T* output,
+                const bool useScaling, const T scalingFactor = 1.0) -> void {
+    static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+    compress_gpu(stream.get(), create_1d_view(indicesGPU_, 0, indicesGPU_.size()), input, output,
+                 useScaling, scalingFactor);
+  }
+
+  // Unpack values into z-stick collection
+  template <typename T>
+  auto decompress(const GPUStreamHandle& stream, const T* input,
+                  GPUArrayView2D<typename gpu::fft::ComplexType<T>::type> output) -> void {
+    static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+    gpu::check_status(gpu::memset_async(
+        static_cast<void*>(output.data()), 0,
+        output.size() * sizeof(typename decltype(output)::ValueType), stream.get()));
+    decompress_gpu(stream.get(), create_1d_view(indicesGPU_, 0, indicesGPU_.size()), input,
+                   output);
+  }
+
+private:
+  GPUArray<int> indicesGPU_;
+};
+} // namespace spfft
+
+#endif
+
diff --git a/src/compression/compression_host.hpp b/src/compression/compression_host.hpp
new file mode 100644
index 0000000..60f0967
--- /dev/null
+++ b/src/compression/compression_host.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_COMPRESSION_HOST_HPP
+#define SPFFT_COMPRESSION_HOST_HPP
+
+#include <complex>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include "compression/indices.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "memory/host_array_view.hpp"
+#include "memory/host_array_const_view.hpp"
+
+namespace spfft {
+// Handles packing and unpacking of sparse frequency values for single or double precision on Host
+class CompressionHost {
+public:
+  CompressionHost(const int numThreads, const std::shared_ptr<Parameters>& param)
+      : numThreads_(numThreads), param_(param) {}
+
+  // Pack values into output buffer
+  template <typename T>
+  auto compress(const HostArrayView2D<std::complex<T>> input2d, T* output, bool useScaling,
+                const T scalingFactor = 1.0) const -> void {
+    const auto& indices = param_->local_value_indices();
+    auto input = HostArrayConstView1D<std::complex<T>>(input2d.data(), input2d.size(), input2d.pinned());
+
+    if (useScaling) {
+      SPFFT_OMP_PRAGMA("omp for schedule(static)")
+      for (SizeType i = 0; i < indices.size(); ++i) {
+        const auto value = scalingFactor * input(indices[i]);
+        output[2 * i] = value.real();
+        output[2 * i + 1] = value.imag();
+      }
+    } else {
+      SPFFT_OMP_PRAGMA("omp for schedule(static)")
+      for (SizeType i = 0; i < indices.size(); ++i) {
+        const auto value = input(indices[i]);
+        output[2 * i] = value.real();
+        output[2 * i + 1] = value.imag();
+      }
+    }
+  }
+
+  // Unpack values into z-stick collection
+  template <typename T>
+  auto decompress(const T* input, HostArrayView2D<std::complex<T>> output2d) const -> void {
+    const auto& indices = param_->local_value_indices();
+    auto output = HostArrayView1D<std::complex<T>>(output2d.data(), output2d.size(), output2d.pinned());
+
+    // ensure values are padded with zeros
+    SPFFT_OMP_PRAGMA("omp for schedule(static)") // implicit barrier
+    for (SizeType stick = 0; stick < output2d.dim_outer(); ++stick) {
+      std::memset(static_cast<void*>(&output2d(stick, 0)), 0,
+                  sizeof(typename decltype(output2d)::ValueType) * output2d.dim_inner());
+    }
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType i = 0; i < indices.size(); ++i) {
+      output(indices[i]) = std::complex<T>(input[2 * i], input[2 * i + 1]);
+    }
+  }
+
+private:
+  int numThreads_;
+  std::shared_ptr<Parameters> param_;
+};
+} // namespace spfft
+
+#endif
+
diff --git a/src/compression/gpu_kernels/compression_kernels.cu b/src/compression/gpu_kernels/compression_kernels.cu
new file mode 100644
index 0000000..8f1a551
--- /dev/null
+++ b/src/compression/gpu_kernels/compression_kernels.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cassert>
+#include <algorithm>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime.hpp"
+#include "memory/gpu_array_const_view.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "memory/array_view_utility.hpp"
+
+namespace spfft {
+
+template <typename T>
+__global__ static void decompress_kernel(
+    const GPUArrayConstView1D<int> indices, const T* input,
+    GPUArrayView1D<typename gpu::fft::ComplexType<T>::type> output) {
+  // const int stride = gridDim.x * blockDim.x;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < indices.size();
+       idx += gridDim.x * blockDim.x) {
+    const int valueIdx = indices(idx);
+    typename gpu::fft::ComplexType<T>::type value;
+    value.x = input[2 * idx];
+    value.y = input[2 * idx + 1];
+    output(valueIdx) = value;
+  }
+}
+
+auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                    const double* input,
+                    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> output) -> void {
+ assert(indices.size() <= output.size());
+  const dim3 threadBlock(256);
+  const dim3 threadGrid(
+      std::min(static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), 4320));
+  // const dim3 threadGrid(indices.size() < 4 ? 1 : indices.size() / 4);
+  launch_kernel(decompress_kernel<double>, threadGrid, threadBlock, 0, stream, indices, input,
+                create_1d_view(output, 0, output.size()));
+}
+
+auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                    const float* input,
+                    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> output) -> void {
+  assert(indices.size() <= output.size());
+  const dim3 threadBlock(256);
+  const dim3 threadGrid(
+      std::min(static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), 4320));
+  launch_kernel(decompress_kernel<float>, threadGrid, threadBlock, 0, stream, indices, input,
+                create_1d_view(output, 0, output.size()));
+}
+
+template <typename T>
+__global__ static void compress_kernel(
+    const GPUArrayConstView1D<int> indices,
+    GPUArrayConstView1D<typename gpu::fft::ComplexType<T>::type> input, T* output) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < indices.size();
+       idx += gridDim.x * blockDim.x) {
+    const int valueIdx = indices(idx);
+    const auto value = input(valueIdx);
+    output[2 * idx] = value.x;
+    output[2 * idx + 1] = value.y;
+  }
+}
+
+template <typename T>
+__global__ static void compress_kernel_scaled(
+    const GPUArrayConstView1D<int> indices,
+    GPUArrayConstView1D<typename gpu::fft::ComplexType<T>::type> input, T* output,
+    const T scalingFactor) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < indices.size();
+       idx += gridDim.x * blockDim.x) {
+    const int valueIdx = indices(idx);
+    const auto value = input(valueIdx);
+    output[2 * idx] = scalingFactor * value.x;
+    output[2 * idx + 1] = scalingFactor * value.y;
+  }
+}
+
+auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                  GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> input,
+                  double* output, const bool useScaling, const double scalingFactor) -> void {
+  const dim3 threadBlock(256);
+  const dim3 threadGrid(
+      std::min(static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), 4320));
+
+  if (useScaling) {
+    launch_kernel(compress_kernel_scaled<double>, threadGrid, threadBlock, 0, stream, indices,
+                  create_1d_view(input, 0, input.size()), output, scalingFactor);
+  } else {
+    launch_kernel(compress_kernel<double>, threadGrid, threadBlock, 0, stream, indices,
+                  create_1d_view(input, 0, input.size()), output);
+  }
+}
+
+auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                  GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> input, float* output,
+                  const bool useScaling, const float scalingFactor) -> void {
+  const dim3 threadBlock(256);
+  const dim3 threadGrid(
+      std::min(static_cast<int>((indices.size() + threadBlock.x - 1) / threadBlock.x), 4320));
+  if (useScaling) {
+    launch_kernel(compress_kernel_scaled<float>, threadGrid, threadBlock, 0, stream, indices,
+                  create_1d_view(input, 0, input.size()), output, scalingFactor);
+  } else {
+    launch_kernel(compress_kernel<float>, threadGrid, threadBlock, 0, stream, indices,
+                  create_1d_view(input, 0, input.size()), output);
+  }
+}
+} // namespace spfft
+
diff --git a/src/compression/gpu_kernels/compression_kernels.hpp b/src/compression/gpu_kernels/compression_kernels.hpp
new file mode 100644
index 0000000..6a4b01f
--- /dev/null
+++ b/src/compression/gpu_kernels/compression_kernels.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef COMPRESSION_KERNELS_HPP
+#define COMPRESSION_KERNELS_HPP
+#include "memory/gpu_array_view.hpp"
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+
+namespace spfft {
+
+auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                    const double* input,
+                    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> output) -> void;
+
+auto decompress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                    const float* input,
+                    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> output) -> void;
+
+auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                  GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> input,
+                  double* output, const bool useScaling, const double scalingFactor) -> void;
+
+auto compress_gpu(const gpu::StreamType stream, const GPUArrayView1D<int>& indices,
+                  GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> input,
+                  float* output, const bool useScaling, const float scalingFactor) -> void;
+
+} // namespace spfft
+
+#endif
diff --git a/src/compression/indices.hpp b/src/compression/indices.hpp
new file mode 100644
index 0000000..2aa7d8d
--- /dev/null
+++ b/src/compression/indices.hpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_INDICES_HPP
+#define SPFFT_INDICES_HPP
+
+#include <complex>
+#include <map>
+#include <utility>
+#include <vector>
+#include <set>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#endif
+
+namespace spfft {
+
+// convert [-N, N) frequency index to [0, N) for FFT input
+inline auto to_storage_index(const int dim, const int index) -> int {
+  if (index < 0) {
+    return dim + index;
+  } else {
+    return index;
+  }
+}
+
+#ifdef SPFFT_MPI
+inline auto create_distributed_transform_indices(const MPICommunicatorHandle& comm,
+                                                 std::vector<int> localSticks)
+    -> std::vector<std::vector<int>> {
+  std::vector<MPI_Request> sendRequests(comm.size());
+
+  constexpr int tag = 442; // random tag (must be less than 32768)
+
+  // send local stick indices
+  for(int r = 0; r < static_cast<int>(comm.size()); ++r) {
+    if(r != static_cast<int>(comm.rank())) {
+      mpi_check_status(MPI_Isend(localSticks.data(), localSticks.size(), MPI_INT, r, tag,
+                                 comm.get(), &(sendRequests[r])));
+    }
+  }
+
+  std::vector<std::vector<int>> globalXYIndices(comm.size());
+
+
+  // recv all other stick indices
+  for(int r = 0; r < static_cast<int>(comm.size()); ++r) {
+    if(r != static_cast<int>(comm.rank())) {
+
+      // get recv count
+      MPI_Status status;
+      MPI_Probe(r, tag, comm.get(), &status);
+      int recvCount = 0;
+      MPI_Get_count(&status, MPI_INT, &recvCount);
+
+      // recv data
+      globalXYIndices[r].resize(recvCount);
+      MPI_Recv(globalXYIndices[r].data(), recvCount, MPI_INT, r, tag,
+               comm.get(), MPI_STATUS_IGNORE);
+    }
+  }
+
+  // wait for all sends to finish
+  for(int r = 0; r < static_cast<int>(comm.size()); ++r) {
+    if(r != static_cast<int>(comm.rank())) {
+      MPI_Wait(&(sendRequests[r]), MPI_STATUS_IGNORE);
+    }
+  }
+
+  // move local sticks into transform indices object AFTER sends are finished
+  globalXYIndices[comm.rank()] = std::move(localSticks);
+
+  return globalXYIndices;
+}
+#endif
+
+inline auto check_stick_duplicates(const std::vector<std::vector<int>>& indicesPerRank) -> void {
+  // check for z-sticks indices
+  std::set<int> globalXYIndices;
+  for(const auto& rankIndices : indicesPerRank) {
+    for(const auto& index : rankIndices) {
+      if (globalXYIndices.count(index)) {
+        throw DuplicateIndicesError();
+      }
+
+      globalXYIndices.insert(index);
+    }
+  }
+}
+
+// convert index triplets for every value into stick/z indices and z-stick index pairs.
+inline auto convert_index_triplets(const bool hermitianSymmetry, const int dimX, const int dimY,
+                                   const int dimZ, const int numValues, const int* xIndices,
+                                   const int* yIndices, const int* zIndices, const int stride)
+    -> std::pair<std::vector<int>, std::vector<int>> {
+
+  // check if indices are non-negative or centered
+  bool centeredIndices = false;
+  for (int i = 0; i < numValues; ++i) {
+    if (xIndices[i * stride] < 0 || yIndices[i * stride] < 0 || zIndices[i * stride] < 0) {
+      centeredIndices = true;
+      break;
+    }
+  }
+
+  const int maxX = (hermitianSymmetry || centeredIndices ? dimX / 2 + 1 : dimX) - 1;
+  const int maxY = (centeredIndices ? dimY / 2 + 1 : dimY) - 1;
+  const int maxZ = (centeredIndices ? dimZ / 2 + 1 : dimZ) - 1;
+  const int minX = hermitianSymmetry ? 0 : maxX - dimX + 1;
+  const int minY = maxY - dimY + 1;
+  const int minZ = maxZ - dimZ + 1;
+
+  // check if indices are inside bounds
+  for (int i = 0; i < numValues; ++i) {
+    if (xIndices[i * stride] < minX || xIndices[i * stride] > maxX) throw InvalidIndicesError();
+    if (yIndices[i * stride] < minY || yIndices[i * stride] > maxY) throw InvalidIndicesError();
+    if (zIndices[i * stride] < minZ || zIndices[i * stride] > maxZ) throw InvalidIndicesError();
+  }
+
+  // store all unique xy index pairs in an ordered container
+  std::map<int, int> sortedXYIndices; // key = index in xy-plane, value = stick index
+  for (int i = 0; i < numValues; ++i) {
+    const auto x = to_storage_index(dimX, xIndices[i * stride]);
+    const auto y = to_storage_index(dimY, yIndices[i * stride]);
+
+    sortedXYIndices[x * dimY + y] = 0;
+  }
+
+  // assign z-stick indices
+  int count = 0;
+  for (auto& pair : sortedXYIndices) {
+    pair.second = count;
+    ++count;
+  }
+
+  // store index for each element. Each z-stick is continous
+  std::vector<int> valueIndices;
+  valueIndices.reserve(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    const auto x = to_storage_index(dimX, xIndices[i * stride]);
+    const auto y = to_storage_index(dimY, yIndices[i * stride]);
+    const auto z = to_storage_index(dimZ, zIndices[i * stride]);
+
+    valueIndices.emplace_back(sortedXYIndices[x * dimY + y] * dimZ + z);
+  }
+
+  // store ordered unique xy-index pairs
+  std::vector<int> stickIndices;
+  stickIndices.reserve(sortedXYIndices.size());
+  for (auto& pair : sortedXYIndices) {
+    stickIndices.emplace_back(pair.first);
+  }
+
+  return {std::move(valueIndices), std::move(stickIndices)};
+}
+
+} // namespace spfft
+
+#endif
+
diff --git a/src/execution/execution_gpu.cpp b/src/execution/execution_gpu.cpp
new file mode 100644
index 0000000..6466a7e
--- /dev/null
+++ b/src/execution/execution_gpu.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "execution/execution_gpu.hpp"
+#include "fft/transform_1d_gpu.hpp"
+#include "fft/transform_2d_gpu.hpp"
+#include "fft/transform_real_2d_gpu.hpp"
+#include "gpu_util/gpu_pointer_translation.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "memory/array_view_utility.hpp"
+#include "parameters/parameters.hpp"
+#include "symmetry/symmetry_gpu.hpp"
+#include "timing/timing.hpp"
+#include "transpose/transpose_gpu.hpp"
+#include "transpose/transpose_mpi_buffered_gpu.hpp"
+#include "transpose/transpose_mpi_compact_buffered_gpu.hpp"
+#include "transpose/transpose_mpi_unbuffered_gpu.hpp"
+
+namespace spfft {
+
+template <typename T>
+ExecutionGPU<T>::ExecutionGPU(const int numThreads, std::shared_ptr<Parameters> param,
+                              HostArray<std::complex<T>>& array1,
+                              HostArray<std::complex<T>>& array2,
+                              GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray1,
+                              GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray2,
+                              const std::shared_ptr<GPUArray<char>>& fftWorkBuffer)
+    : stream_(false),
+      numThreads_(numThreads),
+      scalingFactor_(static_cast<T>(
+          1.0 / static_cast<double>(param->dim_x() * param->dim_y() * param->dim_z()))),
+      zStickSymmetry_(new Symmetry()),
+      planeSymmetry_(new Symmetry()) {
+  const SizeType numLocalZSticks = param->num_z_sticks(0);
+
+  // frequency data with z-sticks
+  freqDomainDataGPU_ = create_2d_view(gpuArray1, 0, numLocalZSticks, param->dim_z());
+  freqDomainCompressedDataGPU_ =
+      GPUArrayView1D<T>(reinterpret_cast<T*>(gpuArray2.data()),
+                        param->local_value_indices().size() * 2, gpuArray2.device_id());
+
+  // Z
+  if (numLocalZSticks > 0) {
+    transformZ_ = std::unique_ptr<TransformGPU>(
+        new Transform1DGPU<T>(freqDomainDataGPU_, stream_, fftWorkBuffer));
+    if (param->transform_type() == SPFFT_TRANS_R2C) {
+      zStickSymmetry_.reset(new StickSymmetryGPU<T>(
+          stream_, GPUArrayView1D<typename gpu::fft::ComplexType<T>::type>(
+                       freqDomainDataGPU_.data() +
+                           freqDomainDataGPU_.index(param->zero_zero_stick_index(), 0),
+                       freqDomainDataGPU_.dim_inner(), freqDomainDataGPU_.device_id())));
+    }
+  }
+
+  if (numLocalZSticks > 0 && param->local_value_indices().size() > 0) {
+    compression_.reset(new CompressionGPU(param));
+  }
+
+  // Transpose
+  auto freqDomainXYGPU = create_3d_view(gpuArray2, 0, param->dim_z(), param->dim_y(),
+                                        param->dim_x_freq()); // must not overlap with z-sticks
+  transpose_.reset(new TransposeGPU<T>(param, stream_, freqDomainXYGPU, freqDomainDataGPU_));
+
+  // XY
+  if (param->num_xy_planes(0) > 0) {
+    if (param->transform_type() == SPFFT_TRANS_R2C) {
+      planeSymmetry_.reset(new PlaneSymmetryGPU<T>(stream_, freqDomainXYGPU));
+      // NOTE: param->dim_x() != param->dim_x_freq()
+      spaceDomainDataExternalHost_ =
+          create_new_type_3d_view<T>(array1, param->dim_z(), param->dim_y(), param->dim_x());
+      spaceDomainDataExternalGPU_ =
+          create_new_type_3d_view<T>(gpuArray1, param->dim_z(), param->dim_y(), param->dim_x());
+
+      transformXY_ = std::unique_ptr<TransformGPU>(new TransformReal2DGPU<T>(
+          spaceDomainDataExternalGPU_, freqDomainXYGPU, stream_, fftWorkBuffer));
+
+    } else {
+      spaceDomainDataExternalHost_ = create_new_type_3d_view<T>(
+          array1, param->dim_z(), param->dim_y(), 2 * param->dim_x_freq());
+      spaceDomainDataExternalGPU_ = create_new_type_3d_view<T>(
+          freqDomainXYGPU, param->dim_z(), param->dim_y(), 2 * param->dim_x_freq());
+
+      transformXY_ = std::unique_ptr<TransformGPU>(
+          new Transform2DGPU<T>(freqDomainXYGPU, stream_, fftWorkBuffer));
+    }
+  }
+}
+
+#ifdef SPFFT_MPI
+template <typename T>
+ExecutionGPU<T>::ExecutionGPU(MPICommunicatorHandle comm, const SpfftExchangeType exchangeType,
+                              const int numThreads, std::shared_ptr<Parameters> param,
+                              HostArray<std::complex<T>>& array1,
+                              HostArray<std::complex<T>>& array2,
+                              GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray1,
+                              GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray2,
+                              const std::shared_ptr<GPUArray<char>>& fftWorkBuffer)
+    : stream_(false),
+      numThreads_(numThreads),
+      scalingFactor_(static_cast<T>(
+          1.0 / static_cast<double>(param->dim_x() * param->dim_y() * param->dim_z()))),
+      zStickSymmetry_(new Symmetry()),
+      planeSymmetry_(new Symmetry()) {
+  assert(array1.data() != array2.data());
+  assert(gpuArray1.data() != gpuArray2.data());
+  assert(gpuArray1.device_id() == gpuArray2.device_id());
+
+  const SizeType numLocalZSticks = param->num_z_sticks(comm.rank());
+  const SizeType numLocalXYPlanes = param->num_xy_planes(comm.rank());
+
+  freqDomainDataGPU_ = create_2d_view(gpuArray1, 0, numLocalZSticks, param->dim_z());
+  freqDomainCompressedDataGPU_ =
+      GPUArrayView1D<T>(reinterpret_cast<T*>(gpuArray2.data()),
+                        param->local_value_indices().size() * 2, gpuArray2.device_id());
+
+  auto freqDomainXYGPU = create_3d_view(gpuArray2, 0, numLocalXYPlanes, param->dim_y(),
+                                        param->dim_x_freq()); // must not overlap with z-sticks
+
+  // Z
+  if (numLocalZSticks > 0) {
+    transformZ_ = std::unique_ptr<TransformGPU>(
+        new Transform1DGPU<T>(freqDomainDataGPU_, stream_, fftWorkBuffer));
+
+    if (param->transform_type() == SPFFT_TRANS_R2C &&
+        param->zero_zero_stick_index() < freqDomainDataGPU_.dim_outer()) {
+      zStickSymmetry_.reset(new StickSymmetryGPU<T>(
+          stream_, GPUArrayView1D<typename gpu::fft::ComplexType<T>::type>(
+                       freqDomainDataGPU_.data() +
+                           freqDomainDataGPU_.index(param->zero_zero_stick_index(), 0),
+                       freqDomainDataGPU_.dim_inner(), freqDomainDataGPU_.device_id())));
+    }
+  }
+
+  if (numLocalZSticks > 0) {
+    compression_.reset(new CompressionGPU(param));
+  }
+
+  // XY
+  if (numLocalXYPlanes > 0) {
+    if (param->transform_type() == SPFFT_TRANS_R2C) {
+      // NOTE: param->dim_x() != param->dim_x_freq()
+      spaceDomainDataExternalHost_ =
+          create_new_type_3d_view<T>(array1, numLocalXYPlanes, param->dim_y(), param->dim_x());
+      spaceDomainDataExternalGPU_ =
+          create_new_type_3d_view<T>(gpuArray1, numLocalXYPlanes, param->dim_y(), param->dim_x());
+
+      transformXY_ = std::unique_ptr<TransformGPU>(new TransformReal2DGPU<T>(
+          spaceDomainDataExternalGPU_, freqDomainXYGPU, stream_, fftWorkBuffer));
+
+      planeSymmetry_.reset(new PlaneSymmetryGPU<T>(stream_, freqDomainXYGPU));
+
+    } else {
+      spaceDomainDataExternalHost_ = create_new_type_3d_view<T>(
+          array1, numLocalXYPlanes, param->dim_y(), 2 * param->dim_x_freq());
+      spaceDomainDataExternalGPU_ = create_new_type_3d_view<T>(
+          freqDomainXYGPU, numLocalXYPlanes, param->dim_y(), 2 * param->dim_x_freq());
+
+      transformXY_ = std::unique_ptr<TransformGPU>(
+          new Transform2DGPU<T>(freqDomainXYGPU, stream_, fftWorkBuffer));
+    }
+  }
+
+  switch (exchangeType) {
+    case SpfftExchangeType::SPFFT_EXCH_UNBUFFERED: {
+      auto freqDomainDataHost = create_2d_view(array1, 0, numLocalZSticks, param->dim_z());
+      auto freqDomainXYHost =
+          create_3d_view(array2, 0, numLocalXYPlanes, param->dim_y(), param->dim_x_freq());
+      transpose_.reset(
+          new TransposeMPIUnbufferedGPU<T>(param, comm, freqDomainXYHost, freqDomainXYGPU, stream_,
+                                           freqDomainDataHost, freqDomainDataGPU_, stream_));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED: {
+      const auto bufferZSize = param->total_num_xy_planes() * param->num_z_sticks(comm.rank());
+      const auto bufferXYSize = param->total_num_z_sticks() * param->num_xy_planes(comm.rank());
+      auto transposeBufferZ = create_1d_view(array2, 0, bufferZSize);
+      auto transposeBufferZGPU = create_1d_view(gpuArray2, 0, bufferZSize);
+      auto transposeBufferXY = create_1d_view(array1, 0, bufferXYSize);
+      auto transposeBufferXYGPU = create_1d_view(gpuArray1, 0, bufferXYSize);
+      transpose_.reset(new TransposeMPICompactBufferedGPU<T, T>(
+          param, comm, transposeBufferXY, freqDomainXYGPU, transposeBufferXYGPU, stream_,
+          transposeBufferZ, freqDomainDataGPU_, transposeBufferZGPU, stream_));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED_FLOAT: {
+      const auto bufferZSize = param->total_num_xy_planes() * param->num_z_sticks(comm.rank());
+      const auto bufferXYSize = param->total_num_z_sticks() * param->num_xy_planes(comm.rank());
+      auto transposeBufferZ = create_1d_view(array2, 0, bufferZSize);
+      auto transposeBufferZGPU = create_1d_view(gpuArray2, 0, bufferZSize);
+      auto transposeBufferXY = create_1d_view(array1, 0, bufferXYSize);
+      auto transposeBufferXYGPU = create_1d_view(gpuArray1, 0, bufferXYSize);
+      transpose_.reset(new TransposeMPICompactBufferedGPU<T, float>(
+          param, comm, transposeBufferXY, freqDomainXYGPU, transposeBufferXYGPU, stream_,
+          transposeBufferZ, freqDomainDataGPU_, transposeBufferZGPU, stream_));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED: {
+      const auto bufferSize = param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size();
+      auto transposeBufferZ = create_1d_view(array2, 0, bufferSize);
+      auto transposeBufferZGPU = create_1d_view(gpuArray2, 0, bufferSize);
+      auto transposeBufferXY = create_1d_view(array1, 0, bufferSize);
+      auto transposeBufferXYGPU = create_1d_view(gpuArray1, 0, bufferSize);
+      transpose_.reset(new TransposeMPIBufferedGPU<T, T>(
+          param, comm, transposeBufferXY, freqDomainXYGPU, transposeBufferXYGPU, stream_,
+          transposeBufferZ, freqDomainDataGPU_, transposeBufferZGPU, stream_));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED_FLOAT: {
+      const auto bufferSize = param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size();
+      auto transposeBufferZ = create_1d_view(array2, 0, bufferSize);
+      auto transposeBufferZGPU = create_1d_view(gpuArray2, 0, bufferSize);
+      auto transposeBufferXY = create_1d_view(array1, 0, bufferSize);
+      auto transposeBufferXYGPU = create_1d_view(gpuArray1, 0, bufferSize);
+      transpose_.reset(new TransposeMPIBufferedGPU<T, float>(
+          param, comm, transposeBufferXY, freqDomainXYGPU, transposeBufferXYGPU, stream_,
+          transposeBufferZ, freqDomainDataGPU_, transposeBufferZGPU, stream_));
+    } break;
+    default:
+      throw InvalidParameterError();
+  }
+}
+
+// instatiate templates for float and double
+#endif
+
+template <typename T>
+auto ExecutionGPU<T>::forward_xy(const SpfftProcessingUnitType inputLocation) -> void {
+
+  // Check for any preceding errors before starting execution
+  if (gpu::get_last_error() != gpu::status::Success) {
+    throw GPUPrecedingError();
+  }
+
+  // XY
+  if (transformXY_) {
+    if (inputLocation == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      copy_to_gpu_async(stream_, spaceDomainDataExternalHost_, spaceDomainDataExternalGPU_);
+    }
+    transformXY_->forward();
+  }
+
+  // transpose
+  if (transformXY_) transpose_->pack_forward();
+}
+
+template <typename T>
+auto ExecutionGPU<T>::forward_exchange(const bool nonBlockingExchange) -> void {
+  HOST_TIMING_SCOPED("exchange_start")
+  transpose_->exchange_forward_start(nonBlockingExchange);
+}
+
+template <typename T>
+auto ExecutionGPU<T>::forward_z(T* output, const SpfftScalingType scalingType) -> void {
+
+  HOST_TIMING_START("exechange_fininalize");
+  transpose_->exchange_forward_finalize();
+  HOST_TIMING_STOP("exechange_fininalize");
+
+  if (transformZ_) transpose_->unpack_forward();
+
+  // Z
+  if (transformZ_) transformZ_->forward();
+
+  // Compress
+  if (compression_) {
+    T* outputPtrHost = nullptr;
+    T* outputPtrGPU = nullptr;
+    std::tie(outputPtrHost, outputPtrGPU) = translate_gpu_pointer(output);
+
+    if (outputPtrGPU == nullptr) {
+      // output on HOST
+      compression_->compress(stream_, freqDomainDataGPU_, freqDomainCompressedDataGPU_.data(),
+                             scalingType == SpfftScalingType::SPFFT_FULL_SCALING, scalingFactor_);
+
+      gpu::check_status(
+          gpu::memcpy_async(static_cast<void*>(outputPtrHost),
+                            static_cast<const void*>(freqDomainCompressedDataGPU_.data()),
+                            freqDomainCompressedDataGPU_.size() *
+                                sizeof(decltype(*(freqDomainCompressedDataGPU_.data()))),
+                            gpu::flag::MemcpyDeviceToHost, stream_.get()));
+    } else {
+      // output on GPU
+      compression_->compress(stream_, freqDomainDataGPU_, outputPtrGPU,
+                             scalingType == SpfftScalingType::SPFFT_FULL_SCALING, scalingFactor_);
+    }
+  }
+}
+
+template <typename T>
+auto ExecutionGPU<T>::backward_z(const T* input) -> void {
+
+  // Check for any preceding errors before starting execution
+  if (gpu::get_last_error() != gpu::status::Success) {
+    throw GPUPrecedingError();
+  }
+
+  // decompress
+  if (compression_) {
+    const T* inputPtrHost = nullptr;
+    const T* inputPtrGPU = nullptr;
+    std::tie(inputPtrHost, inputPtrGPU) = translate_gpu_pointer(input);
+
+    if (inputPtrGPU == nullptr) {
+      // input on HOST
+      gpu::check_status(
+          gpu::memcpy_async(static_cast<void*>(freqDomainCompressedDataGPU_.data()),
+                            static_cast<const void*>(inputPtrHost),
+                            freqDomainCompressedDataGPU_.size() *
+                                sizeof(decltype(*(freqDomainCompressedDataGPU_.data()))),
+                            gpu::flag::MemcpyHostToDevice, stream_.get()));
+      compression_->decompress(stream_, freqDomainCompressedDataGPU_.data(), freqDomainDataGPU_);
+    } else {
+      // input on GPU
+      compression_->decompress(stream_, inputPtrGPU, freqDomainDataGPU_);
+    }
+  }
+
+  // Z
+  if (transformZ_) {
+    zStickSymmetry_->apply();
+    transformZ_->backward();
+  }
+
+  // transpose
+  if (transformZ_) transpose_->pack_backward();
+}
+template <typename T>
+auto ExecutionGPU<T>::backward_exchange(const bool nonBlockingExchange) -> void {
+  transpose_->exchange_backward_start(nonBlockingExchange);
+}
+
+template <typename T>
+auto ExecutionGPU<T>::backward_xy(const SpfftProcessingUnitType outputLocation) -> void {
+
+  HOST_TIMING_START("exechange_fininalize");
+  transpose_->exchange_backward_finalize();
+  HOST_TIMING_STOP("exechange_fininalize");
+
+  if (transformXY_) transpose_->unpack_backward();
+
+  // XY
+  if (transformXY_) {
+    planeSymmetry_->apply();
+    transformXY_->backward();
+    if (outputLocation & SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      copy_from_gpu_async(stream_, spaceDomainDataExternalGPU_, spaceDomainDataExternalHost_);
+    }
+  }
+}
+
+template <typename T>
+auto ExecutionGPU<T>::synchronize() -> void {
+  gpu::stream_synchronize(stream_.get());
+}
+
+template <typename T>
+auto ExecutionGPU<T>::space_domain_data_host() -> HostArrayView3D<T> {
+  return spaceDomainDataExternalHost_;
+}
+
+template <typename T>
+auto ExecutionGPU<T>::space_domain_data_gpu() -> GPUArrayView3D<T> {
+  return spaceDomainDataExternalGPU_;
+}
+
+// instatiate templates for float and double
+template class ExecutionGPU<double>;
+#ifdef SPFFT_SINGLE_PRECISION
+template class ExecutionGPU<float>;
+#endif
+
+} // namespace spfft
diff --git a/src/execution/execution_gpu.hpp b/src/execution/execution_gpu.hpp
new file mode 100644
index 0000000..430da45
--- /dev/null
+++ b/src/execution/execution_gpu.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_EXECUTION_GPU
+#define SPFFT_EXECUTION_GPU
+
+#include <complex>
+#include <memory>
+#include "compression/compression_gpu.hpp"
+#include "compression/indices.hpp"
+#include "fft/transform_interface.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "memory/host_array.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/types.h"
+#include "symmetry/symmetry.hpp"
+#include "transpose/transpose.hpp"
+#include "util/common_types.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#endif
+
+namespace spfft {
+
+// Controls the execution of the 3D FFT from a compressed format in frequency space and slices in
+// space domain. Memory is NOT owned by this class and must remain valid during the lifetime.
+template <typename T>
+class ExecutionGPU {
+public:
+  // Initialize a local execution on GPU
+  ExecutionGPU(const int numThreads, std::shared_ptr<Parameters> param,
+               HostArray<std::complex<T>>& array1, HostArray<std::complex<T>>& array2,
+               GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray1,
+               GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray2,
+               const std::shared_ptr<GPUArray<char>>& fftWorkBuffer);
+
+#ifdef SPFFT_MPI
+  // Initialize a distributed execution on GPU
+  ExecutionGPU(MPICommunicatorHandle comm, const SpfftExchangeType exchangeType,
+               const int numThreads, std::shared_ptr<Parameters> param,
+               HostArray<std::complex<T>>& array1, HostArray<std::complex<T>>& array2,
+               GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray1,
+               GPUArray<typename gpu::fft::ComplexType<T>::type>& gpuArray2,
+               const std::shared_ptr<GPUArray<char>>& fftWorkBuffer);
+#endif
+
+  // transform forward from a given memory location (Host or GPU).
+  // The output is located on the GPU.
+  auto forward_z(T* output, const SpfftScalingType scalingType) -> void;
+  auto forward_exchange(const bool nonBlockingExchange) -> void;
+  auto forward_xy(const SpfftProcessingUnitType inputLocation) -> void;
+
+
+  // transform backward into a given memory location (Host or GPU).
+  // The input is taken from the GPU.
+  auto backward_z(const T* input) -> void;
+  auto backward_exchange(const bool nonBlockingExchange) -> void;
+  auto backward_xy(const SpfftProcessingUnitType outputLocation) -> void;
+
+  auto synchronize() -> void;
+
+  // The space domain data on Host
+  auto space_domain_data_host() -> HostArrayView3D<T>;
+
+  // The space domain data on GPU
+  auto space_domain_data_gpu() -> GPUArrayView3D<T>;
+
+private:
+  GPUStreamHandle stream_;
+  int numThreads_;
+  T scalingFactor_;
+  std::unique_ptr<TransformGPU> transformZ_;
+  std::unique_ptr<Transpose> transpose_;
+  std::unique_ptr<TransformGPU> transformXY_;
+
+  std::unique_ptr<Symmetry> zStickSymmetry_;
+  std::unique_ptr<Symmetry> planeSymmetry_;
+
+  std::unique_ptr<CompressionGPU> compression_;
+
+  HostArrayView3D<T> spaceDomainDataExternalHost_;
+  GPUArrayView3D<T> spaceDomainDataExternalGPU_;
+
+  GPUArrayView2D<typename gpu::fft::ComplexType<T>::type> freqDomainDataGPU_;
+  GPUArrayView1D<T> freqDomainCompressedDataGPU_;
+};
+} // namespace spfft
+#endif
diff --git a/src/execution/execution_host.cpp b/src/execution/execution_host.cpp
new file mode 100644
index 0000000..87ed93b
--- /dev/null
+++ b/src/execution/execution_host.cpp
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "execution/execution_host.hpp"
+#include "compression/indices.hpp"
+#include "fft/transform_1d_host.hpp"
+#include "fft/transform_real_1d_host.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "spfft/exceptions.hpp"
+#include "symmetry/symmetry_host.hpp"
+#include "timing/timing.hpp"
+#include "transpose/transpose_host.hpp"
+#include "util/common_types.hpp"
+
+#ifdef SPFFT_MPI
+#include "transpose/transpose_mpi_buffered_host.hpp"
+#include "transpose/transpose_mpi_compact_buffered_host.hpp"
+#include "transpose/transpose_mpi_unbuffered_host.hpp"
+#endif
+
+namespace spfft {
+
+template <typename T>
+ExecutionHost<T>::ExecutionHost(const int numThreads, std::shared_ptr<Parameters> param,
+                                HostArray<std::complex<T>>& array1,
+                                HostArray<std::complex<T>>& array2)
+    : numThreads_(numThreads),
+      scalingFactor_(static_cast<T>(
+          1.0 / static_cast<double>(param->dim_x() * param->dim_y() * param->dim_z()))) {
+  HOST_TIMING_SCOPED("Execution init");
+  const SizeType numLocalZSticks = param->num_z_sticks(0);
+  const SizeType numLocalXYPlanes = param->num_xy_planes(0);
+  std::set<SizeType> uniqueXIndices;
+  for (const auto& xyIndex : param->z_stick_xy_indices(0)) {
+    uniqueXIndices.emplace(static_cast<SizeType>(xyIndex / param->dim_y()));
+  }
+
+  auto freqDomainZ3D = create_3d_view(array1, 0, 1, numLocalZSticks, param->dim_z());
+  freqDomainData_ = create_2d_view(freqDomainZ3D, 0, numLocalZSticks, param->dim_z());
+  auto freqDomainXY =
+      create_3d_view(array2, 0, param->dim_z(), param->dim_x_freq(), param->dim_y());
+
+  transpose_.reset(new TransposeHost<T>(param, freqDomainXY, freqDomainData_));
+
+  if (param->local_value_indices().size() > 0) {
+    compression_.reset(new CompressionHost(numThreads_, param));
+  }
+
+  if (numLocalZSticks > 0) {
+    // Z
+    transformZBackward_.reset(new Transform1DPlanesHost<T>(freqDomainZ3D, freqDomainZ3D, false,
+                                                           false, FFTW_BACKWARD, numThreads));
+    transformZForward_.reset(new Transform1DPlanesHost<T>(freqDomainZ3D, freqDomainZ3D, false,
+                                                          false, FFTW_FORWARD, numThreads));
+  }
+
+  if (numLocalXYPlanes > 0) {
+    // Y
+    transformYBackward_.reset(new Transform1DVerticalHost<T>(freqDomainXY, freqDomainXY, false,
+                                                             false, FFTW_BACKWARD, uniqueXIndices));
+    transformYForward_.reset(new Transform1DVerticalHost<T>(freqDomainXY, freqDomainXY, false,
+                                                            false, FFTW_FORWARD, uniqueXIndices));
+
+    // X
+    if (param->transform_type() == SPFFT_TRANS_R2C) {
+      if (param->zero_zero_stick_index() < param->num_z_sticks(0)) {
+        zStickSymmetry_.reset(new StickSymmetryHost<T>(HostArrayView1D<std::complex<T>>(
+            &freqDomainData_(param->zero_zero_stick_index(), 0), freqDomainData_.dim_inner(),
+            freqDomainData_.pinned())));
+      }
+
+      planeSymmetry_.reset(new PlaneSymmetryHost<T>(freqDomainXY));
+
+      spaceDomainDataExternal_ =
+          create_new_type_3d_view<T>(array1, param->dim_z(), param->dim_y(), param->dim_x());
+      transformXBackward_.reset(new TransformReal1DPlanesHost<T>(
+          freqDomainXY, spaceDomainDataExternal_, true, false, numThreads));
+      transformXForward_.reset(new TransformReal1DPlanesHost<T>(
+          spaceDomainDataExternal_, freqDomainXY, false, true, numThreads));
+    } else {
+      zStickSymmetry_.reset(new Symmetry());
+      planeSymmetry_.reset(new Symmetry());
+
+      auto spaceDomainData =
+          create_3d_view(array1, 0, param->dim_z(), param->dim_y(), param->dim_x_freq());
+      spaceDomainDataExternal_ =
+          create_new_type_3d_view<T>(array1, param->dim_z(), param->dim_y(), 2 * param->dim_x());
+      transformXBackward_.reset(new Transform1DPlanesHost<T>(freqDomainXY, spaceDomainData, true,
+                                                             false, FFTW_BACKWARD, numThreads));
+
+      transformXForward_.reset(new Transform1DPlanesHost<T>(spaceDomainData, freqDomainXY, false,
+                                                            true, FFTW_FORWARD, numThreads));
+    }
+  }
+}
+
+#ifdef SPFFT_MPI
+
+template <typename T>
+ExecutionHost<T>::ExecutionHost(MPICommunicatorHandle comm, const SpfftExchangeType exchangeType,
+                                const int numThreads, std::shared_ptr<Parameters> param,
+                                HostArray<std::complex<T>>& array1,
+                                HostArray<std::complex<T>>& array2)
+    : numThreads_(numThreads),
+      scalingFactor_(static_cast<T>(
+          1.0 / static_cast<double>(param->dim_x() * param->dim_y() * param->dim_z()))),
+      zStickSymmetry_(new Symmetry()),
+      planeSymmetry_(new Symmetry()) {
+  HOST_TIMING_SCOPED("Execution init");
+  const SizeType numLocalZSticks = param->num_z_sticks(comm.rank());
+  const SizeType numLocalXYPlanes = param->num_xy_planes(comm.rank());
+
+  // get unique x indices to only compute non-zero y-transforms
+  std::set<SizeType> uniqueXIndices;
+  for (SizeType r = 0; r < comm.size(); ++r) {
+    for (const auto& xyIndex : param->z_stick_xy_indices(r)) {
+      uniqueXIndices.emplace(static_cast<SizeType>(xyIndex / param->dim_y()));
+    }
+  }
+
+  auto freqDomainZ3D = create_3d_view(array1, 0, 1, numLocalZSticks, param->dim_z());
+  freqDomainData_ = create_2d_view(freqDomainZ3D, 0, numLocalZSticks, param->dim_z());
+
+  auto freqDomainXY =
+      create_3d_view(array2, 0, numLocalXYPlanes, param->dim_x_freq(), param->dim_y());
+
+  auto& spaceDomainArray = array1;
+  // create external view with
+  if (param->transform_type() == SPFFT_TRANS_R2C) {
+    spaceDomainDataExternal_ = create_new_type_3d_view<T>(spaceDomainArray, numLocalXYPlanes,
+                                                          param->dim_y(), param->dim_x());
+  } else {
+    spaceDomainDataExternal_ = create_new_type_3d_view<T>(spaceDomainArray, numLocalXYPlanes,
+                                                          param->dim_y(), 2 * param->dim_x());
+  }
+
+  if (param->local_value_indices().size() > 0) {
+    compression_.reset(new CompressionHost(numThreads_, param));
+  }
+
+  if (numLocalZSticks > 0) {
+    // apply hermitian symmetry for x=0, y=0 stick
+    if (param->transform_type() == SPFFT_TRANS_R2C &&
+        param->zero_zero_stick_index() < freqDomainData_.dim_outer()) {
+      zStickSymmetry_.reset(new StickSymmetryHost<T>(
+          HostArrayView1D<std::complex<T>>(&freqDomainData_(param->zero_zero_stick_index(), 0),
+                                           freqDomainData_.dim_inner(), freqDomainData_.pinned())));
+    }
+    transformZForward_ = std::unique_ptr<TransformHost>(new Transform1DPlanesHost<T>(
+        freqDomainZ3D, freqDomainZ3D, false, false, FFTW_FORWARD, numThreads));
+    transformZBackward_ = std::unique_ptr<TransformHost>(new Transform1DPlanesHost<T>(
+        freqDomainZ3D, freqDomainZ3D, false, false, FFTW_BACKWARD, numThreads));
+  }
+
+  if (numLocalXYPlanes > 0) {
+    transformYBackward_.reset(new Transform1DVerticalHost<T>(freqDomainXY, freqDomainXY, false,
+                                                             false, FFTW_BACKWARD, uniqueXIndices));
+    transformYForward_.reset(new Transform1DVerticalHost<T>(freqDomainXY, freqDomainXY, false,
+                                                            false, FFTW_FORWARD, uniqueXIndices));
+
+    if (param->transform_type() == SPFFT_TRANS_R2C) {
+      transformXBackward_.reset(new TransformReal1DPlanesHost<T>(
+          freqDomainXY, spaceDomainDataExternal_, true, false, numThreads));
+      transformXForward_.reset(new TransformReal1DPlanesHost<T>(
+          spaceDomainDataExternal_, freqDomainXY, false, true, numThreads));
+
+      planeSymmetry_.reset(new PlaneSymmetryHost<T>(freqDomainXY));
+
+    } else {
+      auto spaceDomainData =
+          create_3d_view(spaceDomainArray, 0, numLocalXYPlanes, param->dim_y(), param->dim_x());
+      transformXBackward_.reset(new Transform1DPlanesHost<T>(freqDomainXY, spaceDomainData, true,
+                                                             false, FFTW_BACKWARD, numThreads));
+      transformXForward_.reset(new Transform1DPlanesHost<T>(spaceDomainData, freqDomainXY, false,
+                                                            true, FFTW_FORWARD, numThreads));
+    }
+  }
+
+  switch (exchangeType) {
+    case SpfftExchangeType::SPFFT_EXCH_UNBUFFERED: {
+      transpose_.reset(
+          new TransposeMPIUnbufferedHost<T>(param, comm, freqDomainXY, freqDomainData_));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED: {
+      auto transposeBufferZ = create_1d_view(
+          array2, 0, param->total_num_xy_planes() * param->num_z_sticks(comm.rank()));
+      auto transposeBufferXY = create_1d_view(
+          array1, 0, param->total_num_z_sticks() * param->num_xy_planes(comm.rank()));
+      transpose_.reset(new TransposeMPICompactBufferedHost<T, T>(
+          param, comm, freqDomainXY, freqDomainData_, transposeBufferXY, transposeBufferZ));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED_FLOAT: {
+      auto transposeBufferZ = create_1d_view(
+          array2, 0, param->total_num_xy_planes() * param->num_z_sticks(comm.rank()));
+      auto transposeBufferXY = create_1d_view(
+          array1, 0, param->total_num_z_sticks() * param->num_xy_planes(comm.rank()));
+      transpose_.reset(new TransposeMPICompactBufferedHost<T, float>(
+          param, comm, freqDomainXY, freqDomainData_, transposeBufferXY, transposeBufferZ));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED: {
+      auto transposeBufferZ = create_1d_view(
+          array2, 0, param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size());
+      auto transposeBufferXY = create_1d_view(
+          array1, 0, param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size());
+      transpose_.reset(new TransposeMPIBufferedHost<T, T>(
+          param, comm, freqDomainXY, freqDomainData_, transposeBufferXY, transposeBufferZ));
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED_FLOAT: {
+      auto transposeBufferZ = create_1d_view(
+          array2, 0, param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size());
+      auto transposeBufferXY = create_1d_view(
+          array1, 0, param->max_num_z_sticks() * param->max_num_xy_planes() * comm.size());
+      transpose_.reset(new TransposeMPIBufferedHost<T, float>(
+          param, comm, freqDomainXY, freqDomainData_, transposeBufferXY, transposeBufferZ));
+    } break;
+    default:
+      throw InvalidParameterError();
+  }
+}
+#endif
+
+template <typename T>
+auto ExecutionHost<T>::forward_xy() -> void {
+  SPFFT_OMP_PRAGMA("omp parallel num_threads(numThreads_)") {
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("x transform"); }
+    if (transformXForward_) transformXForward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("x transform"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("y transform"); }
+    if (transformYForward_) transformYForward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("y transform"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("pack"); }
+    if (transformYForward_) transpose_->pack_forward();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("pack"); }
+  }
+}
+
+template <typename T>
+auto ExecutionHost<T>::forward_exchange(const bool nonBlockingExchange) -> void {
+  HOST_TIMING_SCOPED("exchange_start")
+  // must be called outside omp parallel region (MPI restriction on thread id)
+  transpose_->exchange_forward_start(nonBlockingExchange);
+  // SPFFT_OMP_PRAGMA("omp barrier") // ensure exchange is done
+}
+
+template <typename T>
+auto ExecutionHost<T>::forward_z(T* output, const SpfftScalingType scalingType) -> void {
+  // must be called outside omp parallel region (MPI restriction on thread id)
+
+  HOST_TIMING_START("exechange_fininalize");
+  transpose_->exchange_forward_finalize();
+  HOST_TIMING_STOP("exechange_fininalize");
+
+  HOST_TIMING_STOP("exchange")
+  SPFFT_OMP_PRAGMA("omp parallel num_threads(numThreads_)") {
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("unpack"); }
+    if (transformZForward_) transpose_->unpack_forward();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("unpack"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("z transform"); }
+    if (transformZForward_) transformZForward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("z transform"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("compression"); }
+    if (compression_)
+      compression_->compress(freqDomainData_, output,
+                             scalingType == SpfftScalingType::SPFFT_FULL_SCALING, scalingFactor_);
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("compression"); }
+  }
+}
+
+template <typename T>
+auto ExecutionHost<T>::backward_z(const T* input) -> void {
+  SPFFT_OMP_PRAGMA("omp parallel num_threads(numThreads_)") {
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("compression"); }
+    if (compression_) compression_->decompress(input, freqDomainData_);
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("compression"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("z symmetrization"); }
+    zStickSymmetry_->apply();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("z symmetrization"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("z transform"); }
+    if (transformZBackward_) transformZBackward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("z transform"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("pack"); }
+    if (transformZBackward_) transpose_->pack_backward();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("pack"); }
+  }
+}
+
+template <typename T>
+auto ExecutionHost<T>::backward_exchange(const bool nonBlockingExchange) -> void {
+  HOST_TIMING_SCOPED("exchange_start")
+  // must be called outside omp parallel region (MPI restriction on thread id)
+  transpose_->exchange_backward_start(nonBlockingExchange);
+}
+
+template <typename T>
+auto ExecutionHost<T>::backward_xy() -> void {
+  // must be called outside omp parallel region (MPI restriction on thread id)
+  HOST_TIMING_START("exechange_fininalize");
+  transpose_->exchange_forward_finalize();
+  HOST_TIMING_STOP("exechange_fininalize");
+
+  HOST_TIMING_STOP("exchange")
+  SPFFT_OMP_PRAGMA("omp parallel num_threads(numThreads_)") {
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("unpack"); }
+    if (transformYBackward_) transpose_->unpack_backward();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("unpack"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("xy symmetrization"); }
+    planeSymmetry_->apply();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("xy symmetrization"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("y transform"); }
+    if (transformYBackward_) transformYBackward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("y transform"); }
+
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_START("x transform"); }
+    if (transformXBackward_) transformXBackward_->execute();
+    SPFFT_OMP_PRAGMA("omp master") { HOST_TIMING_STOP("x transform"); }
+  }
+}
+
+template <typename T>
+auto ExecutionHost<T>::space_domain_data() -> HostArrayView3D<T> {
+  return spaceDomainDataExternal_;
+}
+
+// instatiate templates for float and double
+template class ExecutionHost<double>;
+
+#ifdef SPFFT_SINGLE_PRECISION
+template class ExecutionHost<float>;
+#endif
+
+} // namespace spfft
diff --git a/src/execution/execution_host.hpp b/src/execution/execution_host.hpp
new file mode 100644
index 0000000..c3dd3eb
--- /dev/null
+++ b/src/execution/execution_host.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_EXECUTION_HOST_HPP
+#define SPFFT_EXECUTION_HOST_HPP
+
+#include <complex>
+#include <memory>
+#include <tuple>
+#include "compression/compression_host.hpp"
+#include "compression/indices.hpp"
+#include "fft/transform_interface.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/types.h"
+#include "symmetry/symmetry.hpp"
+#include "timing/timing.hpp"
+#include "transpose/transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_init_handle.hpp"
+#endif
+
+namespace spfft {
+
+// Controls the execution of the 3D FFT from a compressed format in frequency space and slices in
+// space domain. Memory is NOT owned by this class and must remain valid during the lifetime.
+template <typename T>
+class ExecutionHost {
+public:
+  // Initialize a local execution on Host
+  ExecutionHost(const int numThreads, std::shared_ptr<Parameters> param,
+                HostArray<std::complex<T>>& array1, HostArray<std::complex<T>>& array2);
+
+#ifdef SPFFT_MPI
+  // Initialize a distributed execution on Host
+  ExecutionHost(MPICommunicatorHandle comm, const SpfftExchangeType exchangeType,
+                const int numThreads, std::shared_ptr<Parameters> param,
+                HostArray<std::complex<T>>& array1, HostArray<std::complex<T>>& array2);
+#endif
+
+  // Transform forward
+  auto forward_z(T* output, const SpfftScalingType scalingType) -> void;
+  auto forward_exchange(const bool nonBlockingExchange) -> void;
+  auto forward_xy() -> void;
+
+  // Transform backward
+  auto backward_z(const T* input) -> void;
+  auto backward_exchange(const bool nonBlockingExchange) -> void;
+  auto backward_xy() -> void;
+
+  // Access the space domain data
+  auto space_domain_data() -> HostArrayView3D<T>;
+
+private:
+  int numThreads_;
+  T scalingFactor_;
+  std::unique_ptr<TransformHost> transformZBackward_;
+  std::unique_ptr<TransformHost> transformZForward_;
+  std::unique_ptr<TransformHost> transformYBackward_;
+  std::unique_ptr<TransformHost> transformYForward_;
+  std::unique_ptr<TransformHost> transformXBackward_;
+  std::unique_ptr<TransformHost> transformXForward_;
+
+  std::unique_ptr<Transpose> transpose_;
+
+  std::unique_ptr<Symmetry> zStickSymmetry_;
+  std::unique_ptr<Symmetry> planeSymmetry_;
+
+  std::unique_ptr<CompressionHost> compression_;
+
+  HostArrayView3D<T> spaceDomainDataExternal_;
+  HostArrayView2D<std::complex<T>> freqDomainData_;
+};
+} // namespace spfft
+#endif
diff --git a/src/fft/fftw_plan_1d.hpp b/src/fft/fftw_plan_1d.hpp
new file mode 100644
index 0000000..e528e99
--- /dev/null
+++ b/src/fft/fftw_plan_1d.hpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_FFTW_PLAN_HPP
+#define SPFFT_FFTW_PLAN_HPP
+
+#include <fftw3.h>
+#include <cassert>
+#include <complex>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+namespace spfft {
+
+template <typename T>
+class FFTWPlan;
+
+template <>
+class FFTWPlan<double> {
+public:
+  using ComplexType = std::complex<double>;
+
+  // Create standard 1d fftw plan.
+  // If input and output pointers are equal, in-place transform is created.
+  FFTWPlan(ComplexType* input, ComplexType* output, const SizeType size, const int sign)
+      : plan_(nullptr),
+        size_(size),
+        inPlace_(input == output),
+        alignmentInput_(fftw_alignment_of(reinterpret_cast<double*>(input))),
+        alignmentOutput_(fftw_alignment_of(reinterpret_cast<double*>(output))) {
+    auto flags = FFTW_ESTIMATE;
+    if (input != output) {
+      flags = flags | FFTW_DESTROY_INPUT; // allow input override for out-of-place transform
+    }
+    plan_ = fftw_plan_dft_1d(size, reinterpret_cast<fftw_complex*>(input),
+                             reinterpret_cast<fftw_complex*>(output), sign, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  // Create strided 1d fftw plan.
+  // If input and output pointers are equal, in-place transform is created.
+  FFTWPlan(ComplexType* input, ComplexType* output, const SizeType size, const SizeType istride,
+           const SizeType ostride, const SizeType idist, const SizeType odist,
+           const SizeType howmany, const int sign) {
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    auto flags = FFTW_ESTIMATE;
+    if (input != output) {
+      flags = flags | FFTW_DESTROY_INPUT; // allow input override for out-of-place transform
+    }
+    plan_ =
+        fftw_plan_many_dft(rank, n, (int)howmany, reinterpret_cast<fftw_complex*>(input), inembed,
+                           (int)istride, (int)idist, reinterpret_cast<fftw_complex*>(output),
+                           onembed, (int)ostride, (int)odist, sign, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  FFTWPlan(const FFTWPlan& other) = delete;
+
+  FFTWPlan(FFTWPlan&& other) noexcept {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+  }
+
+  auto operator=(const FFTWPlan& other) -> FFTWPlan& = delete;
+
+  auto operator=(FFTWPlan&& other) noexcept -> FFTWPlan& {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+    return *this;
+  }
+
+  // Get plan handle
+  inline auto get() -> fftw_plan { return plan_; };
+
+  // Release ownership of plan handle
+  inline auto release() -> fftw_plan {
+    fftw_plan planLocal = plan_;
+    plan_ = nullptr;
+    return planLocal;
+  };
+
+  inline auto empty() const noexcept -> bool { return !plan_; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  // Plan created with in-place transform
+  inline auto in_place() const noexcept -> bool { return inPlace_; }
+
+  // Execute on input / output provided to constructor.
+  // Undefinded behaviour if empty().
+  auto execute() -> void { fftw_execute(plan_); }
+
+  // Execute on given input / output.
+  // The alignment of input and output must match the pointers given to the constructor.
+  // If the plan was not setup for in-place transforms, input and output must not be equal
+  // Undefinded behaviour if empty().
+  auto execute(ComplexType* input, ComplexType* output) -> void {
+    assert(inPlace_ == (input == output));
+    assert(fftw_alignment_of(reinterpret_cast<double*>(input)) == alignmentInput_);
+    assert(fftw_alignment_of(reinterpret_cast<double*>(output)) == alignmentOutput_);
+    fftw_execute_dft(plan_, reinterpret_cast<fftw_complex*>(input),
+                     reinterpret_cast<fftw_complex*>(output));
+  }
+
+  ~FFTWPlan() {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = nullptr;
+  }
+
+private:
+  fftw_plan plan_ = nullptr;
+  SizeType size_ = 0;
+  bool inPlace_ = false;
+  int alignmentInput_ = 0;
+  int alignmentOutput_ = 0;
+};
+
+#ifdef SPFFT_SINGLE_PRECISION
+template <>
+class FFTWPlan<float> {
+public:
+  using ComplexType = std::complex<float>;
+
+  // Create standard 1d fftw plan.
+  // If input and output pointers are equal, in-place transform is created.
+  FFTWPlan(ComplexType* input, ComplexType* output, const SizeType size, const int sign)
+      : plan_(nullptr),
+        size_(size),
+        inPlace_(input == output),
+        alignmentInput_(fftwf_alignment_of(reinterpret_cast<float*>(input))),
+        alignmentOutput_(fftwf_alignment_of(reinterpret_cast<float*>(output))) {
+    plan_ = fftwf_plan_dft_1d(size, reinterpret_cast<fftwf_complex*>(input),
+                              reinterpret_cast<fftwf_complex*>(output), sign, FFTW_ESTIMATE);
+    if (!plan_) throw FFTWError();
+  }
+
+  // Create strided 1d fftw plan.
+  // If input and output pointers are equal, in-place transform is created.
+  FFTWPlan(ComplexType* input, ComplexType* output, const SizeType size, const SizeType istride,
+           const SizeType ostride, const SizeType idist, const SizeType odist,
+           const SizeType howmany, const int sign) {
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    plan_ =
+        fftwf_plan_many_dft(rank, n, (int)howmany, reinterpret_cast<fftwf_complex*>(input), inembed,
+                            (int)istride, (int)idist, reinterpret_cast<fftwf_complex*>(output),
+                            onembed, (int)ostride, (int)odist, sign, FFTW_ESTIMATE);
+    if (!plan_) throw FFTWError();
+  }
+
+  FFTWPlan(const FFTWPlan& other) = delete;
+
+  FFTWPlan(FFTWPlan&& other) noexcept {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+  }
+
+  auto operator=(const FFTWPlan& other) -> FFTWPlan& = delete;
+
+  auto operator=(FFTWPlan&& other) noexcept -> FFTWPlan& {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+    return *this;
+  }
+
+  // Get plan handle
+  inline auto get() -> fftwf_plan { return plan_; };
+
+  // Release ownership of plan handle
+  inline auto release() -> fftwf_plan {
+    fftwf_plan planLocal = plan_;
+    plan_ = nullptr;
+    return planLocal;
+  };
+
+  inline auto empty() const noexcept -> bool { return !plan_; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  // Plan created with in-place transform
+  inline auto in_place() const noexcept -> bool { return inPlace_; }
+
+  // Execute on input / output provided to constructor.
+  // Undefinded behaviour if empty().
+  auto execute() -> void { fftwf_execute(plan_); }
+
+  // Execute on given input / output.
+  // The alignment of input and output must match the pointers given to the constructor.
+  // If the plan was not setup for in-place transforms, input and output must not be equal
+  // Undefinded behaviour if empty().
+  auto execute(ComplexType* input, ComplexType* output) -> void {
+    assert(inPlace_ == (input == output));
+    assert(fftwf_alignment_of(reinterpret_cast<float*>(input)) == alignmentInput_);
+    assert(fftwf_alignment_of(reinterpret_cast<float*>(output)) == alignmentOutput_);
+    fftwf_execute_dft(plan_, reinterpret_cast<fftwf_complex*>(input),
+                      reinterpret_cast<fftwf_complex*>(output));
+  }
+
+  ~FFTWPlan() {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = nullptr;
+  }
+
+private:
+  fftwf_plan plan_ = nullptr;
+  SizeType size_ = 0;
+  bool inPlace_ = false;
+  int alignmentInput_ = 0;
+  int alignmentOutput_ = 0;
+};
+
+#endif
+
+} // namespace spfft
+
+#endif
diff --git a/src/fft/fftw_real_plan_1d.hpp b/src/fft/fftw_real_plan_1d.hpp
new file mode 100644
index 0000000..66c963d
--- /dev/null
+++ b/src/fft/fftw_real_plan_1d.hpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_FFTW_REAL_PLAN_HPP
+#define SPFFT_FFTW_REAL_PLAN_HPP
+
+#include <fftw3.h>
+#include <cassert>
+#include <complex>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+namespace spfft {
+
+template <typename T>
+class FFTWRealPlan;
+
+template <>
+class FFTWRealPlan<double> {
+public:
+  using ComplexType = std::complex<double>;
+
+  // Create strided 1d fftw real plan r2c
+  FFTWRealPlan(double* input, ComplexType* output, const SizeType size, const SizeType istride,
+               const SizeType ostride, const SizeType idist, const SizeType odist,
+               const SizeType howmany) {
+    assert(reinterpret_cast<void*>(input) !=
+           reinterpret_cast<void*>(output)); // must not be in place
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    plan_ = fftw_plan_many_dft_r2c(rank, n, (int)howmany, input, inembed, (int)istride, (int)idist,
+                                   reinterpret_cast<fftw_complex*>(output), onembed, (int)ostride,
+                                   (int)odist, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  // Create strided 1d fftw real plan c2r
+  FFTWRealPlan(ComplexType* input, double* output, const SizeType size, const SizeType istride,
+               const SizeType ostride, const SizeType idist, const SizeType odist,
+               const SizeType howmany) {
+    assert(reinterpret_cast<void*>(input) !=
+           reinterpret_cast<void*>(output)); // must not be in place
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    plan_ = fftw_plan_many_dft_c2r(rank, n, (int)howmany, reinterpret_cast<fftw_complex*>(input),
+                                   inembed, (int)istride, (int)idist, output, onembed, (int)ostride,
+                                   (int)odist, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  FFTWRealPlan(const FFTWRealPlan& other) = delete;
+
+  FFTWRealPlan(FFTWRealPlan&& other) noexcept {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+  }
+
+  auto operator=(const FFTWRealPlan& other) -> FFTWRealPlan& = delete;
+
+  auto operator=(FFTWRealPlan&& other) noexcept -> FFTWRealPlan& {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+    return *this;
+  }
+
+  // Get plan handle
+  inline auto get() -> fftw_plan { return plan_; };
+
+  // Release ownership of plan handle
+  inline auto release() -> fftw_plan {
+    fftw_plan planLocal = plan_;
+    plan_ = nullptr;
+    return planLocal;
+  };
+
+  inline auto empty() const noexcept -> bool { return !plan_; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  // Plan created with in-place transform
+  inline auto in_place() const noexcept -> bool { return inPlace_; }
+
+  // Execute on input / output provided to constructor.
+  // Undefinded behaviour if empty().
+  auto execute() -> void { fftw_execute(plan_); }
+
+  ~FFTWRealPlan() {
+    if (plan_) fftw_destroy_plan(plan_);
+    plan_ = nullptr;
+  }
+
+private:
+  fftw_plan plan_ = nullptr;
+  SizeType size_ = 0;
+  bool inPlace_ = false;
+};
+
+#ifdef SPFFT_SINGLE_PRECISION
+template <>
+class FFTWRealPlan<float> {
+public:
+  using ComplexType = std::complex<float>;
+
+  // Create strided 1d fftwf real plan r2c
+  FFTWRealPlan(float* input, ComplexType* output, const SizeType size, const SizeType istride,
+               const SizeType ostride, const SizeType idist, const SizeType odist,
+               const SizeType howmany) {
+    assert(reinterpret_cast<void*>(input) !=
+           reinterpret_cast<void*>(output)); // must not be in place
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    plan_ = fftwf_plan_many_dft_r2c(rank, n, (int)howmany, input, inembed, (int)istride, (int)idist,
+                                    reinterpret_cast<fftwf_complex*>(output), onembed, (int)ostride,
+                                    (int)odist, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  // Create strided 1d fftwf real plan c2r
+  FFTWRealPlan(ComplexType* input, float* output, const SizeType size, const SizeType istride,
+               const SizeType ostride, const SizeType idist, const SizeType odist,
+               const SizeType howmany) {
+    assert(reinterpret_cast<void*>(input) !=
+           reinterpret_cast<void*>(output)); // must not be in place
+    int rank = 1;
+    int n[] = {(int)size};
+    int inembed[] = {n[0]};
+    int onembed[] = {n[0]};
+    auto flags = FFTW_ESTIMATE | FFTW_DESTROY_INPUT;
+    plan_ = fftwf_plan_many_dft_c2r(rank, n, (int)howmany, reinterpret_cast<fftwf_complex*>(input),
+                                    inembed, (int)istride, (int)idist, output, onembed,
+                                    (int)ostride, (int)odist, flags);
+    if (!plan_) throw FFTWError();
+  }
+
+  FFTWRealPlan(const FFTWRealPlan& other) = delete;
+
+  FFTWRealPlan(FFTWRealPlan&& other) noexcept {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+  }
+
+  auto operator=(const FFTWRealPlan& other) -> FFTWRealPlan& = delete;
+
+  auto operator=(FFTWRealPlan&& other) noexcept -> FFTWRealPlan& {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = other.plan_;
+    other.plan_ = nullptr;
+    return *this;
+  }
+
+  // Get plan handle
+  inline auto get() -> fftwf_plan { return plan_; };
+
+  // Release ownership of plan handle
+  inline auto release() -> fftwf_plan {
+    fftwf_plan planLocal = plan_;
+    plan_ = nullptr;
+    return planLocal;
+  };
+
+  inline auto empty() const noexcept -> bool { return !plan_; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  // Plan created with in-place transform
+  inline auto in_place() const noexcept -> bool { return inPlace_; }
+
+  // Execute on input / output provided to constructor.
+  // Undefinded behaviour if empty().
+  auto execute() -> void { fftwf_execute(plan_); }
+
+  // Execute on given input / output.
+  // The alignment of input and output must match the pointers given to the constructor.
+  // If the plan was not setup for in-place transforms, input and output must not be equal
+  // Undefinded behaviour if empty().
+  auto execute(ComplexType* input, ComplexType* output) -> void {
+    assert(inPlace_ == (input == output));
+    assert(fftwf_alignment_of(reinterpret_cast<float*>(input)) == alignmentInput_);
+    assert(fftwf_alignment_of(reinterpret_cast<float*>(output)) == alignmentOutput_);
+    fftwf_execute_dft(plan_, reinterpret_cast<fftwf_complex*>(input),
+                      reinterpret_cast<fftwf_complex*>(output));
+  }
+
+  ~FFTWRealPlan() {
+    if (plan_) fftwf_destroy_plan(plan_);
+    plan_ = nullptr;
+  }
+
+private:
+  fftwf_plan plan_ = nullptr;
+  SizeType size_ = 0;
+  bool inPlace_ = false;
+  int alignmentInput_ = 0;
+  int alignmentOutput_ = 0;
+};
+
+#endif
+
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_1d_gpu.hpp b/src/fft/transform_1d_gpu.hpp
new file mode 100644
index 0000000..0d243d6
--- /dev/null
+++ b/src/fft/transform_1d_gpu.hpp
@@ -0,0 +1,136 @@
+
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_1D_GPU_HPP
+#define SPFFT_TRANSFORM_1D_GPU_HPP
+
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <memory>
+#include "fft/transform_interface.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class Transform1DGPU : public TransformGPU {
+public:
+  using ValueType = T;
+  using ComplexType = gpu::fft::ComplexType<T>;
+
+  Transform1DGPU(GPUArrayView2D<typename gpu::fft::ComplexType<T>::type>& data,
+                 GPUStreamHandle stream, std::shared_ptr<GPUArray<char>> workBuffer)
+      : stream_(std::move(stream)), workBuffer_(std::move(workBuffer)), dataPtr_(data.data()) {
+    assert(workBuffer_);
+
+    std::size_t worksize = 0;
+
+    int rank = 1;
+    int n[1] = {data.dim_inner()};
+    int nembed[1] = {data.dim_inner()};
+    int stride = 1;
+    int dist = data.dim_inner();
+    int batch = data.dim_outer();
+
+    // create plan
+    gpu::fft::check_result(gpu::fft::create(&plan_));
+    gpu::fft::check_result(gpu::fft::set_auto_allocation(plan_, 0));
+    gpu::fft::check_result(gpu::fft::make_plan_many(
+        plan_, rank, n, nembed, stride, dist, nembed, stride, dist,
+        gpu::fft::TransformType::ComplexToComplex<T>::value, batch, &worksize));
+
+    // set stream
+    gpu::fft::check_result(gpu::fft::set_stream(plan_, stream_.get()));
+
+    // resize work buffer if necessary
+    if (workBuffer_->size() < worksize) {
+      *workBuffer_ = GPUArray<char>(worksize);
+    }
+  }
+
+  Transform1DGPU(const Transform1DGPU& transform) = delete;
+
+  Transform1DGPU(Transform1DGPU&& transform) noexcept
+      : stream_(std::move(transform.stream_)),
+        plan_(std::move(transform.plan_)),
+        workBuffer_(std::move(transform.workBuffer_)),
+        dataPtr_(transform.dataPtr_) {
+    transform.plan_ = 0;
+  }
+
+  ~Transform1DGPU() {
+    if (plan_) {
+      gpu::fft::destroy(plan_);
+    }
+  }
+
+  auto operator=(const Transform1DGPU& transform) -> Transform1DGPU& = delete;
+
+  auto operator=(Transform1DGPU&& transform) noexcept -> Transform1DGPU& {
+    if (plan_) {
+      gpu::fft::destroy(plan_);
+    }
+    stream_ = std::move(transform.stream_);
+    plan_ = std::move(transform.plan_);
+    workBuffer_ = std::move(transform.workBuffer_);
+    dataPtr_ = transform.dataPtr_;
+
+    transform.plan_ = 0;
+    return *this;
+  }
+
+  inline auto device_id() const noexcept -> int { return stream_.device_id(); }
+
+  auto forward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(plan_, workBuffer_->data()));
+    gpu::fft::check_result(
+        gpu::fft::execute(plan_, dataPtr_, dataPtr_, gpu::fft::TransformDirection::Forward));
+  }
+
+  auto backward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(plan_, workBuffer_->data()));
+    gpu::fft::check_result(
+        gpu::fft::execute(plan_, dataPtr_, dataPtr_, gpu::fft::TransformDirection::Backward));
+  }
+
+private:
+  GPUStreamHandle stream_;
+  gpu::fft::HandleType plan_ = 0;
+  std::shared_ptr<GPUArray<char>> workBuffer_;
+  typename gpu::fft::ComplexType<T>::type* dataPtr_;
+};
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_1d_host.hpp b/src/fft/transform_1d_host.hpp
new file mode 100644
index 0000000..b85e443
--- /dev/null
+++ b/src/fft/transform_1d_host.hpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_1D_HOST_HPP
+#define SPFFT_TRANSFORM_1D_HOST_HPP
+
+#include <cassert>
+#include <complex>
+#include <set>
+#include <vector>
+#include "fft/fftw_plan_1d.hpp"
+#include "fft/transform_interface.hpp"
+#include "memory/host_array_view.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+
+namespace spfft {
+
+// Computes the FFT in 1D along either the innermost dimension (not transposed) or the second
+// innermost dimension (transposed)
+// The transforms are computed in batches aligned to inner 2d planes
+template <typename T>
+class Transform1DPlanesHost : public TransformHost {
+public:
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+  Transform1DPlanesHost(HostArrayView3D<ComplexType> inputData,
+                        HostArrayView3D<ComplexType> outputData, bool transposeInputData,
+                        bool transposeOutputData, int sign, int maxNumThreads) {
+    assert(inputData.dim_outer() == outputData.dim_outer());
+
+    // only one is transposed
+    assert((transposeInputData != transposeOutputData) ||
+           (inputData.dim_inner() == outputData.dim_inner()));
+    assert((transposeInputData != transposeOutputData) ||
+           (inputData.dim_mid() == outputData.dim_mid()));
+
+    // none or both transposed
+    assert((transposeInputData == transposeOutputData) ||
+           (inputData.dim_inner() == outputData.dim_mid()));
+    assert((transposeInputData == transposeOutputData) ||
+           (inputData.dim_mid() == outputData.dim_inner()));
+
+    // transposed case must not be in-place
+    assert(!(inputData.data() == outputData.data() && (transposeInputData || transposeOutputData)));
+
+    // make sure maxNumThreads is at least 1
+    SizeType numSplitsPerPlane = maxNumThreads < 1 ? 1 : maxNumThreads;
+    // only use at most as many splits as required to create work for every thread
+    if (numSplitsPerPlane > 1 && inputData.dim_outer() > numSplitsPerPlane) {
+      numSplitsPerPlane = 2;
+    }
+    const SizeType numTransformsPerPlane =
+        transposeInputData ? inputData.dim_inner() : inputData.dim_mid();
+    // make sure there are at most as many splits as transforms per plane
+    numSplitsPerPlane =
+        numTransformsPerPlane < numSplitsPerPlane ? numTransformsPerPlane : numSplitsPerPlane;
+
+    // set fftw plan parameters
+    const SizeType size = transposeInputData ? inputData.dim_mid() : inputData.dim_inner();
+    const SizeType inputStride = transposeInputData ? inputData.dim_inner() : 1;
+    const SizeType outputStride = transposeOutputData ? outputData.dim_inner() : 1;
+
+    const SizeType inputDist = transposeInputData ? 1 : inputData.dim_inner();
+    const SizeType outputDist = transposeOutputData ? 1 : outputData.dim_inner();
+
+    const SizeType numTransformsPerSplit = numTransformsPerPlane / numSplitsPerPlane;
+
+    const SizeType inputSplitStrideMid = transposeInputData ? 0 : numTransformsPerSplit;
+    const SizeType inputSplitStrideInner = transposeInputData ? numTransformsPerSplit : 0;
+    const SizeType outputSplitStrideMid = transposeOutputData ? 0 : numTransformsPerSplit;
+    const SizeType outputSplitStrideInner = transposeOutputData ? numTransformsPerSplit : 0;
+
+    // determine number of transforms per plane
+    // create plans within each plane
+    transforms_.reserve(inputData.dim_outer() * numSplitsPerPlane);
+    for (SizeType idxOuter = 0; idxOuter < inputData.dim_outer(); ++idxOuter) {
+      for (SizeType idxSplit = 0; idxSplit < numSplitsPerPlane; ++idxSplit) {
+        const SizeType howmany =
+            idxSplit == numSplitsPerPlane - 1
+                ? numTransformsPerSplit + numTransformsPerPlane % numSplitsPerPlane
+                : numTransformsPerSplit;
+        transforms_.emplace_back(&(inputData(idxOuter, idxSplit * inputSplitStrideMid,
+                                             idxSplit * inputSplitStrideInner)),
+                                 &(outputData(idxOuter, idxSplit * outputSplitStrideMid,
+                                              idxSplit * outputSplitStrideInner)),
+                                 size, inputStride, outputStride, inputDist, outputDist, howmany,
+                                 sign);
+      }
+    }
+  }
+
+  auto execute() -> void override {
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType i = 0; i < transforms_.size(); ++i) {
+      transforms_[i].execute();
+    }
+  }
+
+private:
+  std::vector<FFTWPlan<ValueType>> transforms_;
+};
+
+// Computes the FFT in 1D along either the innermost dimension (not transposed) or the second
+// innermost dimension (transposed).
+// The transforms are computed in batches aligned to the outer and transform dimension.
+// The indices of transforms to be computed per plane can be provided as well.
+template <typename T>
+class Transform1DVerticalHost : public TransformHost {
+public:
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+  Transform1DVerticalHost(HostArrayView3D<ComplexType> inputData,
+                          HostArrayView3D<ComplexType> outputData, bool transposeInputData,
+                          bool transposeOutputData, int sign,
+                          const std::set<SizeType>& inputMidIndices) {
+    assert(inputData.dim_outer() == outputData.dim_outer());
+
+    // check case where only one is transposed
+    assert((transposeInputData != transposeOutputData) ||
+           (inputData.dim_inner() == outputData.dim_inner()));
+    assert((transposeInputData != transposeOutputData) ||
+           (inputData.dim_mid() == outputData.dim_mid()));
+
+    // none or both transposed
+    assert((transposeInputData == transposeOutputData) ||
+           (inputData.dim_inner() == outputData.dim_mid()));
+    assert((transposeInputData == transposeOutputData) ||
+           (inputData.dim_mid() == outputData.dim_inner()));
+
+    // transposed case must not be in-place
+    assert(!(inputData.data() == outputData.data() && (transposeInputData || transposeOutputData)));
+
+    // set fftw plan parameters
+    const SizeType size = transposeInputData ? inputData.dim_mid() : inputData.dim_inner();
+    const SizeType inputStride = transposeInputData ? inputData.dim_inner() : 1;
+    const SizeType outputStride = transposeOutputData ? outputData.dim_inner() : 1;
+
+    const SizeType inputDist = inputData.dim_mid() * inputData.dim_inner();
+    const SizeType outputDist = outputData.dim_mid() * outputData.dim_inner();
+    const SizeType howmany = inputData.dim_outer();
+
+    // determine number of transforms per plane
+    // create plans within each plane
+    transforms_.reserve(inputMidIndices.size());
+    for (const auto& midIndex : inputMidIndices) {
+      const SizeType idxMidInput = transposeInputData ? 0 : midIndex;
+      const SizeType idxInnerInput = transposeInputData ? midIndex : 0;
+      const SizeType idxMidOutput = transposeOutputData ? 0 : midIndex;
+      const SizeType idxInnerOutput = transposeOutputData ? midIndex : 0;
+      transforms_.emplace_back(&(inputData(0, idxMidInput, idxInnerInput)),
+                               &(outputData(0, idxMidOutput, idxInnerOutput)), size, inputStride,
+                               outputStride, inputDist, outputDist, howmany, sign);
+    }
+  }
+
+  auto execute() -> void override {
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType i = 0; i < transforms_.size(); ++i) {
+      transforms_[i].execute();
+    }
+  }
+
+private:
+  std::vector<FFTWPlan<ValueType>> transforms_;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_2d_gpu.hpp b/src/fft/transform_2d_gpu.hpp
new file mode 100644
index 0000000..0e5265d
--- /dev/null
+++ b/src/fft/transform_2d_gpu.hpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_2D_GPU_HPP
+#define SPFFT_TRANSFORM_2D_GPU_HPP
+
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <memory>
+#include "fft/transform_interface.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class Transform2DGPU : public TransformGPU {
+public:
+  using ValueType = T;
+  using ComplexType = gpu::fft::ComplexType<T>;
+
+  Transform2DGPU(GPUArrayView3D<typename gpu::fft::ComplexType<T>::type>& data,
+                 GPUStreamHandle stream, std::shared_ptr<GPUArray<char>> workBuffer)
+      : stream_(std::move(stream)), workBuffer_(std::move(workBuffer)), dataPtr_(data.data()) {
+    assert(workBuffer_);
+
+    std::size_t worksize = 0;
+
+    int rank = 2;
+    int n[2] = {data.dim_mid(), data.dim_inner()};
+    int nembed[2] = {data.dim_mid(), data.dim_inner()};
+    int stride = 1;
+    int dist = data.dim_inner() * data.dim_mid();
+    int batch = data.dim_outer();
+
+    // create plan
+    gpu::fft::check_result(gpu::fft::create(&plan_));
+    gpu::fft::check_result(gpu::fft::set_auto_allocation(plan_, 0));
+    gpu::fft::check_result(gpu::fft::make_plan_many(
+        plan_, rank, n, nembed, stride, dist, nembed, stride, dist,
+        gpu::fft::TransformType::ComplexToComplex<T>::value, batch, &worksize));
+
+    // set stream
+    gpu::fft::check_result(gpu::fft::set_stream(plan_, stream_.get()));
+
+    // resize work buffer if necessary
+    if (workBuffer_->size() < worksize) {
+      *workBuffer_ = GPUArray<char>(worksize);
+    }
+  }
+
+  Transform2DGPU(const Transform2DGPU& transform) = delete;
+
+  Transform2DGPU(Transform2DGPU&& transform) noexcept
+      : stream_(std::move(transform.stream_)),
+        plan_(std::move(transform.plan_)),
+        workBuffer_(std::move(transform.workBuffer_)),
+        dataPtr_(transform.dataPtr_) {
+    transform.plan_ = 0;
+  }
+
+  ~Transform2DGPU() {
+    if (plan_) {
+      gpu::fft::destroy(plan_);
+    }
+  }
+
+  auto operator=(const Transform2DGPU& transform) -> Transform2DGPU& = delete;
+
+  auto operator=(Transform2DGPU&& transform) noexcept -> Transform2DGPU& {
+    if (plan_) {
+      gpu::fft::destroy(plan_);
+    }
+    stream_ = std::move(transform.stream_);
+    plan_ = std::move(transform.plan_);
+    workBuffer_ = std::move(transform.workBuffer_);
+    dataPtr_ = transform.dataPtr_;
+
+    transform.plan_ = 0;
+    return *this;
+  }
+
+  inline auto device_id() const noexcept -> int { return stream_.device_id(); }
+
+  auto forward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(plan_, workBuffer_->data()));
+    gpu::fft::check_result(
+        gpu::fft::execute(plan_, dataPtr_, dataPtr_, gpu::fft::TransformDirection::Forward));
+  }
+
+  auto backward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(plan_, workBuffer_->data()));
+    gpu::fft::check_result(
+        gpu::fft::execute(plan_, dataPtr_, dataPtr_, gpu::fft::TransformDirection::Backward));
+  }
+
+private:
+  GPUStreamHandle stream_;
+  gpu::fft::HandleType plan_ = 0;
+  std::shared_ptr<GPUArray<char>> workBuffer_;
+  typename gpu::fft::ComplexType<T>::type* dataPtr_;
+};
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_interface.hpp b/src/fft/transform_interface.hpp
new file mode 100644
index 0000000..3e1c477
--- /dev/null
+++ b/src/fft/transform_interface.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_INTERFACE_HPP
+#define SPFFT_TRANSFORM_INTERFACE_HPP
+#include "spfft/config.h"
+
+namespace spfft {
+
+class TransformHost {
+public:
+  virtual auto execute() -> void = 0;
+  virtual ~TransformHost() = default;
+};
+
+class TransformGPU {
+public:
+  virtual auto forward() -> void = 0;
+  virtual auto backward() -> void = 0;
+  virtual ~TransformGPU() = default;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_real_1d_host.hpp b/src/fft/transform_real_1d_host.hpp
new file mode 100644
index 0000000..6cbb8e5
--- /dev/null
+++ b/src/fft/transform_real_1d_host.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_REAL_1D_HOST_HPP
+#define SPFFT_TRANSFORM_REAL_1D_HOST_HPP
+
+#include <cassert>
+#include <complex>
+#include <set>
+#include <vector>
+#include "fft/fftw_real_plan_1d.hpp"
+#include "fft/transform_interface.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+namespace spfft {
+
+// Computes the FFT in 1D along either the innermost dimension (not transposed) or the second
+// innermost dimension (transposed)
+// The transforms are computed in batches aligned to inner 2d planes
+template <typename T>
+class TransformReal1DPlanesHost : public TransformHost {
+public:
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+  // r2c
+  TransformReal1DPlanesHost(HostArrayView3D<T> inputData, HostArrayView3D<ComplexType> outputData,
+                            bool transposeInputData, bool transposeOutputData, int maxNumThreads) {
+    assert(inputData.dim_outer() == outputData.dim_outer());
+
+    assert(disjoint(inputData, outputData));
+
+    // set fftw plan parameters
+    const SizeType size = transposeInputData ? inputData.dim_mid() : inputData.dim_inner();
+    const SizeType inputStride = transposeInputData ? inputData.dim_inner() : 1;
+    const SizeType outputStride = transposeOutputData ? outputData.dim_inner() : 1;
+
+    const SizeType inputDist = transposeInputData ? 1 : inputData.dim_inner();
+    const SizeType outputDist = transposeOutputData ? 1 : outputData.dim_inner();
+
+    // make sure maxNumThreads is at least 1
+    SizeType numSplitsPerPlane = maxNumThreads < 1 ? 1 : maxNumThreads;
+    // only use at most as many splits as required to create work for every thread
+    if (numSplitsPerPlane > 1 && inputData.dim_outer() > numSplitsPerPlane) {
+      numSplitsPerPlane = 2;
+    }
+    const SizeType numTransformsPerPlane =
+        transposeInputData ? inputData.dim_inner() : inputData.dim_mid();
+    // make sure there are at most as many splits as transforms per plane
+    numSplitsPerPlane =
+        numTransformsPerPlane < numSplitsPerPlane ? numTransformsPerPlane : numSplitsPerPlane;
+
+    const SizeType numTransformsPerSplit = numTransformsPerPlane / numSplitsPerPlane;
+
+    const SizeType inputSplitStrideMid = transposeInputData ? 0 : numTransformsPerSplit;
+    const SizeType inputSplitStrideInner = transposeInputData ? numTransformsPerSplit : 0;
+    const SizeType outputSplitStrideMid = transposeOutputData ? 0 : numTransformsPerSplit;
+    const SizeType outputSplitStrideInner = transposeOutputData ? numTransformsPerSplit : 0;
+
+    // check for non-transposed output
+    assert((transposeOutputData) ||
+           (size / 2 + 1 == outputData.dim_inner()));
+
+    // check for transposed output
+    assert((!transposeOutputData) ||
+           (size / 2 + 1 == outputData.dim_mid()));
+
+    // determine number of transforms per plane
+    // create plans within each plane
+    transforms_.reserve(inputData.dim_outer() * numSplitsPerPlane);
+    for (SizeType idxOuter = 0; idxOuter < inputData.dim_outer(); ++idxOuter) {
+      for (SizeType idxSplit = 0; idxSplit < numSplitsPerPlane; ++idxSplit) {
+        const SizeType howmany =
+            idxSplit == numSplitsPerPlane - 1
+                ? numTransformsPerSplit + numTransformsPerPlane % numSplitsPerPlane
+                : numTransformsPerSplit;
+        transforms_.emplace_back(&(inputData(idxOuter, idxSplit * inputSplitStrideMid,
+                                             idxSplit * inputSplitStrideInner)),
+                                 &(outputData(idxOuter, idxSplit * outputSplitStrideMid,
+                                              idxSplit * outputSplitStrideInner)),
+                                 size, inputStride, outputStride, inputDist, outputDist, howmany);
+      }
+    }
+  }
+
+  // c2r
+  TransformReal1DPlanesHost(HostArrayView3D<ComplexType> inputData, HostArrayView3D<T> outputData,
+                            bool transposeInputData, bool transposeOutputData, int maxNumThreads) {
+    assert(inputData.dim_outer() == outputData.dim_outer());
+
+    assert(disjoint(inputData, outputData));
+
+    // set fftw plan parameters
+    const SizeType size = transposeOutputData ? outputData.dim_mid() : outputData.dim_inner();
+    const SizeType inputStride = transposeInputData ? inputData.dim_inner() : 1;
+    const SizeType outputStride = transposeOutputData ? outputData.dim_inner() : 1;
+
+    const SizeType inputDist = transposeInputData ? 1 : inputData.dim_inner();
+    const SizeType outputDist = transposeOutputData ? 1 : outputData.dim_inner();
+
+    // make sure maxNumThreads is at least 1
+    SizeType numSplitsPerPlane = maxNumThreads < 1 ? 1 : maxNumThreads;
+    // only use at most as many splits as required to create work for every thread
+    if (numSplitsPerPlane > 1 && inputData.dim_outer() > numSplitsPerPlane) {
+      numSplitsPerPlane = 2;
+    }
+    const SizeType numTransformsPerPlane =
+        transposeInputData ? inputData.dim_inner() : inputData.dim_mid();
+    // make sure there are at most as many splits as transforms per plane
+    numSplitsPerPlane =
+        numTransformsPerPlane < numSplitsPerPlane ? numTransformsPerPlane : numSplitsPerPlane;
+
+    const SizeType numTransformsPerSplit = numTransformsPerPlane / numSplitsPerPlane;
+
+    const SizeType inputSplitStrideMid = transposeInputData ? 0 : numTransformsPerSplit;
+    const SizeType inputSplitStrideInner = transposeInputData ? numTransformsPerSplit : 0;
+    const SizeType outputSplitStrideMid = transposeOutputData ? 0 : numTransformsPerSplit;
+    const SizeType outputSplitStrideInner = transposeOutputData ? numTransformsPerSplit : 0;
+
+    // check for non-transposed output
+    assert((transposeInputData) ||
+           (size / 2 + 1 == inputData.dim_inner()));
+
+    // check for transposed output
+    assert((!transposeInputData) ||
+           (size / 2 + 1 == inputData.dim_mid()));
+
+    // determine number of transforms per plane
+    // create plans within each plane
+    transforms_.reserve(inputData.dim_outer() * numSplitsPerPlane);
+    for (SizeType idxOuter = 0; idxOuter < inputData.dim_outer(); ++idxOuter) {
+      for (SizeType idxSplit = 0; idxSplit < numSplitsPerPlane; ++idxSplit) {
+        const SizeType howmany =
+            idxSplit == numSplitsPerPlane - 1
+                ? numTransformsPerSplit + numTransformsPerPlane % numSplitsPerPlane
+                : numTransformsPerSplit;
+        transforms_.emplace_back(&(inputData(idxOuter, idxSplit * inputSplitStrideMid,
+                                             idxSplit * inputSplitStrideInner)),
+                                 &(outputData(idxOuter, idxSplit * outputSplitStrideMid,
+                                              idxSplit * outputSplitStrideInner)),
+                                 size, inputStride, outputStride, inputDist, outputDist, howmany);
+      }
+    }
+  }
+  auto execute() -> void override {
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType i = 0; i < transforms_.size(); ++i) {
+      transforms_[i].execute();
+    }
+  }
+
+private:
+  std::vector<FFTWRealPlan<ValueType>> transforms_;
+};
+} // namespace spfft
+
+#endif
diff --git a/src/fft/transform_real_2d_gpu.hpp b/src/fft/transform_real_2d_gpu.hpp
new file mode 100644
index 0000000..19a63ec
--- /dev/null
+++ b/src/fft/transform_real_2d_gpu.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_REAL_2D_GPU_HPP
+#define SPFFT_TRANSFORM_REAL_2D_GPU_HPP
+
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <memory>
+#include "fft/transform_interface.hpp"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class TransformReal2DGPU : public TransformGPU {
+public:
+  using ValueType = T;
+  using ComplexType = gpu::fft::ComplexType<T>;
+
+  TransformReal2DGPU(GPUArrayView3D<T> spaceDomain,
+                     GPUArrayView3D<typename gpu::fft::ComplexType<T>::type> freqDomain,
+                     GPUStreamHandle stream, std::shared_ptr<GPUArray<char>> workBuffer)
+      : stream_(std::move(stream)),
+        workBuffer_(std::move(workBuffer)),
+        spaceDomainPtr_(spaceDomain.data()),
+        freqDomainPtr_(freqDomain.data()) {
+    assert(disjoint(spaceDomain, freqDomain));
+    assert(workBuffer_);
+    assert(spaceDomain.dim_outer() == freqDomain.dim_outer());
+    assert(spaceDomain.dim_mid() == freqDomain.dim_mid());
+    assert(spaceDomain.dim_inner() / 2 + 1 == freqDomain.dim_inner());
+
+    int rank = 2;
+    int n[2] = {spaceDomain.dim_mid(), spaceDomain.dim_inner()};
+    int nembedReal[2] = {spaceDomain.dim_mid(), spaceDomain.dim_inner()};
+    int nembedFreq[2] = {freqDomain.dim_mid(), freqDomain.dim_inner()};
+    int stride = 1;
+    int distReal = spaceDomain.dim_inner() * spaceDomain.dim_mid();
+    int distFreq = freqDomain.dim_inner() * freqDomain.dim_mid();
+    int batch = spaceDomain.dim_outer();
+
+    std::size_t worksizeForward = 0;
+    std::size_t worksizeBackward = 0;
+    // create plan
+    gpu::fft::check_result(gpu::fft::create(&planForward_));
+    gpu::fft::check_result(gpu::fft::create(&planBackward_));
+
+    gpu::fft::check_result(gpu::fft::set_auto_allocation(planForward_, 0));
+    gpu::fft::check_result(gpu::fft::set_auto_allocation(planBackward_, 0));
+
+    gpu::fft::check_result(gpu::fft::make_plan_many(
+        planForward_, rank, n, nembedReal, stride, distReal, nembedFreq, stride, distFreq,
+        gpu::fft::TransformType::RealToComplex<T>::value, batch, &worksizeForward));
+    gpu::fft::check_result(gpu::fft::make_plan_many(
+        planBackward_, rank, n, nembedFreq, stride, distFreq, nembedReal, stride, distReal,
+        gpu::fft::TransformType::ComplexToReal<T>::value, batch, &worksizeBackward));
+
+    // set stream
+    gpu::fft::check_result(gpu::fft::set_stream(planForward_, stream_.get()));
+    gpu::fft::check_result(gpu::fft::set_stream(planBackward_, stream_.get()));
+
+    const std::size_t worksize =
+        worksizeForward > worksizeBackward ? worksizeForward : worksizeBackward;
+    // resize work buffer if necessary
+    if (workBuffer_->size() < worksize) {
+      *workBuffer_ = GPUArray<char>(worksize);
+    }
+  }
+
+  TransformReal2DGPU(const TransformReal2DGPU& transform) = delete;
+
+  TransformReal2DGPU(TransformReal2DGPU&& transform) noexcept
+      : stream_(std::move(transform.stream_)),
+        planForward_(std::move(transform.planForward_)),
+        planBackward_(std::move(transform.planBackward_)),
+        workBuffer_(std::move(transform.workBuffer_)),
+        spaceDomainPtr_(transform.spaceDomainPtr_),
+        freqDomainPtr_(transform.freqDomainPtr_) {
+    transform.plan_ = 0;
+  }
+
+  ~TransformReal2DGPU() {
+    if (planForward_) {
+      gpu::fft::destroy(planForward_);
+      planForward_ = 0;
+    }
+    if (planBackward_) {
+      gpu::fft::destroy(planBackward_);
+      planBackward_ = 0;
+    }
+  }
+
+  auto operator=(const TransformReal2DGPU& transform) -> TransformReal2DGPU& = delete;
+
+  auto operator=(TransformReal2DGPU&& transform) noexcept -> TransformReal2DGPU& {
+    if (planForward_) {
+      gpu::fft::destroy(planForward_);
+      planForward_ = 0;
+    }
+    if (planBackward_) {
+      gpu::fft::destroy(planBackward_);
+      planBackward_ = 0;
+    }
+    stream_ = std::move(transform.stream_);
+    planForward_ = std::move(transform.planForward_);
+    planBackward_ = std::move(transform.planBackward_);
+    workBuffer_ = std::move(transform.workBuffer_);
+    spaceDomainPtr_ = transform.spaceDomainPtr_;
+    freqDomainPtr_ = transform.freqDomainPtr_;
+
+    transform.plan_ = 0;
+    return *this;
+  }
+
+  inline auto device_id() const noexcept -> int { return stream_.device_id(); }
+
+  auto forward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(planForward_, workBuffer_->data()));
+    gpu::fft::check_result(gpu::fft::execute(planForward_, spaceDomainPtr_, freqDomainPtr_));
+  }
+
+  auto backward() -> void override {
+    gpu::fft::check_result(gpu::fft::set_work_area(planBackward_, workBuffer_->data()));
+    gpu::fft::check_result(gpu::fft::execute(planBackward_, freqDomainPtr_, spaceDomainPtr_));
+  }
+
+private:
+  GPUStreamHandle stream_;
+  gpu::fft::HandleType planForward_ = 0;
+  gpu::fft::HandleType planBackward_ = 0;
+  std::shared_ptr<GPUArray<char>> workBuffer_;
+  T* spaceDomainPtr_;
+  typename gpu::fft::ComplexType<T>::type* freqDomainPtr_;
+};
+} // namespace spfft
+
+#endif
diff --git a/src/gpu_util/complex_conversion.cuh b/src/gpu_util/complex_conversion.cuh
new file mode 100644
index 0000000..5cd201c
--- /dev/null
+++ b/src/gpu_util/complex_conversion.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_GPU_COMPLEX_CONVERISON_CUH
+#define SPFFT_GPU_COMPLEX_CONVERISON_CUH
+
+#include "gpu_util/gpu_fft_api.hpp"
+
+namespace spfft {
+
+template <typename T, typename U>
+struct ConvertComplex {
+  __device__ __host__ inline static T apply(const U& val) { return val; }
+};
+
+template <>
+struct ConvertComplex<gpu::fft::ComplexFloatType, gpu::fft::ComplexDoubleType> {
+  __device__ __host__ inline static gpu::fft::ComplexFloatType apply(
+      const gpu::fft::ComplexDoubleType& val) {
+    return gpu::fft::ComplexFloatType{(float)val.x, (float)val.y};
+  }
+};
+
+template <>
+struct ConvertComplex<gpu::fft::ComplexDoubleType, gpu::fft::ComplexFloatType> {
+  __device__ __host__ inline static gpu::fft::ComplexDoubleType apply(
+      const gpu::fft::ComplexFloatType& val) {
+    return gpu::fft::ComplexDoubleType{(double)val.x, (double)val.y};
+  }
+};
+
+}
+#endif
diff --git a/src/gpu_util/gpu_device_guard.hpp b/src/gpu_util/gpu_device_guard.hpp
new file mode 100644
index 0000000..578b6a4
--- /dev/null
+++ b/src/gpu_util/gpu_device_guard.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_DEVICE_GUARD_HPP
+#define SPFFT_GPU_DEVICE_GUARD_HPP
+
+#include "spfft/config.h"
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include <memory>
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "spfft/exceptions.hpp"
+namespace spfft {
+class GPUDeviceGuard {
+public:
+  explicit GPUDeviceGuard(const int deviceId) : targetDeviceId_(deviceId), originalDeviceId_(0) {
+    gpu::check_status(gpu::get_device(&originalDeviceId_));
+    if (originalDeviceId_ != deviceId) {
+      gpu::check_status(gpu::set_device(deviceId));
+    }
+  };
+
+  GPUDeviceGuard() = delete;
+  GPUDeviceGuard(const GPUDeviceGuard&) = delete;
+  GPUDeviceGuard(GPUDeviceGuard&&) = delete;
+  auto operator=(const GPUDeviceGuard&) -> GPUDeviceGuard& = delete;
+  auto operator=(GPUDeviceGuard&&) -> GPUDeviceGuard& = delete;
+
+  ~GPUDeviceGuard() {
+    if (targetDeviceId_ != originalDeviceId_) {
+      gpu::set_device(originalDeviceId_); // no check to avoid throw exeception in destructor
+    }
+  }
+
+private:
+  int targetDeviceId_ = 0;
+  int originalDeviceId_ = 0;
+};
+} // namespace spfft
+
+#endif
+#endif
diff --git a/src/gpu_util/gpu_fft_api.hpp b/src/gpu_util/gpu_fft_api.hpp
new file mode 100644
index 0000000..51d5f73
--- /dev/null
+++ b/src/gpu_util/gpu_fft_api.hpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_FFT_API_HPP
+#define SPFFT_GPU_FFT_API_HPP
+
+#include "spfft/config.h"
+
+#if defined(SPFFT_CUDA)
+#include <cufft.h>
+#define GPU_FFT_PREFIX(val) cufft##val
+
+#elif defined(SPFFT_ROCM)
+#include <hipfft.h>
+#define GPU_FFT_PREFIX(val) hipfft##val
+#endif
+
+// only declare namespace members if GPU support is enabled
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+
+#include <utility>
+#include "spfft/exceptions.hpp"
+
+namespace spfft {
+namespace gpu {
+namespace fft {
+
+// ==================================
+// Types
+// ==================================
+using ResultType = GPU_FFT_PREFIX(Result);
+using HandleType = GPU_FFT_PREFIX(Handle);
+using ComplexFloatType = GPU_FFT_PREFIX(Complex);
+using ComplexDoubleType = GPU_FFT_PREFIX(DoubleComplex);
+
+// Complex type selector
+template <typename T>
+struct ComplexType;
+
+template <>
+struct ComplexType<double> {
+  using type = ComplexDoubleType;
+};
+
+template <>
+struct ComplexType<float> {
+  using type = ComplexFloatType;
+};
+
+// ==================================
+// Transform types
+// ==================================
+namespace TransformDirection {
+#ifdef SPFFT_CUDA
+constexpr auto Forward = CUFFT_FORWARD;
+constexpr auto Backward = CUFFT_INVERSE;
+#else
+constexpr auto Forward = HIPFFT_FORWARD;
+constexpr auto Backward = HIPFFT_BACKWARD;
+#endif
+} // namespace TransformDirection
+
+// ==================================
+// Transform types
+// ==================================
+namespace TransformType {
+#ifdef SPFFT_CUDA
+constexpr auto R2C = CUFFT_R2C;
+constexpr auto C2R = CUFFT_C2R;
+constexpr auto C2C = CUFFT_C2C;
+constexpr auto D2Z = CUFFT_D2Z;
+constexpr auto Z2D = CUFFT_Z2D;
+constexpr auto Z2Z = CUFFT_Z2Z;
+#else
+constexpr auto R2C = HIPFFT_R2C;
+constexpr auto C2R = HIPFFT_C2R;
+constexpr auto C2C = HIPFFT_C2C;
+constexpr auto D2Z = HIPFFT_D2Z;
+constexpr auto Z2D = HIPFFT_Z2D;
+constexpr auto Z2Z = HIPFFT_Z2Z;
+#endif
+
+// Transform type selector
+template <typename T>
+struct ComplexToComplex;
+
+template <>
+struct ComplexToComplex<double> {
+  constexpr static auto value = Z2Z;
+};
+
+template <>
+struct ComplexToComplex<float> {
+  constexpr static auto value = C2C;
+};
+
+// Transform type selector
+template <typename T>
+struct RealToComplex;
+
+template <>
+struct RealToComplex<double> {
+  constexpr static auto value = D2Z;
+};
+
+template <>
+struct RealToComplex<float> {
+  constexpr static auto value = R2C;
+};
+
+// Transform type selector
+template <typename T>
+struct ComplexToReal;
+
+template <>
+struct ComplexToReal<double> {
+  constexpr static auto value = Z2D;
+};
+
+template <>
+struct ComplexToReal<float> {
+  constexpr static auto value = C2R;
+};
+} // namespace TransformType
+
+// ==================================
+// Result values
+// ==================================
+namespace result {
+#ifdef SPFFT_CUDA
+constexpr auto Success = CUFFT_SUCCESS;
+#else
+constexpr auto Success = HIPFFT_SUCCESS;
+#endif
+} // namespace result
+
+// ==================================
+// Error check functions
+// ==================================
+inline auto check_result(ResultType error) -> void {
+  if (error != result::Success) {
+    throw GPUFFTError();
+  }
+}
+
+// ==================================
+// Execution function overload
+// ==================================
+inline auto execute(HandleType& plan, ComplexDoubleType* iData, double* oData) -> ResultType {
+  return GPU_FFT_PREFIX(ExecZ2D)(plan, iData, oData);
+}
+
+inline auto execute(HandleType& plan, ComplexFloatType* iData, float* oData) -> ResultType {
+  return GPU_FFT_PREFIX(ExecC2R)(plan, iData, oData);
+}
+
+inline auto execute(HandleType& plan, double* iData, ComplexDoubleType* oData) -> ResultType {
+  return GPU_FFT_PREFIX(ExecD2Z)(plan, iData, oData);
+}
+
+inline auto execute(HandleType& plan, float* iData, ComplexFloatType* oData) -> ResultType {
+  return GPU_FFT_PREFIX(ExecR2C)(plan, iData, oData);
+}
+
+inline auto execute(HandleType& plan, ComplexDoubleType* iData, ComplexDoubleType* oData,
+                    int direction) -> ResultType {
+  return GPU_FFT_PREFIX(ExecZ2Z)(plan, iData, oData, direction);
+}
+
+inline auto execute(HandleType& plan, ComplexFloatType* iData, ComplexFloatType* oData,
+                    int direction) -> ResultType {
+  return GPU_FFT_PREFIX(ExecC2C)(plan, iData, oData, direction);
+}
+
+// ==================================
+// Forwarding functions of to GPU API
+// ==================================
+template <typename... ARGS>
+inline auto create(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(Create)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto make_plan_many(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(MakePlanMany)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto set_work_area(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(SetWorkArea)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto destroy(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(Destroy)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto set_stream(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(SetStream)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto set_auto_allocation(ARGS... args) -> ResultType {
+  return GPU_FFT_PREFIX(SetAutoAllocation)(std::forward<ARGS>(args)...);
+}
+
+} // namespace fft
+} // namespace gpu
+} // namespace spfft
+
+#undef GPU_FFT_PREFIX
+
+#endif // defined SPFFT_CUDA || SPFFT_ROCM
+#endif
diff --git a/src/gpu_util/gpu_pointer_translation.hpp b/src/gpu_util/gpu_pointer_translation.hpp
new file mode 100644
index 0000000..13aa2da
--- /dev/null
+++ b/src/gpu_util/gpu_pointer_translation.hpp
@@ -0,0 +1,84 @@
+
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_POINTER_TRANSLATION_HPP
+#define SPFFT_GPU_POINTER_TRANSLATION_HPP
+
+#include <gpu_util/gpu_runtime_api.hpp>
+#include <utility>
+#include "spfft/config.h"
+namespace spfft {
+
+template <typename T>
+auto translate_gpu_pointer(const T* inputPointer)
+    -> std::pair<const T*, const T*> {
+  gpu::PointerAttributes attr;
+  auto status = gpu::pointer_get_attributes(&attr, static_cast<const void*>(inputPointer));
+  gpu::get_last_error(); // if pointer is not registered, error will be stored and has to be cleared
+
+#ifdef SPFFT_ROCM
+  if(status != gpu::status::Success) {
+#else
+  if(status == gpu::status::ErrorInvalidValue) {
+#endif
+    // not registered with cuda -> host pointer
+    const T* devicePtr = nullptr;
+    return {inputPointer, devicePtr};
+  } else {
+    gpu::check_status(status);
+    return {static_cast<const T*>(attr.hostPointer), static_cast<const T*>(attr.devicePointer)};
+  }
+
+}
+
+template <typename T>
+auto translate_gpu_pointer(T* inputPointer)
+    -> std::pair<T*, T*> {
+  gpu::PointerAttributes attr;
+  auto status = gpu::pointer_get_attributes(&attr, static_cast<const void*>(inputPointer));
+  gpu::get_last_error(); // if pointer is not registered, error will be stored and has to be cleared
+
+      gpu::get_last_error();
+#ifdef SPFFT_ROCM
+  if(status != gpu::status::Success) {
+#else
+  if(status == gpu::status::ErrorInvalidValue) {
+#endif
+    // not registered with cuda -> host pointer
+    T* devicePtr = nullptr;
+    return {inputPointer, devicePtr};
+  } else {
+    gpu::check_status(status);
+    return {static_cast<T*>(attr.hostPointer), static_cast<T*>(attr.devicePointer)};
+  }
+
+}
+
+} // namespace spfft
+
+#endif
diff --git a/src/gpu_util/gpu_runtime.hpp b/src/gpu_util/gpu_runtime.hpp
new file mode 100644
index 0000000..05ca682
--- /dev/null
+++ b/src/gpu_util/gpu_runtime.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_RUNTIME_HPP
+#define SPFFT_GPU_RUNTIME_HPP
+
+#include "spfft/config.h"
+#include "gpu_util/gpu_runtime_api.hpp"
+
+#ifdef SPFFT_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
+namespace spfft {
+
+#ifdef SPFFT_CUDA
+template <typename F, typename... ARGS>
+inline auto launch_kernel(F func, const dim3 threadGrid, const dim3 threadBlock,
+                          const size_t sharedMemoryBytes, const gpu::StreamType stream,
+                          ARGS... args) -> void {
+#ifndef NDEBUG
+  gpu::device_synchronize();
+  gpu::check_status(gpu::get_last_error()); // before
+#endif
+  func<<<threadGrid, threadBlock,sharedMemoryBytes, stream>>>(std::forward<ARGS>(args)...);
+#ifndef NDEBUG
+  gpu::device_synchronize();
+  gpu::check_status(gpu::get_last_error()); // after
+#endif
+}
+#endif
+
+#ifdef SPFFT_ROCM
+template <typename F, typename... ARGS>
+inline auto launch_kernel(F func, const dim3 threadGrid, const dim3 threadBlock,
+                          const size_t sharedMemoryBytes, const gpu::StreamType stream,
+                          ARGS... args) -> void {
+#ifndef NDEBUG
+  gpu::device_synchronize();
+  gpu::check_status(gpu::get_last_error()); // before
+#endif
+  hipLaunchKernelGGL(func, threadGrid, threadBlock, sharedMemoryBytes, stream,
+                     std::forward<ARGS>(args)...);
+#ifndef NDEBUG
+  gpu::device_synchronize();
+  gpu::check_status(gpu::get_last_error()); // after
+#endif
+}
+#endif
+
+
+} // namespace spfft
+
+#endif
diff --git a/src/gpu_util/gpu_runtime_api.hpp b/src/gpu_util/gpu_runtime_api.hpp
new file mode 100644
index 0000000..b86deeb
--- /dev/null
+++ b/src/gpu_util/gpu_runtime_api.hpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_RUNTIME_RUNTIME_HPP
+#define SPFFT_GPU_RUNTIME_RUNTIME_HPP
+
+#include "spfft/config.h"
+
+#if defined(SPFFT_CUDA)
+#include <cuda_runtime_api.h>
+#define GPU_PREFIX(val) cuda##val
+
+#elif defined(SPFFT_ROCM)
+#include <hip/hip_runtime_api.h>
+#define GPU_PREFIX(val) hip##val
+#endif
+
+// only declare namespace members if GPU support is enabled
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+
+#include <utility>
+#include "spfft/exceptions.hpp"
+
+namespace spfft {
+namespace gpu {
+
+using StatusType = GPU_PREFIX(Error_t);
+using StreamType = GPU_PREFIX(Stream_t);
+
+#ifdef SPFFT_CUDA
+using PointerAttributes = GPU_PREFIX(PointerAttributes);
+#else
+using PointerAttributes = GPU_PREFIX(PointerAttribute_t);
+#endif
+
+namespace status {
+// error / return values
+constexpr StatusType Success = GPU_PREFIX(Success);
+constexpr StatusType ErrorMemoryAllocation = GPU_PREFIX(ErrorMemoryAllocation);
+constexpr StatusType ErrorLaunchOutOfResources = GPU_PREFIX(ErrorLaunchOutOfResources);
+constexpr StatusType ErrorInvalidValue = GPU_PREFIX(ErrorInvalidValue);
+constexpr StatusType ErrorInvalidResourceHandle = GPU_PREFIX(ErrorInvalidResourceHandle);
+constexpr StatusType ErrorInvalidDevice = GPU_PREFIX(ErrorInvalidDevice);
+constexpr StatusType ErrorInvalidMemcpyDirection = GPU_PREFIX(ErrorInvalidMemcpyDirection);
+constexpr StatusType ErrorInvalidDevicePointer = GPU_PREFIX(ErrorInvalidDevicePointer);
+constexpr StatusType ErrorInitializationError = GPU_PREFIX(ErrorInitializationError);
+constexpr StatusType ErrorNoDevice = GPU_PREFIX(ErrorNoDevice);
+constexpr StatusType ErrorNotReady = GPU_PREFIX(ErrorNotReady);
+constexpr StatusType ErrorUnknown = GPU_PREFIX(ErrorUnknown);
+constexpr StatusType ErrorPeerAccessNotEnabled = GPU_PREFIX(ErrorPeerAccessNotEnabled);
+constexpr StatusType ErrorPeerAccessAlreadyEnabled = GPU_PREFIX(ErrorPeerAccessAlreadyEnabled);
+constexpr StatusType ErrorHostMemoryAlreadyRegistered =
+    GPU_PREFIX(ErrorHostMemoryAlreadyRegistered);
+constexpr StatusType ErrorHostMemoryNotRegistered = GPU_PREFIX(ErrorHostMemoryNotRegistered);
+constexpr StatusType ErrorUnsupportedLimit = GPU_PREFIX(ErrorUnsupportedLimit);
+} // namespace status
+
+// flags to pass to GPU API
+namespace flag {
+constexpr auto HostRegisterDefault = GPU_PREFIX(HostRegisterDefault);
+constexpr auto HostRegisterPortable = GPU_PREFIX(HostRegisterPortable);
+constexpr auto HostRegisterMapped = GPU_PREFIX(HostRegisterMapped);
+constexpr auto HostRegisterIoMemory = GPU_PREFIX(HostRegisterIoMemory);
+
+constexpr auto StreamDefault = GPU_PREFIX(StreamDefault);
+constexpr auto StreamNonBlocking = GPU_PREFIX(StreamNonBlocking);
+
+constexpr auto MemoryTypeHost = GPU_PREFIX(MemoryTypeHost);
+constexpr auto MemoryTypeDevice = GPU_PREFIX(MemoryTypeDevice);
+#if (CUDART_VERSION >= 10000)
+constexpr auto MemoryTypeUnregistered = GPU_PREFIX(MemoryTypeUnregistered);
+constexpr auto MemoryTypeManaged = GPU_PREFIX(MemoryTypeManaged);
+#endif
+
+constexpr auto MemcpyHostToDevice = GPU_PREFIX(MemcpyHostToDevice);
+constexpr auto MemcpyDeviceToHost = GPU_PREFIX(MemcpyDeviceToHost);
+} // namespace flag
+
+// ==================================
+// Error check functions
+// ==================================
+inline auto check_status(StatusType error) -> void {
+  if (error != status::Success) {
+    if (error == status::ErrorMemoryAllocation) throw GPUAllocationError();
+    if (error == status::ErrorLaunchOutOfResources) throw GPULaunchError();
+    if (error == status::ErrorNoDevice) throw GPUNoDeviceError();
+    if (error == status::ErrorInvalidValue) throw GPUInvalidValueError();
+    if (error == status::ErrorInvalidDevicePointer) throw GPUInvalidDevicePointerError();
+
+    throw GPUError();
+  }
+}
+
+// ==================================
+// Forwarding functions of to GPU API
+// ==================================
+template <typename... ARGS>
+inline auto host_register(ARGS... args) -> StatusType {
+  return GPU_PREFIX(HostRegister)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto host_unregister(ARGS... args) -> StatusType {
+  return GPU_PREFIX(HostUnregister)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto stream_create_with_flags(ARGS... args) -> StatusType {
+  return GPU_PREFIX(StreamCreateWithFlags)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto stream_destroy(ARGS... args) -> StatusType {
+  return GPU_PREFIX(StreamDestroy)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto malloc(ARGS... args) -> StatusType {
+  return GPU_PREFIX(Malloc)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto free(ARGS... args) -> StatusType {
+  return GPU_PREFIX(Free)(std::forward<ARGS>(args)...);
+}
+
+
+template <typename... ARGS>
+inline auto memcpy(ARGS... args) -> StatusType {
+  return GPU_PREFIX(Memcpy)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto memcpy_async(ARGS... args) -> StatusType {
+  return GPU_PREFIX(MemcpyAsync)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto get_device(ARGS... args) -> StatusType {
+  return GPU_PREFIX(GetDevice)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto set_device(ARGS... args) -> StatusType {
+  return GPU_PREFIX(SetDevice)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto get_device_count(ARGS... args) -> StatusType {
+  return GPU_PREFIX(GetDeviceCount)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto stream_synchronize(ARGS... args) -> StatusType {
+  return GPU_PREFIX(StreamSynchronize)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto memset_async(ARGS... args) -> StatusType {
+  return GPU_PREFIX(MemsetAsync)(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline auto pointer_get_attributes(ARGS... args) -> StatusType {
+  return GPU_PREFIX(PointerGetAttributes)(std::forward<ARGS>(args)...);
+}
+
+inline auto get_last_error() -> StatusType {
+  return GPU_PREFIX(GetLastError)();
+}
+
+inline auto device_synchronize() -> StatusType {
+  return GPU_PREFIX(DeviceSynchronize)();
+}
+
+} // namespace gpu
+} // namespace spfft
+
+#undef GPU_PREFIX
+
+#endif // defined SPFFT_CUDA || SPFFT_ROCM
+#endif
diff --git a/src/gpu_util/gpu_stream_handle.hpp b/src/gpu_util/gpu_stream_handle.hpp
new file mode 100644
index 0000000..9616b15
--- /dev/null
+++ b/src/gpu_util/gpu_stream_handle.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GPU_STREAM_HANDLE_HPP
+#define SPFFT_GPU_STREAM_HANDLE_HPP
+
+#include "spfft/config.h"
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include <memory>
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "spfft/exceptions.hpp"
+namespace spfft {
+class GPUStreamHandle {
+public:
+  GPUStreamHandle() : stream_(new gpu::StreamType(0)), deviceId_(0) {
+    gpu::check_status(gpu::get_device(&deviceId_));
+  }
+
+  explicit GPUStreamHandle(const bool blockedByDefaultStream) : deviceId_(0) {
+    gpu::check_status(gpu::get_device(&deviceId_));
+    gpu::StreamType rawStream;
+    if (blockedByDefaultStream)
+      gpu::check_status(gpu::stream_create_with_flags(&rawStream, gpu::flag::StreamDefault));
+    else
+      gpu::check_status(gpu::stream_create_with_flags(&rawStream, gpu::flag::StreamNonBlocking));
+
+    stream_ =
+        std::shared_ptr<gpu::StreamType>(new gpu::StreamType(rawStream), [](gpu::StreamType* ptr) {
+          gpu::stream_destroy(*ptr);
+          delete ptr;
+        });
+  };
+
+  inline auto get() const -> gpu::StreamType { return *stream_; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+private:
+  std::shared_ptr<gpu::StreamType> stream_;
+  int deviceId_ = 0;
+};
+} // namespace spfft
+
+#endif
+#endif
diff --git a/src/gpu_util/gpu_transfer.hpp b/src/gpu_util/gpu_transfer.hpp
new file mode 100644
index 0000000..d0e26d7
--- /dev/null
+++ b/src/gpu_util/gpu_transfer.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_GPU_TRANSFER_HPP
+#define SPFFT_GPU_TRANSFER_HPP
+
+#include <cassert>
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/memory_type_trait.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T, typename U>
+auto copy_to_gpu(const T& hostArray, U&& gpuArray) -> void {
+  using UType = typename std::remove_reference<U>::type;
+  static_assert(!IsDeviceMemory<T>::value, "First argument must represent host memory!");
+  static_assert(IsDeviceMemory<UType>::value, "Second argument must represent device memory!");
+  static_assert(sizeof(decltype(*(gpuArray.data()))) == sizeof(decltype(*(hostArray.data()))),
+                "Size of value types must match!");
+
+  assert(hostArray.size() == static_cast<SizeType>(gpuArray.size()));
+  gpu::check_status(gpu::memcpy(
+      static_cast<void*>(gpuArray.data()), static_cast<const void*>(hostArray.data()),
+      gpuArray.size() * sizeof(decltype(*(gpuArray.data()))), gpu::flag::MemcpyHostToDevice));
+}
+
+template <typename T, typename U>
+auto copy_to_gpu_async(const GPUStreamHandle& stream, const T& hostArray, U&& gpuArray) -> void {
+  using UType = typename std::remove_reference<U>::type;
+  static_assert(!IsDeviceMemory<T>::value, "First argument must represent host memory!");
+  static_assert(IsDeviceMemory<UType>::value, "Second argument must represent device memory!");
+  static_assert(sizeof(decltype(*(gpuArray.data()))) == sizeof(decltype(*(hostArray.data()))),
+                "Size of value types must match!");
+
+  assert(hostArray.size() == static_cast<SizeType>(gpuArray.size()));
+  gpu::check_status(gpu::memcpy_async(static_cast<void*>(gpuArray.data()),
+                                      static_cast<const void*>(hostArray.data()),
+                                      gpuArray.size() * sizeof(decltype(*(gpuArray.data()))),
+                                      gpu::flag::MemcpyHostToDevice, stream.get()));
+}
+
+template <typename T, typename U>
+auto copy_from_gpu(const T& gpuArray, U&& hostArray) -> void {
+  using UType = typename std::remove_reference<U>::type;
+  static_assert(IsDeviceMemory<T>::value, "First argument must represent device memory!");
+  static_assert(!IsDeviceMemory<UType>::value, "Second argument must represent host memory!");
+  static_assert(sizeof(decltype(*(gpuArray.data()))) == sizeof(decltype(*(hostArray.data()))),
+                "Size of value types must match!");
+
+  assert(hostArray.size() == static_cast<SizeType>(gpuArray.size()));
+  gpu::check_status(gpu::memcpy(
+      static_cast<void*>(hostArray.data()), static_cast<const void*>(gpuArray.data()),
+      hostArray.size() * sizeof(decltype(*(gpuArray.data()))), gpu::flag::MemcpyDeviceToHost));
+}
+
+template <typename T, typename U>
+auto copy_from_gpu_async(const GPUStreamHandle& stream, const T& gpuArray, U&& hostArray) -> void {
+  using UType = typename std::remove_reference<U>::type;
+  static_assert(IsDeviceMemory<T>::value, "First argument must represent device memory!");
+  static_assert(!IsDeviceMemory<UType>::value, "Second argument must represent host memory!");
+  static_assert(sizeof(decltype(*(gpuArray.data()))) == sizeof(decltype(*(hostArray.data()))),
+                "Size of value types must match!");
+
+  assert(hostArray.size() == static_cast<SizeType>(gpuArray.size()));
+  gpu::check_status(gpu::memcpy_async(static_cast<void*>(hostArray.data()),
+                                      static_cast<const void*>(gpuArray.data()),
+                                      hostArray.size() * sizeof(decltype(*(gpuArray.data()))),
+                                      gpu::flag::MemcpyDeviceToHost, stream.get()));
+}
+
+} // namespace spfft
+
+#endif
diff --git a/src/memory/aligned_allocation.cpp b/src/memory/aligned_allocation.cpp
new file mode 100644
index 0000000..5e08794
--- /dev/null
+++ b/src/memory/aligned_allocation.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "memory/aligned_allocation.hpp"
+#include <stdlib.h>
+#include <unistd.h>
+
+namespace spfft {
+
+namespace memory {
+
+auto allocate_aligned(SizeType numBytes, SizeType alignment) -> void* {
+  // check if sizeof(void*) is power of 2
+  static_assert((sizeof(void*) & (sizeof(void*) - 1)) == 0,
+                "size of void* must by power of 2 for alignment!");
+  // check if alignment is power of 2 and multiple of sizeof(void*)
+  if (alignment % sizeof(void*) != 0 || ((alignment & (alignment - 1)) != 0))
+    throw HostAllocationError();
+  void* ptr;
+  if (posix_memalign(&ptr, alignment, numBytes) != 0) throw HostAllocationError();
+  return ptr;
+}
+
+auto allocate_aligned(SizeType numBytes) -> void* {
+  static auto pageSize = sysconf(_SC_PAGESIZE);
+  return allocate_aligned(numBytes, static_cast<SizeType>(pageSize));
+}
+
+auto free_aligned(void* ptr) noexcept -> void {
+  free(ptr);
+}
+
+} // namespace memory
+
+} // namespace spfft
+
diff --git a/src/memory/aligned_allocation.hpp b/src/memory/aligned_allocation.hpp
new file mode 100644
index 0000000..f54afac
--- /dev/null
+++ b/src/memory/aligned_allocation.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_ALIGNED_ALLOCATOR_HPP
+#define SPFFT_ALIGNED_ALLOCATOR_HPP
+
+#include <type_traits>
+#include <utility>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+namespace memory {
+
+// Allocate given number of bytes at adress with given alignment.
+// The alignment must be a multiple of sizeof(void*) and a power of 2
+// Throws upon failure.
+auto allocate_aligned(SizeType numBytes, SizeType alignment) -> void*;
+
+// Allocate memory aligned at page boundaries
+auto allocate_aligned(SizeType numBytes) -> void*;
+
+// Free memory allocated with allocate_aligned() function
+auto free_aligned(void* ptr) noexcept -> void;
+
+// construct numElements elements of type T with arguments args at location pointed to by ptr
+template <typename T, typename... ARGS>
+auto construct_elements_in_place(T* ptr, SizeType numElements, ARGS... args) -> void;
+
+// deconstruct elements of trivially destructable type in array
+template <typename T,
+          typename std::enable_if<std::is_trivially_destructible<T>::value, int>::type = 0>
+auto deconstruct_elements(T* ptr, SizeType numElements) noexcept -> void;
+
+// deconstruct elements of non-trivially destructable type in array
+template <typename T,
+          typename std::enable_if<!std::is_trivially_destructible<T>::value, int>::type = 0>
+auto deconstruct_elements(T* ptr,
+                          SizeType numElements) noexcept(std::is_nothrow_destructible<T>::value)
+    -> void;
+
+// ======================
+// Implementation
+// ======================
+template <typename T, typename... ARGS>
+auto construct_elements_in_place(T* ptr, SizeType numElements, ARGS... args) -> void {
+  SizeType constructIdx = 0;
+  try {
+    // construct all elements
+    for (; constructIdx < numElements; ++constructIdx) {
+      new (ptr + constructIdx) T(std::forward<ARGS>(args)...);
+    }
+  } catch (...) {
+    // destruct all elements which did not throw in case of error
+    deconstruct_elements(ptr, constructIdx);
+    throw;
+  }
+}
+
+template <typename T, typename std::enable_if<std::is_trivially_destructible<T>::value, int>::type>
+auto deconstruct_elements(T*, SizeType) noexcept -> void {}
+
+template <typename T, typename std::enable_if<!std::is_trivially_destructible<T>::value, int>::type>
+auto deconstruct_elements(T* ptr,
+                          SizeType numElements) noexcept(std::is_nothrow_destructible<T>::value)
+    -> void {
+  for (SizeType destructIdx = 0; destructIdx < numElements; ++destructIdx) {
+    ptr[destructIdx].~T();
+  }
+}
+
+} // namespace memory
+} // namespace spfft
+
+#endif
diff --git a/src/memory/array_view_utility.hpp b/src/memory/array_view_utility.hpp
new file mode 100644
index 0000000..2a28595
--- /dev/null
+++ b/src/memory/array_view_utility.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_ARRAY_VIEW_UTILITY_HPP
+#define SPFFT_ARRAY_VIEW_UTILITY_HPP
+
+#include <array>
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <utility>
+#include "memory/gpu_array_view.hpp"
+#include "memory/host_array_view.hpp"
+#include "memory/memory_type_trait.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T, typename U>
+auto disjoint(const T& array1, const U& array2) -> bool {
+  const void* start1 = static_cast<const void*>(array1.data());
+  const void* end1 = static_cast<const void*>(array1.data() + array1.size());
+  const void* start2 = static_cast<const void*>(array2.data());
+  const void* end2 = static_cast<const void*>(array2.data() + array2.size());
+  return !(start1 >= start2 && start1 < end2) && !(start2 >= start1 && start2 < end1);
+}
+
+namespace gpu_array_utility_internal {
+inline auto checked_cast_to_int(const SizeType value) -> int {
+  static_assert(std::is_unsigned<SizeType>::value, "Expected unsigend SizeType");
+  if (value > static_cast<SizeType>(std::numeric_limits<int>::max())) {
+    throw OverflowError();
+  }
+  return static_cast<int>(value);
+}
+} // namespace gpu_array_utility_internal
+
+// ----------------------
+// Create array view
+// ----------------------
+
+template <typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_1d_view(T& array, const SizeType startIdx, const SizeType size)
+    -> HostArrayView1D<typename T::ValueType> {
+  assert(array.size() >= startIdx + size);
+  return HostArrayView1D<typename T::ValueType>(array.data() + startIdx, size, array.pinned());
+}
+
+template <typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_1d_view(T& array, const SizeType startIdx, const SizeType size)
+    -> GPUArrayView1D<typename T::ValueType> {
+  assert(array.size() >= startIdx + size);
+  return GPUArrayView1D<typename T::ValueType>(
+      array.data() + startIdx, gpu_array_utility_internal::checked_cast_to_int(size),
+      array.device_id());
+}
+
+template <typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_2d_view(T& array, const SizeType startIdx, const SizeType dimOuter,
+                    const SizeType dimInner) -> HostArrayView2D<typename T::ValueType> {
+  assert(array.size() >= startIdx + dimInner * dimOuter);
+  return HostArrayView2D<typename T::ValueType>(array.data() + startIdx, dimOuter, dimInner,
+                                                array.pinned());
+}
+
+template <typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_2d_view(T& array, const SizeType startIdx, const SizeType dimOuter,
+                    const SizeType dimInner) -> GPUArrayView2D<typename T::ValueType> {
+  assert(array.size() >= startIdx + dimInner * dimOuter);
+  // check that entire memory can be adressed with int
+  gpu_array_utility_internal::checked_cast_to_int(dimOuter * dimInner);
+  return GPUArrayView2D<typename T::ValueType>(
+      array.data() + startIdx, gpu_array_utility_internal::checked_cast_to_int(dimOuter),
+      gpu_array_utility_internal::checked_cast_to_int(dimInner), array.device_id());
+}
+
+template <typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_3d_view(T& array, const SizeType startIdx, const SizeType dimOuter,
+                    const SizeType dimMid, const SizeType dimInner)
+    -> HostArrayView3D<typename T::ValueType> {
+  assert(array.size() >= startIdx + dimOuter * dimMid * dimInner);
+  return HostArrayView3D<typename T::ValueType>(array.data() + startIdx, dimOuter, dimMid, dimInner,
+                                                array.pinned());
+}
+
+template <typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_3d_view(T& array, const SizeType startIdx, const SizeType dimOuter,
+                    const SizeType dimMid, const SizeType dimInner)
+    -> GPUArrayView3D<typename T::ValueType> {
+  assert(array.size() >= startIdx + dimOuter * dimMid * dimInner);
+  // check that entire memory can be adressed with int
+  gpu_array_utility_internal::checked_cast_to_int(dimOuter * dimMid * dimInner);
+  return GPUArrayView3D<typename T::ValueType>(
+      array.data() + startIdx, gpu_array_utility_internal::checked_cast_to_int(dimOuter),
+      gpu_array_utility_internal::checked_cast_to_int(dimMid),
+      gpu_array_utility_internal::checked_cast_to_int(dimInner), array.device_id());
+}
+
+// -------------------------------
+// Create array view with new type
+// ------------------------------
+template <typename U, typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_1d_view(T& array, const SizeType size) -> HostArrayView1D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= size * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  return HostArrayView1D<U>(reinterpret_cast<U*>(array.data()), size, array.pinned());
+}
+
+template <typename U, typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_1d_view(T& array, const SizeType size) -> GPUArrayView1D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= size * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  return GPUArrayView1D<U>(reinterpret_cast<U*>(array.data()),
+                           gpu_array_utility_internal::checked_cast_to_int(size),
+                           array.device_id());
+}
+
+template <typename U, typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_2d_view(T& array, const SizeType dimOuter, const SizeType dimInner)
+    -> HostArrayView2D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= dimOuter * dimInner * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  return HostArrayView2D<U>(reinterpret_cast<U*>(array.data()), dimOuter, dimInner, array.pinned());
+}
+
+template <typename U, typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_2d_view(T& array, const SizeType dimOuter, const SizeType dimInner)
+    -> GPUArrayView2D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= dimOuter * dimInner * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  // check that entire memory can be adressed with int
+  gpu_array_utility_internal::checked_cast_to_int(dimOuter * dimInner);
+  return GPUArrayView2D<U>(
+      reinterpret_cast<U*>(array.data()), gpu_array_utility_internal::checked_cast_to_int(dimOuter),
+      gpu_array_utility_internal::checked_cast_to_int(dimInner), array.device_id());
+}
+
+template <typename U, typename T, typename std::enable_if<!IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_3d_view(T& array, const SizeType dimOuter, const SizeType dimMid,
+                             const SizeType dimInner) -> HostArrayView3D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= dimOuter * dimMid * dimInner * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  return HostArrayView3D<U>(reinterpret_cast<U*>(array.data()), dimOuter, dimMid, dimInner,
+                            array.pinned());
+}
+
+template <typename U, typename T, typename std::enable_if<IsDeviceMemory<T>::value, int>::type = 0>
+auto create_new_type_3d_view(T& array, const SizeType dimOuter, const SizeType dimMid,
+                             const SizeType dimInner) -> GPUArrayView3D<U> {
+  assert(array.size() * sizeof(typename T::ValueType) >= dimOuter * dimMid * dimInner * sizeof(U));
+  static_assert(alignof(typename T::ValueType) % alignof(U) == 0,
+                "Alignment of old type must be multiple of new type alignment");
+  // check that entire memory can be adressed with int
+  gpu_array_utility_internal::checked_cast_to_int(dimOuter * dimMid * dimInner);
+  return GPUArrayView3D<U>(
+      reinterpret_cast<U*>(array.data()), gpu_array_utility_internal::checked_cast_to_int(dimOuter),
+      gpu_array_utility_internal::checked_cast_to_int(dimMid),
+      gpu_array_utility_internal::checked_cast_to_int(dimInner), array.device_id());
+}
+
+// --------------------------------
+// convert scalar and complex views
+// --------------------------------
+template <typename T>
+auto convert_to_complex_view(HostArrayView1D<T> view) -> HostArrayView1D<std::complex<T>> {
+  assert(view.size() % 2 == 0);
+  return HostArrayView1D<std::complex<T>>(reinterpret_cast<std::complex<T>*>(view.data()),
+                                          view.size() / 2, view.pinned());
+}
+
+template <typename T>
+auto convert_to_complex_view(HostArrayView2D<T> view) -> HostArrayView2D<std::complex<T>> {
+  assert(view.dim_inner() % 2 == 0);
+  return HostArrayView2D<std::complex<T>>(reinterpret_cast<std::complex<T>*>(view.data()),
+                                          view.dim_outer(), view.dim_inner() / 2, view.pinned());
+}
+
+template <typename T>
+auto convert_to_complex_view(HostArrayView3D<T> view) -> HostArrayView3D<std::complex<T>> {
+  assert(view.dim_inner() % 2 == 0);
+  return HostArrayView3D<std::complex<T>>(reinterpret_cast<std::complex<T>*>(view.data()),
+                                          view.dim_outer(), view.dim_mid(), view.dim_inner() / 2,
+                                          view.pinned());
+}
+
+template <typename T>
+auto convert_from_complex_view(HostArrayView2D<std::complex<T>> view) -> HostArrayView1D<T> {
+  return HostArrayView1D<T>(reinterpret_cast<T*>(view.data()), view.size() * 2, view.pinned());
+}
+
+template <typename T>
+auto convert_from_complex_view(HostArrayView2D<std::complex<T>> view) -> HostArrayView3D<T> {
+  return HostArrayView2D<T>(reinterpret_cast<T*>(view.data()), view.dim_outer(),
+                            view.dim_inner() * 2, view.pinned());
+}
+
+template <typename T>
+auto convert_from_complex_view(HostArrayView3D<std::complex<T>> view) -> HostArrayView3D<T> {
+  return HostArrayView3D<T>(reinterpret_cast<T*>(view.data()), view.dim_outer(), view.dim_mid(),
+                            view.dim_inner() * 2, view.pinned());
+}
+
+} // namespace spfft
+
+#endif
diff --git a/src/memory/gpu_array.hpp b/src/memory/gpu_array.hpp
new file mode 100644
index 0000000..d71eea0
--- /dev/null
+++ b/src/memory/gpu_array.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_GPU_ARRAY_HPP
+#define SPFFT_GPU_ARRAY_HPP
+
+#include <cassert>
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class GPUArray {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 1;
+
+  GPUArray() = default;
+
+  GPUArray(const SizeType size);
+
+  GPUArray(const GPUArray& array) = delete;
+
+  GPUArray(GPUArray&& array) noexcept;
+
+  ~GPUArray();
+
+  auto operator=(const GPUArray& array) -> GPUArray& = delete;
+
+  auto operator=(GPUArray&& array) noexcept -> GPUArray&;
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+private:
+  SizeType size_ = 0;
+  ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+// ======================
+// Implementation
+// ======================
+template <typename T>
+GPUArray<T>::GPUArray(const SizeType size)
+    : size_(size), data_(nullptr), deviceId_(0) {
+  assert(size >= 0);
+  gpu::check_status(gpu::get_device(&deviceId_));
+  if (size > 0) {
+    gpu::check_status(gpu::malloc(reinterpret_cast<void**>(&data_), size * sizeof(ValueType)));
+  }
+}
+
+template <typename T>
+GPUArray<T>::~GPUArray() {
+  if (data_) {
+    // don't check error to avoid throwing exception in destructor
+    gpu::free(data_);
+    data_ = nullptr;
+    size_ = 0;
+  }
+}
+
+template <typename T>
+GPUArray<T>::GPUArray(GPUArray&& array) noexcept
+    : size_(array.size_), data_(array.data_), deviceId_(array.deviceId_) {
+  array.data_ = nullptr;
+  array.size_ = 0;
+}
+
+template <typename T>
+auto GPUArray<T>::operator=(GPUArray&& array) noexcept -> GPUArray& {
+  if (data_) {
+    gpu::free(data_);
+  }
+  data_ = array.data_;
+  size_ = array.size_;
+  deviceId_ = array.deviceId_;
+
+  array.data_ = nullptr;
+  array.size_ = 0;
+
+  return *this;
+}
+
+} // namespace spfft
+
+#endif
diff --git a/src/memory/gpu_array_const_view.hpp b/src/memory/gpu_array_const_view.hpp
new file mode 100644
index 0000000..14d9e04
--- /dev/null
+++ b/src/memory/gpu_array_const_view.hpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_GPU_ARRAY_CONST_VIEW_HPP
+#define SPFFT_GPU_ARRAY_CONST_VIEW_HPP
+
+#include <cassert>
+#include <limits>
+#include "memory/gpu_array_view.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include "gpu_util/gpu_runtime.hpp"
+#endif
+
+namespace spfft {
+
+// T must be build-in type
+template <typename T>
+class GPUArrayConstView1D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 1;
+
+  GPUArrayConstView1D() = default;
+
+  GPUArrayConstView1D(const ValueType* data, const int size, const int deviceId);
+
+  GPUArrayConstView1D(const GPUArrayView1D<T>&); // conversion allowed
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  __device__ inline auto operator()(const int idx) const -> ValueType {
+    assert(idx < size_);
+#if __CUDA_ARCH__ >= 350 || defined(__HIPCC__)
+    return __ldg(data_ + idx);
+#else
+    return data_[idx];
+#endif
+  }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int { return size_; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  inline auto size() const noexcept -> int { return size_; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+
+private:
+  int size_ = 0;
+  const ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+// T must be build-in type
+template <typename T>
+class GPUArrayConstView2D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 2;
+
+  GPUArrayConstView2D() = default;
+
+  GPUArrayConstView2D(const ValueType* data, const int dimOuter, const int dimInner,
+                      const int deviceId);
+
+  GPUArrayConstView2D(const GPUArrayView2D<T>&); // conversion allowed
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+  __device__ inline auto operator()(const int idxOuter, const int idxInner) const -> ValueType {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+#if __CUDA_ARCH__ >= 350 || defined(__HIPCC__)
+    return __ldg(data_ + (idxOuter * dims_[1]) + idxInner);
+#else
+    return data_[(idxOuter * dims_[1]) + idxInner];
+#endif
+  }
+
+  __host__ __device__ inline auto index(const int idxOuter, const int idxInner) const noexcept
+      -> int {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int { return dims_[0] * dims_[1]; }
+
+  __host__ __device__ inline auto dim_inner() const noexcept -> int { return dims_[1]; }
+
+  __host__ __device__ inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto index(const int idxOuter, const int idxInner) const noexcept -> int {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> int { return dims_[0] * dims_[1]; }
+
+  inline auto dim_inner() const noexcept -> int { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+private:
+  int dims_[2];
+  const ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+// T must be build-in type
+template <typename T>
+class GPUArrayConstView3D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 3;
+
+  GPUArrayConstView3D() = default;
+
+  GPUArrayConstView3D(const ValueType* data, const int dimOuter, const int dimMid,
+                      const int dimInner, const int deviceId);
+
+  GPUArrayConstView3D(const GPUArrayView3D<T>&); // conversion allowed
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+  __device__ inline auto operator()(const int idxOuter, const int idxMid, const int idxInner) const
+      noexcept -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+#if __CUDA_ARCH__ >= 350 || defined(__HIPCC__)
+    return __ldg(data_ + (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner);
+#else
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+#endif
+  }
+
+  __host__ __device__ inline auto index(const int idxOuter, const int idxMid,
+                                        const int idxInner) const noexcept -> int {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int {
+    return dims_[0] * dims_[1] * dims_[2];
+  }
+
+  __host__ __device__ inline auto dim_inner() const noexcept -> int { return dims_[2]; }
+
+  __host__ __device__ inline auto dim_mid() const noexcept -> int { return dims_[1]; }
+
+  __host__ __device__ inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto index(const int idxOuter, const int idxMid, const int idxInner) const noexcept
+      -> int {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> int { return dims_[0] * dims_[1] * dims_[2]; }
+
+  inline auto dim_inner() const noexcept -> int { return dims_[2]; }
+
+  inline auto dim_mid() const noexcept -> int { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+
+private:
+  int dims_[3];
+  const ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+// ======================
+// Implementation
+// ======================
+
+template <typename T>
+GPUArrayConstView1D<T>::GPUArrayConstView1D(const ValueType* data, const int size,
+                                            const int deviceId)
+    : size_(size), data_(data), deviceId_(deviceId) {
+  assert(!(size != 0 && data == nullptr));
+}
+
+template <typename T>
+GPUArrayConstView1D<T>::GPUArrayConstView1D(const GPUArrayView1D<T>& view)
+    : size_(view.size()), data_(view.data()), deviceId_(view.device_id()) {}
+
+template <typename T>
+GPUArrayConstView2D<T>::GPUArrayConstView2D(const ValueType* data, const int dimOuter, const int dimInner,
+                                            const int deviceId)
+    : dims_{dimOuter, dimInner}, data_(data), deviceId_(deviceId) {
+  assert(!(dimOuter != 0 && dimInner != 0 && data == nullptr));
+  assert(dimOuter >= 0);
+  assert(dimInner >= 0);
+}
+
+template <typename T>
+GPUArrayConstView2D<T>::GPUArrayConstView2D(const GPUArrayView2D<T>& view)
+    : dims_{view.dim_outer(), view.dim_inner()}, data_(view.data()), deviceId_(view.device_id()) {}
+
+template <typename T>
+GPUArrayConstView3D<T>::GPUArrayConstView3D(const ValueType* data, const int dimOuter,
+                                            const int dimMid, const int dimInner,
+                                            const int deviceId)
+    : dims_{dimOuter, dimMid, dimInner}, data_(data), deviceId_(deviceId) {
+  assert(!(dimOuter != 0 && dimMid != 0 && dimInner != 0 && data == nullptr));
+  assert(dimOuter >= 0);
+  assert(dimMid >= 0);
+  assert(dimInner >= 0);
+}
+
+template <typename T>
+GPUArrayConstView3D<T>::GPUArrayConstView3D(const GPUArrayView3D<T>& view)
+    : dims_{view.dim_outer(), view.dim_mid(), view.dim_inner()},
+      data_(view.data()),
+      deviceId_(view.device_id()) {}
+} // namespace spfft
+
+#endif
+
diff --git a/src/memory/gpu_array_view.hpp b/src/memory/gpu_array_view.hpp
new file mode 100644
index 0000000..9008db6
--- /dev/null
+++ b/src/memory/gpu_array_view.hpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_GPU_ARRAY_VIEW_HPP
+#define SPFFT_GPU_ARRAY_VIEW_HPP
+
+
+#include <cassert>
+#include <limits>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include "gpu_util/gpu_runtime.hpp"
+#endif
+
+
+namespace spfft {
+
+template <typename T>
+class GPUArrayView1D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 1;
+
+  GPUArrayView1D() = default;
+
+  GPUArrayView1D(ValueType* data, const int size, const int deviceId);
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  __device__ inline auto operator()(const int idx) -> ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  __device__ inline auto operator()(const int idx) const -> const ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  __host__ __device__ inline auto data() noexcept -> ValueType* { return data_; }
+
+  __host__ __device__ inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int { return size_; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  inline auto size() const noexcept -> int { return size_; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+
+private:
+  int size_ = 0;
+  ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+template <typename T>
+class GPUArrayView2D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 2;
+
+  GPUArrayView2D() = default;
+
+  GPUArrayView2D(ValueType* data, const int dimOuter, const int dimInner, const int deviceId);
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+  __device__ inline auto operator()(const int idxOuter, const int idxInner) -> ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+    return data_[(idxOuter * dims_[1]) + idxInner];
+  }
+
+  __device__ inline auto operator()(const int idxOuter, const int idxInner) const
+      -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+    return data_[(idxOuter * dims_[1]) + idxInner];
+  }
+
+  __host__ __device__ inline auto index(const int idxOuter, const int idxInner) const noexcept
+      -> int {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  __host__ __device__ inline auto data() noexcept -> ValueType* { return data_; }
+
+  __host__ __device__ inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int { return dims_[0] * dims_[1]; }
+
+  __host__ __device__ inline auto dim_inner() const noexcept -> int { return dims_[1]; }
+
+  __host__ __device__ inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto index(const int idxOuter, const int idxInner) const noexcept -> int {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> int { return dims_[0] * dims_[1]; }
+
+  inline auto dim_inner() const noexcept -> int { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+private:
+  int dims_[2];
+  ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+template <typename T>
+class GPUArrayView3D {
+public:
+  using ValueType = T;
+  static constexpr SizeType ORDER = 3;
+
+  GPUArrayView3D() = default;
+
+  GPUArrayView3D(ValueType* data, const int dimOuter, const int dimMid, const int dimInner,
+                 const int deviceId);
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+  __device__ inline auto operator()(const int idxOuter, const int idxMid,
+                                    const int idxInner) noexcept -> ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+  }
+
+  __device__ inline auto operator()(const int idxOuter, const int idxMid, const int idxInner) const
+      noexcept -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+  }
+
+  __host__ __device__ inline auto index(const int idxOuter, const int idxMid,
+                                        const int idxInner) const noexcept -> int {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  __host__ __device__ inline auto data() noexcept -> ValueType* { return data_; }
+
+  __host__ __device__ inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  __host__ __device__ inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  __host__ __device__ inline auto size() const noexcept -> int {
+    return dims_[0] * dims_[1] * dims_[2];
+  }
+
+  __host__ __device__ inline auto dim_inner() const noexcept -> int { return dims_[2]; }
+
+  __host__ __device__ inline auto dim_mid() const noexcept -> int { return dims_[1]; }
+
+  __host__ __device__ inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  __host__ __device__ inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#else
+
+  inline auto index(const int idxOuter, const int idxMid, const int idxInner) const noexcept
+      -> int {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> int { return dims_[0] * dims_[1] * dims_[2]; }
+
+  inline auto dim_inner() const noexcept -> int { return dims_[2]; }
+
+  inline auto dim_mid() const noexcept -> int { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> int { return dims_[0]; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+#endif
+
+private:
+  int dims_[3];
+  ValueType* data_ = nullptr;
+  int deviceId_ = 0;
+};
+
+// ======================
+// Implementation
+// ======================
+template <typename T>
+GPUArrayView1D<T>::GPUArrayView1D(ValueType* data, const int size, const int deviceId)
+    : size_(size), data_(data), deviceId_(deviceId) {
+  assert(!(size != 0 && data == nullptr));
+}
+
+
+template <typename T>
+GPUArrayView2D<T>::GPUArrayView2D(ValueType* data, const int dimOuter, const int dimInner,
+                                  const int deviceId)
+    : dims_{dimOuter, dimInner}, data_(data), deviceId_(deviceId) {
+  assert(!(dimOuter != 0 && dimInner != 0 && data == nullptr));
+  assert(dimOuter >= 0);
+  assert(dimInner >= 0);
+}
+
+
+template <typename T>
+GPUArrayView3D<T>::GPUArrayView3D(ValueType* data, const int dimOuter, const int dimMid,
+                                  const int dimInner, const int deviceId)
+    : dims_{dimOuter, dimMid, dimInner}, data_(data), deviceId_(deviceId) {
+  assert(!(dimOuter != 0 && dimMid != 0 && dimInner != 0 && data == nullptr));
+  assert(dimOuter >= 0);
+  assert(dimMid >= 0);
+  assert(dimInner >= 0);
+}
+
+} // namespace spfft
+
+#endif
+
diff --git a/src/memory/host_array.hpp b/src/memory/host_array.hpp
new file mode 100644
index 0000000..1e500a4
--- /dev/null
+++ b/src/memory/host_array.hpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_HOST_ARRAY_HPP
+#define SPFFT_HOST_ARRAY_HPP
+
+#include <cassert>
+#include <memory>
+#include <type_traits>
+#include <vector>
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "memory/aligned_allocation.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+// Fixed sized array with data aligned to page boundaries
+// and requirements for pinned memory with ROCm.
+// The data can be pinned in memory, if GPU support is enabled.
+// The destructor of type T must not throw.
+template <typename T>
+class HostArray {
+public:
+  static_assert(std::is_nothrow_destructible<T>::value,
+                "Destructor of ValueType for HostArray must be noexcept.");
+
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+  static constexpr SizeType ORDER = 1;
+
+  // Construct empty array
+  HostArray() noexcept;
+
+  // Create array with given size. Additional parameters are passed to the
+  // constructor of each element of type T.
+  // Throws exception upon allocation or element construction failure
+  template <typename... ARGS>
+  HostArray(SizeType size, ARGS... args);
+
+  HostArray(const HostArray& array) = delete;
+
+  HostArray(HostArray&& array) noexcept;
+
+  ~HostArray() noexcept(std::is_nothrow_destructible<T>::value);
+
+  auto operator=(const HostArray& array) -> HostArray& = delete;
+
+  auto operator=(HostArray&& array) noexcept -> HostArray&;
+
+  inline auto operator[](const SizeType idx) -> ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto operator[](const SizeType idx) const -> const ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto operator()(const SizeType idx) -> ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto operator()(const SizeType idx) const -> const ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  // Attempt to pin memory. Return true on success and false otherwise
+  auto pin_memory() noexcept -> bool;
+
+  // Unpin memory if pinned. Does nothing otherwise
+  auto unpin_memory() noexcept -> void;
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto begin() noexcept -> Iterator { return data_; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() noexcept -> Iterator { return data_ + size_; }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size_; }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size_; }
+
+  // undefined behaviour for empty array
+  inline auto front() -> ValueType& { return data_[0]; }
+
+  // undefined behaviour for empty array
+  inline auto front() const -> const ValueType& { return data_[0]; }
+
+  // undefined behaviour for empty array
+  inline auto back() -> ValueType& { return data_[size_ - 1]; }
+
+  // undefined behaviour for empty array
+  inline auto back() const -> const ValueType& { return data_[size_ - 1]; }
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+private:
+  T* data_ = nullptr;
+  SizeType size_ = 0;
+  bool pinned_ = false;
+};
+
+// ======================
+// Implementation
+// ======================
+
+template <typename T>
+HostArray<T>::HostArray() noexcept : data_(nullptr), size_(0), pinned_(false) {}
+
+template <typename T>
+template <typename... ARGS>
+HostArray<T>::HostArray(SizeType size, ARGS... args)
+    : data_(static_cast<T*>(memory::allocate_aligned(size * sizeof(T)))),
+      size_(size),
+      pinned_(false) {
+  try {
+    memory::construct_elements_in_place(data_, size, std::forward<ARGS>(args)...);
+  } catch (...) {
+    size_ = 0;
+    memory::free_aligned(data_);
+    data_ = nullptr;
+    throw;
+  }
+}
+
+template <typename T>
+HostArray<T>::HostArray(HostArray&& array) noexcept : data_(nullptr), size_(0), pinned_(false) {
+  data_ = array.data_;
+  array.data_ = nullptr;
+
+  size_ = array.size_;
+  array.size_ = 0;
+
+  pinned_ = array.pinned_;
+  array.pinned_ = false;
+}
+
+template <typename T>
+HostArray<T>::~HostArray() noexcept(std::is_nothrow_destructible<T>::value) {
+  if (data_) {
+    this->unpin_memory();
+    memory::deconstruct_elements(data_, size_);
+    memory::free_aligned(data_);
+    data_ = nullptr;
+    size_ = 0;
+  }
+  assert(data_ == nullptr);
+  assert(size_ == 0);
+  assert(!pinned_);
+}
+
+template <typename T>
+auto HostArray<T>::operator=(HostArray&& array) noexcept -> HostArray& {
+  if (data_) {
+    this->unpin_memory();
+    memory::deconstruct_elements(data_, size_);
+    memory::free_aligned(data_);
+  }
+
+  data_ = array.data_;
+  array.data_ = nullptr;
+
+  size_ = array.size_;
+  array.size_ = 0;
+
+  pinned_ = array.pinned_;
+  array.pinned_ = false;
+
+  return *this;
+}
+
+template <typename T>
+auto HostArray<T>::pin_memory() noexcept -> bool {
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  if (!pinned_ && data_) {
+    if (gpu::host_register(static_cast<void*>(data_), size_ * sizeof(ValueType),
+                           gpu::flag::HostRegisterDefault) == gpu::status::Success) {
+      pinned_ = true;
+    }
+  }
+#endif
+  return pinned_;
+}
+
+template <typename T>
+auto HostArray<T>::unpin_memory() noexcept -> void {
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  if (pinned_) {
+    gpu::host_unregister((void*)data_);
+    pinned_ = false;
+  }
+#endif
+}
+
+} // namespace spfft
+
+#endif
diff --git a/src/memory/host_array_const_view.hpp b/src/memory/host_array_const_view.hpp
new file mode 100644
index 0000000..bd0e9fc
--- /dev/null
+++ b/src/memory/host_array_const_view.hpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_HOST_ARRAY_CONST_VIEW_HPP
+#define SPFFT_HOST_ARRAY_CONST_VIEW_HPP
+
+#include <array>
+#include <cassert>
+#include "memory/host_array_view.hpp"
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class HostArrayConstView1D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 1;
+
+  HostArrayConstView1D() = default;
+
+  HostArrayConstView1D(const HostArrayConstView1D&) = default;
+
+  HostArrayConstView1D(HostArrayConstView1D&&) = default;
+
+  HostArrayConstView1D(const ValueType* data, const SizeType size, const bool pinned);
+
+  // conversion from non-const view
+  HostArrayConstView1D(const HostArrayView1D<T>& view)
+      : size_(view.size()), pinned_(view.pinned()), data_(view.data()) {}
+
+  inline auto operator()(const SizeType idx) const -> const ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size_; }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size_; }
+
+private:
+  SizeType size_ = 0;
+  bool pinned_ = false;
+  const ValueType* data_ = nullptr;
+};
+
+template <typename T>
+class HostArrayConstView2D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 2;
+
+  HostArrayConstView2D() = default;
+
+  HostArrayConstView2D(const HostArrayConstView2D&) = default;
+
+  HostArrayConstView2D(HostArrayConstView2D&&) = default;
+
+  HostArrayConstView2D(const ValueType* data, const SizeType dimOuter, const SizeType dimInner,
+                       const bool pinned);
+
+  HostArrayConstView2D(const ValueType* data, const std::array<SizeType, 2>& dims,
+                       const bool pinned);
+
+  // conversion from non-const view
+  HostArrayConstView2D(const HostArrayView2D<T>& view)
+      : dims_({view.dim_outer(), view.dim_inner()}), pinned_(view.pinned()), data_(view.data()) {}
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxInner) const
+      -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+    return data_[(idxOuter * dims_[1]) + idxInner];
+  }
+
+  inline auto index(const SizeType idxOuter, const SizeType idxInner) const noexcept -> SizeType {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> SizeType { return dims_[0] * dims_[1]; }
+
+  inline auto dim_inner() const noexcept -> SizeType { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> SizeType { return dims_[0]; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size(); }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size(); }
+
+private:
+  std::array<SizeType, 2> dims_ = {0, 0};
+  bool pinned_ = false;
+  const ValueType* data_ = nullptr;
+};
+
+template <typename T>
+class HostArrayConstView3D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 3;
+
+  HostArrayConstView3D() = default;
+
+  HostArrayConstView3D(const HostArrayConstView3D&) = default;
+
+  HostArrayConstView3D(HostArrayConstView3D&&) = default;
+
+  HostArrayConstView3D(const ValueType* data, const SizeType dimOuter, const SizeType dimMid,
+                       const SizeType dimInner, const bool pinned);
+
+  HostArrayConstView3D(const ValueType* data, const std::array<SizeType, 3>& dims,
+                       const bool pinned);
+
+  // conversion from non-const view
+  HostArrayConstView3D(const HostArrayView3D<T>& view)
+      : dims_({view.dim_outer(), view.dim_mid(), view.dim_inner()}),
+        pinned_(view.pinned()),
+        data_(view.data()) {}
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxMid,
+                         const SizeType idxInner) const noexcept -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+  }
+
+  inline auto index(const SizeType idxOuter, const SizeType idxMid, const SizeType idxInner) const
+      noexcept -> SizeType {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> SizeType { return dims_[0] * dims_[1] * dims_[2]; }
+
+  inline auto dim_inner() const noexcept -> SizeType { return dims_[2]; }
+
+  inline auto dim_mid() const noexcept -> SizeType { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> SizeType { return dims_[0]; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size(); }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size(); }
+
+private:
+  std::array<SizeType, 3> dims_ = {0, 0, 0};
+  bool pinned_ = false;
+  const ValueType* data_ = nullptr;
+};
+
+// ======================
+// Implementation
+// ======================
+
+template <typename T>
+HostArrayConstView1D<T>::HostArrayConstView1D(const ValueType* data, const SizeType size,
+                                              const bool pinned)
+    : size_(size), pinned_(pinned), data_(data) {
+  assert(!(size != 0 && data == nullptr));
+}
+
+template <typename T>
+HostArrayConstView2D<T>::HostArrayConstView2D(const ValueType* data, const SizeType dimOuter,
+                                              const SizeType dimInner, const bool pinned)
+    : dims_({dimOuter, dimInner}), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayConstView2D<T>::HostArrayConstView2D(const ValueType* data,
+                                              const std::array<SizeType, 2>& dims,
+                                              const bool pinned)
+    : dims_(dims), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayConstView3D<T>::HostArrayConstView3D(const ValueType* data, const SizeType dimOuter,
+                                              const SizeType dimMid, const SizeType dimInner,
+                                              const bool pinned)
+    : dims_({dimOuter, dimMid, dimInner}), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayConstView3D<T>::HostArrayConstView3D(const ValueType* data,
+                                              const std::array<SizeType, 3>& dims,
+                                              const bool pinned)
+    : dims_(dims), pinned_(pinned), data_(data) {}
+} // namespace spfft
+#endif
+
diff --git a/src/memory/host_array_view.hpp b/src/memory/host_array_view.hpp
new file mode 100644
index 0000000..8ea7a29
--- /dev/null
+++ b/src/memory/host_array_view.hpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_HOST_ARRAY_VIEW_HPP
+#define SPFFT_HOST_ARRAY_VIEW_HPP
+
+#include <array>
+#include <cassert>
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+template <typename T>
+class HostArrayView1D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 1;
+
+  HostArrayView1D() = default;
+
+  HostArrayView1D(ValueType* data, const SizeType size, const bool pinned);
+
+  inline auto operator()(const SizeType idx) -> ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto operator()(const SizeType idx) const -> const ValueType& {
+    assert(idx < size_);
+    return data_[idx];
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return size_ == 0; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  inline auto begin() noexcept -> Iterator { return data_; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() noexcept -> Iterator { return data_ + size_; }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size_; }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size_; }
+
+private:
+  SizeType size_ = 0;
+  bool pinned_ = false;
+  ValueType* data_ = nullptr;
+};
+
+template <typename T>
+class HostArrayView2D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 2;
+
+  HostArrayView2D() = default;
+
+  HostArrayView2D(ValueType* data, const SizeType dimOuter, const SizeType dimInner,
+                  const bool pinned);
+
+  HostArrayView2D(ValueType* data, const std::array<SizeType, 2>& dims, const bool pinned);
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxInner) -> ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+    return data_[(idxOuter * dims_[1]) + idxInner];
+  }
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxInner) const
+      -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxInner < dims_[1]);
+    return data_[(idxOuter * dims_[1]) + idxInner];
+  }
+
+  inline auto index(const SizeType idxOuter, const SizeType idxInner) const noexcept -> SizeType {
+    return (idxOuter * dims_[1]) + idxInner;
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> SizeType { return dims_[0] * dims_[1]; }
+
+  inline auto dim_inner() const noexcept -> SizeType { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> SizeType { return dims_[0]; }
+
+  inline auto begin() noexcept -> Iterator { return data_; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() noexcept -> Iterator { return data_ + size(); }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size(); }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size(); }
+
+private:
+  std::array<SizeType, 2> dims_ = {0, 0};
+  bool pinned_ = false;
+  ValueType* data_ = nullptr;
+};
+
+template <typename T>
+class HostArrayView3D {
+public:
+  using ValueType = T;
+  using Iterator = T*;
+  using ConstIterator = const T*;
+
+  static constexpr SizeType ORDER = 3;
+
+  HostArrayView3D() = default;
+
+  HostArrayView3D(ValueType* data, const SizeType dimOuter, const SizeType dimMid,
+                  const SizeType dimInner, const bool pinned);
+
+  HostArrayView3D(ValueType* data, const std::array<SizeType, 3>& dims, const bool pinned);
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxMid,
+                         const SizeType idxInner) noexcept -> ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+  }
+
+  inline auto operator()(const SizeType idxOuter, const SizeType idxMid,
+                         const SizeType idxInner) const noexcept -> const ValueType& {
+    assert(idxOuter < dims_[0]);
+    assert(idxMid < dims_[1]);
+    assert(idxInner < dims_[2]);
+    return data_[(idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner];
+  }
+
+  inline auto index(const SizeType idxOuter, const SizeType idxMid, const SizeType idxInner) const
+      noexcept -> SizeType {
+    return (idxOuter * dims_[1] + idxMid) * dims_[2] + idxInner;
+  }
+
+  inline auto pinned() const noexcept -> bool { return pinned_; }
+
+  inline auto data() noexcept -> ValueType* { return data_; }
+
+  inline auto data() const noexcept -> const ValueType* { return data_; }
+
+  inline auto empty() const noexcept -> bool { return this->size() == 0; }
+
+  inline auto size() const noexcept -> SizeType { return dims_[0] * dims_[1] * dims_[2]; }
+
+  inline auto dim_inner() const noexcept -> SizeType { return dims_[2]; }
+
+  inline auto dim_mid() const noexcept -> SizeType { return dims_[1]; }
+
+  inline auto dim_outer() const noexcept -> SizeType { return dims_[0]; }
+
+  inline auto begin() noexcept -> Iterator { return data_; }
+
+  inline auto begin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto cbegin() const noexcept -> ConstIterator { return data_; }
+
+  inline auto end() noexcept -> Iterator { return data_ + size(); }
+
+  inline auto end() const noexcept -> ConstIterator { return data_ + size(); }
+
+  inline auto cend() const noexcept -> ConstIterator { return data_ + size(); }
+
+private:
+  std::array<SizeType, 3> dims_ = {0, 0, 0};
+  bool pinned_ = false;
+  ValueType* data_ = nullptr;
+};
+
+// ======================
+// Implementation
+// ======================
+
+template <typename T>
+HostArrayView1D<T>::HostArrayView1D(ValueType* data, const SizeType size, const bool pinned)
+    : size_(size), pinned_(pinned), data_(data) {
+  assert(!(size != 0 && data == nullptr));
+}
+
+template <typename T>
+HostArrayView2D<T>::HostArrayView2D(ValueType* data, const SizeType dimOuter,
+                                    const SizeType dimInner, const bool pinned)
+    : dims_({dimOuter, dimInner}), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayView2D<T>::HostArrayView2D(ValueType* data, const std::array<SizeType, 2>& dims,
+                                    const bool pinned)
+    : dims_(dims), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayView3D<T>::HostArrayView3D(ValueType* data, const SizeType dimOuter, const SizeType dimMid,
+                                    const SizeType dimInner, const bool pinned)
+    : dims_({dimOuter, dimMid, dimInner}), pinned_(pinned), data_(data) {}
+
+template <typename T>
+HostArrayView3D<T>::HostArrayView3D(ValueType* data, const std::array<SizeType, 3>& dims,
+                                    const bool pinned)
+    : dims_(dims), pinned_(pinned), data_(data) {}
+} // namespace spfft
+#endif
+
diff --git a/src/memory/memory_type_trait.hpp b/src/memory/memory_type_trait.hpp
new file mode 100644
index 0000000..5edb792
--- /dev/null
+++ b/src/memory/memory_type_trait.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MEMORY_TYPE_TRAIT_HPP
+#define SPFFT_MEMORY_TYPE_TRAIT_HPP
+#include "spfft/config.h"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "memory/gpu_array_const_view.hpp"
+#endif
+
+namespace spfft {
+
+template<typename T>
+struct IsDeviceMemory {
+  constexpr static bool value = false;
+};
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+template<typename T>
+struct IsDeviceMemory<GPUArray<T>> {
+  constexpr static bool value = true;
+};
+
+template<typename T>
+struct IsDeviceMemory<GPUArrayView1D<T>> {
+  constexpr static bool value = true;
+};
+
+template<typename T>
+struct IsDeviceMemory<GPUArrayView2D<T>> {
+  constexpr static bool value = true;
+};
+template<typename T>
+struct IsDeviceMemory<GPUArrayView3D<T>> {
+  constexpr static bool value = true;
+};
+
+
+template<typename T>
+struct IsDeviceMemory<GPUArrayConstView1D<T>> {
+  constexpr static bool value = true;
+};
+
+template<typename T>
+struct IsDeviceMemory<GPUArrayConstView2D<T>> {
+  constexpr static bool value = true;
+};
+template<typename T>
+struct IsDeviceMemory<GPUArrayConstView3D<T>> {
+  constexpr static bool value = true;
+};
+
+#endif
+} // namespace
+#endif
diff --git a/src/mpi_util/mpi_check_status.hpp b/src/mpi_util/mpi_check_status.hpp
new file mode 100644
index 0000000..c790d12
--- /dev/null
+++ b/src/mpi_util/mpi_check_status.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_CHECK_STATUS_HPP
+#define SPFFT_MPI_CHECK_STATUS_HPP
+
+#include <mpi.h>
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+
+namespace spfft {
+inline auto mpi_check_status(int status) -> void {
+  if (status != MPI_SUCCESS) {
+    throw MPIError();
+  }
+}
+} // namespace spfft
+
+#endif
diff --git a/src/mpi_util/mpi_communicator_handle.hpp b/src/mpi_util/mpi_communicator_handle.hpp
new file mode 100644
index 0000000..917184c
--- /dev/null
+++ b/src/mpi_util/mpi_communicator_handle.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_COMMUNICATOR_HANDLE_HPP
+#define SPFFT_MPI_COMMUNICATOR_HANDLE_HPP
+
+#include <mpi.h>
+#include <cassert>
+#include <memory>
+#include "mpi_util/mpi_check_status.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+// MPI Communicator, which creates a duplicate at construction time.
+// Copies of the object share the same communicator, which is reference counted.
+class MPICommunicatorHandle {
+public:
+  MPICommunicatorHandle() : comm_(new MPI_Comm(MPI_COMM_SELF)), size_(1), rank_(0) {}
+
+  MPICommunicatorHandle(const MPI_Comm& comm) {
+    // create copy of communicator
+    MPI_Comm newComm;
+    mpi_check_status(MPI_Comm_dup(comm, &newComm));
+
+    comm_ = std::shared_ptr<MPI_Comm>(new MPI_Comm(newComm), [](MPI_Comm* ptr) {
+      MPI_Comm_free(ptr);
+      delete ptr;
+    });
+
+    int sizeInt, rankInt;
+    mpi_check_status(MPI_Comm_size(*comm_, &sizeInt));
+    mpi_check_status(MPI_Comm_rank(*comm_, &rankInt));
+
+    if (sizeInt < 1 || rankInt < 0) {
+      throw MPIError();
+    }
+    rank_ = static_cast<SizeType>(rankInt);
+    size_ = static_cast<SizeType>(sizeInt);
+  }
+
+  inline auto get() const -> const MPI_Comm& { return *comm_; }
+
+  inline auto size() const noexcept -> SizeType { return size_; }
+
+  inline auto rank() const noexcept -> SizeType { return rank_; }
+
+private:
+  std::shared_ptr<MPI_Comm> comm_ = nullptr;
+  SizeType size_ = 1;
+  SizeType rank_ = 0;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/mpi_util/mpi_datatype_handle.hpp b/src/mpi_util/mpi_datatype_handle.hpp
new file mode 100644
index 0000000..79da813
--- /dev/null
+++ b/src/mpi_util/mpi_datatype_handle.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_DATATYPE_HANDLE_HPP
+#define SPFFT_MPI_DATATYPE_HANDLE_HPP
+
+#include <mpi.h>
+#include <memory>
+#include <vector>
+#include "mpi_util/mpi_check_status.hpp"
+#include "spfft/config.h"
+
+namespace spfft {
+
+// Storage for MPI datatypes
+class MPIDatatypeHandle {
+public:
+  MPIDatatypeHandle() = default;
+
+  // Create custom datatype with ownership
+  // Does not call MPI_Type_commit!
+  // Can take predifined MPI types such as MPI_DOUBLE, on which MPI_Type_free() will not be called
+  // NOTE: Freeing a MPI_Datatype on which this type depends on does not affect this type (see "The
+  // MPI core")
+  MPIDatatypeHandle(const MPI_Datatype& mpiType) {
+    assert(mpiType != MPI_DATATYPE_NULL);
+    int numIntegers, numAddresses, numDatatypes, combiner;
+    mpi_check_status(
+        MPI_Type_get_envelope(mpiType, &numIntegers, &numAddresses, &numDatatypes, &combiner));
+    if (combiner != MPI_COMBINER_NAMED && combiner != MPI_COMBINER_DUP) {
+      // take ownership and call MPI_Type_free upon release
+      type_ = std::shared_ptr<MPI_Datatype>(new MPI_Datatype(mpiType), [](MPI_Datatype* ptr) {
+        assert(*ptr != MPI_DATATYPE_NULL);
+        MPI_Type_free(ptr);
+        delete ptr;
+      });
+    } else {
+      // only copy type descriptor, will not call MPI_Type_free()
+      type_ = std::make_shared<MPI_Datatype>(mpiType);
+    }
+  }
+
+  inline auto get() const -> const MPI_Datatype& {
+    assert(type_);
+    assert(*type_ != MPI_DATATYPE_NULL);
+    return *type_;
+  }
+
+  inline auto empty() const noexcept -> bool { return type_ == nullptr; }
+
+  inline static MPIDatatypeHandle create_contiguous(int count, MPI_Datatype oldType) {
+    MPI_Datatype newType;
+    mpi_check_status(MPI_Type_contiguous(count, oldType, &newType));
+    mpi_check_status(MPI_Type_commit(&newType));
+    return MPIDatatypeHandle(newType);
+  }
+
+  inline static MPIDatatypeHandle create_vector(int count, int blocklength, int stride,
+                                                MPI_Datatype oldType) {
+    MPI_Datatype newType;
+    mpi_check_status(MPI_Type_vector(count, blocklength, stride, oldType, &newType));
+    mpi_check_status(MPI_Type_commit(&newType));
+    return MPIDatatypeHandle(newType);
+  }
+
+  inline static MPIDatatypeHandle create_hindexed(int count, const int arrayOfBlocklengths[],
+                                                  const MPI_Aint arrayOfDispls[],
+                                                  MPI_Datatype oldType) {
+    MPI_Datatype newType;
+    mpi_check_status(
+        MPI_Type_create_hindexed(count, arrayOfBlocklengths, arrayOfDispls, oldType, &newType));
+    mpi_check_status(MPI_Type_commit(&newType));
+    return MPIDatatypeHandle(newType);
+  }
+
+  inline static MPIDatatypeHandle create_subarray(int ndims, const int arrayOfSizes[],
+                                                  const int arrayOfSubsizes[],
+                                                  const int arrayOfStarts[], int order,
+                                                  MPI_Datatype oldType) {
+    MPI_Datatype newType;
+    mpi_check_status(MPI_Type_create_subarray(ndims, arrayOfSizes, arrayOfSubsizes, arrayOfStarts,
+                                              order, oldType, &newType));
+    mpi_check_status(MPI_Type_commit(&newType));
+    return MPIDatatypeHandle(newType);
+  }
+
+private:
+  std::shared_ptr<MPI_Datatype> type_ = nullptr;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/mpi_util/mpi_init_handle.hpp b/src/mpi_util/mpi_init_handle.hpp
new file mode 100644
index 0000000..835c8b3
--- /dev/null
+++ b/src/mpi_util/mpi_init_handle.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_INIT_HANDLE_HPP
+#define SPFFT_MPI_INIT_HANDLE_HPP
+
+#include <mpi.h>
+#include "mpi_util/mpi_check_status.hpp"
+#include "spfft/config.h"
+
+namespace spfft {
+
+// MPI Communicator, which creates a duplicate at construction time.
+// Copies of the object share the same communicator, which is reference counted.
+class MPIInitHandle {
+public:
+  MPIInitHandle(int& argc, char**& argv, bool callFinalize) : callFinalize_(callFinalize) {
+    int initialized;
+    MPI_Initialized(&initialized);
+    if (!initialized) {
+      // MPI_Init(&argc, &argv);
+      int provided;
+      mpi_check_status(MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided));
+    }
+  }
+
+  // unmovable
+  MPIInitHandle(const MPIInitHandle& other) = delete;
+  MPIInitHandle(MPIInitHandle&& other) = delete;
+  auto operator=(const MPIInitHandle& other) -> MPIInitHandle& = delete;
+  auto operator=(MPIInitHandle&& other) -> MPIInitHandle& = delete;
+
+  ~MPIInitHandle() {
+    if (callFinalize_) {
+      MPI_Finalize();
+    }
+  }
+
+private:
+  bool callFinalize_ = false;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/mpi_util/mpi_match_elementary_type.hpp b/src/mpi_util/mpi_match_elementary_type.hpp
new file mode 100644
index 0000000..12b2e15
--- /dev/null
+++ b/src/mpi_util/mpi_match_elementary_type.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_MATCH_ELEMENTARY_TYPE_HPP
+#define SPFFT_MPI_MATCH_ELEMENTARY_TYPE_HPP
+
+#include <mpi.h>
+#include "spfft/config.h"
+
+namespace spfft {
+
+template <typename T>
+struct MPIMatchElementaryType;
+
+template <>
+struct MPIMatchElementaryType<char> {
+  inline static auto get() -> MPI_Datatype { return MPI_CHAR; }
+};
+
+template <>
+struct MPIMatchElementaryType<signed short int> {
+  inline static auto get() -> MPI_Datatype { return MPI_SHORT; }
+};
+
+template <>
+struct MPIMatchElementaryType<signed int> {
+  inline static auto get() -> MPI_Datatype { return MPI_INT; }
+};
+
+template <>
+struct MPIMatchElementaryType<signed long int> {
+  inline static auto get() -> MPI_Datatype { return MPI_LONG; }
+};
+
+template <>
+struct MPIMatchElementaryType<signed long long int> {
+  inline static auto get() -> MPI_Datatype { return MPI_LONG_LONG; }
+};
+
+template <>
+struct MPIMatchElementaryType<signed char> {
+  inline static auto get() -> MPI_Datatype { return MPI_SIGNED_CHAR; }
+};
+
+template <>
+struct MPIMatchElementaryType<unsigned char> {
+  inline static auto get() -> MPI_Datatype { return MPI_UNSIGNED_CHAR; }
+};
+
+template <>
+struct MPIMatchElementaryType<unsigned short int> {
+  inline static auto get() -> MPI_Datatype { return MPI_UNSIGNED_SHORT; }
+};
+
+template <>
+struct MPIMatchElementaryType<unsigned int> {
+  inline static auto get() -> MPI_Datatype { return MPI_UNSIGNED; }
+};
+
+template <>
+struct MPIMatchElementaryType<unsigned long int> {
+  inline static auto get() -> MPI_Datatype { return MPI_UNSIGNED_LONG; }
+};
+
+template <>
+struct MPIMatchElementaryType<unsigned long long int> {
+  inline static auto get() -> MPI_Datatype { return MPI_UNSIGNED_LONG_LONG; }
+};
+
+template <>
+struct MPIMatchElementaryType<float> {
+  inline static auto get() -> MPI_Datatype { return MPI_FLOAT; }
+};
+
+template <>
+struct MPIMatchElementaryType<double> {
+  inline static auto get() -> MPI_Datatype { return MPI_DOUBLE; }
+};
+
+template <>
+struct MPIMatchElementaryType<long double> {
+  inline static auto get() -> MPI_Datatype { return MPI_LONG_DOUBLE; }
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/mpi_util/mpi_request_handle.hpp b/src/mpi_util/mpi_request_handle.hpp
new file mode 100644
index 0000000..eff3ff7
--- /dev/null
+++ b/src/mpi_util/mpi_request_handle.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MPI_REQUEST_HANDLE_HPP
+#define SPFFT_MPI_REQUEST_HANDLE_HPP
+
+#include <mpi.h>
+#include <memory>
+#include <vector>
+#include "mpi_util/mpi_check_status.hpp"
+#include "spfft/config.h"
+
+namespace spfft {
+
+// Storage for MPI datatypes
+class MPIRequestHandle {
+public:
+  MPIRequestHandle() = default;
+
+  MPIRequestHandle(const MPIRequestHandle&) = delete;
+
+  MPIRequestHandle(MPIRequestHandle&&) = default;
+
+  auto operator=(const MPIRequestHandle& other) -> MPIRequestHandle& = delete;
+
+  auto operator=(MPIRequestHandle&& other) -> MPIRequestHandle& = default;
+
+  inline auto get_and_activate() -> MPI_Request* {
+    activated_ = true;
+    return &mpiRequest_;
+  }
+
+  inline auto wait_if_active() -> void {
+    if(activated_) {
+      activated_ = false;
+      MPI_Wait(&mpiRequest_, MPI_STATUS_IGNORE);
+    }
+  }
+
+private:
+  MPI_Request mpiRequest_ = MPI_REQUEST_NULL;
+  bool activated_ = false;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/parameters/parameters.cpp b/src/parameters/parameters.cpp
new file mode 100644
index 0000000..330a231
--- /dev/null
+++ b/src/parameters/parameters.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "parameters/parameters.hpp"
+#include <algorithm>
+#include <cstdlib>
+#include <numeric>
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#endif
+
+namespace spfft {
+
+#ifdef SPFFT_MPI
+Parameters::Parameters(const MPICommunicatorHandle& comm, const SpfftTransformType transformType,
+                       const SizeType dimX, const SizeType dimY, const SizeType dimZ,
+                       const SizeType numLocalXYPlanes, const SizeType numLocalElements,
+                       SpfftIndexFormatType indexFormat, const int* indices)
+    : transformType_(transformType),
+      dimX_(dimX),
+      dimXFreq_(transformType == SPFFT_TRANS_R2C ? dimX / 2 + 1 : dimX),
+      dimY_(dimY),
+      dimZ_(dimZ),
+      totalNumXYPlanes_(dimZ),
+      comm_rank_(comm.rank()),
+      comm_size_(comm.size()) {
+  // helper struct to exchange information
+  struct TransposeParameter {
+    SizeType dimX;
+    SizeType dimY;
+    SizeType dimZ;
+    SizeType numLocalXYPlanes;
+    SizeType numLocalZSticks;
+  };
+
+
+  // Only index triplets supported (for now)
+  if(indexFormat != SPFFT_INDEX_TRIPLETS) {
+    throw InternalError();
+  }
+
+  // convert indices to internal format
+  std::vector<int> localStickIndices;
+  std::tie(freqValueIndices_, localStickIndices) =
+      convert_index_triplets(transformType == SPFFT_TRANS_R2C, dimX, dimY, dimZ, numLocalElements,
+                             indices, indices + 1, indices + 2, 3);
+
+  stickIndicesPerRank_ = create_distributed_transform_indices(comm, std::move(localStickIndices));
+  check_stick_duplicates(stickIndicesPerRank_);
+
+  const SizeType numLocalZSticks = stickIndicesPerRank_[comm.rank()].size();
+
+  TransposeParameter paramLocal =
+      TransposeParameter{dimX, dimY, dimZ, numLocalXYPlanes, numLocalZSticks};
+
+  // exchange local parameters
+  MPIDatatypeHandle parameterType = MPIDatatypeHandle::create_contiguous(
+      sizeof(TransposeParameter) / sizeof(SizeType), MPIMatchElementaryType<SizeType>::get());
+
+  std::vector<TransposeParameter> paramPerRank(comm.size());
+  mpi_check_status(MPI_Allgather(&paramLocal, 1, parameterType.get(), paramPerRank.data(), 1,
+                                 parameterType.get(), comm.get()));
+
+  // Check parameters
+  SizeType numZSticksTotal = 0;
+  SizeType numXYPlanesTotal = 0;
+  for (const auto& p : paramPerRank) {
+    // dimensions must match for all ranks
+    if (p.dimX != paramLocal.dimX || p.dimY != paramLocal.dimY || p.dimZ != paramLocal.dimZ) {
+      throw MPIParameterMismatchError();
+    }
+    numZSticksTotal += p.numLocalZSticks;
+    numXYPlanesTotal += p.numLocalXYPlanes;
+  }
+  if (numZSticksTotal > dimX * dimY) {
+    // More z sticks than possible
+    throw MPIParameterMismatchError();
+  }
+  if (numXYPlanesTotal != dimZ) {
+    throw MPIParameterMismatchError();
+  }
+
+  // store all parameters in members
+  numZSticksPerRank_.reserve(comm.size());
+  numXYPlanesPerRank_.reserve(comm.size());
+  xyPlaneOffsets_.reserve(comm.size());
+  SizeType startIndex = 0;
+  SizeType xyPlaneOffset = 0;
+  for (const auto& p : paramPerRank) {
+    numZSticksPerRank_.emplace_back(p.numLocalZSticks);
+    numXYPlanesPerRank_.emplace_back(p.numLocalXYPlanes);
+    xyPlaneOffsets_.emplace_back(xyPlaneOffset);
+    startIndex += p.numLocalZSticks;
+    xyPlaneOffset += p.numLocalXYPlanes;
+  }
+
+  maxNumZSticks_ = *std::max_element(numZSticksPerRank_.begin(), numZSticksPerRank_.end());
+  maxNumXYPlanes_ = *std::max_element(numXYPlanesPerRank_.begin(), numXYPlanesPerRank_.end());
+  totalNumZSticks_ =
+      std::accumulate(numZSticksPerRank_.begin(), numZSticksPerRank_.end(), SizeType(0));
+
+  // check if this rank holds the x=0, y=0 z-stick, which is treated specially for the real to
+  // complex case
+  zeroZeroStickIndex_ = 0;
+  for (const auto& index : stickIndicesPerRank_[comm.rank()]) {
+    if (index == 0) {
+      break;
+    }
+    ++zeroZeroStickIndex_;
+  }
+}
+#endif
+
+Parameters::Parameters(const SpfftTransformType transformType, const SizeType dimX,
+                       const SizeType dimY, const SizeType dimZ, const SizeType numLocalElements,
+                       SpfftIndexFormatType indexFormat, const int* indices)
+    : transformType_(transformType),
+      dimX_(dimX),
+      dimXFreq_(transformType == SPFFT_TRANS_R2C ? dimX / 2 + 1 : dimX),
+      dimY_(dimY),
+      dimZ_(dimZ),
+      maxNumXYPlanes_(dimZ),
+      totalNumXYPlanes_(dimZ),
+      comm_rank_(0),
+      comm_size_(1),
+      numXYPlanesPerRank_(1, dimZ),
+      xyPlaneOffsets_(1, 0) {
+  // Only index triplets supported (for now)
+  if (indexFormat != SPFFT_INDEX_TRIPLETS) {
+    throw InternalError();
+  }
+
+  std::vector<int> localStickIndices;
+  std::tie(freqValueIndices_, localStickIndices) =
+      convert_index_triplets(transformType == SPFFT_TRANS_R2C, dimX, dimY, dimZ, numLocalElements,
+                             indices, indices + 1, indices + 2, 3);
+  stickIndicesPerRank_.emplace_back(std::move(localStickIndices));
+  check_stick_duplicates(stickIndicesPerRank_);
+
+  maxNumZSticks_ = stickIndicesPerRank_[0].size();
+  totalNumZSticks_ = stickIndicesPerRank_[0].size();
+  numZSticksPerRank_.assign(1, stickIndicesPerRank_[0].size());
+  zeroZeroStickIndex_ = 0;
+  for (const auto& index : stickIndicesPerRank_[0]) {
+    if (index == 0) {
+      break;
+    }
+    ++zeroZeroStickIndex_;
+  }
+}
+
+} // namespace spfft
diff --git a/src/parameters/parameters.hpp b/src/parameters/parameters.hpp
new file mode 100644
index 0000000..fcc3a3f
--- /dev/null
+++ b/src/parameters/parameters.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_PARAMETERS_HPP
+#define SPFFT_PARAMETERS_HPP
+
+#include <cassert>
+#include <utility>
+#include <vector>
+#include <limits>
+#include "compression/indices.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "util/common_types.hpp"
+#include "memory/host_array_const_view.hpp"
+#include "spfft/types.h"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#endif
+
+namespace spfft {
+
+class Parameters {
+public:
+#ifdef SPFFT_MPI
+  Parameters(const MPICommunicatorHandle& comm, const SpfftTransformType transformType,
+             const SizeType dimX, const SizeType dimY, const SizeType dimZ,
+             const SizeType numLocalXYPlanes, const SizeType numLocalElements,
+             SpfftIndexFormatType indexFormat, const int* indices);
+#endif
+
+  Parameters(const SpfftTransformType transformType, const SizeType dimX, const SizeType dimY,
+             const SizeType dimZ, const SizeType numLocalElements, SpfftIndexFormatType indexFormat,
+             const int* indices);
+
+  inline auto dim_x() const noexcept -> SizeType { return dimX_; }
+
+  inline auto dim_x_freq() const noexcept -> SizeType { return dimXFreq_; }
+
+  inline auto dim_y() const noexcept -> SizeType { return dimY_; }
+
+  inline auto dim_z() const noexcept -> SizeType { return dimZ_; }
+
+  inline auto max_num_z_sticks() const noexcept -> SizeType { return maxNumZSticks_; }
+
+  inline auto max_num_xy_planes() const noexcept -> SizeType { return maxNumXYPlanes_; }
+
+  inline auto total_num_z_sticks() const noexcept -> SizeType { return totalNumZSticks_; }
+
+  inline auto total_num_xy_planes() const noexcept -> SizeType { return totalNumXYPlanes_; }
+
+  inline auto transform_type() const noexcept -> SpfftTransformType { return transformType_; }
+
+  inline auto zero_zero_stick_index() const noexcept -> SizeType { return zeroZeroStickIndex_; }
+
+  inline auto num_xy_planes(const SizeType rank) const -> SizeType {
+    assert(rank < numXYPlanesPerRank_.size());
+    return numXYPlanesPerRank_[rank];
+  }
+
+  inline auto local_num_xy_planes() const -> SizeType {
+    assert(comm_rank_ < numXYPlanesPerRank_.size());
+    return numXYPlanesPerRank_[comm_rank_];
+  }
+
+  inline auto xy_plane_offset(const SizeType rank) const -> SizeType {
+    assert(rank < numXYPlanesPerRank_.size());
+    return xyPlaneOffsets_[rank];
+  }
+
+  inline auto local_xy_plane_offset() const -> SizeType {
+    assert(comm_rank_ < numXYPlanesPerRank_.size());
+    return xyPlaneOffsets_[comm_rank_];
+  }
+
+  inline auto num_z_sticks(const SizeType rank) const -> SizeType {
+    assert(rank < numZSticksPerRank_.size());
+    return numZSticksPerRank_[rank];
+  }
+
+  inline auto local_num_z_sticks() const -> SizeType {
+    assert(comm_rank_ < numZSticksPerRank_.size());
+    return numZSticksPerRank_[comm_rank_];
+  }
+
+  inline auto z_stick_xy_indices(const SizeType rank) const -> HostArrayConstView1D<int> {
+    assert(rank < stickIndicesPerRank_.size());
+    assert(num_z_sticks(rank) == stickIndicesPerRank_[rank].size());
+    return HostArrayConstView1D<int>(stickIndicesPerRank_[rank].data(),
+                                     stickIndicesPerRank_[rank].size(), false);
+  }
+
+  inline auto local_z_stick_xy_indices() const -> HostArrayConstView1D<int> {
+    assert(comm_rank_ < stickIndicesPerRank_.size());
+    assert(num_z_sticks(comm_rank_) == stickIndicesPerRank_[comm_rank_].size());
+    return HostArrayConstView1D<int>(stickIndicesPerRank_[comm_rank_].data(),
+                                     stickIndicesPerRank_[comm_rank_].size(), false);
+  }
+
+  inline auto local_value_indices() const -> const std::vector<int>& { return freqValueIndices_; }
+
+  inline auto local_num_elements() const -> SizeType { return freqValueIndices_.size(); }
+
+  inline auto comm_rank() const -> SizeType { return comm_rank_; }
+
+  inline auto comm_size() const -> SizeType { return comm_size_; }
+
+private:
+  SpfftTransformType transformType_;
+  SizeType zeroZeroStickIndex_ = std::numeric_limits<SizeType>::max();
+  SizeType dimX_ = 0;
+  SizeType dimXFreq_ = 0;
+  SizeType dimY_ = 0;
+  SizeType dimZ_ = 0;
+  SizeType maxNumZSticks_ = 0;
+  SizeType maxNumXYPlanes_ = 0;
+  SizeType totalNumZSticks_ = 0;
+  SizeType totalNumXYPlanes_ = 0;
+  SizeType comm_rank_ = 0;
+  SizeType comm_size_ = 1;
+  std::vector<SizeType> numZSticksPerRank_;
+  std::vector<SizeType> numXYPlanesPerRank_;
+  std::vector<SizeType> xyPlaneOffsets_;
+  std::vector<std::vector<int>> stickIndicesPerRank_;
+  std::vector<int> freqValueIndices_;
+};
+
+} // namespace spfft
+
+#endif
diff --git a/src/spfft/grid.cpp b/src/spfft/grid.cpp
new file mode 100644
index 0000000..bd31819
--- /dev/null
+++ b/src/spfft/grid.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spfft/grid.hpp"
+#include "spfft/grid.h"
+#include "spfft/grid_internal.hpp"
+
+namespace spfft {
+
+Grid::Grid(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns,
+           SpfftProcessingUnitType processingUnit, int maxNumThreads)
+    : grid_(new GridInternal<double>(maxDimX, maxDimY, maxDimZ, maxNumLocalZColumns, processingUnit,
+                                     maxNumThreads)) {}
+#ifdef SPFFT_MPI
+Grid::Grid(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns, int maxLocalZLength,
+           SpfftProcessingUnitType processingUnit, int maxNumThreads, MPI_Comm comm,
+           SpfftExchangeType exchangeType)
+    : grid_(new GridInternal<double>(maxDimX, maxDimY, maxDimZ, maxNumLocalZColumns,
+                                     maxLocalZLength, processingUnit, maxNumThreads, comm,
+                                     exchangeType)) {}
+#endif
+
+Grid::Grid(const Grid& grid) : grid_(new GridInternal<double>(*(grid.grid_))) {}
+
+Grid& Grid::operator=(const Grid& grid) {
+  grid_.reset(new GridInternal<double>(*(grid.grid_)));
+  return *this;
+}
+
+Transform Grid::create_transform(SpfftProcessingUnitType processingUnit,
+                                 SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                                 int localZLength, int numLocalElements,
+                                 SpfftIndexFormatType indexFormat, const int* indices) const {
+  return Transform(grid_, processingUnit, transformType, dimX, dimY, dimZ, localZLength,
+                   numLocalElements, indexFormat, indices);
+}
+
+int Grid::max_dim_x() const { return grid_->max_dim_x(); }
+
+int Grid::max_dim_y() const { return grid_->max_dim_y(); }
+
+int Grid::max_dim_z() const { return grid_->max_dim_z(); }
+
+int Grid::max_num_local_z_columns() const { return grid_->max_num_local_z_columns(); }
+
+int Grid::max_local_z_length() const { return grid_->max_num_local_xy_planes(); }
+
+SpfftProcessingUnitType Grid::processing_unit() const { return grid_->processing_unit(); }
+
+int Grid::device_id() const { return grid_->device_id(); }
+
+int Grid::num_threads() const { return grid_->num_threads(); }
+
+#ifdef SPFFT_MPI
+MPI_Comm Grid::communicator() const { return grid_->communicator().get(); }
+#endif
+} // namespace spfft
+
+//---------------------
+// C API
+//---------------------
+
+extern "C" {
+SpfftError spfft_grid_create(SpfftGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                             int maxNumLocalZSticks, SpfftProcessingUnitType processingUnit,
+                             int maxNumThreads) {
+  try {
+    *grid = new spfft::Grid(maxDimX, maxDimY, maxDimZ, maxNumLocalZSticks, processingUnit,
+                            maxNumThreads);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_grid_create_distributed(SpfftGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                                         int maxNumLocalZSticks, int maxLocalZLength,
+                                         SpfftProcessingUnitType processingUnit, int maxNumThreads,
+                                         MPI_Comm comm, SpfftExchangeType exchangeType) {
+  try {
+    *grid = new spfft::Grid(maxDimX, maxDimY, maxDimZ, maxNumLocalZSticks, maxLocalZLength,
+                            processingUnit, maxNumThreads, comm, exchangeType);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_create_distributed_fortran(SpfftGrid* grid, int maxDimX, int maxDimY,
+                                                 int maxDimZ, int maxNumLocalZSticks,
+                                                 int maxLocalZLength,
+                                                 SpfftProcessingUnitType processingUnit,
+                                                 int maxNumThreads, int commFortran,
+                                                 SpfftExchangeType exchangeType) {
+  try {
+    MPI_Comm comm = MPI_Comm_f2c(commFortran);
+    *grid = new spfft::Grid(maxDimX, maxDimY, maxDimZ, maxNumLocalZSticks, maxLocalZLength,
+                            processingUnit, maxNumThreads, comm, exchangeType);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+
+SpfftError spfft_grid_destroy(SpfftGrid grid) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    delete reinterpret_cast<spfft::Grid*>(grid);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  grid = nullptr;
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_max_dim_x(SpfftGrid grid, int* dimX) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimX = reinterpret_cast<spfft::Grid*>(grid)->max_dim_x();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_max_dim_y(SpfftGrid grid, int* dimY) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimY = reinterpret_cast<spfft::Grid*>(grid)->max_dim_y();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_max_dim_z(SpfftGrid grid, int* dimZ) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimZ = reinterpret_cast<spfft::Grid*>(grid)->max_dim_z();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_max_num_local_z_columns(SpfftGrid grid, int* maxNumLocalZColumns) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *maxNumLocalZColumns = reinterpret_cast<spfft::Grid*>(grid)->max_num_local_z_columns();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_max_local_z_length(SpfftGrid grid, int* maxLocalZLength) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *maxLocalZLength = reinterpret_cast<spfft::Grid*>(grid)->max_local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_processing_unit(SpfftGrid grid, SpfftProcessingUnitType* processingUnit) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *processingUnit = reinterpret_cast<spfft::Grid*>(grid)->processing_unit();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_device_id(SpfftGrid grid, int* deviceId) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *deviceId = reinterpret_cast<spfft::Grid*>(grid)->device_id();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_num_threads(SpfftGrid grid, int* numThreads) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *numThreads = reinterpret_cast<spfft::Grid*>(grid)->num_threads();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_grid_communicator(SpfftGrid grid, MPI_Comm* comm) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *comm = reinterpret_cast<spfft::Grid*>(grid)->communicator();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_grid_communicator_fortran(SpfftGrid grid, int* commFortran) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *commFortran = MPI_Comm_c2f(reinterpret_cast<spfft::Grid*>(grid)->communicator());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+}
+
diff --git a/src/spfft/grid_float.cpp b/src/spfft/grid_float.cpp
new file mode 100644
index 0000000..40ae4c3
--- /dev/null
+++ b/src/spfft/grid_float.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spfft/grid_float.hpp"
+#include "spfft/grid_float.h"
+#include "spfft/grid_internal.hpp"
+
+#ifdef SPFFT_SINGLE_PRECISION
+namespace spfft {
+GridFloat::GridFloat(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns,
+                     SpfftProcessingUnitType processingUnit, int maxNumThreads)
+    : grid_(new GridInternal<float>(maxDimX, maxDimY, maxDimZ, maxNumLocalZColumns, processingUnit,
+                                    maxNumThreads)) {}
+
+#ifdef SPFFT_MPI
+GridFloat::GridFloat(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZColumns,
+                     int maxLocalZLength, SpfftProcessingUnitType processingUnit, int maxNumThreads,
+                     MPI_Comm comm, SpfftExchangeType exchangeType)
+    : grid_(new GridInternal<float>(maxDimX, maxDimY, maxDimZ, maxNumLocalZColumns, maxLocalZLength,
+                                    processingUnit, maxNumThreads, comm, exchangeType)) {}
+#endif
+
+GridFloat::GridFloat(const GridFloat& grid) : grid_(new GridInternal<float>(*(grid.grid_))) {}
+
+GridFloat& GridFloat::operator=(const GridFloat& grid) {
+  grid_.reset(new GridInternal<float>(*(grid.grid_)));
+  return *this;
+}
+
+TransformFloat GridFloat::create_transform(SpfftProcessingUnitType processingUnit,
+                                           SpfftTransformType transformType, int dimX, int dimY,
+                                           int dimZ, int localZLength, int numLocalElements,
+                                           SpfftIndexFormatType indexFormat,
+                                           const int* indices) const {
+  return TransformFloat(grid_, processingUnit, transformType, dimX, dimY, dimZ, localZLength,
+                        numLocalElements, indexFormat, indices);
+}
+
+int GridFloat::max_dim_x() const { return grid_->max_dim_x(); }
+
+int GridFloat::max_dim_y() const { return grid_->max_dim_y(); }
+
+int GridFloat::max_dim_z() const { return grid_->max_dim_z(); }
+
+int GridFloat::max_num_local_z_columns() const { return grid_->max_num_local_z_columns(); }
+
+int GridFloat::max_local_z_length() const { return grid_->max_num_local_xy_planes(); }
+
+SpfftProcessingUnitType GridFloat::processing_unit() const { return grid_->processing_unit(); }
+
+int GridFloat::device_id() const { return grid_->device_id(); }
+
+int GridFloat::num_threads() const { return grid_->num_threads(); }
+
+#ifdef SPFFT_MPI
+MPI_Comm GridFloat::communicator() const { return grid_->communicator().get(); }
+#endif
+
+} // namespace spfft
+
+
+//---------------------
+// C API
+//---------------------
+
+extern "C" {
+SpfftError spfft_float_grid_create(SpfftFloatGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                             int maxNumLocalZSticks, SpfftProcessingUnitType processingUnit,
+                             int maxNumThreads) {
+  try {
+    *grid = new spfft::GridFloat(maxDimX, maxDimY, maxDimZ, maxNumLocalZSticks, processingUnit,
+                            maxNumThreads);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_float_grid_create_distributed(SpfftFloatGrid* grid, int maxDimX, int maxDimY, int maxDimZ,
+                                         int maxNumLocalZSticks, int maxLocalZLength,
+                                         SpfftProcessingUnitType processingUnit, int maxNumThreads,
+                                         MPI_Comm comm, SpfftExchangeType exchangeType) {
+  try {
+    *grid = new spfft::GridFloat(maxDimX, maxDimY, maxDimZ, maxNumLocalZSticks, maxLocalZLength,
+                            processingUnit, maxNumThreads, comm, exchangeType);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+
+SpfftError spfft_float_grid_destroy(SpfftFloatGrid grid) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    delete reinterpret_cast<spfft::GridFloat*>(grid);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  grid = nullptr;
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_max_dim_x(SpfftFloatGrid grid, int* dimX) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimX = reinterpret_cast<spfft::GridFloat*>(grid)->max_dim_x();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_max_dim_y(SpfftFloatGrid grid, int* dimY) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimY = reinterpret_cast<spfft::GridFloat*>(grid)->max_dim_y();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_max_dim_z(SpfftFloatGrid grid, int* dimZ) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimZ = reinterpret_cast<spfft::GridFloat*>(grid)->max_dim_z();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_max_num_local_z_columns(SpfftFloatGrid grid, int* maxNumLocalZColumns) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *maxNumLocalZColumns = reinterpret_cast<spfft::GridFloat*>(grid)->max_num_local_z_columns();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_max_local_z_length(SpfftFloatGrid grid, int* maxLocalZLength) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *maxLocalZLength = reinterpret_cast<spfft::GridFloat*>(grid)->max_local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_processing_unit(SpfftFloatGrid grid, SpfftProcessingUnitType* processingUnit) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *processingUnit = reinterpret_cast<spfft::GridFloat*>(grid)->processing_unit();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_device_id(SpfftFloatGrid grid, int* deviceId) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *deviceId = reinterpret_cast<spfft::GridFloat*>(grid)->device_id();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_num_threads(SpfftFloatGrid grid, int* numThreads) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *numThreads = reinterpret_cast<spfft::GridFloat*>(grid)->num_threads();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_float_grid_communicator(SpfftFloatGrid grid, MPI_Comm* comm) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *comm = reinterpret_cast<spfft::GridFloat*>(grid)->communicator();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_grid_communicator_fortran(SpfftFloatGrid grid, int* commFortran) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *commFortran = MPI_Comm_c2f(reinterpret_cast<spfft::GridFloat*>(grid)->communicator());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+
+}
+
+#endif
diff --git a/src/spfft/grid_internal.cpp b/src/spfft/grid_internal.cpp
new file mode 100644
index 0000000..e0e3865
--- /dev/null
+++ b/src/spfft/grid_internal.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spfft/config.h"
+
+#include <complex>
+#include <memory>
+#include "spfft/grid_internal.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#endif
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_device_guard.hpp"
+#endif
+
+namespace spfft {
+
+template <typename T>
+GridInternal<T>::GridInternal(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZSticks,
+                              SpfftProcessingUnitType executionUnit, int numThreads)
+    : isLocal_(true),
+      executionUnit_(executionUnit),
+      deviceId_(0),
+      numThreads_(numThreads),
+      maxDimX_(maxDimX),
+      maxDimY_(maxDimY),
+      maxDimZ_(maxDimZ),
+      maxNumLocalZSticks_(maxNumLocalZSticks),
+      maxNumLocalXYPlanes_(maxDimZ) {
+  // input check
+  if (maxDimX <= 0 || maxDimY <= 0 || maxDimZ <= 0 || maxNumLocalZSticks < 0) {
+    throw InvalidParameterError();
+  }
+  if (!(executionUnit &
+        (SpfftProcessingUnitType::SPFFT_PU_HOST | SpfftProcessingUnitType::SPFFT_PU_GPU))) {
+    throw InvalidParameterError();
+  }
+
+  // set number of threads to default omp value if not valid
+  if (numThreads < 1) {
+    numThreads = omp_get_max_threads();
+    numThreads_ = omp_get_max_threads();
+  }
+
+  // store device id
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+  gpu::check_status(gpu::get_device(&deviceId_));
+#endif
+
+  // allocate memory
+  if (executionUnit & SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    arrayHost1_ = HostArray<ComplexType>(static_cast<SizeType>(maxDimX * maxDimY * maxDimZ));
+    arrayHost2_ = HostArray<ComplexType>(static_cast<SizeType>(maxDimX * maxDimY * maxDimZ));
+  }
+  if (executionUnit & SpfftProcessingUnitType::SPFFT_PU_GPU) {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    if (arrayHost1_.empty()) {
+      // not already created for CPU, which always requires at least as much memory
+      arrayHost1_ = HostArray<ComplexType>(static_cast<SizeType>(maxNumLocalZSticks * maxDimZ));
+      arrayHost2_ = HostArray<ComplexType>(static_cast<SizeType>(maxDimX * maxDimY * maxDimZ));
+    }
+    arrayHost1_.pin_memory();
+    arrayHost2_.pin_memory();
+    arrayGPU1_ = GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(
+        static_cast<SizeType>(maxNumLocalZSticks * maxDimZ));
+    arrayGPU2_ = GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(
+        static_cast<SizeType>(maxDimX * maxDimY * maxDimZ));
+
+    // each transform will resize the work buffer as needed
+    fftWorkBuffer_.reset(new GPUArray<char>());
+
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+#ifdef SPFFT_MPI
+template <typename T>
+GridInternal<T>::GridInternal(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZSticks,
+                              int maxNumLocalXYPlanes, SpfftProcessingUnitType executionUnit,
+                              int numThreads, MPI_Comm comm, SpfftExchangeType exchangeType)
+    : isLocal_(false),
+      executionUnit_(executionUnit),
+      deviceId_(0),
+      numThreads_(numThreads),
+      maxDimX_(maxDimX),
+      maxDimY_(maxDimY),
+      maxDimZ_(maxDimZ),
+      maxNumLocalZSticks_(maxNumLocalZSticks),
+      maxNumLocalXYPlanes_(maxNumLocalXYPlanes),
+      comm_(comm),
+      exchangeType_(exchangeType) {
+  // input check
+  if (static_cast<long long int>(maxDimX) * static_cast<long long int>(maxDimY) *
+          static_cast<long long int>(maxNumLocalXYPlanes) >
+      std::numeric_limits<int>::max()) {
+    throw OverflowError();
+  }
+  if (static_cast<long long int>(maxNumLocalZSticks) * static_cast<long long int>(maxDimZ) >
+      std::numeric_limits<int>::max()) {
+    throw OverflowError();
+  }
+  if (maxDimX <= 0 || maxDimY <= 0 || maxDimZ <= 0 || maxNumLocalZSticks < 0) {
+    throw InvalidParameterError();
+  }
+  if (!(executionUnit &
+        (SpfftProcessingUnitType::SPFFT_PU_HOST | SpfftProcessingUnitType::SPFFT_PU_GPU))) {
+    throw InvalidParameterError();
+  }
+  if (exchangeType != SpfftExchangeType::SPFFT_EXCH_DEFAULT &&
+      exchangeType != SpfftExchangeType::SPFFT_EXCH_BUFFERED &&
+      exchangeType != SpfftExchangeType::SPFFT_EXCH_BUFFERED_FLOAT &&
+      exchangeType != SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED &&
+      exchangeType != SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED_FLOAT &&
+      exchangeType != SpfftExchangeType::SPFFT_EXCH_UNBUFFERED) {
+    throw InvalidParameterError();
+  }
+
+  // compare parameters between ranks
+  {
+    int errorDetected = 0;
+    int exchangeAll = exchangeType;
+    int executionUnitAll = executionUnit;
+
+    // Bitwise or will lead to a mismatch on at least one rank if not all values are equal
+    mpi_check_status(MPI_Allreduce(MPI_IN_PLACE, &exchangeAll, 1, MPI_INT, MPI_BOR, comm_.get()));
+    mpi_check_status(
+        MPI_Allreduce(MPI_IN_PLACE, &executionUnitAll, 1, MPI_INT, MPI_BOR, comm_.get()));
+
+    if (exchangeAll != exchangeType || executionUnitAll != executionUnit) {
+      errorDetected = 1;
+    }
+
+    // check if any rank has detected an error
+    mpi_check_status(MPI_Allreduce(MPI_IN_PLACE, &errorDetected, 1, MPI_INT, MPI_SUM, comm_.get()));
+    if (errorDetected) {
+      throw MPIParameterMismatchError();
+    }
+  }
+
+  // set number of threads to default omp value if not valid
+  if (numThreads < 1) {
+    numThreads = omp_get_max_threads();
+    numThreads_ = omp_get_max_threads();
+  }
+
+  // set default exchange type
+  if (exchangeType == SpfftExchangeType::SPFFT_EXCH_DEFAULT) {
+    exchangeType = SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED;
+    exchangeType_ = SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED;
+  }
+
+  // mark as local if comm size is 1
+  if (comm_.size() == 1) isLocal_ = true;
+
+    // store device id
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+  gpu::check_status(gpu::get_device(&deviceId_));
+#endif
+
+  int requiredSize = 0;
+  switch (exchangeType) {
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED: {
+      decltype(maxNumLocalXYPlanes_) globalMaxNumXYPlanes = 0;
+      decltype(maxNumLocalZSticks_) globalMaxNumZSticks = 0;
+      MPI_Allreduce(&maxNumLocalXYPlanes_, &globalMaxNumXYPlanes, 1,
+                    MPIMatchElementaryType<decltype(maxNumLocalXYPlanes_)>::get(), MPI_MAX, comm);
+      MPI_Allreduce(&maxNumLocalZSticks_, &globalMaxNumZSticks, 1,
+                    MPIMatchElementaryType<decltype(maxNumLocalZSticks_)>::get(), MPI_MAX, comm);
+      requiredSize =
+          std::max({globalMaxNumXYPlanes * globalMaxNumZSticks * static_cast<int>(comm_.size() + 1),
+                    maxDimX_ * maxDimY_ * maxNumLocalXYPlanes_, maxDimZ_ * maxNumLocalZSticks_});
+    } break;
+    default: {
+      // AUTO or COMPACT_BUFFERED or UNBUFFERED
+      requiredSize =
+          std::max(maxDimX_ * maxDimY_ * maxNumLocalXYPlanes_, maxDimZ_ * maxNumLocalZSticks_);
+
+    } break;
+  }
+
+  // Host
+  arrayHost1_ = HostArray<ComplexType>(static_cast<SizeType>(requiredSize));
+  arrayHost2_ = HostArray<ComplexType>(static_cast<SizeType>(requiredSize));
+
+  // GPU
+  if (executionUnit & SpfftProcessingUnitType::SPFFT_PU_GPU) {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    arrayHost1_.pin_memory();
+    arrayHost2_.pin_memory();
+    arrayGPU1_ = GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(
+        static_cast<SizeType>(requiredSize));
+    arrayGPU2_ = GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(
+        static_cast<SizeType>(requiredSize));
+
+    // each transform will resize the work buffer as needed
+    fftWorkBuffer_.reset(new GPUArray<char>());
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+#endif
+
+template <typename T>
+GridInternal<T>::GridInternal(const GridInternal<T>& grid)
+    : isLocal_(grid.isLocal_),
+      executionUnit_(grid.executionUnit_),
+      deviceId_(grid.deviceId_),
+      numThreads_(grid.numThreads_),
+      maxDimX_(grid.maxDimX_),
+      maxDimY_(grid.maxDimY_),
+      maxDimZ_(grid.maxDimZ_),
+      maxNumLocalZSticks_(grid.maxNumLocalZSticks_),
+      maxNumLocalXYPlanes_(grid.maxNumLocalXYPlanes_),
+      arrayHost1_(grid.arrayHost1_.size()),
+      arrayHost2_(grid.arrayHost2_.size())
+{
+#ifdef SPFFT_MPI
+      if(!grid.isLocal_) comm_ = MPICommunicatorHandle(grid.comm_.get());
+      exchangeType_ = grid.exchangeType_;
+#endif
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  if (grid.executionUnit_ & SPFFT_PU_GPU) {
+    GPUDeviceGuard(grid.device_id());
+
+    if (grid.arrayGPU1_.size() > 0)
+      arrayGPU1_ =
+          GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(grid.arrayGPU1_.size());
+    if (grid.arrayGPU2_.size() > 0)
+      arrayGPU2_ =
+          GPUArray<typename gpu::fft::ComplexType<ValueType>::type>(grid.arrayGPU2_.size());
+    if (grid.fftWorkBuffer_) fftWorkBuffer_.reset(new GPUArray<char>(grid.fftWorkBuffer_->size()));
+  }
+#endif
+}
+
+// instatiate templates for float and double
+template class GridInternal<double>;
+#ifdef SPFFT_SINGLE_PRECISION
+template class GridInternal<float>;
+#endif
+
+} // namespace spfft
diff --git a/src/spfft/grid_internal.hpp b/src/spfft/grid_internal.hpp
new file mode 100644
index 0000000..2d3cdd6
--- /dev/null
+++ b/src/spfft/grid_internal.hpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GRID_INTERNAL_HPP
+#define SPFFT_GRID_INTERNAL_HPP
+#include "spfft/config.h"
+
+#include <algorithm>
+#include <complex>
+#include <memory>
+#include "memory/host_array.hpp"
+#include "spfft/types.h"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include <mpi.h>
+#include "mpi_util/mpi_communicator_handle.hpp"
+#endif
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "gpu_util/gpu_fft_api.hpp"
+#include "memory/gpu_array.hpp"
+#endif
+
+namespace spfft {
+
+template <typename T>
+class GridInternal {
+public:
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+  GridInternal(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZSticks,
+               SpfftProcessingUnitType executionUnit, int numThreads);
+
+#ifdef SPFFT_MPI
+  GridInternal(int maxDimX, int maxDimY, int maxDimZ, int maxNumLocalZSticks,
+               int maxNumLocalXYPlanes, SpfftProcessingUnitType executionUnit, int numThreads,
+               MPI_Comm comm, SpfftExchangeType exchangeType);
+#endif
+
+  GridInternal(const GridInternal<T>& grid);
+
+  GridInternal(GridInternal<T>&&) = default;
+
+  inline GridInternal& operator=(const GridInternal<T>& grid) {
+    *this = GridInternal(grid);
+    return *this;
+  }
+
+  inline GridInternal& operator=(GridInternal<T>&&) = default;
+
+  inline auto max_dim_x() const noexcept -> int { return maxDimX_; }
+
+  inline auto max_dim_y() const noexcept -> int { return maxDimY_; }
+
+  inline auto max_dim_z() const noexcept -> int { return maxDimZ_; }
+
+  inline auto max_num_local_z_columns() const noexcept -> int { return maxNumLocalZSticks_; }
+
+  inline auto max_num_local_xy_planes() const noexcept -> int { return maxNumLocalXYPlanes_; }
+
+  inline auto device_id() const noexcept -> int { return deviceId_; }
+
+  inline auto num_threads() const noexcept -> int { return numThreads_; }
+
+  inline auto array_host_1() -> HostArray<ComplexType>& { return arrayHost1_; }
+
+  inline auto array_host_2() -> HostArray<ComplexType>& { return arrayHost2_; }
+
+  inline auto processing_unit() -> SpfftProcessingUnitType { return executionUnit_; }
+
+  inline auto local() -> bool { return isLocal_; }
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  inline auto array_gpu_1() -> GPUArray<typename gpu::fft::ComplexType<ValueType>::type>& {
+    return arrayGPU1_;
+  }
+
+  inline auto array_gpu_2() -> GPUArray<typename gpu::fft::ComplexType<ValueType>::type>& {
+    return arrayGPU2_;
+  }
+
+  inline auto fft_work_buffer() -> const std::shared_ptr<GPUArray<char>>& {
+    assert(fftWorkBuffer_);
+    return fftWorkBuffer_;
+  }
+#endif
+
+#ifdef SPFFT_MPI
+  inline auto communicator() const -> const MPICommunicatorHandle& { return comm_; }
+
+  inline auto exchange_type() const -> SpfftExchangeType { return exchangeType_; }
+#endif
+
+private:
+  bool isLocal_;
+  SpfftProcessingUnitType executionUnit_;
+  int deviceId_, numThreads_;
+  int maxDimX_, maxDimY_, maxDimZ_, maxNumLocalZSticks_, maxNumLocalXYPlanes_;
+
+  HostArray<ComplexType> arrayHost1_;
+  HostArray<ComplexType> arrayHost2_;
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  GPUArray<typename gpu::fft::ComplexType<ValueType>::type> arrayGPU1_;
+  GPUArray<typename gpu::fft::ComplexType<ValueType>::type> arrayGPU2_;
+  std::shared_ptr<GPUArray<char>> fftWorkBuffer_;
+#endif
+
+#ifdef SPFFT_MPI
+  MPICommunicatorHandle comm_;
+  SpfftExchangeType exchangeType_;
+#endif
+};
+
+} // namespace spfft
+#endif
diff --git a/src/spfft/multi_transform.cpp b/src/spfft/multi_transform.cpp
new file mode 100644
index 0000000..860b246
--- /dev/null
+++ b/src/spfft/multi_transform.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spfft/multi_transform.h"
+#include <vector>
+#include "spfft/config.h"
+#include "spfft/multi_transform.hpp"
+#include "spfft/multi_transform_internal.hpp"
+#include "spfft/types.h"
+
+namespace spfft {
+
+void multi_transform_forward(int numTransforms, Transform* transforms,
+                             SpfftProcessingUnitType* inputLocations, double** outputPointers,
+                             SpfftScalingType* scalingTypes) {
+  MultiTransformInternal<Transform>::forward(numTransforms, transforms, inputLocations,
+                                             outputPointers, scalingTypes);
+}
+
+void multi_transform_backward(int numTransforms, Transform* transforms, double** inputPointers,
+                              SpfftProcessingUnitType* outputLocations) {
+  MultiTransformInternal<Transform>::backward(numTransforms, transforms, inputPointers,
+                                              outputLocations);
+}
+} // namespace spfft
+
+extern "C" {
+
+SpfftError spfft_multi_transform_forward(int numTransforms, SpfftTransform* transforms,
+                                         SpfftProcessingUnitType* inputLocations,
+                                         double** outputPointers, SpfftScalingType* scalingTypes) {
+  try {
+    multi_transform_forward(numTransforms, reinterpret_cast<spfft::Transform*>(transforms),
+                            inputLocations, outputPointers, scalingTypes);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_multi_transform_backward(int numTransforms, SpfftTransform* transforms,
+                                          double** inputPointers,
+                                          SpfftProcessingUnitType* outputLocations) {
+  try {
+    multi_transform_backward(numTransforms, reinterpret_cast<spfft::Transform*>(transforms),
+                             inputPointers, outputLocations);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+}
diff --git a/src/spfft/multi_transform_float.cpp b/src/spfft/multi_transform_float.cpp
new file mode 100644
index 0000000..5854587
--- /dev/null
+++ b/src/spfft/multi_transform_float.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spfft/config.h"
+#include "spfft/types.h"
+#include "spfft/multi_transform_float.h"
+#include "spfft/multi_transform_float.hpp"
+#include "spfft/multi_transform_internal.hpp"
+
+namespace spfft {
+
+#ifdef SPFFT_SINGLE_PRECISION
+
+void multi_transform_forward(int numTransforms, TransformFloat* transforms,
+                             SpfftProcessingUnitType* inputLocations, float** outputPointers,
+                             SpfftScalingType* scalingTypes) {
+  MultiTransformInternal<TransformFloat>::forward(numTransforms, transforms, inputLocations,
+                                                  outputPointers, scalingTypes);
+}
+
+void multi_transform_backward(int numTransforms, TransformFloat* transforms, float** inputPointers,
+                              SpfftProcessingUnitType* outputLocations) {
+  MultiTransformInternal<TransformFloat>::backward(numTransforms, transforms, inputPointers,
+                                                   outputLocations);
+}
+#endif
+
+} // namespace spfft
+
+
+extern "C" {
+
+SpfftError spfft_float_multi_transform_forward(int numTransforms, SpfftFloatTransform* transforms,
+                                               SpfftProcessingUnitType* inputLocations,
+                                               float** outputPointers,
+                                               SpfftScalingType* scalingTypes) {
+  try {
+    multi_transform_forward(numTransforms, reinterpret_cast<spfft::TransformFloat*>(transforms),
+                            inputLocations, outputPointers, scalingTypes);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_multi_transform_backward(int numTransforms, SpfftFloatTransform* transforms,
+                                                float** inputPointers,
+                                                SpfftProcessingUnitType* outputLocations) {
+  try {
+    multi_transform_backward(numTransforms, reinterpret_cast<spfft::TransformFloat*>(transforms),
+                             inputPointers, outputLocations);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+}
diff --git a/src/spfft/multi_transform_internal.hpp b/src/spfft/multi_transform_internal.hpp
new file mode 100644
index 0000000..ec1e849
--- /dev/null
+++ b/src/spfft/multi_transform_internal.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_MULTI_TRANSFORM_INTERNAL_HPP
+#define SPFFT_MULTI_TRANSFORM_INTERNAL_HPP
+
+#include "spfft/exceptions.hpp"
+#include "spfft/transform.hpp"
+#include "spfft/transform_internal.hpp"
+#include "timing/timing.hpp"
+
+#ifdef SPFFT_SINGLE_PRECISION
+#include "spfft/transform_float.hpp"
+#endif
+
+namespace spfft {
+
+template <typename TransformType>
+class MultiTransformInternal {
+public:
+  using ValueType = typename TransformType::ValueType;
+
+  inline static auto forward(const int numTransforms, TransformType* transforms,
+                             SpfftProcessingUnitType* inputLocations, ValueType** outputPointers,
+                             SpfftScalingType* scalingTypes) -> void {
+    HOST_TIMING_SCOPED("forward")
+
+    // transforms must not share grids
+    for (int t1 = 0; t1 < numTransforms; ++t1) {
+      for (int t2 = t1 + 1; t2 < numTransforms; ++t2) {
+        if (transforms[t1].transform_->shared_grid(*(transforms[t2].transform_))) {
+          throw InvalidParameterError();
+        }
+      }
+    }
+
+    // launch all gpu transforms first
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() == SPFFT_PU_GPU) {
+        transforms[t].transform_->forward_xy(inputLocations[t]);
+      }
+    }
+
+    // launch all cpu transforms including MPI exchange
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() != SPFFT_PU_GPU) {
+        transforms[t].transform_->forward_xy(inputLocations[t]);
+        transforms[t].transform_->forward_exchange();
+      }
+    }
+
+    // launch all GPU MPI exhanges and transform
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() == SPFFT_PU_GPU) {
+        transforms[t].transform_->forward_exchange();
+        transforms[t].transform_->forward_z(outputPointers[t], scalingTypes[t]);
+      }
+    }
+
+    // launch all remaining cpu transforms
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() != SPFFT_PU_GPU) {
+        transforms[t].transform_->forward_z(outputPointers[t], scalingTypes[t]);
+      }
+    }
+
+    // synchronize all transforms
+    for (int t = 0; t < numTransforms; ++t) {
+      transforms[t].transform_->synchronize();
+    }
+  }
+
+  inline static auto backward(const int numTransforms, TransformType* transforms,
+                              ValueType** inputPointers, SpfftProcessingUnitType* outputLocations)
+      -> void {
+    HOST_TIMING_SCOPED("backward")
+
+    // transforms must not share grids
+    for (int t1 = 0; t1 < numTransforms; ++t1) {
+      for (int t2 = t1 + 1; t2 < numTransforms; ++t2) {
+        if (transforms[t1].transform_->shared_grid(*(transforms[t2].transform_))) {
+          throw InvalidParameterError();
+        }
+      }
+    }
+
+    // launch all gpu transforms first
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() == SPFFT_PU_GPU) {
+        transforms[t].transform_->backward_z(inputPointers[t]);
+      }
+    }
+
+    // launch all cpu transforms including MPI exchange
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() != SPFFT_PU_GPU) {
+        transforms[t].transform_->backward_z(inputPointers[t]);
+        transforms[t].transform_->backward_exchange();
+      }
+    }
+
+    // launch all GPU MPI exhanges and transform
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() == SPFFT_PU_GPU) {
+        transforms[t].transform_->backward_exchange();
+        transforms[t].transform_->backward_xy(outputLocations[t]);
+      }
+    }
+
+    // launch all remaining cpu transforms
+    for (int t = 0; t < numTransforms; ++t) {
+      if (transforms[t].transform_->processing_unit() != SPFFT_PU_GPU) {
+        transforms[t].transform_->backward_xy(outputLocations[t]);
+      }
+    }
+
+    // synchronize all transforms
+    for (int t = 0; t < numTransforms; ++t) {
+      transforms[t].transform_->synchronize();
+    }
+  }
+};
+} // namespace spfft
+
+#endif
diff --git a/src/spfft/transform.cpp b/src/spfft/transform.cpp
new file mode 100644
index 0000000..af6142b
--- /dev/null
+++ b/src/spfft/transform.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spfft/transform.hpp"
+#include "spfft/grid.hpp"
+#include "spfft/transform.h"
+#include "spfft/transform_internal.hpp"
+#include "parameters/parameters.hpp"
+
+namespace spfft {
+
+//---------------------
+// Double precision
+//---------------------
+Transform::Transform(const std::shared_ptr<GridInternal<double>>& grid,
+                     SpfftProcessingUnitType processingUnit, SpfftTransformType transformType,
+                     int dimX, int dimY, int dimZ, int localZLength, int numLocalElements,
+                     SpfftIndexFormatType indexFormat, const int* indices)
+
+{
+  if (dimX < 0 || dimY < 0 || dimZ < 0 || localZLength < 0 || numLocalElements < 0 ||
+      (!indices && numLocalElements > 0)) {
+    throw InvalidParameterError();
+  }
+  std::shared_ptr<Parameters> param;
+  if (!grid->local()) {
+#ifdef SPFFT_MPI
+    param.reset(new Parameters(grid->communicator(), transformType, dimX,
+                                                     dimY, dimZ, localZLength, numLocalElements,
+                                                     indexFormat, indices));
+#else
+    throw MPISupportError()
+#endif
+  } else {
+    param.reset(new Parameters(
+        transformType, dimX, dimY, dimZ, numLocalElements, indexFormat, indices));
+  }
+
+  transform_.reset(new TransformInternal<double>(processingUnit, grid, std::move(param)));
+}
+
+Transform::Transform(std::shared_ptr<TransformInternal<double>> transform)
+    : transform_(std::move(transform)) {}
+
+Transform Transform::clone() const {
+  return Transform(std::shared_ptr<TransformInternal<double>>(
+      new TransformInternal<double>(transform_->clone())));
+}
+
+double* Transform::space_domain_data(SpfftProcessingUnitType dataLocation) {
+  return transform_->space_domain_data(dataLocation);
+}
+
+void Transform::forward(SpfftProcessingUnitType inputLocation, double* output,
+                        SpfftScalingType scaling) {
+  transform_->forward(inputLocation, output, scaling);
+}
+
+void Transform::backward(const double* input, SpfftProcessingUnitType outputLocation) {
+  transform_->backward(input, outputLocation);
+}
+
+SpfftTransformType Transform::type() const { return transform_->type(); }
+
+int Transform::dim_x() const { return transform_->dim_x(); }
+
+int Transform::dim_y() const { return transform_->dim_y(); }
+
+int Transform::dim_z() const { return transform_->dim_z(); }
+
+int Transform::local_z_length() const { return transform_->num_local_xy_planes(); }
+
+int Transform::local_z_offset() const { return transform_->local_xy_plane_offset(); }
+
+int Transform::local_slice_size() const {
+  return dim_x() * dim_y() * local_z_length();
+}
+
+int Transform::num_local_elements() const { return transform_->num_local_elements(); }
+
+SpfftProcessingUnitType Transform::processing_unit() const { return transform_->processing_unit(); }
+
+int Transform::device_id() const { return transform_->device_id(); }
+
+int Transform::num_threads() const { return transform_->num_threads(); }
+
+#ifdef SPFFT_MPI
+MPI_Comm Transform::communicator() const { return transform_->communicator(); }
+#endif
+
+} // namespace spfft
+
+//---------------------
+// C API
+//---------------------
+
+extern "C" {
+SpfftError spfft_transform_create(SpfftTransform* transform, SpfftGrid grid,
+                                  SpfftProcessingUnitType processingUnit,
+                                  SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                                  int localZLength, int numLocalElements,
+                                  SpfftIndexFormatType indexFormat, const int* indices) {
+  try {
+    *transform = new spfft::Transform(reinterpret_cast<spfft::Grid*>(grid)->create_transform(
+        processingUnit, transformType, dimX, dimY, dimZ, localZLength, numLocalElements,
+        indexFormat, indices));
+
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_destroy(SpfftTransform transform) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    delete reinterpret_cast<spfft::Transform*>(transform);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  transform = nullptr;
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_clone(SpfftTransform transform, SpfftTransform* newTransform) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *newTransform = new spfft::Transform(reinterpret_cast<spfft::Transform*>(transform)->clone());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+
+SpfftError spfft_transform_forward(SpfftTransform transform, SpfftProcessingUnitType inputLocation,
+                                   double* output, SpfftScalingType scaling) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    reinterpret_cast<spfft::Transform*>(transform)->forward(inputLocation, output, scaling);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_backward(SpfftTransform transform, const double* input,
+                                    SpfftProcessingUnitType outputLocation) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    reinterpret_cast<spfft::Transform*>(transform)->backward(input, outputLocation);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_get_space_domain(SpfftTransform transform,
+                                           SpfftProcessingUnitType dataLocation, double** data) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *data = reinterpret_cast<spfft::Transform*>(transform)->space_domain_data(dataLocation);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_dim_x(SpfftTransform transform, int* dimX) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimX = reinterpret_cast<spfft::Transform*>(transform)->dim_x();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_dim_y(SpfftTransform transform, int* dimY) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimY = reinterpret_cast<spfft::Transform*>(transform)->dim_y();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_dim_z(SpfftTransform transform, int* dimZ) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimZ = reinterpret_cast<spfft::Transform*>(transform)->dim_z();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_local_z_length(SpfftTransform transform, int* localZLength) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *localZLength = reinterpret_cast<spfft::Transform*>(transform)->local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_local_z_offset(SpfftTransform transform, int* offset) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *offset = reinterpret_cast<spfft::Transform*>(transform)->local_z_offset();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_num_local_elements(SpfftTransform transform, int* localZLength) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *localZLength = reinterpret_cast<spfft::Transform*>(transform)->local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_device_id(SpfftTransform transform, int* deviceId) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *deviceId = reinterpret_cast<spfft::Transform*>(transform)->device_id();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_num_threads(SpfftTransform transform, int* numThreads) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *numThreads = reinterpret_cast<spfft::Transform*>(transform)->num_threads();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_transform_communicator(SpfftTransform transform, MPI_Comm* comm) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *comm = reinterpret_cast<spfft::Transform*>(transform)->communicator();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_transform_communicator_fortran(SpfftGrid grid, int* commFortran) {
+  if (!grid) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *commFortran = MPI_Comm_c2f(reinterpret_cast<spfft::Transform*>(grid)->communicator());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+
+} // extern C
+
+
diff --git a/src/spfft/transform_float.cpp b/src/spfft/transform_float.cpp
new file mode 100644
index 0000000..b306a14
--- /dev/null
+++ b/src/spfft/transform_float.cpp
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spfft/transform_float.hpp"
+#include "spfft/transform_float.h"
+#include "spfft/grid_float.hpp"
+#include "spfft/transform_internal.hpp"
+
+#ifdef SPFFT_SINGLE_PRECISION
+namespace spfft {
+TransformFloat::TransformFloat(const std::shared_ptr<GridInternal<float>>& grid,
+                               SpfftProcessingUnitType processingUnit,
+                               SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                               int localZLength, int numLocalElements,
+                               SpfftIndexFormatType indexFormat, const int* indices) {
+  std::shared_ptr<Parameters> param;
+  if (!grid->local()) {
+#ifdef SPFFT_MPI
+    param.reset(new Parameters(grid->communicator(), transformType, dimX, dimY, dimZ, localZLength,
+                               numLocalElements, indexFormat, indices));
+#else
+    throw MPISupportError()
+#endif
+  } else {
+    param.reset(new Parameters(transformType, dimX, dimY, dimZ, numLocalElements, indexFormat,
+                               indices));
+  }
+
+  transform_.reset(new TransformInternal<float>(processingUnit, grid, std::move(param)));
+}
+
+TransformFloat::TransformFloat(std::shared_ptr<TransformInternal<float>> transform)
+    : transform_(std::move(transform)) {}
+
+TransformFloat TransformFloat::clone() const {
+  return TransformFloat(
+      std::shared_ptr<TransformInternal<float>>(new TransformInternal<float>(transform_->clone())));
+}
+
+float* TransformFloat::space_domain_data(SpfftProcessingUnitType dataLocation) {
+  return transform_->space_domain_data(dataLocation);
+}
+
+void TransformFloat::forward(SpfftProcessingUnitType inputLocation, float* output,
+                             SpfftScalingType scaling) {
+  transform_->forward(inputLocation, output, scaling);
+}
+
+void TransformFloat::backward(const float* input, SpfftProcessingUnitType outputLocation) {
+  transform_->backward(input, outputLocation);
+}
+
+SpfftTransformType TransformFloat::type() const { return transform_->type(); }
+
+int TransformFloat::dim_x() const { return transform_->dim_x(); }
+
+int TransformFloat::dim_y() const { return transform_->dim_y(); }
+
+int TransformFloat::dim_z() const { return transform_->dim_z(); }
+
+int TransformFloat::local_z_length() const { return transform_->num_local_xy_planes(); }
+
+int TransformFloat::local_z_offset() const { return transform_->local_xy_plane_offset(); }
+
+int TransformFloat::local_slice_size() const {
+  return dim_x() * dim_y() * local_z_length();
+}
+
+int TransformFloat::num_local_elements() const { return transform_->num_local_elements(); }
+
+SpfftProcessingUnitType TransformFloat::processing_unit() const { return transform_->processing_unit(); }
+
+int TransformFloat::device_id() const { return transform_->device_id(); }
+
+int TransformFloat::num_threads() const { return transform_->num_threads(); }
+
+#ifdef SPFFT_MPI
+MPI_Comm TransformFloat::communicator() const { return transform_->communicator(); }
+#endif
+
+
+} // namespace spfft
+
+//---------------------
+// C API
+//---------------------
+
+extern "C" {
+SpfftError spfft_float_transform_create(SpfftFloatTransform* transform, SpfftFloatGrid grid,
+                                  SpfftProcessingUnitType processingUnit,
+                                  SpfftTransformType transformType, int dimX, int dimY, int dimZ,
+                                  int localZLength, int numLocalElements,
+                                  SpfftIndexFormatType indexFormat, const int* indices) {
+  try {
+    *transform = new spfft::TransformFloat(reinterpret_cast<spfft::GridFloat*>(grid)->create_transform(
+        processingUnit, transformType, dimX, dimY, dimZ, localZLength, numLocalElements,
+        indexFormat, indices));
+
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_destroy(SpfftFloatTransform transform) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    delete reinterpret_cast<spfft::TransformFloat*>(transform);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  transform = nullptr;
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_clone(SpfftFloatTransform transform, SpfftFloatTransform* newTransform) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *newTransform =
+        new spfft::TransformFloat(reinterpret_cast<spfft::TransformFloat*>(transform)->clone());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_forward(SpfftFloatTransform transform, SpfftProcessingUnitType inputLocation,
+                                   float* output, SpfftScalingType scaling) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    reinterpret_cast<spfft::TransformFloat*>(transform)->forward(inputLocation, output, scaling);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_backward(SpfftFloatTransform transform, const float* input,
+                                    SpfftProcessingUnitType outputLocation) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    reinterpret_cast<spfft::TransformFloat*>(transform)->backward(input, outputLocation);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_get_space_domain(SpfftFloatTransform transform,
+                                           SpfftProcessingUnitType dataLocation, float** data) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *data = reinterpret_cast<spfft::TransformFloat*>(transform)->space_domain_data(dataLocation);
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_dim_x(SpfftFloatTransform transform, int* dimX) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimX = reinterpret_cast<spfft::TransformFloat*>(transform)->dim_x();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_dim_y(SpfftFloatTransform transform, int* dimY) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimY = reinterpret_cast<spfft::TransformFloat*>(transform)->dim_y();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_dim_z(SpfftFloatTransform transform, int* dimZ) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *dimZ = reinterpret_cast<spfft::TransformFloat*>(transform)->dim_z();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_local_z_length(SpfftFloatTransform transform, int* localZLength) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *localZLength = reinterpret_cast<spfft::TransformFloat*>(transform)->local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_local_z_offset(SpfftFloatTransform transform, int* offset) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *offset = reinterpret_cast<spfft::TransformFloat*>(transform)->local_z_offset();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_num_local_elements(SpfftFloatTransform transform, int* localZLength) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *localZLength = reinterpret_cast<spfft::TransformFloat*>(transform)->local_z_length();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_device_id(SpfftFloatTransform transform, int* deviceId) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *deviceId = reinterpret_cast<spfft::TransformFloat*>(transform)->device_id();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_num_threads(SpfftFloatTransform transform, int* numThreads) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *numThreads = reinterpret_cast<spfft::TransformFloat*>(transform)->num_threads();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+#ifdef SPFFT_MPI
+SpfftError spfft_float_transform_communicator(SpfftFloatTransform transform, MPI_Comm* comm) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *comm = reinterpret_cast<spfft::TransformFloat*>(transform)->communicator();
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+
+SpfftError spfft_float_transform_communicator_fortran(SpfftFloatTransform transform, int* commFortran) {
+  if (!transform) {
+    return SpfftError::SPFFT_INVALID_HANDLE_ERROR;
+  }
+  try {
+    *commFortran =
+        MPI_Comm_c2f(reinterpret_cast<spfft::TransformFloat*>(transform)->communicator());
+  } catch (const spfft::GenericError& e) {
+    return e.error_code();
+  } catch (...) {
+    return SpfftError::SPFFT_UNKNOWN_ERROR;
+  }
+  return SpfftError::SPFFT_SUCCESS;
+}
+#endif
+
+} // extern C
+
+#endif
diff --git a/src/spfft/transform_internal.cpp b/src/spfft/transform_internal.cpp
new file mode 100644
index 0000000..0667053
--- /dev/null
+++ b/src/spfft/transform_internal.cpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <memory>
+#include "compression/indices.hpp"
+#include "execution/execution_host.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+
+#include "spfft/transform_internal.hpp"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "gpu_util/gpu_device_guard.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#endif
+
+namespace spfft {
+template <typename T>
+TransformInternal<T>::TransformInternal(SpfftProcessingUnitType executionUnit,
+                                        std::shared_ptr<GridInternal<T>> grid,
+                                        std::shared_ptr<Parameters> param)
+    : executionUnit_(executionUnit), grid_(std::move(grid)), param_(std::move(param)) {
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  // set device for current thread
+  GPUDeviceGuard(grid_->device_id());
+#endif
+
+  // ----------------------
+  // Input Check
+  // ----------------------
+  if (!grid_) {
+    throw InvalidParameterError();
+  }
+  if (param_->local_num_xy_planes() > static_cast<SizeType>(grid_->max_num_local_xy_planes())) {
+    throw InvalidParameterError();
+  }
+  if (grid_->local() && param_->dim_z() != param_->local_num_xy_planes()) {
+    throw InvalidParameterError();
+  }
+  if (param_->local_num_z_sticks() > static_cast<SizeType>(grid_->max_num_local_z_columns())) {
+    throw InvalidParameterError();
+  }
+  if (param_->dim_x() > static_cast<SizeType>(grid_->max_dim_x()) ||
+      param_->dim_y() > static_cast<SizeType>(grid_->max_dim_y()) ||
+      param_->dim_z() > static_cast<SizeType>(grid_->max_dim_z())) {
+    throw InvalidParameterError();
+  }
+  if (!(executionUnit & grid_->processing_unit())) {
+    // must match memory initialization parameters for grid
+    throw InvalidParameterError();
+  }
+  if (executionUnit != SpfftProcessingUnitType::SPFFT_PU_HOST &&
+      executionUnit != SpfftProcessingUnitType::SPFFT_PU_GPU) {
+    // must be exclusively CPU or GPU
+    throw InvalidParameterError();
+  }
+#ifdef SPFFT_MPI
+  if (grid_->communicator().size() != param_->comm_size() ||
+      grid_->communicator().rank() != param_->comm_rank()) {
+    throw InternalError();
+  }
+#endif
+
+  // create execution
+  if (grid_->local()) {
+    // ----------------------
+    // Local
+    // ----------------------
+    if (executionUnit == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      execHost_.reset(new ExecutionHost<T>(grid_->num_threads(), param_, grid_->array_host_1(),
+                                           grid_->array_host_2()));
+    } else {
+      // GPU
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+      execGPU_.reset(new ExecutionGPU<T>(
+          grid_->num_threads(), param_, grid_->array_host_1(), grid_->array_host_2(),
+          grid_->array_gpu_1(), grid_->array_gpu_2(), grid_->fft_work_buffer()));
+
+#else
+      throw GPUSupportError();
+#endif
+    }
+
+  } else {
+    // ----------------------
+    // Distributed
+    // ----------------------
+#ifdef SPFFT_MPI
+    if (executionUnit == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      // CPU
+      execHost_.reset(new ExecutionHost<T>(grid_->communicator(), grid_->exchange_type(),
+                                           grid_->num_threads(), param_, grid_->array_host_1(),
+                                           grid_->array_host_2()));
+    } else {
+      // GPU
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+      execGPU_.reset(new ExecutionGPU<T>(grid_->communicator(), grid_->exchange_type(),
+                                         grid_->num_threads(), param_, grid_->array_host_1(),
+                                         grid_->array_host_2(), grid_->array_gpu_1(),
+                                         grid_->array_gpu_2(), grid_->fft_work_buffer()));
+
+#else  // GPU
+      throw GPUSupportError();
+#endif // GPU
+    }
+#else  // MPI
+    throw MPISupportError();
+#endif // MPI
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::forward(const SpfftProcessingUnitType inputLocation, T* output,
+                                   SpfftScalingType scaling) -> void {
+  HOST_TIMING_SCOPED("forward")
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    if (inputLocation != SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      throw InvalidParameterError();
+    }
+    execHost_->forward_xy();
+    execHost_->forward_exchange(false);
+    execHost_->forward_z(output, scaling);
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    assert(execGPU_);
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+
+    execGPU_->forward_xy(inputLocation);
+    execGPU_->forward_exchange(false);
+    execGPU_->forward_z(output, scaling);
+    execGPU_->synchronize();
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::clone() const -> TransformInternal<T> {
+  std::shared_ptr<GridInternal<T>> newGrid(new GridInternal<T>(*grid_));
+  return TransformInternal<T>(executionUnit_, std::move(newGrid), param_);
+}
+
+template <typename T>
+auto TransformInternal<T>::forward_xy(const SpfftProcessingUnitType inputLocation) -> void {
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    if (inputLocation != SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      throw InvalidParameterError();
+    }
+    execHost_->forward_xy();
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    assert(execGPU_);
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+
+    execGPU_->forward_xy(inputLocation);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::forward_exchange() -> void {
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    execHost_->forward_exchange(true);
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    assert(execGPU_);
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+
+    execGPU_->forward_exchange(true);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::forward_z(T* output, SpfftScalingType scaling) -> void {
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    execHost_->forward_z(output, scaling);
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    assert(execGPU_);
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+
+    execGPU_->forward_z(output, scaling);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::backward(const T* input, const SpfftProcessingUnitType outputLocation)
+    -> void {
+  HOST_TIMING_SCOPED("backward")
+  // check if input is can be accessed from gpu
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    if (outputLocation != SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      throw InvalidParameterError();
+    }
+
+    execHost_->backward_z(input);
+    execHost_->backward_exchange(false);
+    execHost_->backward_xy();
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+    execGPU_->backward_z(input);
+    execGPU_->backward_exchange(false);
+    execGPU_->backward_xy(outputLocation);
+    execGPU_->synchronize();
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::backward_z(const T* input) -> void {
+  // check if input is can be accessed from gpu
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+
+    execHost_->backward_z(input);
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+    execGPU_->backward_z(input);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::backward_exchange() -> void {
+  // check if input is can be accessed from gpu
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+
+    execHost_->backward_exchange(true);
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+    execGPU_->backward_exchange(true);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::backward_xy(const SpfftProcessingUnitType outputLocation) -> void {
+  // check if input is can be accessed from gpu
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    assert(execHost_);
+    if (outputLocation != SpfftProcessingUnitType::SPFFT_PU_HOST) {
+      throw InvalidParameterError();
+    }
+
+    execHost_->backward_xy();
+  } else {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+    // set device for current thread
+    GPUDeviceGuard(grid_->device_id());
+    execGPU_->backward_xy(outputLocation);
+#else
+    throw GPUSupportError();
+#endif
+  }
+}
+
+template <typename T>
+auto TransformInternal<T>::space_domain_data(SpfftProcessingUnitType location) -> T* {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+  if (executionUnit_ == SpfftProcessingUnitType::SPFFT_PU_GPU) {
+    // GPU
+    if (location == SpfftProcessingUnitType::SPFFT_PU_GPU) {
+      return execGPU_->space_domain_data_gpu().data();
+    } else {
+      return execGPU_->space_domain_data_host().data();
+    }
+  }
+#endif
+
+  // CPU
+  if (location != SpfftProcessingUnitType::SPFFT_PU_HOST) throw InvalidParameterError();
+  return execHost_->space_domain_data().data();
+}
+
+template <typename T>
+auto TransformInternal<T>::synchronize() -> void {
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+  if(execGPU_) execGPU_->synchronize();
+#endif
+}
+
+
+// instatiate templates for float and double
+template class TransformInternal<double>;
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransformInternal<float>;
+#endif
+
+} // namespace spfft
+
diff --git a/src/spfft/transform_internal.hpp b/src/spfft/transform_internal.hpp
new file mode 100644
index 0000000..c1846a2
--- /dev/null
+++ b/src/spfft/transform_internal.hpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSFORM_INTERNAL_HPP
+#define SPFFT_TRANSFORM_INTERNAL_HPP
+
+#include <memory>
+#include "execution/execution_host.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/grid_internal.hpp"
+#include "spfft/types.h"
+#include "util/common_types.hpp"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "compression/compression_gpu.hpp"
+#include "execution/execution_gpu.hpp"
+#endif
+
+namespace spfft {
+template <typename T>
+class TransformInternal {
+public:
+  TransformInternal(SpfftProcessingUnitType executionUnit, std::shared_ptr<GridInternal<T>> grid,
+                    std::shared_ptr<Parameters> param);
+
+  auto clone() const -> TransformInternal<T>;
+
+  inline auto type() const noexcept -> SpfftTransformType { return param_->transform_type(); }
+
+  inline auto dim_x() const noexcept -> int { return param_->dim_x(); }
+
+  inline auto dim_y() const noexcept -> int { return param_->dim_y(); }
+
+  inline auto dim_z() const noexcept -> int { return param_->dim_z(); }
+
+  inline auto num_local_xy_planes() const noexcept -> int { return param_->local_num_xy_planes(); }
+
+  inline auto local_xy_plane_offset() const noexcept -> int {
+    return param_->local_xy_plane_offset();
+  }
+
+  inline auto processing_unit() const noexcept -> SpfftProcessingUnitType { return executionUnit_; }
+
+  inline auto device_id() const -> int { return grid_->device_id(); }
+
+  inline auto num_threads() const -> int { return grid_->num_threads(); }
+
+  inline auto num_local_elements() const -> int { return param_->local_num_elements(); }
+
+  inline auto shared_grid(const TransformInternal<T>& other) const -> bool {
+    return other.grid_ == grid_;
+  }
+
+  inline auto transform_type() const -> SpfftTransformType {
+    return param_->transform_type();
+  }
+
+#ifdef SPFFT_MPI
+  inline auto communicator() const -> MPI_Comm { return grid_->communicator().get(); }
+#endif
+
+  // full forward transform with blocking communication
+  auto forward(const SpfftProcessingUnitType inputLocation, T* output, SpfftScalingType scaling)
+      -> void;
+
+  // transform in x and y
+  auto forward_xy(const SpfftProcessingUnitType inputLocation) -> void;
+
+  // start non-blocking exchange
+  auto forward_exchange() -> void;
+
+  // finalize exchange and transform z
+  auto forward_z(T* output, SpfftScalingType scaling) -> void;
+
+  // full backward transform with blocking communication
+  auto backward(const T* input, const SpfftProcessingUnitType outputLocation) -> void;
+
+  // transform in x and y
+  auto backward_xy(const SpfftProcessingUnitType outputLocation) -> void;
+
+  // start non-blocking exchange
+  auto backward_exchange() -> void;
+
+  // finalize exchange and transform z
+  auto backward_z(const T* input) -> void;
+
+  // must be called after step-wise transforms on GPUs
+  auto synchronize() -> void;
+
+  auto space_domain_data(SpfftProcessingUnitType location) -> T*;
+
+private:
+  SpfftProcessingUnitType executionUnit_;
+  std::shared_ptr<Parameters> param_;
+  std::shared_ptr<GridInternal<T>> grid_;
+
+  std::unique_ptr<ExecutionHost<T>> execHost_;
+#if (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+  std::unique_ptr<ExecutionGPU<T>> execGPU_;
+#endif
+};
+
+} // namespace spfft
+
+#endif
+
diff --git a/src/symmetry/gpu_kernels/symmetry_kernels.cu b/src/symmetry/gpu_kernels/symmetry_kernels.cu
new file mode 100644
index 0000000..9721243
--- /dev/null
+++ b/src/symmetry/gpu_kernels/symmetry_kernels.cu
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cassert>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime.hpp"
+#include "memory/gpu_array_const_view.hpp"
+#include "memory/gpu_array_view.hpp"
+
+namespace spfft {
+
+template <typename T>
+__global__ static void symmetrize_plane_kernel(
+    GPUArrayView3D<typename gpu::fft::ComplexType<T>::type> data, const int startIndex,
+    const int numIndices) {
+  assert(startIndex + numIndices <= data.dim_mid());
+  int idxMid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idxMid < numIndices) {
+    idxMid += startIndex;
+    auto value = data(blockIdx.y, idxMid, 0);
+    if (value.x != T(0) || value.y != T(0)) {
+      value.y = -value.y;
+      data(blockIdx.y, data.dim_mid() - idxMid, 0) = value;
+    }
+  }
+}
+
+auto symmetrize_plane_gpu(const gpu::StreamType stream,
+                          const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& data)
+    -> void {
+     assert(data.size() > 2);
+  {
+    const int startIndex = 1;
+    const int numIndices = data.dim_mid() / 2;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x, data.dim_outer());
+    launch_kernel(symmetrize_plane_kernel<double>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+  {
+    const int startIndex = data.dim_mid() / 2 + 1;
+    const int numIndices = data.dim_mid() - startIndex;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x, data.dim_outer());
+    launch_kernel(symmetrize_plane_kernel<double>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+}
+
+auto symmetrize_plane_gpu(const gpu::StreamType stream,
+                          const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& data)
+    -> void {
+     assert(data.size() > 2);
+  {
+    const int startIndex = 1;
+    const int numIndices = data.dim_mid() / 2;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x, data.dim_outer());
+    launch_kernel(symmetrize_plane_kernel<float>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+  {
+    const int startIndex = data.dim_mid() / 2 + 1;
+    const int numIndices = data.dim_mid() - startIndex;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x, data.dim_outer());
+    launch_kernel(symmetrize_plane_kernel<float>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+}
+
+template <typename T>
+__global__ static void symmetrize_stick_kernel(
+    GPUArrayView1D<typename gpu::fft::ComplexType<T>::type> data, const int startIndex,
+    const int numIndices) {
+  assert(startIndex + numIndices <= data.size());
+  int idxInner = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idxInner < numIndices) {
+    idxInner += startIndex;
+    auto value = data(idxInner);
+    if (value.x != T(0) && value.y != T(0)) {
+      value.y = -value.y;
+      data(data.size() - idxInner) = value;
+    }
+  }
+}
+
+auto symmetrize_stick_gpu(const gpu::StreamType stream,
+                          const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& data)
+    -> void {
+     assert(data.size() > 2);
+  {
+    const int startIndex = 1;
+    const int numIndices = data.size() / 2;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x);
+    launch_kernel(symmetrize_stick_kernel<double>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+  {
+    const int startIndex = data.size() / 2 + 1;
+    const int numIndices = data.size() - startIndex;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x);
+    launch_kernel(symmetrize_stick_kernel<double>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+}
+
+
+auto symmetrize_stick_gpu(const gpu::StreamType stream,
+                          const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& data)
+    -> void {
+     assert(data.size() > 2);
+  {
+    const int startIndex = 1;
+    const int numIndices = data.size() / 2;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x);
+    launch_kernel(symmetrize_stick_kernel<float>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+  {
+    const int startIndex = data.size() / 2 + 1;
+    const int numIndices = data.size() - startIndex;
+    const dim3 threadBlock(256);
+    const dim3 threadGrid((numIndices + threadBlock.x - 1) / threadBlock.x);
+    launch_kernel(symmetrize_stick_kernel<float>, threadGrid, threadBlock, 0, stream, data,
+                  startIndex, numIndices);
+  }
+}
+
+} // namespace spfft
+
diff --git a/src/symmetry/gpu_kernels/symmetry_kernels.hpp b/src/symmetry/gpu_kernels/symmetry_kernels.hpp
new file mode 100644
index 0000000..b9ac5cc
--- /dev/null
+++ b/src/symmetry/gpu_kernels/symmetry_kernels.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SYMMETRY_KERNELS_HPP
+#define SPFFT_SYMMETRY_KERNELS_HPP
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "memory/gpu_array_view.hpp"
+
+namespace spfft {
+
+auto symmetrize_plane_gpu(const gpu::StreamType stream,
+                          const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& data)
+    -> void;
+
+auto symmetrize_plane_gpu(const gpu::StreamType stream,
+                          const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& data)
+    -> void;
+
+auto symmetrize_stick_gpu(const gpu::StreamType stream,
+                          const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& data)
+    -> void;
+
+auto symmetrize_stick_gpu(const gpu::StreamType stream,
+                          const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& data)
+    -> void;
+
+} // namespace spfft
+
+#endif
diff --git a/src/symmetry/symmetry.hpp b/src/symmetry/symmetry.hpp
new file mode 100644
index 0000000..2e3e0b6
--- /dev/null
+++ b/src/symmetry/symmetry.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SYMMETRY_HPP
+#define SPFFT_SYMMETRY_HPP
+
+#include "spfft/config.h"
+
+namespace spfft {
+
+class Symmetry {
+public:
+  virtual auto apply() -> void{};
+
+  virtual ~Symmetry() = default;
+};
+} // namespace spfft
+
+#endif
+
diff --git a/src/symmetry/symmetry_gpu.hpp b/src/symmetry/symmetry_gpu.hpp
new file mode 100644
index 0000000..635b572
--- /dev/null
+++ b/src/symmetry/symmetry_gpu.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SYMMETRY_GPU_HPP
+#define SPFFT_SYMMETRY_GPU_HPP
+
+#include <complex>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "spfft/config.h"
+#include "symmetry/gpu_kernels/symmetry_kernels.hpp"
+#include "symmetry/symmetry.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+
+namespace spfft {
+
+// This class will apply the 1D hermitian symmetry along the inner dimension on the plane with mid
+// index 0
+template <typename T>
+class PlaneSymmetryGPU : public Symmetry {
+public:
+  PlaneSymmetryGPU(GPUStreamHandle stream,
+                   const GPUArrayView3D<typename gpu::fft::ComplexType<T>::type>& data)
+      : stream_(std::move(stream)), data_(data) {}
+
+  auto apply() -> void override {
+    if (data_.dim_mid() > 2 && data_.size() > 0) {
+      symmetrize_plane_gpu(stream_.get(), data_);
+    }
+  }
+
+private:
+  GPUStreamHandle stream_;
+  GPUArrayView3D<typename gpu::fft::ComplexType<T>::type> data_;
+};
+
+// This class will apply the hermitian symmetry in 1d
+template <typename T>
+class StickSymmetryGPU : public Symmetry {
+public:
+  StickSymmetryGPU(GPUStreamHandle stream,
+                   const GPUArrayView1D<typename gpu::fft::ComplexType<T>::type>& stick)
+      : stream_(std::move(stream)), stick_(stick) {}
+
+  auto apply() -> void override {
+    if (stick_.size() > 2) {
+      symmetrize_stick_gpu(stream_.get(), stick_);
+    }
+  }
+
+private:
+  GPUStreamHandle stream_;
+  GPUArrayView1D<typename gpu::fft::ComplexType<T>::type> stick_;
+};
+} // namespace spfft
+
+#endif
+
diff --git a/src/symmetry/symmetry_host.hpp b/src/symmetry/symmetry_host.hpp
new file mode 100644
index 0000000..3217e63
--- /dev/null
+++ b/src/symmetry/symmetry_host.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_SYMMETRY_HOST_HPP
+#define SPFFT_SYMMETRY_HOST_HPP
+
+#include <complex>
+#include "memory/host_array_view.hpp"
+#include "spfft/config.h"
+#include "symmetry/symmetry.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+
+namespace spfft {
+
+// This class will apply the 1D hermitian symmetry along the inner dimension on the plane with mid
+// index 0
+template <typename T>
+class PlaneSymmetryHost : public Symmetry {
+public:
+  explicit PlaneSymmetryHost(const HostArrayView3D<std::complex<T>>& data) : data_(data) {}
+
+  auto apply() -> void override {
+    constexpr std::complex<T> zeroElement;
+    // Data may be conjugated twice, but this way symmetry is applied independent of positive or
+    // negative frequencies provided
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType idxOuter = 0; idxOuter < data_.dim_outer(); ++idxOuter) {
+      for (SizeType idxInner = 1; idxInner < data_.dim_inner(); ++idxInner) {
+        const auto value = data_(idxOuter, 0, idxInner);
+        if (value != zeroElement) {
+          data_(idxOuter, 0, data_.dim_inner() - idxInner) = std::conj(value);
+        }
+      }
+    }
+  }
+
+private:
+  HostArrayView3D<std::complex<T>> data_;
+};
+
+// This class will apply the hermitian symmetry in 1d
+template <typename T>
+class StickSymmetryHost : public Symmetry {
+public:
+  explicit StickSymmetryHost(const HostArrayView1D<std::complex<T>>& stick) : stick_(stick) {}
+
+  auto apply() -> void override {
+    constexpr std::complex<T> zeroElement;
+    // Data may be conjugated twice, but this way symmetry is applied independent of positive or
+    // negative frequencies provided
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType idxInner = 1; idxInner < stick_.size() / 2 + 1; ++idxInner) {
+      const auto value = stick_(idxInner);
+      if (value != zeroElement) {
+        stick_(stick_.size() - idxInner) = std::conj(value);
+      }
+    }
+    SPFFT_OMP_PRAGMA("omp for schedule(static)")
+    for (SizeType idxInner = stick_.size() / 2 + 1; idxInner < stick_.size(); ++idxInner) {
+      const auto value = stick_(idxInner);
+      if (value != zeroElement) {
+        stick_(stick_.size() - idxInner) = std::conj(value);
+      }
+    }
+  }
+
+private:
+  HostArrayView1D<std::complex<T>> stick_;
+};
+} // namespace spfft
+
+#endif
+
diff --git a/src/timing/host_timing.cpp b/src/timing/host_timing.cpp
new file mode 100644
index 0000000..72d5d2d
--- /dev/null
+++ b/src/timing/host_timing.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "timing/host_timing.hpp"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <ratio>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace spfft {
+namespace timing {
+
+// ======================
+// Local helper
+// ======================
+namespace {
+// Helper struct for creating a tree of timings
+struct HostTimeStampPair {
+  std::string identifier;
+  double time = 0.0;
+  std::size_t startIdx = 0;
+  std::size_t stopIdx = 0;
+  TimingResult* nodePtr = nullptr;
+};
+auto calc_median(const std::vector<double>::iterator& begin,
+                 const std::vector<double>::iterator& end) -> double {
+  const auto n = end - begin;
+  if (n == 0) return *begin;
+  if (n % 2 == 0) {
+    return (*(begin + n / 2) + *(begin + n / 2 - 1)) / 2.0;
+  } else {
+    return *(begin + n / 2);
+  }
+}
+
+auto calculate_statistic(std::vector<double> values)
+    -> std::tuple<double, double, double, double, double, double, double> {
+  if (values.empty()) return std::make_tuple(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+  std::sort(values.begin(), values.end());
+
+  const double min = values.front();
+  const double max = values.back();
+
+  const double median = calc_median(values.begin(), values.end());
+  const double sum = std::accumulate(values.begin(), values.end(), 0.0);
+  const double mean = sum / values.size();
+
+  const double lowerQuartile = calc_median(values.begin(), values.begin() + values.size() / 2);
+  const double upperQuartile = calc_median(
+      values.begin() + values.size() / 2 + (values.size() % 2) * (values.size() > 1), values.end());
+
+  return std::make_tuple(sum, mean, median, min, max, lowerQuartile, upperQuartile);
+}
+
+// format time input in seconds into string with appropriate unit
+auto format_time(const double time_seconds) -> std::string {
+  if (time_seconds <= 0.0) return std::string("0 s");
+
+  // time is always greater than 0 here
+  const double exponent = std::log10(std::abs(time_seconds));
+  const int siExponent = static_cast<int>(std::floor(exponent / 3.0) * 3);
+
+  std::stringstream result;
+  result << std::fixed << std::setprecision(2);
+  result << time_seconds * std::pow(10.0, static_cast<double>(-siExponent));
+  result << " ";
+  switch (siExponent) {
+    case 24:
+      result << "Y";
+      break;
+    case 21:
+      result << "Z";
+      break;
+    case 18:
+      result << "E";
+      break;
+    case 15:
+      result << "P";
+      break;
+    case 12:
+      result << "T";
+      break;
+    case 9:
+      result << "G";
+      break;
+    case 6:
+      result << "M";
+      break;
+    case 3:
+      result << "k";
+      break;
+    case 0:
+      break;
+    case -3:
+      result << "m";
+      break;
+    case -6:
+      result << "u";
+      break;
+    case -9:
+      result << "n";
+      break;
+    case -12:
+      result << "p";
+      break;
+    case -15:
+      result << "f";
+      break;
+    case -18:
+      result << "a";
+      break;
+    case -21:
+      result << "z";
+      break;
+    case -24:
+      result << "y";
+      break;
+    default:
+      result << "?";
+  }
+  result << "s";
+  return result.str();
+}
+
+// print timing nodes in tree recursively
+auto print_node(const std::size_t identifierSpace, const std::string& nodePrefix,
+                const TimingResult& node, const bool isSubNode, const double parentTotTime)
+    -> void {
+  double sum, mean, median, min, max, lowerQuartile, upperQuartile;
+  std::tie(sum, mean, median, min, max, lowerQuartile, upperQuartile) =
+      calculate_statistic(node.timings);
+
+  const double percentage =
+      (parentTotTime < sum || parentTotTime == 0) ? 100.0 : sum / parentTotTime * 100.0;
+
+  std::stringstream percentageStream;
+  percentageStream << std::fixed << std::setprecision(2) << percentage;
+
+  std::cout << std::left << std::setw(identifierSpace);
+  if (isSubNode)
+    std::cout << nodePrefix + "- " + node.identifier;
+  else
+    std::cout << nodePrefix + node.identifier;
+  std::cout << std::right << std::setw(8) << node.timings.size();
+  std::cout << std::right << std::setw(15) << format_time(sum);
+  std::cout << std::right << std::setw(15) << percentageStream.str();
+  std::cout << std::right << std::setw(15) << format_time(mean);
+  std::cout << std::right << std::setw(15) << format_time(median);
+  std::cout << std::right << std::setw(15) << format_time(min);
+  std::cout << std::right << std::setw(15) << format_time(max);
+  std::cout << std::endl;
+
+  for (const auto& subNode : node.subNodes) {
+    print_node(identifierSpace, nodePrefix + std::string(" |"), subNode, true, sum);
+  }
+}
+
+// determine length of padding required for printing entire tree identifiers recursively
+auto max_node_identifier_length(const TimingResult& node, const std::size_t recursionDepth,
+                                const std::size_t addPerLevel, const std::size_t parentMax)
+    -> std::size_t {
+  std::size_t currentLength = node.identifier.length() + recursionDepth * addPerLevel;
+  std::size_t max = currentLength > parentMax ? currentLength : parentMax;
+  for (const auto& subNode : node.subNodes) {
+    const std::size_t subMax =
+        max_node_identifier_length(subNode, recursionDepth + 1, addPerLevel, max);
+    if (subMax > max) max = subMax;
+  }
+
+  return max;
+}
+
+auto export_node_json(const std::string& padding, const std::list<TimingResult>& nodeList,
+                      std::stringstream& stream) -> void {
+  stream << "{" << std::endl;
+  const std::string nodePadding = padding + "  ";
+  const std::string subNodePadding = nodePadding + "  ";
+  for (const auto& node : nodeList) {
+    stream << nodePadding << "\"" << node.identifier << "\" : {" << std::endl;
+    stream << subNodePadding << "\"timings\" : [";
+    for (const auto& value : node.timings) {
+      stream << value;
+      if (&value != &(node.timings.back())) stream << ", ";
+    }
+    stream << "]," << std::endl;
+    stream << subNodePadding << "\"sub-timings\" : ";
+    export_node_json(subNodePadding, node.subNodes, stream);
+    stream << nodePadding << "}";
+    if (&node != &(nodeList.back())) stream << ",";
+    stream << std::endl;
+  }
+  stream << padding << "}" << std::endl;
+}
+} // namespace
+
+// ======================
+// HostTiming
+// ======================
+auto HostTiming::process_timings() -> std::list<TimingResult> {
+  std::list<TimingResult> results;
+
+  std::vector<HostTimeStampPair> timePairs;
+  timePairs.reserve(timeStamps_.size() / 2);
+
+  // create pairs of start / stop timings
+  for (std::size_t i = 0; i < timeStamps_.size(); ++i) {
+    if (timeStamps_[i].type == TimeStampType::Start) {
+      HostTimeStampPair pair;
+      pair.startIdx = i;
+      pair.identifier = std::string(timeStamps_[i].identifierPtr);
+      std::size_t numInnerMatchingIdentifiers = 0;
+      // search for matching stop after start
+      for (std::size_t j = i + 1; j < timeStamps_.size(); ++j) {
+        // only consider matching identifiers
+        if (std::string(timeStamps_[j].identifierPtr) ==
+            std::string(timeStamps_[i].identifierPtr)) {
+          if (timeStamps_[j].type == TimeStampType::Stop && numInnerMatchingIdentifiers == 0) {
+            // Matching stop found
+            std::chrono::duration<double> duration = timeStamps_[j].time - timeStamps_[i].time;
+            pair.time = duration.count();
+            pair.stopIdx = j;
+            timePairs.push_back(pair);
+            if (pair.time < 0) {
+              std::cerr
+                  << "WARNING: Host Timing -> Measured time is negative. Non-steady system-clock?!"
+                  << std::endl;
+            }
+            break;
+          } else if (timeStamps_[j].type == TimeStampType::Stop &&
+                     numInnerMatchingIdentifiers > 0) {
+            // inner stop with matching identifier
+            --numInnerMatchingIdentifiers;
+          } else if (timeStamps_[j].type == TimeStampType::Start) {
+            // inner start with matching identifier
+            ++numInnerMatchingIdentifiers;
+          }
+        }
+      }
+      if (pair.stopIdx == 0) {
+        std::cerr << "WARNING: Host Timing -> Start / stop time stamps do not match for \""
+                  << timeStamps_[i].identifierPtr << "\"!" << std::endl;
+      }
+    }
+  }
+
+  // create tree of timings where sub-nodes represent timings fully enclosed by another start / stop
+  // pair Use the fact that timePairs is sorted by startIdx
+  for (std::size_t i = 0; i < timePairs.size(); ++i) {
+    auto& pair = timePairs[i];
+
+    // find potential parent by going backwards through pairs, starting with the current pair
+    // position
+    for (auto timePairIt = timePairs.rbegin() + (timePairs.size() - i);
+         timePairIt != timePairs.rend(); ++timePairIt) {
+      if (timePairIt->stopIdx > pair.stopIdx && timePairIt->nodePtr != nullptr) {
+        auto& parentNode = *(timePairIt->nodePtr);
+        // check if sub-node with identifier exists
+        bool nodeFound = false;
+        for (auto& subNode : parentNode.subNodes) {
+          if (subNode.identifier == pair.identifier) {
+            nodeFound = true;
+            subNode.timings.push_back(pair.time);
+            // mark node position in pair for finding sub-nodes
+            pair.nodePtr = &(subNode);
+            break;
+          }
+        }
+        if (!nodeFound) {
+          // create new sub-node
+          TimingResult newNode;
+          newNode.identifier = pair.identifier;
+          newNode.timings.push_back(pair.time);
+          parentNode.subNodes.push_back(std::move(newNode));
+          // mark node position in pair for finding sub-nodes
+          pair.nodePtr = &(parentNode.subNodes.back());
+        }
+        break;
+      }
+    }
+
+    // No parent found, must be top level node
+    if (pair.nodePtr == nullptr) {
+      // Check if top level node with same name exists
+      for (auto& topNode : results) {
+        if (topNode.identifier == pair.identifier) {
+          topNode.timings.push_back(pair.time);
+          pair.nodePtr = &(topNode);
+          break;
+        }
+      }
+    }
+
+    // New top level node
+    if (pair.nodePtr == nullptr) {
+      TimingResult newNode;
+      newNode.identifier = pair.identifier;
+      newNode.timings.push_back(pair.time);
+      // newNode.parent = nullptr;
+      results.push_back(std::move(newNode));
+
+      // mark node position in pair for finding sub-nodes
+      pair.nodePtr = &(results.back());
+    }
+  }
+
+  return results;
+}
+
+auto HostTiming::print_timings() -> void {
+  auto timings = process_timings();
+  // calculate space for printing identifiers
+  std::size_t identifierSpace = 0;
+  for (const auto& node : timings) {
+    const auto nodeMax = max_node_identifier_length(node, 0, 2, identifierSpace);
+    if (nodeMax > identifierSpace) identifierSpace = nodeMax;
+  }
+  identifierSpace += 3;
+
+  const auto totalLength = identifierSpace + 8 + 6 * 15;
+  std::cout << std::string(totalLength, '=') << std::endl;
+
+  // header
+  std::cout << std::right << std::setw(identifierSpace + 8) << "#";
+  std::cout << std::right << std::setw(15) << "Total";
+  std::cout << std::right << std::setw(15) << "%";
+  std::cout << std::right << std::setw(15) << "Mean";
+  std::cout << std::right << std::setw(15) << "Median";
+  std::cout << std::right << std::setw(15) << "Min";
+  std::cout << std::right << std::setw(15) << "Max";
+  std::cout << std::endl;
+
+  std::cout << std::string(totalLength, '-') << std::endl;
+
+  // print all timings
+  for (auto& node : timings) {
+    print_node(identifierSpace, std::string(), node, false, 0.0);
+    std::cout << std::endl;
+  }
+  std::cout << std::string(totalLength, '=') << std::endl;
+}
+
+auto HostTiming::export_json() -> std::string {
+  auto nodeList = process_timings();
+  std::stringstream jsonStream;
+  jsonStream << std::scientific;
+  export_node_json("", nodeList, jsonStream);
+  return jsonStream.str();
+}
+
+} // namespace timing
+} // namespace spfft
+
diff --git a/src/timing/host_timing.hpp b/src/timing/host_timing.hpp
new file mode 100644
index 0000000..2bed484
--- /dev/null
+++ b/src/timing/host_timing.hpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_HOST_TIMING_HPP
+#define SPFFT_HOST_TIMING_HPP
+
+#include <chrono>
+#include <cstddef>
+#include <list>
+#include <string>
+#include <vector>
+#include "spfft/config.h"
+
+namespace spfft {
+namespace timing {
+
+using HostClockType = std::chrono::high_resolution_clock;
+
+enum class TimeStampType { Start, Stop, Empty };
+
+struct HostTimeStamp {
+  HostTimeStamp() : type(TimeStampType::Empty) {}
+
+  // Identifier pointer must point to compile time string literal
+  HostTimeStamp(const char* identifier, const TimeStampType& stampType)
+      : time(HostClockType::now()), identifierPtr(identifier), type(stampType) {}
+
+  HostClockType::time_point time;
+  const char* identifierPtr;
+  TimeStampType type;
+};
+
+struct TimingResult {
+  std::string identifier;
+  std::vector<double> timings;
+  std::list<TimingResult> subNodes;
+};
+
+class HostTimingScoped;
+
+class HostTiming {
+public:
+  // reserve 1000'000 time stamps by default
+  HostTiming() { timeStamps_.reserve(1000 * 1000); }
+
+  // explicit reserve size
+  explicit HostTiming(std::size_t reserveCount) { timeStamps_.reserve(reserveCount); }
+
+  // start with string literal
+  template <std::size_t N>
+  inline auto start(const char (&identifierPtr)[N]) -> void {
+    asm volatile("" ::: "memory"); // prevent compiler reordering
+    timeStamps_.emplace_back(identifierPtr, TimeStampType::Start);
+    asm volatile("" ::: "memory");
+  }
+
+  // start with string; more overhead than with string literals
+  inline auto start(std::string identifier) -> void {
+    asm volatile("" ::: "memory");
+    identifierStrings_.emplace_back(std::move(identifier));
+    timeStamps_.emplace_back(identifierStrings_.back().c_str(), TimeStampType::Start);
+    asm volatile("" ::: "memory");
+  }
+
+  // stop with string literal
+  template <std::size_t N>
+  inline auto stop(const char (&identifierPtr)[N]) -> void {
+    asm volatile("" ::: "memory");
+    timeStamps_.emplace_back(identifierPtr, TimeStampType::Stop);
+    asm volatile("" ::: "memory");
+  }
+
+  // stop with string; more overhead than with string literals
+  inline auto stop(std::string identifier) -> void {
+    asm volatile("" ::: "memory");
+    identifierStrings_.emplace_back(std::move(identifier));
+    timeStamps_.emplace_back(identifierStrings_.back().c_str(), TimeStampType::Stop);
+    asm volatile("" ::: "memory");
+  }
+
+  // reset timer
+  inline auto reset() -> void {
+    timeStamps_.clear();
+    timeStamps_.reserve(1000 * 1000);
+    identifierStrings_.clear();
+  }
+
+  // pretty print to cout
+  auto print_timings() -> void;
+
+  // process timings as tree structure
+  auto process_timings() -> std::list<TimingResult>;
+
+  // simple json export
+  auto export_json() -> std::string;
+
+private:
+  inline auto stop_unchecked(const char* identifierPtr) -> void {
+    asm volatile("" ::: "memory");
+    timeStamps_.emplace_back(identifierPtr, TimeStampType::Stop);
+    asm volatile("" ::: "memory");
+  }
+
+  friend HostTimingScoped;
+
+  std::vector<HostTimeStamp> timeStamps_;
+  std::list<std::string> identifierStrings_;
+};
+
+// Helper class, which calls start() upon creation and stop() on deconstruction
+class HostTimingScoped {
+public:
+  // timer reference must be valid for the entire lifetime
+  template <std::size_t N>
+  HostTimingScoped(const char (&identifierPtr)[N], HostTiming& timer)
+      : identifierPtr_(identifierPtr), timer_(timer) {
+    timer_.start(identifierPtr);
+  }
+
+  HostTimingScoped(std::string identifier, HostTiming& timer)
+      : identifierPtr_(nullptr), identifier_(std::move(identifier)), timer_(timer) {
+    timer_.start(identifier_);
+  }
+
+  HostTimingScoped(const HostTimingScoped&) = delete;
+  HostTimingScoped(HostTimingScoped&&) = delete;
+  auto operator=(const HostTimingScoped&) -> HostTimingScoped& = delete;
+  auto operator=(HostTimingScoped &&) -> HostTimingScoped& = delete;
+
+  ~HostTimingScoped() {
+    if (identifierPtr_) {
+      timer_.stop_unchecked(identifierPtr_);
+    } else {
+      timer_.stop(std::move(identifier_));
+    }
+  }
+
+private:
+  const char* identifierPtr_;
+  std::string identifier_;
+  HostTiming& timer_;
+};
+
+} // namespace timing
+} // namespace spfft
+
+#endif
diff --git a/src/timing/timing.cpp b/src/timing/timing.cpp
new file mode 100644
index 0000000..31b3785
--- /dev/null
+++ b/src/timing/timing.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "timing/host_timing.hpp"
+
+namespace spfft {
+namespace timing {
+HostTiming GlobalHostTimer;
+} // namespace timing
+} // namespace spfft
diff --git a/src/timing/timing.hpp b/src/timing/timing.hpp
new file mode 100644
index 0000000..12e07c6
--- /dev/null
+++ b/src/timing/timing.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPFFT_TIMING_HPP
+#define SPFFT_TIMING_HPP
+
+#include "spfft/config.h"
+#ifdef SPFFT_TIMING
+#include <chrono>
+#include <string>
+#include "timing/host_timing.hpp"
+
+namespace spfft {
+namespace timing {
+extern HostTiming GlobalHostTimer;
+} // namespace timing
+} // namespace spfft
+
+#define HOST_TIMING_CONCAT_IMPL(x, y) x##y
+#define HOST_TIMING_MACRO_CONCAT(x, y) HOST_TIMING_CONCAT_IMPL(x, y)
+
+#define HOST_TIMING_SCOPED(identifier)                        \
+  ::spfft::timing::HostTimingScoped HOST_TIMING_MACRO_CONCAT( \
+      scopedHostTimerMacroGenerated, __COUNTER__)(identifier, ::spfft::timing::GlobalHostTimer);
+
+#define HOST_TIMING_START(identifier) ::spfft::timing::GlobalHostTimer.start(identifier);
+
+#define HOST_TIMING_STOP(identifier) ::spfft::timing::GlobalHostTimer.stop(identifier);
+
+#define HOST_TIMING_PRINT() ::spfft::timing::GlobalHostTimer.print_timings();
+
+#define HOST_TIMING_EXPORT_JSON_STRING() ::spfft::timing::GlobalHostTimer.export_json()
+
+#define HOST_TIMING_PROCESS_TIMINGS() ::spfft::timing::GlobalHostTimer.process_timings()
+
+#else
+
+#define HOST_TIMING_START(identifier)
+#define HOST_TIMING_STOP(identifier)
+#define HOST_TIMING_SCOPED(identifier)
+#define HOST_TIMING_PRINT()
+#define HOST_TIMING_EXPORT_JSON_STRING() std::string();
+#define HOST_TIMING_PROCESS_TIMINGS()
+
+#endif
+
+#endif
diff --git a/src/transpose/gpu_kernels/buffered_kernels.cu b/src/transpose/gpu_kernels/buffered_kernels.cu
new file mode 100644
index 0000000..130f7af
--- /dev/null
+++ b/src/transpose/gpu_kernels/buffered_kernels.cu
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include "gpu_util/complex_conversion.cuh"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime.hpp"
+#include "memory/gpu_array_const_view.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "memory/array_view_utility.hpp"
+
+namespace spfft {
+
+// Packs z-sticks into buffer for MPI_Alltoall
+// Dimension of buffer are (numRanks, maxNumZSticks, maxNumXYPlanes)
+// Dimension of freqZData are (numLocalZSticks, dimZ)
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void buffered_pack_backward_kernel(const GPUArrayConstView1D<int> numXYPlanes,
+                                                     const GPUArrayConstView1D<int> xyPlaneOffsets,
+                                                     const GPUArrayConstView2D<DATA_TYPE> freqZData,
+                                                     GPUArrayView3D<BUFFER_TYPE> buffer) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int r = 0; r < numXYPlanes.size(); ++r) {
+    if (xyPlaneIndex < numXYPlanes(r)) {
+      const int xyOffset = xyPlaneOffsets(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < freqZData.dim_outer();
+           zStickIndex += gridDim.y) {
+        buffer(r, zStickIndex, xyPlaneIndex) = ConvertComplex<BUFFER_TYPE, DATA_TYPE>::apply(
+            freqZData(zStickIndex, xyPlaneIndex + xyOffset));
+      }
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto buffered_pack_backward_launch(const gpu::StreamType stream, const int maxNumXYPlanes,
+                                          const GPUArrayView1D<int>& numXYPlanes,
+                                          const GPUArrayView1D<int>& xyPlaneOffsets,
+                                          const GPUArrayView2D<DATA_TYPE>& freqZData,
+                                          GPUArrayView3D<BUFFER_TYPE> buffer) -> void {
+  assert(xyPlaneOffsets.size() == numXYPlanes.size());
+  assert(buffer.size() >= freqZData.size());
+  assert(buffer.dim_outer() == xyPlaneOffsets.size());
+  assert(buffer.dim_inner() == maxNumXYPlanes);
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((maxNumXYPlanes + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 4320));
+  assert(threadGrid.x > 0);
+  assert(threadGrid.y > 0);
+  launch_kernel(buffered_pack_backward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid, threadBlock, 0,
+                stream, numXYPlanes, xyPlaneOffsets, freqZData, buffer);
+}
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> buffer) -> void {
+  buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, freqZData,
+                                buffer);
+}
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, freqZData,
+                                buffer);
+}
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, freqZData,
+                                buffer);
+}
+
+// Unpacks z-sticks from buffer after MPI_Alltoall
+// Dimension of buffer are (numRanks, maxNumZSticks, maxNumXYPlanes)
+// Dimension of freqXYData are (numLocalXYPlanes, dimY, dimX)
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void buffered_unpack_backward_kernel(
+    const GPUArrayConstView1D<int> numZSticks, const GPUArrayConstView1D<int> indices,
+    const GPUArrayConstView3D<BUFFER_TYPE> buffer, GPUArrayView2D<DATA_TYPE> freqXYDataFlat) {
+  // buffer.dim_mid() is equal to maxNumZSticks
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (xyPlaneIndex < freqXYDataFlat.dim_outer()) {
+    for (int r = 0; r < numZSticks.size(); ++r) {
+      const int numCurrentZSticks = numZSticks(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < numCurrentZSticks;
+           zStickIndex += gridDim.y) {
+        const int currentIndex = indices(r * buffer.dim_mid() + zStickIndex);
+        freqXYDataFlat(xyPlaneIndex, currentIndex) =
+            ConvertComplex<DATA_TYPE, BUFFER_TYPE>::apply(buffer(r, zStickIndex, xyPlaneIndex));
+      }
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto buffered_unpack_backward_launch(const gpu::StreamType stream, const int maxNumXYPlanes,
+                                            const GPUArrayView1D<int>& numZSticks,
+                                            const GPUArrayView1D<int>& indices,
+                                            const GPUArrayView3D<BUFFER_TYPE>& buffer,
+                                            GPUArrayView3D<DATA_TYPE> freqXYData) -> void {
+  assert(buffer.dim_outer() == numZSticks.size());
+  assert(buffer.dim_inner() == maxNumXYPlanes);
+  assert(indices.size() == buffer.dim_mid() * numZSticks.size());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqXYData.dim_outer() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(buffer.dim_mid(), 4320));
+  assert(threadGrid.x > 0);
+  assert(threadGrid.y > 0);
+  launch_kernel(buffered_unpack_backward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid, threadBlock, 0,
+                stream, numZSticks, indices, buffer,
+                GPUArrayView2D<DATA_TYPE>(freqXYData.data(), freqXYData.dim_outer(),
+                                          freqXYData.dim_mid() * freqXYData.dim_inner(),
+                                          freqXYData.device_id()));
+}
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void {
+  buffered_unpack_backward_launch(stream, maxNumXYPlanes, numZSticks, indices, buffer, freqXYData);
+}
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> freqXYData) -> void {
+  buffered_unpack_backward_launch(stream, maxNumXYPlanes, numZSticks, indices, buffer, freqXYData);
+}
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void {
+  buffered_unpack_backward_launch(stream, maxNumXYPlanes, numZSticks, indices, buffer, freqXYData);
+}
+
+// Unpacks z-sticks from buffer after MPI_Alltoall
+// Dimension of buffer are (numRanks, maxNumZSticks, maxNumXYPlanes)
+// Dimension of freqZData are (numLocalZSticks, dimZ)
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void buffered_unpack_forward_kernel(const GPUArrayConstView1D<int> numXYPlanes,
+                                                      const GPUArrayConstView1D<int> xyPlaneOffsets,
+                                                      const GPUArrayConstView3D<BUFFER_TYPE> buffer,
+                                                      GPUArrayView2D<DATA_TYPE> freqZData) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int r = 0; r < numXYPlanes.size(); ++r) {
+    if (xyPlaneIndex < numXYPlanes(r)) {
+      const int xyOffset = xyPlaneOffsets(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < freqZData.dim_outer();
+           zStickIndex += gridDim.y) {
+        freqZData(zStickIndex, xyPlaneIndex + xyOffset) =
+            ConvertComplex<DATA_TYPE, BUFFER_TYPE>::apply(buffer(r, zStickIndex, xyPlaneIndex));
+      }
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto buffered_unpack_forward_launch(const gpu::StreamType stream, const int maxNumXYPlanes,
+                                           const GPUArrayView1D<int>& numXYPlanes,
+                                           const GPUArrayView1D<int>& xyPlaneOffsets,
+                                           const GPUArrayView3D<BUFFER_TYPE>& buffer,
+                                           GPUArrayView2D<DATA_TYPE> freqZData) -> void {
+  assert(xyPlaneOffsets.size() == numXYPlanes.size());
+  assert(buffer.size() >= freqZData.size());
+  assert(buffer.dim_outer() == xyPlaneOffsets.size());
+  assert(buffer.dim_inner() == maxNumXYPlanes);
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((maxNumXYPlanes + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 4320));
+  assert(threadGrid.x > 0);
+  assert(threadGrid.y > 0);
+  launch_kernel(buffered_unpack_forward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid, threadBlock, 0,
+                stream, numXYPlanes, xyPlaneOffsets, buffer, freqZData);
+}
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void {
+  buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, buffer,
+                                 freqZData);
+}
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void {
+  buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, buffer,
+                                 freqZData);
+}
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void {
+  buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets, buffer,
+                                 freqZData);
+}
+
+// Packs z-sticks into buffer for MPI_Alltoall
+// Dimension of buffer are (numRanks, maxNumZSticks, maxNumXYPlanes)
+// Dimension of freqXYData are (numLocalXYPlanes, dimY, dimX)
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void buffered_pack_forward_kernel(const GPUArrayConstView1D<int> numZSticks,
+                                                    const GPUArrayConstView1D<int> indices,
+                                                    const GPUArrayConstView2D<DATA_TYPE> freqXYDataFlat,
+                                                    GPUArrayView3D<BUFFER_TYPE> buffer) {
+  // buffer.dim_mid() is equal to maxNumZSticks
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (xyPlaneIndex < freqXYDataFlat.dim_outer()) {
+    for (int r = 0; r < numZSticks.size(); ++r) {
+      const int numCurrentZSticks = numZSticks(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < numCurrentZSticks;
+           zStickIndex += gridDim.y) {
+        const int currentIndex = indices(r * buffer.dim_mid() + zStickIndex);
+        buffer(r, zStickIndex, xyPlaneIndex) = ConvertComplex<BUFFER_TYPE, DATA_TYPE>::apply(
+            freqXYDataFlat(xyPlaneIndex, currentIndex));
+      }
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto buffered_pack_forward_launch(const gpu::StreamType stream, const int maxNumXYPlanes,
+                                         const GPUArrayView1D<int>& numZSticks,
+                                         const GPUArrayView1D<int>& indices,
+                                         const GPUArrayView3D<DATA_TYPE>& freqXYData,
+                                         GPUArrayView3D<BUFFER_TYPE> buffer) -> void {
+  assert(buffer.dim_outer() == numZSticks.size());
+  assert(buffer.dim_inner() == maxNumXYPlanes);
+  assert(indices.size() == buffer.dim_mid() * numZSticks.size());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqXYData.dim_outer() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(buffer.dim_mid(), 4320));
+  assert(threadGrid.x > 0);
+  assert(threadGrid.y > 0);
+  launch_kernel(buffered_pack_forward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid, threadBlock, 0,
+                stream, numZSticks, indices,
+                GPUArrayConstView2D<DATA_TYPE>(freqXYData.data(), freqXYData.dim_outer(),
+                                               freqXYData.dim_mid() * freqXYData.dim_inner(),
+                                               freqXYData.device_id()),
+                buffer);
+}
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> buffer) -> void {
+  buffered_pack_forward_launch(stream, maxNumXYPlanes, numZSticks, indices, freqXYData, buffer);
+}
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  buffered_pack_forward_launch(stream, maxNumXYPlanes, numZSticks, indices, freqXYData, buffer);
+}
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  buffered_pack_forward_launch(stream, maxNumXYPlanes, numZSticks, indices, freqXYData, buffer);
+}
+
+} // namespace spfft
+
diff --git a/src/transpose/gpu_kernels/buffered_kernels.hpp b/src/transpose/gpu_kernels/buffered_kernels.hpp
new file mode 100644
index 0000000..e257162
--- /dev/null
+++ b/src/transpose/gpu_kernels/buffered_kernels.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_BUFFERED_KERNELS_HPP
+#define SPFFT_BUFFERED_KERNELS_HPP
+#include <cassert>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "memory/gpu_array_view.hpp"
+
+namespace spfft {
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> buffer) -> void;
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void;
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> freqXYData) -> void;
+
+auto buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void;
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void;
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void;
+
+auto buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void;
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> buffer) -> void;
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+} // namespace spfft
+
+#endif
diff --git a/src/transpose/gpu_kernels/compact_buffered_kernels.cu b/src/transpose/gpu_kernels/compact_buffered_kernels.cu
new file mode 100644
index 0000000..d916313
--- /dev/null
+++ b/src/transpose/gpu_kernels/compact_buffered_kernels.cu
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include "gpu_util/complex_conversion.cuh"
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/gpu_array_const_view.hpp"
+
+namespace spfft {
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void compact_buffered_pack_backward_kernel(
+    const GPUArrayConstView1D<int> numXYPlanes, const GPUArrayConstView1D<int> xyPlaneOffsets,
+    const GPUArrayConstView2D<DATA_TYPE> freqZData, GPUArrayView1D<BUFFER_TYPE> buffer) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  int bufferOffset = 0;
+  for (int r = 0; r < numXYPlanes.size(); ++r) {
+    const int numCurrentXYPlanes = numXYPlanes(r);
+    if (xyPlaneIndex < numCurrentXYPlanes) {
+      for (int zStickIndex = blockIdx.y; zStickIndex < freqZData.dim_outer();
+           zStickIndex += gridDim.y) {
+        buffer(bufferOffset + zStickIndex * numCurrentXYPlanes + xyPlaneIndex) =
+            ConvertComplex<BUFFER_TYPE, DATA_TYPE>::apply(
+                freqZData(zStickIndex, xyPlaneIndex + xyPlaneOffsets(r)));
+      }
+    }
+    bufferOffset += numCurrentXYPlanes * freqZData.dim_outer();
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto compact_buffered_pack_backward_launch(const gpu::StreamType stream,
+                                                  const int maxNumXYPlanes,
+                                                  const GPUArrayView1D<int> numXYPlanes,
+                                                  const GPUArrayView1D<int>& xyPlaneOffsets,
+                                                  const GPUArrayView2D<DATA_TYPE>& freqZData,
+                                                  GPUArrayView1D<BUFFER_TYPE> buffer) -> void {
+  assert(xyPlaneOffsets.size() == numXYPlanes.size());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((maxNumXYPlanes + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 4320));
+  launch_kernel(compact_buffered_pack_backward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid,
+                threadBlock, 0, stream, numXYPlanes, xyPlaneOffsets, freqZData, buffer);
+}
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<double>::type> buffer) -> void {
+  compact_buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                        freqZData, buffer);
+}
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  compact_buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                        freqZData, buffer);
+}
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  compact_buffered_pack_backward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                        freqZData, buffer);
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void compact_buffered_unpack_backward_kernel(
+    const int maxNumZSticks, const GPUArrayConstView1D<int> numZSticks,
+    const GPUArrayConstView1D<int> indices, const GPUArrayConstView1D<BUFFER_TYPE> buffer,
+    GPUArrayView2D<DATA_TYPE> freqXYData) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  int bufferOffset = 0;
+  if (xyPlaneIndex < freqXYData.dim_outer()) {
+    for (int r = 0; r < numZSticks.size(); ++r) {
+      const int numCurrentZSticks = numZSticks(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < numCurrentZSticks;
+           zStickIndex += gridDim.y) {
+        const int currentIndex = indices(r * maxNumZSticks + zStickIndex);
+        freqXYData(xyPlaneIndex, currentIndex) = ConvertComplex<DATA_TYPE, BUFFER_TYPE>::apply(
+            buffer(bufferOffset + zStickIndex * freqXYData.dim_outer() + xyPlaneIndex));
+      }
+      bufferOffset += numCurrentZSticks * freqXYData.dim_outer();
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto compact_buffered_unpack_backward_launch(const gpu::StreamType stream,
+                                                    const int maxNumZSticks,
+                                                    const GPUArrayView1D<int>& numZSticks,
+                                                    const GPUArrayView1D<int>& indices,
+                                                    const GPUArrayView1D<BUFFER_TYPE>& buffer,
+                                                    GPUArrayView3D<DATA_TYPE> freqXYData) -> void {
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqXYData.dim_outer() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(maxNumZSticks, 4320));
+  launch_kernel(compact_buffered_unpack_backward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid,
+                threadBlock, 0, stream, maxNumZSticks, numZSticks, indices, buffer,
+                GPUArrayView2D<DATA_TYPE>(freqXYData.data(), freqXYData.dim_outer(),
+                                          freqXYData.dim_mid() * freqXYData.dim_inner(),
+                                          freqXYData.device_id()));
+}
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void {
+  compact_buffered_unpack_backward_launch(stream, maxNumZSticks, numZSticks, indices, buffer,
+                                          freqXYData);
+}
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> freqXYData) -> void {
+  compact_buffered_unpack_backward_launch(stream, maxNumZSticks, numZSticks, indices, buffer,
+                                          freqXYData);
+}
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void {
+  compact_buffered_unpack_backward_launch(stream, maxNumZSticks, numZSticks, indices, buffer,
+                                          freqXYData);
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void compact_buffered_unpack_forward_kernel(
+    const GPUArrayConstView1D<int> numXYPlanes, const GPUArrayConstView1D<int> xyPlaneOffsets,
+    const GPUArrayConstView1D<BUFFER_TYPE> buffer, GPUArrayView2D<DATA_TYPE> freqZData) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  int bufferOffset = 0;
+  for (int r = 0; r < numXYPlanes.size(); ++r) {
+    const int numCurrentXYPlanes = numXYPlanes(r);
+    if (xyPlaneIndex < numCurrentXYPlanes) {
+      for (int zStickIndex = blockIdx.y; zStickIndex < freqZData.dim_outer();
+           zStickIndex += gridDim.y) {
+        freqZData(zStickIndex, xyPlaneIndex + xyPlaneOffsets(r)) =
+            ConvertComplex<DATA_TYPE, BUFFER_TYPE>::apply(
+                buffer(bufferOffset + zStickIndex * numCurrentXYPlanes + xyPlaneIndex));
+      }
+    }
+    bufferOffset += numCurrentXYPlanes * freqZData.dim_outer();
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto compact_buffered_unpack_forward_launch(const gpu::StreamType stream,
+                                                   const int maxNumXYPlanes,
+                                                   const GPUArrayView1D<int> numXYPlanes,
+                                                   const GPUArrayView1D<int>& xyPlaneOffsets,
+                                                   const GPUArrayView1D<BUFFER_TYPE>& buffer,
+                                                   GPUArrayView2D<DATA_TYPE> freqZData) -> void {
+  assert(xyPlaneOffsets.size() == numXYPlanes.size());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((maxNumXYPlanes + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 4320));
+  launch_kernel(compact_buffered_unpack_forward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid,
+                threadBlock, 0, stream, numXYPlanes, xyPlaneOffsets, buffer, freqZData);
+}
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void {
+  compact_buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                         buffer, freqZData);
+}
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void {
+  compact_buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                         buffer, freqZData);
+}
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void {
+  compact_buffered_unpack_forward_launch(stream, maxNumXYPlanes, numXYPlanes, xyPlaneOffsets,
+                                         buffer, freqZData);
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+__global__ static void compact_buffered_pack_forward_kernel(
+    const int maxNumZSticks, const GPUArrayConstView1D<int> numZSticks,
+    const GPUArrayConstView1D<int> indices, const GPUArrayConstView2D<DATA_TYPE> freqXYData,
+    GPUArrayView1D<BUFFER_TYPE> buffer) {
+  const int xyPlaneIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  int bufferOffset = 0;
+  if (xyPlaneIndex < freqXYData.dim_outer()) {
+    for (int r = 0; r < numZSticks.size(); ++r) {
+      const int numCurrentZSticks = numZSticks(r);
+      for (int zStickIndex = blockIdx.y; zStickIndex < numCurrentZSticks;
+           zStickIndex += gridDim.y) {
+        const int currentIndex = indices(r * maxNumZSticks + zStickIndex);
+        buffer(bufferOffset + zStickIndex * freqXYData.dim_outer() + xyPlaneIndex) =
+            ConvertComplex<BUFFER_TYPE, DATA_TYPE>::apply(freqXYData(xyPlaneIndex, currentIndex));
+      }
+      bufferOffset += numCurrentZSticks * freqXYData.dim_outer();
+    }
+  }
+}
+
+template <typename DATA_TYPE, typename BUFFER_TYPE>
+static auto compact_buffered_pack_forward_launch(const gpu::StreamType stream,
+                                                 const int maxNumZSticks,
+                                                 const GPUArrayView1D<int>& numZSticks,
+                                                 const GPUArrayView1D<int>& indices,
+                                                 const GPUArrayView3D<DATA_TYPE>& freqXYData,
+                                                 GPUArrayView1D<BUFFER_TYPE> buffer) -> void {
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqXYData.dim_outer() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(maxNumZSticks, 4320));
+  launch_kernel(compact_buffered_pack_forward_kernel<DATA_TYPE, BUFFER_TYPE>, threadGrid,
+                threadBlock, 0, stream, maxNumZSticks, numZSticks, indices,
+                GPUArrayConstView2D<DATA_TYPE>(freqXYData.data(), freqXYData.dim_outer(),
+                                               freqXYData.dim_mid() * freqXYData.dim_inner(),
+                                               freqXYData.device_id()),
+                buffer);
+}
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<double>::type> buffer) -> void {
+  compact_buffered_pack_forward_launch(stream, maxNumZSticks, numZSticks, indices, freqXYData,
+                                       buffer);
+}
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  compact_buffered_pack_forward_launch(stream, maxNumZSticks, numZSticks, indices, freqXYData,
+                                       buffer);
+}
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void {
+  compact_buffered_pack_forward_launch(stream, maxNumZSticks, numZSticks, indices, freqXYData,
+                                       buffer);
+}
+
+} // namespace spfft
+
diff --git a/src/transpose/gpu_kernels/compact_buffered_kernels.hpp b/src/transpose/gpu_kernels/compact_buffered_kernels.hpp
new file mode 100644
index 0000000..247d698
--- /dev/null
+++ b/src/transpose/gpu_kernels/compact_buffered_kernels.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_COMPACT_BUFFERED_KERNELS_HPP
+#define SPFFT_COMPACT_BUFFERED_KERNELS_HPP
+#include <cassert>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "memory/gpu_array_view.hpp"
+
+namespace spfft {
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<double>::type> buffer) -> void;
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto compact_buffered_pack_backward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void;
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> freqXYData) -> void;
+
+auto compact_buffered_unpack_backward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> freqXYData) -> void;
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<double>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void;
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void;
+
+auto compact_buffered_unpack_forward(
+    const gpu::StreamType stream, const int maxNumXYPlanes, const GPUArrayView1D<int> numXYPlanes,
+    const GPUArrayView1D<int>& xyPlaneOffsets,
+    const GPUArrayView1D<typename gpu::fft::ComplexType<float>::type>& buffer,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void;
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<double>::type> buffer) -> void;
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+auto compact_buffered_pack_forward(
+    const gpu::StreamType stream, const int maxNumZSticks, const GPUArrayView1D<int>& numZSticks,
+    const GPUArrayView1D<int>& indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& freqXYData,
+    GPUArrayView1D<typename gpu::fft::ComplexType<float>::type> buffer) -> void;
+
+} // namespace spfft
+
+#endif
diff --git a/src/transpose/gpu_kernels/local_transpose_kernels.cu b/src/transpose/gpu_kernels/local_transpose_kernels.cu
new file mode 100644
index 0000000..064aa59
--- /dev/null
+++ b/src/transpose/gpu_kernels/local_transpose_kernels.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_runtime.hpp"
+#include "memory/gpu_array_const_view.hpp"
+#include "memory/array_view_utility.hpp"
+
+namespace spfft {
+
+// ------------------
+// Backward
+// ------------------
+
+// Places data from z-sticks into a full 3d grid
+template <typename T>
+__global__ static void transpose_backward_kernel(const GPUArrayConstView1D<int> indices,
+                                                 const GPUArrayConstView2D<T> freqZData,
+                                                 GPUArrayView2D<T> spaceDomainFlat) {
+  const int z = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (z < freqZData.dim_inner()) {
+    for (int stickIndex = blockIdx.y; stickIndex < indices.size(); stickIndex += gridDim.y) {
+      const auto stickXYIndex = indices(stickIndex);
+      spaceDomainFlat(z, stickXYIndex) = freqZData(stickIndex, z);
+    }
+  }
+}
+
+auto local_transpose_backward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> spaceDomain) -> void {
+  assert(indices.size() == freqZData.dim_outer());
+  assert(indices.size() <= spaceDomain.dim_inner() * spaceDomain.dim_mid());
+  assert(spaceDomain.dim_outer() == freqZData.dim_inner());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqZData.dim_inner() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 2160));
+  launch_kernel(transpose_backward_kernel<typename gpu::fft::ComplexType<double>::type>, threadGrid,
+                threadBlock, 0, stream, indices, freqZData,
+                GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>(spaceDomain.data(), spaceDomain.dim_outer(),
+                               spaceDomain.dim_mid() * spaceDomain.dim_inner(), spaceDomain.device_id()));
+}
+
+auto local_transpose_backward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> spaceDomain) -> void {
+  assert(indices.size() == freqZData.dim_outer());
+  assert(indices.size() <= spaceDomain.dim_inner() * spaceDomain.dim_mid());
+  assert(spaceDomain.dim_outer() == freqZData.dim_inner());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqZData.dim_inner() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 2160));
+  launch_kernel(transpose_backward_kernel<typename gpu::fft::ComplexType<float>::type>, threadGrid,
+                threadBlock, 0, stream, indices, freqZData,
+                GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>(
+                    spaceDomain.data(), spaceDomain.dim_outer(),
+                    spaceDomain.dim_mid() * spaceDomain.dim_inner(), spaceDomain.device_id()));
+}
+
+// ------------------
+// Forward
+// ------------------
+
+template <typename T>
+__global__ static void transpose_forward_kernel(const GPUArrayConstView1D<int> indices,
+                                                const GPUArrayConstView2D<T> spaceDomainFlat,
+                                                GPUArrayView2D<T> freqZData) {
+  const int z = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (z < freqZData.dim_inner()) {
+    for (int stickIndex = blockIdx.y; stickIndex < indices.size(); stickIndex += gridDim.y) {
+      const auto stickXYIndex = indices(stickIndex);
+      freqZData(stickIndex, z) = spaceDomainFlat(z, stickXYIndex);
+    }
+  }
+}
+
+auto local_transpose_forward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& spaceDomain,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void {
+  assert(indices.size() == freqZData.dim_outer());
+  assert(indices.size() <= spaceDomain.dim_inner() * spaceDomain.dim_mid());
+  assert(spaceDomain.dim_outer() == freqZData.dim_inner());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqZData.dim_inner() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 2160));
+  launch_kernel(transpose_forward_kernel<typename gpu::fft::ComplexType<double>::type>, threadGrid,
+                threadBlock, 0, stream, indices,
+                GPUArrayConstView2D<typename gpu::fft::ComplexType<double>::type>(
+                    spaceDomain.data(), spaceDomain.dim_outer(),
+                    spaceDomain.dim_mid() * spaceDomain.dim_inner(), spaceDomain.device_id()),
+                freqZData);
+}
+
+auto local_transpose_forward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& spaceDomain,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void {
+  assert(indices.size() == freqZData.dim_outer());
+  assert(indices.size() <= spaceDomain.dim_inner() * spaceDomain.dim_mid());
+  assert(spaceDomain.dim_outer() == freqZData.dim_inner());
+  const dim3 threadBlock(128);
+  const dim3 threadGrid((freqZData.dim_inner() + threadBlock.x - 1) / threadBlock.x,
+                        std::min(freqZData.dim_outer(), 2160));
+  launch_kernel(transpose_forward_kernel<typename gpu::fft::ComplexType<float>::type>, threadGrid,
+                threadBlock, 0, stream, indices,
+                GPUArrayConstView2D<typename gpu::fft::ComplexType<float>::type>(
+                    spaceDomain.data(), spaceDomain.dim_outer(),
+                    spaceDomain.dim_mid() * spaceDomain.dim_inner(), spaceDomain.device_id()),
+                freqZData);
+}
+
+} // namespace spfft
+
diff --git a/src/transpose/gpu_kernels/local_transpose_kernels.hpp b/src/transpose/gpu_kernels/local_transpose_kernels.hpp
new file mode 100644
index 0000000..e5beccf
--- /dev/null
+++ b/src/transpose/gpu_kernels/local_transpose_kernels.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cassert>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "memory/gpu_array_view.hpp"
+
+namespace spfft {
+
+// ------------------
+// Backward
+// ------------------
+
+auto local_transpose_backward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<double>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> spaceDomain) -> void;
+
+auto local_transpose_backward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView2D<typename gpu::fft::ComplexType<float>::type>& freqZData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<float>::type> spaceDomain) -> void;
+
+// ------------------
+// Forward
+// ------------------
+
+auto local_transpose_forward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<double>::type>& spaceDomain,
+    GPUArrayView2D<typename gpu::fft::ComplexType<double>::type> freqZData) -> void;
+
+auto local_transpose_forward(
+    const gpu::StreamType stream, const GPUArrayView1D<int> indices,
+    const GPUArrayView3D<typename gpu::fft::ComplexType<float>::type>& spaceDomain,
+    GPUArrayView2D<typename gpu::fft::ComplexType<float>::type> freqZData) -> void;
+
+} // namespace spfft
+
diff --git a/src/transpose/transpose.hpp b/src/transpose/transpose.hpp
new file mode 100644
index 0000000..88b3d34
--- /dev/null
+++ b/src/transpose/transpose.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_HPP
+#define SPFFT_TRANSPOSE_HPP
+
+#include "spfft/config.h"
+#include "util/common_types.hpp"
+
+namespace spfft {
+
+class Transpose {
+public:
+  virtual auto pack_forward() -> void {}
+  virtual auto exchange_forward_start(const bool nonBlockingExchange) -> void = 0;
+  virtual auto exchange_forward_finalize() -> void {}
+  virtual auto unpack_forward() -> void {}
+
+  inline auto forward() -> void {
+    this->pack_forward();
+    this->exchange_forward_start(false);
+    this->exchange_forward_finalize();
+    this->unpack_forward();
+  }
+
+  virtual auto pack_backward() -> void {}
+  virtual auto exchange_backward_start(const bool nonBlockingExchange) -> void = 0;
+  virtual auto exchange_backward_finalize() -> void {}
+  virtual auto unpack_backward() -> void {}
+
+  inline auto backward() -> void {
+    this->pack_backward();
+    this->exchange_backward_start(false);
+    this->exchange_backward_finalize();
+    this->unpack_backward();
+  }
+
+  virtual ~Transpose() = default;
+};
+} // namespace spfft
+#endif
diff --git a/src/transpose/transpose_gpu.hpp b/src/transpose/transpose_gpu.hpp
new file mode 100644
index 0000000..ef96470
--- /dev/null
+++ b/src/transpose/transpose_gpu.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_GPU_HPP
+#define SPFFT_TRANSPOSE_GPU_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <vector>
+#include <memory>
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "transpose/gpu_kernels/local_transpose_kernels.hpp"
+
+namespace spfft {
+// Transpose Z sticks, such that data is represented by xy planes, where the y-dimension is
+// continous and vice versa
+template <typename T>
+class TransposeGPU : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = typename gpu::fft::ComplexType<ValueType>::type;
+
+public:
+  TransposeGPU(const std::shared_ptr<Parameters>& param, GPUStreamHandle stream,
+               GPUArrayView3D<ComplexType> spaceDomainData,
+               GPUArrayView2D<ComplexType> freqDomainData)
+      : stream_(std::move(stream)),
+        spaceDomainData_(spaceDomainData),
+        freqDomainData_(freqDomainData), indices_(param->num_z_sticks(0)) {
+    // single node only checks
+    assert(spaceDomainData.dim_outer() == freqDomainData.dim_inner());
+
+    // check data dimensions and parameters
+    assert(param->dim_x_freq() == spaceDomainData.dim_inner());
+    assert(param->dim_y() == spaceDomainData.dim_mid());
+    assert(param->dim_z() == spaceDomainData.dim_outer());
+    assert(param->dim_z() == freqDomainData.dim_inner());
+    assert(param->num_z_sticks(0) == freqDomainData.dim_outer());
+
+    // data must be disjoint
+    assert(disjoint(spaceDomainData, freqDomainData));
+
+    // copy xy indices
+    const auto zStickXYIndices = param->z_stick_xy_indices(0);
+
+    std::vector<int> transposedIndices;
+    transposedIndices.reserve(zStickXYIndices.size());
+
+    for(const auto& index : zStickXYIndices) {
+      const int x = index / param->dim_y();
+      const int y = index - x * param->dim_y();
+      transposedIndices.emplace_back(y * param->dim_x_freq() + x);
+    }
+
+    copy_to_gpu(transposedIndices, indices_);
+  }
+
+  auto exchange_backward_start(const bool) -> void override {
+    gpu::check_status(gpu::memset_async(
+        static_cast<void*>(spaceDomainData_.data()), 0,
+        spaceDomainData_.size() * sizeof(typename decltype(spaceDomainData_)::ValueType),
+        stream_.get()));
+    if (freqDomainData_.size() > 0 && spaceDomainData_.size() > 0) {
+      local_transpose_backward(stream_.get(), create_1d_view(indices_, 0, indices_.size()),
+                               freqDomainData_, spaceDomainData_);
+    }
+  }
+
+  auto unpack_backward() -> void override {}
+
+  auto exchange_forward_start(const bool) -> void override {
+    if (freqDomainData_.size() > 0 && spaceDomainData_.size() > 0) {
+      local_transpose_forward(stream_.get(), create_1d_view(indices_, 0, indices_.size()),
+                              spaceDomainData_, freqDomainData_);
+    }
+  }
+
+  auto unpack_forward() -> void override {}
+
+private:
+  GPUStreamHandle stream_;
+  GPUArrayView3D<ComplexType> spaceDomainData_;
+  GPUArrayView2D<ComplexType> freqDomainData_;
+  GPUArray<int> indices_;
+};
+} // namespace spfft
+#endif
diff --git a/src/transpose/transpose_host.hpp b/src/transpose/transpose_host.hpp
new file mode 100644
index 0000000..4ec7a00
--- /dev/null
+++ b/src/transpose/transpose_host.hpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_HOST_HPP
+#define SPFFT_TRANSPOSE_HOST_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <vector>
+#include <memory>
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+namespace spfft {
+// Transpose Z sticks, such that data is represented by xy planes, where the y-dimension is
+// continous and vice versa
+template <typename T>
+class TransposeHost : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+public:
+  TransposeHost(const std::shared_ptr<Parameters>& param,
+                HostArrayView3D<ComplexType> spaceDomainData,
+                HostArrayView2D<ComplexType> freqDomainData)
+      : spaceDomainData_(spaceDomainData), freqDomainData_(freqDomainData), param_(param) {
+    // single rank only checks
+    assert(spaceDomainData.dim_outer() == freqDomainData.dim_inner());
+
+    // check data dimensions and parameters
+    assert(param->dim_x_freq() == spaceDomainData.dim_mid());
+    assert(param->dim_y() == spaceDomainData.dim_inner());
+    assert(param->dim_z() == spaceDomainData.dim_outer());
+    assert(param->dim_z() == freqDomainData.dim_inner());
+    assert(param->num_z_sticks(0) == freqDomainData.dim_outer());
+
+    // data must be disjoint
+    assert(disjoint(spaceDomainData, freqDomainData));
+  }
+
+  auto exchange_backward_start(const bool) -> void override {}
+
+  auto unpack_backward() -> void override {
+    SPFFT_OMP_PRAGMA("omp for schedule(static)") // implicit barrier
+    for (SizeType z = 0; z < spaceDomainData_.dim_outer(); ++z) {
+      std::memset(static_cast<void*>(&spaceDomainData_(z, 0, 0)), 0,
+                  sizeof(typename decltype(spaceDomainData_)::ValueType) *
+                      spaceDomainData_.dim_inner() * spaceDomainData_.dim_mid());
+    }
+
+    const SizeType unrolledLoopEnd =
+        freqDomainData_.dim_outer() < 4 ? 0 : freqDomainData_.dim_outer() - 3;
+
+    auto stickIndicesView = param_->z_stick_xy_indices(0);
+
+    auto spaceDomainDataFlat =
+        create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                       spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+    // unrolled loop
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      const SizeType xyIndex1 = stickIndicesView(zStickIndex);
+      const SizeType xyIndex2 = stickIndicesView(zStickIndex + 1);
+      const SizeType xyIndex3 = stickIndicesView(zStickIndex + 2);
+      const SizeType xyIndex4 = stickIndicesView(zStickIndex + 3);
+      for (SizeType zIndex = 0; zIndex < freqDomainData_.dim_inner(); ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex1) = freqDomainData_(zStickIndex, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex2) = freqDomainData_(zStickIndex + 1, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex3) = freqDomainData_(zStickIndex + 2, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex4) = freqDomainData_(zStickIndex + 3, zIndex);
+      }
+    }
+
+    // transpose remaining elements
+    SPFFT_OMP_PRAGMA("omp for schedule(static)") // keep barrier
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < freqDomainData_.dim_outer();
+         zStickIndex += 1) {
+      const SizeType xyIndex = stickIndicesView(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < freqDomainData_.dim_inner(); ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex) = freqDomainData_(zStickIndex, zIndex);
+      }
+    }
+  }
+
+  auto exchange_forward_start(const bool) -> void override {}
+
+  auto unpack_forward() -> void override {
+    const SizeType unrolledLoopEnd =
+        freqDomainData_.dim_outer() < 4 ? 0 : freqDomainData_.dim_outer() - 3;
+
+    auto stickIndicesView = param_->z_stick_xy_indices(0);
+
+    auto spaceDomainDataFlat =
+        create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                       spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+    // unrolled loop
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      const SizeType xyIndex1 = stickIndicesView(zStickIndex);
+      const SizeType xyIndex2 = stickIndicesView(zStickIndex + 1);
+      const SizeType xyIndex3 = stickIndicesView(zStickIndex + 2);
+      const SizeType xyIndex4 = stickIndicesView(zStickIndex + 3);
+      for (SizeType zIndex = 0; zIndex < freqDomainData_.dim_inner(); ++zIndex) {
+        freqDomainData_(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex1);
+        freqDomainData_(zStickIndex + 1, zIndex) = spaceDomainDataFlat(zIndex, xyIndex2);
+        freqDomainData_(zStickIndex + 2, zIndex) = spaceDomainDataFlat(zIndex, xyIndex3);
+        freqDomainData_(zStickIndex + 3, zIndex) = spaceDomainDataFlat(zIndex, xyIndex4);
+      }
+    }
+
+    // transpose remaining elements
+    SPFFT_OMP_PRAGMA("omp for schedule(static)") // keep barrier
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < freqDomainData_.dim_outer();
+         zStickIndex += 1) {
+      const SizeType xyIndex = stickIndicesView(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < freqDomainData_.dim_inner(); ++zIndex) {
+        freqDomainData_(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex);
+      }
+    }
+  }
+
+private:
+  HostArrayView3D<ComplexType> spaceDomainData_;
+  HostArrayView2D<ComplexType> freqDomainData_;
+  std::shared_ptr<Parameters> param_;
+};
+} // namespace spfft
+#endif
diff --git a/src/transpose/transpose_mpi_buffered_gpu.cpp b/src/transpose/transpose_mpi_buffered_gpu.cpp
new file mode 100644
index 0000000..ed85b3a
--- /dev/null
+++ b/src/transpose/transpose_mpi_buffered_gpu.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#if defined(SPFFT_MPI) && (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/gpu_kernels/buffered_kernels.hpp"
+#include "transpose/transpose_mpi_buffered_gpu.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+TransposeMPIBufferedGPU<T, U>::TransposeMPIBufferedGPU(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView1D<ComplexType> spaceDomainBufferHost,
+    GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU,
+    GPUArrayView1D<ComplexGPUType> spaceDomainBufferGPU, GPUStreamHandle spaceDomainStream,
+    HostArrayView1D<ComplexType> freqDomainBufferHost,
+    GPUArrayView2D<ComplexGPUType> freqDomainDataGPU,
+    GPUArrayView1D<ComplexGPUType> freqDomainBufferGPU, GPUStreamHandle freqDomainStream)
+    : param_(param),
+      comm_(std::move(comm)),
+      spaceDomainBufferHost_(create_new_type_1d_view<ComplexExchangeType>(
+          spaceDomainBufferHost,
+          comm_.size() * param_->max_num_xy_planes() * param_->max_num_z_sticks())),
+      freqDomainBufferHost_(create_new_type_1d_view<ComplexExchangeType>(
+          freqDomainBufferHost,
+          comm_.size() * param_->max_num_xy_planes() * param_->max_num_z_sticks())),
+      spaceDomainDataGPU_(spaceDomainDataGPU),
+      freqDomainDataGPU_(freqDomainDataGPU),
+      spaceDomainBufferGPU_(create_new_type_3d_view<ComplexExchangeGPUType>(
+          spaceDomainBufferGPU, comm_.size(), param_->max_num_z_sticks(),
+          param_->max_num_xy_planes())),
+      freqDomainBufferGPU_(create_new_type_3d_view<ComplexExchangeGPUType>(
+          freqDomainBufferGPU, comm_.size(), param_->max_num_z_sticks(),
+          param_->max_num_xy_planes())),
+      spaceDomainStream_(std::move(spaceDomainStream)),
+      freqDomainStream_(std::move(freqDomainStream)) {
+  assert(param_->dim_y() == spaceDomainDataGPU.dim_mid());
+  assert(param_->dim_x_freq() == spaceDomainDataGPU.dim_inner());
+  assert(param_->num_xy_planes(comm_.rank()) == spaceDomainDataGPU.dim_outer());
+  assert(param_->dim_z() == freqDomainDataGPU.dim_inner());
+  assert(param_->num_z_sticks(comm_.rank()) == freqDomainDataGPU.dim_outer());
+
+  assert(spaceDomainBufferGPU.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+  assert(freqDomainBufferGPU.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+  assert(spaceDomainBufferHost.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+  assert(freqDomainBufferHost.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+
+  // assert(disjoint(spaceDomainDataGPU, freqDomainDataGPU));
+  assert(disjoint(spaceDomainDataGPU, spaceDomainBufferGPU));
+  assert(disjoint(freqDomainDataGPU, freqDomainBufferGPU));
+  assert(disjoint(spaceDomainBufferHost, freqDomainBufferHost));
+#ifdef SPFFT_GPU_DIRECT
+  assert(disjoint(spaceDomainBufferGPU, freqDomainBufferGPU));
+#endif
+
+  // create underlying type
+  mpiTypeHandle_ = MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<U>::get());
+
+  // copy relevant parameters
+  std::vector<int> numZSticksHost(comm_.size());
+  std::vector<int> numXYPlanesHost(comm_.size());
+  std::vector<int> xyPlaneOffsetsHost(comm_.size());
+  std::vector<int> indicesHost(comm_.size() * param_->max_num_z_sticks());
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    numZSticksHost[r] = static_cast<int>(param_->num_z_sticks(r));
+    numXYPlanesHost[r] = static_cast<int>(param_->num_xy_planes(r));
+    xyPlaneOffsetsHost[r] = static_cast<int>(param_->xy_plane_offset(r));
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    for (SizeType i = 0; i < zStickXYIndices.size(); ++i) {
+      // transpose stick index
+      const int xyIndex = zStickXYIndices(i);
+      const int x = xyIndex / param_->dim_y();
+      const int y = xyIndex - x * param_->dim_y();
+      indicesHost[r * param_->max_num_z_sticks() + i] = y * param_->dim_x_freq() + x;
+    }
+  }
+
+  numZSticksGPU_ = GPUArray<int>(numZSticksHost.size());
+  numXYPlanesGPU_ = GPUArray<int>(numXYPlanesHost.size());
+  xyPlaneOffsetsGPU_ = GPUArray<int>(xyPlaneOffsetsHost.size());
+  indicesGPU_ = GPUArray<int>(indicesHost.size());
+
+  copy_to_gpu(numZSticksHost, numZSticksGPU_);
+  copy_to_gpu(numXYPlanesHost, numXYPlanesGPU_);
+  copy_to_gpu(xyPlaneOffsetsHost, xyPlaneOffsetsGPU_);
+  copy_to_gpu(indicesHost, indicesGPU_);
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::pack_backward() -> void {
+  if (freqDomainDataGPU_.size() > 0 && freqDomainBufferGPU_.size() > 0) {
+    buffered_pack_backward(freqDomainStream_.get(), param_->max_num_xy_planes(),
+                           create_1d_view(numXYPlanesGPU_, 0, numXYPlanesGPU_.size()),
+                           create_1d_view(xyPlaneOffsetsGPU_, 0, xyPlaneOffsetsGPU_.size()),
+                           freqDomainDataGPU_, freqDomainBufferGPU_);
+#ifndef SPFFT_GPU_DIRECT
+    copy_from_gpu_async(freqDomainStream_, freqDomainBufferGPU_, freqDomainBufferHost_);
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::unpack_backward() -> void {
+  if (spaceDomainDataGPU_.size() > 0) {
+    gpu::check_status(gpu::memset_async(
+        static_cast<void*>(spaceDomainDataGPU_.data()), 0,
+        spaceDomainDataGPU_.size() * sizeof(typename decltype(spaceDomainDataGPU_)::ValueType),
+        spaceDomainStream_.get()));
+    if (spaceDomainBufferGPU_.size() > 0) {
+#ifndef SPFFT_GPU_DIRECT
+      copy_to_gpu_async(spaceDomainStream_, spaceDomainBufferHost_, spaceDomainBufferGPU_);
+#endif
+      buffered_unpack_backward(spaceDomainStream_.get(), param_->max_num_xy_planes(),
+                               create_1d_view(numZSticksGPU_, 0, numZSticksGPU_.size()),
+                               create_1d_view(indicesGPU_, 0, indicesGPU_.size()),
+                               spaceDomainBufferGPU_, spaceDomainDataGPU_);
+    }
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::exchange_backward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only master thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(freqDomainStream_.get()));
+
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    // exchange data
+    mpi_check_status(MPI_Ialltoall(
+        freqDomainBufferGPU_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), spaceDomainBufferGPU_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+#else
+    // exchange data
+    mpi_check_status(MPI_Ialltoall(
+        freqDomainBufferHost_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), spaceDomainBufferHost_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    // exchange data
+    mpi_check_status(MPI_Alltoall(freqDomainBufferGPU_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), spaceDomainBufferGPU_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+#else
+    // exchange data
+    mpi_check_status(MPI_Alltoall(freqDomainBufferHost_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), spaceDomainBufferHost_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::pack_forward() -> void {
+  if (spaceDomainDataGPU_.size() > 0 && spaceDomainBufferGPU_.size() > 0) {
+    buffered_pack_forward(spaceDomainStream_.get(), param_->max_num_xy_planes(),
+                          create_1d_view(numZSticksGPU_, 0, numZSticksGPU_.size()),
+                          create_1d_view(indicesGPU_, 0, indicesGPU_.size()), spaceDomainDataGPU_,
+                          spaceDomainBufferGPU_);
+
+#ifndef SPFFT_GPU_DIRECT
+    copy_from_gpu_async(spaceDomainStream_, spaceDomainBufferGPU_, spaceDomainBufferHost_);
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::unpack_forward() -> void {
+  if (freqDomainDataGPU_.size() > 0 && freqDomainBufferGPU_.size() > 0) {
+#ifndef SPFFT_GPU_DIRECT
+    copy_to_gpu_async(freqDomainStream_, freqDomainBufferHost_, freqDomainBufferGPU_);
+#endif
+    buffered_unpack_forward(freqDomainStream_.get(), param_->max_num_xy_planes(),
+                            create_1d_view(numXYPlanesGPU_, 0, numXYPlanesGPU_.size()),
+                            create_1d_view(xyPlaneOffsetsGPU_, 0, xyPlaneOffsetsGPU_.size()),
+                            freqDomainBufferGPU_, freqDomainDataGPU_);
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::exchange_forward_start(const bool nonBlockingExchange) -> void {
+  assert(omp_get_thread_num() == 0); // only master thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(spaceDomainStream_.get()));
+
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    // exchange data
+    mpi_check_status(MPI_Ialltoall(
+        spaceDomainBufferGPU_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), freqDomainBufferGPU_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+#else
+    // exchange data
+    mpi_check_status(MPI_Ialltoall(
+        spaceDomainBufferHost_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), freqDomainBufferHost_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    // exchange data
+    mpi_check_status(MPI_Alltoall(spaceDomainBufferGPU_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), freqDomainBufferGPU_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+#else
+    // exchange data
+    mpi_check_status(MPI_Alltoall(spaceDomainBufferHost_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), freqDomainBufferHost_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedGPU<T, U>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPIBufferedGPU<float, float>;
+#endif
+template class TransposeMPIBufferedGPU<double, double>;
+template class TransposeMPIBufferedGPU<double, float>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_buffered_gpu.hpp b/src/transpose/transpose_mpi_buffered_gpu.hpp
new file mode 100644
index 0000000..90ab7db
--- /dev/null
+++ b/src/transpose/transpose_mpi_buffered_gpu.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_BUFFERED_GPU_HPP
+#define SPFFT_TRANSPOSE_MPI_BUFFERED_GPU_HPP
+
+#include <complex>
+#include <memory>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#if defined(SPFFT_MPI) && (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+class TransposeMPIBufferedGPU : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+  using ComplexExchangeType = std::complex<U>;
+  using ComplexGPUType = typename gpu::fft::ComplexType<T>::type;
+  using ComplexExchangeGPUType = typename gpu::fft::ComplexType<U>::type;
+
+public:
+  // spaceDomainDataGPU and freqDomainDataGPU must NOT overlap
+  // spaceDomainDataGPU and spaceDomainBufferGPU must NOT overlap
+  // freqDomainDataGPU and freqDomainBufferGPU must NOT overlap
+  // spaceDomainBufferGPU and freqDomainBufferGPU must NOT overlap
+  // spaceDomainBufferHost and freqDomainBufferHost must NOT overlap
+  //
+  // spaceDomainBufferGPU and freqDomainDataGPU MAY overlap
+  // freqDomainBufferGPU and spaceDomainDataGPU MAY overlap
+  TransposeMPIBufferedGPU(const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+                          HostArrayView1D<ComplexType> spaceDomainBufferHost,
+                          GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU,
+                          GPUArrayView1D<ComplexGPUType> spaceDomainBufferGPU,
+                          GPUStreamHandle spaceDomainStream,
+                          HostArrayView1D<ComplexType> freqDomainBufferHost,
+                          GPUArrayView2D<ComplexGPUType> freqDomainDataGPU,
+                          GPUArrayView1D<ComplexGPUType> freqDomainBufferGPU,
+                          GPUStreamHandle freqDomainStream);
+
+  auto pack_backward() -> void override;
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+  auto unpack_backward() -> void override;
+
+  auto pack_forward() -> void override;
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+  auto unpack_forward() -> void override;
+
+private:
+  std::shared_ptr<Parameters> param_;
+  MPIDatatypeHandle mpiTypeHandle_;
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+
+  HostArrayView1D<ComplexExchangeType> spaceDomainBufferHost_;
+  HostArrayView1D<ComplexExchangeType> freqDomainBufferHost_;
+  GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU_;
+  GPUArrayView2D<ComplexGPUType> freqDomainDataGPU_;
+  GPUArrayView3D<ComplexExchangeGPUType> spaceDomainBufferGPU_;
+  GPUArrayView3D<ComplexExchangeGPUType> freqDomainBufferGPU_;
+  GPUStreamHandle spaceDomainStream_;
+  GPUStreamHandle freqDomainStream_;
+
+  GPUArray<int> numZSticksGPU_;
+  GPUArray<int> numXYPlanesGPU_;
+  GPUArray<int> xyPlaneOffsetsGPU_;
+  GPUArray<int> indicesGPU_;
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/transpose/transpose_mpi_buffered_host.cpp b/src/transpose/transpose_mpi_buffered_host.cpp
new file mode 100644
index 0000000..569c33b
--- /dev/null
+++ b/src/transpose/transpose_mpi_buffered_host.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/transpose_mpi_buffered_host.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+TransposeMPIBufferedHost<T, U>::TransposeMPIBufferedHost(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView3D<ComplexType> spaceDomainData, HostArrayView2D<ComplexType> freqDomainData,
+    HostArrayView1D<ComplexType> spaceDomainBuffer, HostArrayView1D<ComplexType> freqDomainBuffer)
+    : param_(param),
+      comm_(std::move(comm)),
+      spaceDomainData_(spaceDomainData),
+      freqDomainData_(freqDomainData),
+      spaceDomainBuffer_(create_new_type_1d_view<ComplexExchangeType>(spaceDomainBuffer,
+                                                                      spaceDomainBuffer.size())),
+      freqDomainBuffer_(
+          create_new_type_1d_view<ComplexExchangeType>(freqDomainBuffer, freqDomainBuffer.size())) {
+  // assert(param_->dim_x_freq() == spaceDomainData.dim_mid());
+  assert(param_->dim_y() == spaceDomainData.dim_inner());
+  assert(param_->num_xy_planes(comm_.rank()) == spaceDomainData.dim_outer());
+  assert(param_->dim_z() == freqDomainData.dim_inner());
+  assert(param_->num_z_sticks(comm_.rank()) == freqDomainData.dim_outer());
+
+  assert(spaceDomainBuffer.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+  assert(freqDomainBuffer.size() >=
+         param_->max_num_xy_planes() * param_->max_num_z_sticks() * comm_.size());
+
+  assert(disjoint(spaceDomainData, freqDomainData));
+  assert(disjoint(spaceDomainData, spaceDomainBuffer));
+  assert(disjoint(freqDomainData, freqDomainBuffer));
+  assert(disjoint(spaceDomainBuffer, freqDomainBuffer));
+
+  // create underlying type
+  mpiTypeHandle_ = MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<U>::get());
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::pack_backward() -> void {
+  auto freqDomainBuffer3d = create_3d_view(freqDomainBuffer_, 0, comm_.size(),
+                                           param_->max_num_z_sticks(), param_->max_num_xy_planes());
+  // transpose locally from (numLocalZSticks, dimZ) to (dimZ, numLocalZSticks) with spacing
+  // between ranks
+  for (SizeType r = 0; r < static_cast<SizeType>(comm_.size()); ++r) {
+    const auto xyPlaneOffset = param_->xy_plane_offset(r);
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < freqDomainData_.dim_outer(); ++zStickIndex) {
+      for (SizeType xyPlaneIndex = 0; xyPlaneIndex < param_->num_xy_planes(r); ++xyPlaneIndex) {
+        freqDomainBuffer3d(r, zStickIndex, xyPlaneIndex) =
+            freqDomainData_(zStickIndex, xyPlaneIndex + xyPlaneOffset);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::unpack_backward() -> void {
+  // zero target data location (not all values are overwritten upon unpacking)
+  SPFFT_OMP_PRAGMA("omp for schedule(static)") // implicit barrier
+  for (SizeType z = 0; z < spaceDomainData_.dim_outer(); ++z) {
+    std::memset(static_cast<void*>(&spaceDomainData_(z, 0, 0)), 0,
+                sizeof(typename decltype(spaceDomainData_)::ValueType) *
+                    spaceDomainData_.dim_inner() * spaceDomainData_.dim_mid());
+  }
+
+  auto spaceDomainDataFlat =
+      create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                     spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+  // unpack from (numZSticksTotal, numLocalXYPlanes) to (numLocalXYPlanes, dimX, dimY)
+  const auto numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    // take care with unsigned type
+    const SizeType unrolledLoopEnd = zStickXYIndices.size() < 4 ? 0 : zStickXYIndices.size() - 3;
+
+    auto spaceDomainBuffer2d = create_2d_view(
+        spaceDomainBuffer_, r * param_->max_num_xy_planes() * param_->max_num_z_sticks(),
+        param_->max_num_z_sticks(), param_->max_num_xy_planes());
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      // manual loop unrolling for better performance
+      const SizeType xyIndex1 = zStickXYIndices(zStickIndex);
+      const SizeType xyIndex2 = zStickXYIndices(zStickIndex + 1);
+      const SizeType xyIndex3 = zStickXYIndices(zStickIndex + 2);
+      const SizeType xyIndex4 = zStickXYIndices(zStickIndex + 3);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex1) = spaceDomainBuffer2d(zStickIndex, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex2) = spaceDomainBuffer2d(zStickIndex + 1, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex3) = spaceDomainBuffer2d(zStickIndex + 2, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex4) = spaceDomainBuffer2d(zStickIndex + 3, zIndex);
+      }
+    }
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < zStickXYIndices.size();
+         zStickIndex += 1) {
+      const SizeType xyIndex = zStickXYIndices(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex) = spaceDomainBuffer2d(zStickIndex, zIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::exchange_backward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only master thread must be allowed to enter
+
+  // exchange data
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoall(
+        freqDomainBuffer_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), spaceDomainBuffer_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(MPI_Alltoall(freqDomainBuffer_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), spaceDomainBuffer_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::pack_forward() -> void {
+  auto spaceDomainDataFlat =
+      create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                     spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+  // pack from (numLocalXYPlanes, dimX, dimY) to (numZSticksTotal, numLocalXYPlanes)
+  const auto numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    // take care with unsigned type
+    const SizeType unrolledLoopEnd = zStickXYIndices.size() < 4 ? 0 : zStickXYIndices.size() - 3;
+
+    auto spaceDomainBuffer2d = create_2d_view(
+        spaceDomainBuffer_, r * param_->max_num_xy_planes() * param_->max_num_z_sticks(),
+        param_->max_num_z_sticks(), param_->max_num_xy_planes());
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      // manual loop unrolling for better performance
+      const SizeType xyIndex1 = zStickXYIndices(zStickIndex);
+      const SizeType xyIndex2 = zStickXYIndices(zStickIndex + 1);
+      const SizeType xyIndex3 = zStickXYIndices(zStickIndex + 2);
+      const SizeType xyIndex4 = zStickXYIndices(zStickIndex + 3);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainBuffer2d(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex1);
+        spaceDomainBuffer2d(zStickIndex + 1, zIndex) = spaceDomainDataFlat(zIndex, xyIndex2);
+        spaceDomainBuffer2d(zStickIndex + 2, zIndex) = spaceDomainDataFlat(zIndex, xyIndex3);
+        spaceDomainBuffer2d(zStickIndex + 3, zIndex) = spaceDomainDataFlat(zIndex, xyIndex4);
+      }
+    }
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < zStickXYIndices.size();
+         zStickIndex += 1) {
+      const SizeType xyIndex = zStickXYIndices(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainBuffer2d(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::unpack_forward() -> void {
+  auto freqDomainBuffer3d = create_3d_view(freqDomainBuffer_, 0, comm_.size(),
+                                           param_->max_num_z_sticks(), param_->max_num_xy_planes());
+  for (SizeType r = 0; r < static_cast<SizeType>(comm_.size()); ++r) {
+    const auto xyPlaneOffset = param_->xy_plane_offset(r);
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < freqDomainData_.dim_outer(); ++zStickIndex) {
+      for (SizeType xyPlaneIndex = 0; xyPlaneIndex < param_->num_xy_planes(r); ++xyPlaneIndex) {
+        freqDomainData_(zStickIndex, xyPlaneIndex + xyPlaneOffset) =
+            freqDomainBuffer3d(r, zStickIndex, xyPlaneIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::exchange_forward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only master thread must be allowed to enter
+
+  // exchange data
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoall(
+        spaceDomainBuffer_.data(), param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+        mpiTypeHandle_.get(), freqDomainBuffer_.data(),
+        param_->max_num_z_sticks() * param_->max_num_xy_planes(), mpiTypeHandle_.get(), comm_.get(),
+        mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(MPI_Alltoall(spaceDomainBuffer_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), freqDomainBuffer_.data(),
+                                  param_->max_num_z_sticks() * param_->max_num_xy_planes(),
+                                  mpiTypeHandle_.get(), comm_.get()));
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPIBufferedHost<T, U>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPIBufferedHost<float, float>;
+#endif
+template class TransposeMPIBufferedHost<double, double>;
+template class TransposeMPIBufferedHost<double, float>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_buffered_host.hpp b/src/transpose/transpose_mpi_buffered_host.hpp
new file mode 100644
index 0000000..34bab10
--- /dev/null
+++ b/src/transpose/transpose_mpi_buffered_host.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_BUFFERED_HOST_HPP
+#define SPFFT_TRANSPOSE_MPI_BUFFERED_HOST_HPP
+
+#include <complex>
+#include <memory>
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+class TransposeMPIBufferedHost : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+  using ComplexExchangeType = std::complex<U>;
+
+public:
+  // spaceDomainData and freqDomainData must NOT overlap
+  // spaceDomainData and spaceDomainBuffer must NOT overlap
+  // freqDomainData and freqDomainBuffer must NOT overlap
+  // spaceDomainBuffer and freqDomainBuffer must NOT overlap
+  //
+  // spaceDomainBuffer and freqDomainData MAY overlap
+  // freqDomainBuffer and spaceDomainData MAY overlap
+  TransposeMPIBufferedHost(const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+                           HostArrayView3D<ComplexType> spaceDomainData,
+                           HostArrayView2D<ComplexType> freqDomainData,
+                           HostArrayView1D<ComplexType> spaceDomainBuffer,
+                           HostArrayView1D<ComplexType> freqDomainBuffer);
+
+  auto pack_backward() -> void override;
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+  auto unpack_backward() -> void override;
+
+  auto pack_forward() -> void override;
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+  auto unpack_forward() -> void override;
+
+private:
+  std::shared_ptr<Parameters> param_;
+  MPIDatatypeHandle mpiTypeHandle_;
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+
+  HostArrayView3D<ComplexType> spaceDomainData_;
+  HostArrayView2D<ComplexType> freqDomainData_;
+  HostArrayView1D<ComplexExchangeType> spaceDomainBuffer_;
+  HostArrayView1D<ComplexExchangeType> freqDomainBuffer_;
+
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/transpose/transpose_mpi_compact_buffered_gpu.cpp b/src/transpose/transpose_mpi_compact_buffered_gpu.cpp
new file mode 100644
index 0000000..65e7c60
--- /dev/null
+++ b/src/transpose/transpose_mpi_compact_buffered_gpu.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spfft/config.h"
+#if defined(SPFFT_MPI) && (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/gpu_kernels/compact_buffered_kernels.hpp"
+#include "transpose/transpose_mpi_compact_buffered_gpu.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+TransposeMPICompactBufferedGPU<T, U>::TransposeMPICompactBufferedGPU(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView1D<ComplexType> spaceDomainBufferHost,
+    GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU,
+    GPUArrayView1D<ComplexGPUType> spaceDomainBufferGPU, GPUStreamHandle spaceDomainStream,
+    HostArrayView1D<ComplexType> freqDomainBufferHost,
+    GPUArrayView2D<ComplexGPUType> freqDomainDataGPU,
+    GPUArrayView1D<ComplexGPUType> freqDomainBufferGPU, GPUStreamHandle freqDomainStream)
+    : param_(param),
+      comm_(std::move(comm)),
+      spaceDomainBufferHost_(create_new_type_1d_view<ComplexExchangeType>(
+          spaceDomainBufferHost,
+          param_->num_xy_planes(comm_.rank()) * param_->total_num_z_sticks())),
+      freqDomainBufferHost_(create_new_type_1d_view<ComplexExchangeType>(
+          freqDomainBufferHost,
+          param_->total_num_xy_planes() * param_->num_z_sticks(comm_.rank()))),
+      spaceDomainDataGPU_(spaceDomainDataGPU),
+      freqDomainDataGPU_(freqDomainDataGPU),
+      spaceDomainBufferGPU_(create_new_type_1d_view<ComplexExchangeGPUType>(
+          spaceDomainBufferGPU,
+          param_->total_num_z_sticks() * param_->num_xy_planes(comm_.rank()))),
+      freqDomainBufferGPU_(create_new_type_1d_view<ComplexExchangeGPUType>(
+          freqDomainBufferGPU, param_->num_z_sticks(comm_.rank()) * param_->total_num_xy_planes())),
+      spaceDomainStream_(std::move(spaceDomainStream)),
+      freqDomainStream_(std::move(freqDomainStream)) {
+  assert(param_->dim_y() == spaceDomainDataGPU.dim_mid());
+  assert(param_->dim_x_freq() == spaceDomainDataGPU.dim_inner());
+  assert(param_->num_xy_planes(comm_.rank()) == spaceDomainDataGPU.dim_outer());
+  assert(param_->dim_z() == freqDomainDataGPU.dim_inner());
+  assert(param_->num_z_sticks(comm_.rank()) == freqDomainDataGPU.dim_outer());
+
+  assert(spaceDomainBufferGPU.size() >=
+         param_->total_num_z_sticks() * param_->num_xy_planes(comm_.rank()));
+  assert(spaceDomainBufferHost.size() >=
+         param_->total_num_z_sticks() * param_->num_xy_planes(comm_.rank()));
+  assert(freqDomainBufferGPU.size() >=
+         param_->total_num_xy_planes() * param_->num_z_sticks(comm_.rank()));
+  assert(freqDomainBufferHost.size() >=
+         param_->total_num_xy_planes() * param_->num_z_sticks(comm_.rank()));
+
+  // assert(disjoint(spaceDomainDataGPU, freqDomainDataGPU));
+  assert(disjoint(spaceDomainDataGPU, spaceDomainBufferGPU));
+  assert(disjoint(freqDomainDataGPU, freqDomainBufferGPU));
+  assert(disjoint(spaceDomainBufferHost, freqDomainBufferHost));
+#ifdef SPFFT_GPU_DIRECT
+  assert(disjoint(spaceDomainBufferGPU, freqDomainBufferGPU));
+#endif
+
+  // create underlying type
+  mpiTypeHandle_ = MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<U>::get());
+
+  // prepare mpi parameters
+  spaceDomainCount_.resize(comm_.size());
+  freqDomainCount_.resize(comm_.size());
+  const SizeType numLocalZSticks = param_->num_z_sticks(comm_.rank());
+  const SizeType numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    freqDomainCount_[r] = numLocalZSticks * param_->num_xy_planes(r);
+    spaceDomainCount_[r] = param_->num_z_sticks(r) * numLocalXYPlanes;
+  }
+
+  spaceDomainDispls_.resize(comm_.size());
+  freqDomainDispls_.resize(comm_.size());
+  int currentFreqDomainDispls = 0;
+  int currentSpaceDomainDispls = 0;
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    assert(currentSpaceDomainDispls + spaceDomainCount_[r] <=
+           static_cast<int>(spaceDomainBufferHost.size()));
+    assert(currentFreqDomainDispls + freqDomainCount_[r] <=
+           static_cast<int>(freqDomainBufferHost.size()));
+    spaceDomainDispls_[r] = currentSpaceDomainDispls;
+    freqDomainDispls_[r] = currentFreqDomainDispls;
+    currentSpaceDomainDispls += spaceDomainCount_[r];
+    currentFreqDomainDispls += freqDomainCount_[r];
+  }
+
+  // copy relevant parameters to gpu
+  std::vector<int> numZSticksHost(comm_.size());
+  std::vector<int> numXYPlanesHost(comm_.size());
+  std::vector<int> xyPlaneOffsetsHost(comm_.size());
+  std::vector<int> indicesHost(comm_.size() * param_->max_num_z_sticks());
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    numZSticksHost[r] = static_cast<int>(param_->num_z_sticks(r));
+    numXYPlanesHost[r] = static_cast<int>(param_->num_xy_planes(r));
+    xyPlaneOffsetsHost[r] = static_cast<int>(param_->xy_plane_offset(r));
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    for (SizeType i = 0; i < zStickXYIndices.size(); ++i) {
+      // transpose stick index
+      const int xyIndex = zStickXYIndices(i);
+      const int x = xyIndex / param_->dim_y();
+      const int y = xyIndex - x * param_->dim_y();
+      indicesHost[r * param_->max_num_z_sticks() + i] = y * param_->dim_x_freq() + x;
+    }
+  }
+  numZSticksGPU_ = GPUArray<int>(numZSticksHost.size());
+  numXYPlanesGPU_ = GPUArray<int>(numXYPlanesHost.size());
+  xyPlaneOffsetsGPU_ = GPUArray<int>(xyPlaneOffsetsHost.size());
+  indicesGPU_ = GPUArray<int>(indicesHost.size());
+
+  copy_to_gpu(numZSticksHost, numZSticksGPU_);
+  copy_to_gpu(numXYPlanesHost, numXYPlanesGPU_);
+  copy_to_gpu(xyPlaneOffsetsHost, xyPlaneOffsetsGPU_);
+  copy_to_gpu(indicesHost, indicesGPU_);
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::pack_backward() -> void {
+  if (freqDomainDataGPU_.size() > 0 && freqDomainBufferGPU_.size() > 0) {
+    compact_buffered_pack_backward(freqDomainStream_.get(), param_->max_num_xy_planes(),
+                                   create_1d_view(numXYPlanesGPU_, 0, numXYPlanesGPU_.size()),
+                                   create_1d_view(xyPlaneOffsetsGPU_, 0, xyPlaneOffsetsGPU_.size()),
+                                   freqDomainDataGPU_, freqDomainBufferGPU_);
+#ifndef SPFFT_GPU_DIRECT
+    copy_from_gpu_async(freqDomainStream_, freqDomainBufferGPU_, freqDomainBufferHost_);
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::unpack_backward() -> void {
+  if (spaceDomainDataGPU_.size() > 0) {
+    gpu::check_status(gpu::memset_async(
+        static_cast<void*>(spaceDomainDataGPU_.data()), 0,
+        spaceDomainDataGPU_.size() * sizeof(typename decltype(spaceDomainDataGPU_)::ValueType),
+        spaceDomainStream_.get()));
+    if (spaceDomainBufferGPU_.size() > 0) {
+#ifndef SPFFT_GPU_DIRECT
+      copy_to_gpu_async(spaceDomainStream_, spaceDomainBufferHost_, spaceDomainBufferGPU_);
+#endif
+      compact_buffered_unpack_backward(spaceDomainStream_.get(), param_->max_num_z_sticks(),
+                                       create_1d_view(numZSticksGPU_, 0, numZSticksGPU_.size()),
+                                       create_1d_view(indicesGPU_, 0, indicesGPU_.size()),
+                                       spaceDomainBufferGPU_, spaceDomainDataGPU_);
+    }
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::exchange_backward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(freqDomainStream_.get()));
+
+  // exchange data
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Ialltoallv(freqDomainBufferGPU_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    spaceDomainBufferGPU_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#else
+    mpi_check_status(MPI_Ialltoallv(freqDomainBufferHost_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    spaceDomainBufferHost_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Alltoallv(freqDomainBufferGPU_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   spaceDomainBufferGPU_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+#else
+    mpi_check_status(MPI_Alltoallv(freqDomainBufferHost_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   spaceDomainBufferHost_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::pack_forward() -> void {
+  if (spaceDomainDataGPU_.size() > 0 && spaceDomainBufferGPU_.size() > 0) {
+    compact_buffered_pack_forward(spaceDomainStream_.get(), param_->max_num_z_sticks(),
+                                  create_1d_view(numZSticksGPU_, 0, numZSticksGPU_.size()),
+                                  create_1d_view(indicesGPU_, 0, indicesGPU_.size()),
+                                  spaceDomainDataGPU_, spaceDomainBufferGPU_);
+
+#ifndef SPFFT_GPU_DIRECT
+    copy_from_gpu_async(spaceDomainStream_, spaceDomainBufferGPU_, spaceDomainBufferHost_);
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::unpack_forward() -> void {
+  if (freqDomainDataGPU_.size() > 0 && freqDomainBufferGPU_.size() > 0) {
+#ifndef SPFFT_GPU_DIRECT
+    copy_to_gpu_async(freqDomainStream_, freqDomainBufferHost_, freqDomainBufferGPU_);
+#endif
+    compact_buffered_unpack_forward(
+        freqDomainStream_.get(), param_->max_num_xy_planes(),
+        create_1d_view(numXYPlanesGPU_, 0, numXYPlanesGPU_.size()),
+        create_1d_view(xyPlaneOffsetsGPU_, 0, xyPlaneOffsetsGPU_.size()), freqDomainBufferGPU_,
+        freqDomainDataGPU_);
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::exchange_forward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(spaceDomainStream_.get()));
+
+  // exchange data
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Ialltoallv(spaceDomainBufferGPU_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    freqDomainBufferGPU_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#else
+    mpi_check_status(MPI_Ialltoallv(spaceDomainBufferHost_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    freqDomainBufferHost_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Alltoallv(spaceDomainBufferGPU_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   freqDomainBufferGPU_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+#else
+    mpi_check_status(MPI_Alltoallv(spaceDomainBufferHost_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   freqDomainBufferHost_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedGPU<T, U>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPICompactBufferedGPU<float, float>;
+#endif
+template class TransposeMPICompactBufferedGPU<double, double>;
+template class TransposeMPICompactBufferedGPU<double, float>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_compact_buffered_gpu.hpp b/src/transpose/transpose_mpi_compact_buffered_gpu.hpp
new file mode 100644
index 0000000..7d86b06
--- /dev/null
+++ b/src/transpose/transpose_mpi_compact_buffered_gpu.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_COMPACT_BUFFERED_GPU_HPP
+#define SPFFT_TRANSPOSE_MPI_COMPACT_BUFFERED_GPU_HPP
+
+#include <complex>
+#include <memory>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#if defined(SPFFT_MPI) && (defined(SPFFT_CUDA) || defined(SPFFT_ROCM))
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+class TransposeMPICompactBufferedGPU : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+  using ComplexExchangeType = std::complex<U>;
+  using ComplexGPUType = typename gpu::fft::ComplexType<T>::type;
+  using ComplexExchangeGPUType = typename gpu::fft::ComplexType<U>::type;
+
+public:
+  // spaceDomainDataGPU and freqDomainDataGPU must NOT overlap
+  // spaceDomainDataGPU and spaceDomainBufferGPU must NOT overlap
+  // freqDomainDataGPU and freqDomainBufferGPU must NOT overlap
+  // spaceDomainBufferGPU and freqDomainBufferGPU must NOT overlap
+  // spaceDomainBufferHost and freqDomainBufferHost must NOT overlap
+  //
+  // spaceDomainBufferGPU and freqDomainDataGPU MAY overlap
+  // freqDomainBufferGPU and spaceDomainDataGPU MAY overlap
+  TransposeMPICompactBufferedGPU(const std::shared_ptr<Parameters>& param,
+                                 MPICommunicatorHandle comm,
+                                 HostArrayView1D<ComplexType> spaceDomainBufferHost,
+                                 GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU,
+                                 GPUArrayView1D<ComplexGPUType> spaceDomainBufferGPU,
+                                 GPUStreamHandle spaceDomainStream,
+                                 HostArrayView1D<ComplexType> freqDomainBufferHost,
+                                 GPUArrayView2D<ComplexGPUType> freqDomainDataGPU,
+                                 GPUArrayView1D<ComplexGPUType> freqDomainBufferGPU,
+                                 GPUStreamHandle freqDomainStream);
+
+  auto pack_backward() -> void override;
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+  auto unpack_backward() -> void override;
+
+  auto pack_forward() -> void override;
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+  auto unpack_forward() -> void override;
+
+private:
+  std::shared_ptr<Parameters> param_;
+  MPIDatatypeHandle mpiTypeHandle_;
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+  std::vector<int> spaceDomainDispls_;
+  std::vector<int> freqDomainDispls_;
+  std::vector<int> spaceDomainCount_;
+  std::vector<int> freqDomainCount_;
+
+  HostArrayView1D<ComplexExchangeType> spaceDomainBufferHost_;
+  HostArrayView1D<ComplexExchangeType> freqDomainBufferHost_;
+  GPUArrayView3D<ComplexGPUType> spaceDomainDataGPU_;
+  GPUArrayView2D<ComplexGPUType> freqDomainDataGPU_;
+  GPUArrayView1D<ComplexExchangeGPUType> spaceDomainBufferGPU_;
+  GPUArrayView1D<ComplexExchangeGPUType> freqDomainBufferGPU_;
+  GPUStreamHandle spaceDomainStream_;
+  GPUStreamHandle freqDomainStream_;
+
+  GPUArray<int> numZSticksGPU_;
+  GPUArray<int> numXYPlanesGPU_;
+  GPUArray<int> xyPlaneOffsetsGPU_;
+  GPUArray<int> indicesGPU_;
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/transpose/transpose_mpi_compact_buffered_host.cpp b/src/transpose/transpose_mpi_compact_buffered_host.cpp
new file mode 100644
index 0000000..c105f07
--- /dev/null
+++ b/src/transpose/transpose_mpi_compact_buffered_host.cpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/transpose_mpi_compact_buffered_host.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+TransposeMPICompactBufferedHost<T, U>::TransposeMPICompactBufferedHost(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView3D<ComplexType> spaceDomainData, HostArrayView2D<ComplexType> freqDomainData,
+    HostArrayView1D<ComplexType> spaceDomainBuffer, HostArrayView1D<ComplexType> freqDomainBuffer)
+    : param_(param),
+      comm_(std::move(comm)),
+      spaceDomainData_(spaceDomainData),
+      freqDomainData_(freqDomainData),
+      spaceDomainBuffer_(create_new_type_1d_view<ComplexExchangeType>(spaceDomainBuffer,
+                                                                      spaceDomainBuffer.size())),
+      freqDomainBuffer_(
+          create_new_type_1d_view<ComplexExchangeType>(freqDomainBuffer, freqDomainBuffer.size())) {
+  assert(param_->dim_x_freq() == spaceDomainData.dim_mid());
+  assert(param_->dim_y() == spaceDomainData.dim_inner());
+  assert(param_->num_xy_planes(comm_.rank()) == spaceDomainData.dim_outer());
+  assert(param_->dim_z() == freqDomainData.dim_inner());
+  assert(param_->num_z_sticks(comm_.rank()) == freqDomainData.dim_outer());
+
+  assert(spaceDomainBuffer.size() >=
+         param_->total_num_z_sticks() * param_->num_xy_planes(comm_.rank()));
+  assert(freqDomainBuffer.size() >=
+         param_->total_num_xy_planes() * param_->num_z_sticks(comm_.rank()));
+
+  assert(disjoint(spaceDomainData, freqDomainData));
+  assert(disjoint(spaceDomainData, spaceDomainBuffer));
+  assert(disjoint(freqDomainData, freqDomainBuffer));
+  assert(disjoint(spaceDomainBuffer, freqDomainBuffer));
+
+  // create underlying type
+  mpiTypeHandle_ = MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<U>::get());
+
+  spaceDomainCount_.resize(comm_.size());
+  freqDomainCount_.resize(comm_.size());
+  const SizeType numLocalZSticks = param_->num_z_sticks(comm_.rank());
+  const SizeType numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    freqDomainCount_[r] = numLocalZSticks * param_->num_xy_planes(r);
+    spaceDomainCount_[r] = param_->num_z_sticks(r) * numLocalXYPlanes;
+  }
+
+  spaceDomainDispls_.resize(comm_.size());
+  freqDomainDispls_.resize(comm_.size());
+  int currentFreqDomainDispls = 0;
+  int currentSpaceDomainDispls = 0;
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    assert(currentSpaceDomainDispls + spaceDomainCount_[r] <=
+           static_cast<int>(spaceDomainBuffer.size()));
+    assert(currentFreqDomainDispls + freqDomainCount_[r] <=
+           static_cast<int>(freqDomainBuffer.size()));
+    spaceDomainDispls_[r] = currentSpaceDomainDispls;
+    freqDomainDispls_[r] = currentFreqDomainDispls;
+    currentSpaceDomainDispls += spaceDomainCount_[r];
+    currentFreqDomainDispls += freqDomainCount_[r];
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::pack_backward() -> void {
+  // transpose locally from (numLocalZSticks, dimZ) to (dimZ, numLocalZSticks)
+  for (SizeType r = 0; r < static_cast<SizeType>(comm_.size()); ++r) {
+    const auto xyPlaneOffset = param_->xy_plane_offset(r);
+    auto freqDomainBuffer2d = create_2d_view(freqDomainBuffer_, freqDomainDispls_[r],
+                                             freqDomainData_.dim_outer(), param_->num_xy_planes(r));
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < freqDomainData_.dim_outer(); ++zStickIndex) {
+      for (SizeType zIndex = 0; zIndex < param_->num_xy_planes(r); ++zIndex) {
+        freqDomainBuffer2d(zStickIndex, zIndex) =
+            freqDomainData_(zStickIndex, zIndex + xyPlaneOffset);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::unpack_backward() -> void {
+  // zero target data location (not all values are overwritten upon unpacking)
+  SPFFT_OMP_PRAGMA("omp for schedule(static)") // implicit barrier
+  for (SizeType z = 0; z < spaceDomainData_.dim_outer(); ++z) {
+    std::memset(static_cast<void*>(&spaceDomainData_(z, 0, 0)), 0,
+                sizeof(typename decltype(spaceDomainData_)::ValueType) *
+                    spaceDomainData_.dim_inner() * spaceDomainData_.dim_mid());
+  }
+
+  auto spaceDomainDataFlat =
+      create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                     spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+  // unpack from (numZSticksTotal, numLocalXYPlanes) to (numLocalXYPlanes, dimX, dimY)
+  const auto numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    // take care with unsigned type
+    const SizeType unrolledLoopEnd = zStickXYIndices.size() < 4 ? 0 : zStickXYIndices.size() - 3;
+
+    auto recvBuffer = create_2d_view(spaceDomainBuffer_, spaceDomainDispls_[r],
+                                     zStickXYIndices.size(), numLocalXYPlanes);
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      const SizeType xyIndex1 = zStickXYIndices(zStickIndex);
+      const SizeType xyIndex2 = zStickXYIndices(zStickIndex + 1);
+      const SizeType xyIndex3 = zStickXYIndices(zStickIndex + 2);
+      const SizeType xyIndex4 = zStickXYIndices(zStickIndex + 3);
+
+      // manual loop unrolling for better performance
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex1) = recvBuffer(zStickIndex, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex2) = recvBuffer(zStickIndex + 1, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex3) = recvBuffer(zStickIndex + 2, zIndex);
+        spaceDomainDataFlat(zIndex, xyIndex4) = recvBuffer(zStickIndex + 3, zIndex);
+      }
+    }
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < zStickXYIndices.size();
+         zStickIndex += 1) {
+      const SizeType xyIndex = zStickXYIndices(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        spaceDomainDataFlat(zIndex, xyIndex) = recvBuffer(zStickIndex, zIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::exchange_backward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  // exchange data
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoallv(freqDomainBuffer_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    spaceDomainBuffer_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(MPI_Alltoallv(freqDomainBuffer_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   spaceDomainBuffer_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+  }
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::pack_forward() -> void {
+  auto spaceDomainDataFlat =
+      create_2d_view(spaceDomainData_, 0, spaceDomainData_.dim_outer(),
+                     spaceDomainData_.dim_mid() * spaceDomainData_.dim_inner());
+
+  // pack from (numLocalXYPlanes, dimX, dimY) to (numZSticksTotal, numLocalXYPlanes)
+  const auto numLocalXYPlanes = param_->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < (SizeType)comm_.size(); ++r) {
+    const auto zStickXYIndices = param_->z_stick_xy_indices(r);
+    // take care with unsigned type
+    const SizeType unrolledLoopEnd = zStickXYIndices.size() < 4 ? 0 : zStickXYIndices.size() - 3;
+
+    auto recvBuffer = create_2d_view(spaceDomainBuffer_, spaceDomainDispls_[r],
+                                     zStickXYIndices.size(), numLocalXYPlanes);
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < unrolledLoopEnd; zStickIndex += 4) {
+      // manual loop unrolling for better performance
+      const SizeType xyIndex1 = zStickXYIndices(zStickIndex);
+      const SizeType xyIndex2 = zStickXYIndices(zStickIndex + 1);
+      const SizeType xyIndex3 = zStickXYIndices(zStickIndex + 2);
+      const SizeType xyIndex4 = zStickXYIndices(zStickIndex + 3);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        recvBuffer(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex1);
+        recvBuffer(zStickIndex + 1, zIndex) = spaceDomainDataFlat(zIndex, xyIndex2);
+        recvBuffer(zStickIndex + 2, zIndex) = spaceDomainDataFlat(zIndex, xyIndex3);
+        recvBuffer(zStickIndex + 3, zIndex) = spaceDomainDataFlat(zIndex, xyIndex4);
+      }
+    }
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = unrolledLoopEnd; zStickIndex < zStickXYIndices.size();
+         zStickIndex += 1) {
+      const SizeType xyIndex = zStickXYIndices(zStickIndex);
+      for (SizeType zIndex = 0; zIndex < numLocalXYPlanes; ++zIndex) {
+        recvBuffer(zStickIndex, zIndex) = spaceDomainDataFlat(zIndex, xyIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::unpack_forward() -> void {
+  // transpose locally from  (dimZ, numLocalZSticks) to (numLocalZSticks, dimZ)
+  for (SizeType r = 0; r < static_cast<SizeType>(comm_.size()); ++r) {
+    const auto xyPlaneOffset = param_->xy_plane_offset(r);
+    auto freqDomainBuffer2d = create_2d_view(freqDomainBuffer_, freqDomainDispls_[r],
+                                             freqDomainData_.dim_outer(), param_->num_xy_planes(r));
+
+    SPFFT_OMP_PRAGMA("omp for schedule(static) nowait")
+    for (SizeType zStickIndex = 0; zStickIndex < freqDomainData_.dim_outer(); ++zStickIndex) {
+      for (SizeType zIndex = 0; zIndex < param_->num_xy_planes(r); ++zIndex) {
+        freqDomainData_(zStickIndex, zIndex + xyPlaneOffset) =
+            freqDomainBuffer2d(zStickIndex, zIndex);
+      }
+    }
+  }
+  SPFFT_OMP_PRAGMA("omp barrier")
+}
+
+template <typename T, typename U>
+auto TransposeMPICompactBufferedHost<T, U>::exchange_forward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoallv(spaceDomainBuffer_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                    freqDomainBuffer_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(MPI_Alltoallv(spaceDomainBuffer_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), mpiTypeHandle_.get(),
+                                   freqDomainBuffer_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), mpiTypeHandle_.get(), comm_.get()));
+  }
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPICompactBufferedHost<float, float>;
+#endif
+template class TransposeMPICompactBufferedHost<double, double>;
+template class TransposeMPICompactBufferedHost<double, float>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_compact_buffered_host.hpp b/src/transpose/transpose_mpi_compact_buffered_host.hpp
new file mode 100644
index 0000000..493c22d
--- /dev/null
+++ b/src/transpose/transpose_mpi_compact_buffered_host.hpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_COMPACT_BUFFERED_HOST_HPP
+#define SPFFT_TRANSPOSE_MPI_COMPACT_BUFFERED_HOST_HPP
+
+#include <complex>
+#include <memory>
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T, typename U>
+class TransposeMPICompactBufferedHost : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+  using ComplexExchangeType = std::complex<U>;
+
+public:
+  // spaceDomainData and freqDomainData must NOT overlap
+  // spaceDomainData and spaceDomainBuffer must NOT overlap
+  // freqDomainData and freqDomainBuffer must NOT overlap
+  // spaceDomainBuffer and freqDomainBuffer must NOT overlap
+  //
+  // spaceDomainBuffer and freqDomainData MAY overlap
+  // freqDomainBuffer and spaceDomainData MAY overlap
+  TransposeMPICompactBufferedHost(const std::shared_ptr<Parameters>& param,
+                                  MPICommunicatorHandle comm,
+                                  HostArrayView3D<ComplexType> spaceDomainData,
+                                  HostArrayView2D<ComplexType> freqDomainData,
+                                  HostArrayView1D<ComplexType> spaceDomainBuffer,
+                                  HostArrayView1D<ComplexType> freqDomainBuffer);
+
+  auto pack_backward() -> void override;
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+  auto unpack_backward() -> void override;
+
+  auto pack_forward() -> void override;
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+  auto unpack_forward() -> void override;
+
+private:
+  std::shared_ptr<Parameters> param_;
+  MPIDatatypeHandle mpiTypeHandle_;
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+
+  HostArrayView3D<ComplexType> spaceDomainData_;
+  HostArrayView2D<ComplexType> freqDomainData_;
+  HostArrayView1D<ComplexExchangeType> spaceDomainBuffer_;
+  HostArrayView1D<ComplexExchangeType> freqDomainBuffer_;
+
+  std::vector<int> spaceDomainDispls_;
+  std::vector<int> freqDomainDispls_;
+  std::vector<int> spaceDomainCount_;
+  std::vector<int> freqDomainCount_;
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/transpose/transpose_mpi_unbuffered_gpu.cpp b/src/transpose/transpose_mpi_unbuffered_gpu.cpp
new file mode 100644
index 0000000..a947f14
--- /dev/null
+++ b/src/transpose/transpose_mpi_unbuffered_gpu.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "gpu_util/gpu_transfer.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/transpose_mpi_unbuffered_gpu.hpp"
+
+namespace spfft {
+template <typename T>
+TransposeMPIUnbufferedGPU<T>::TransposeMPIUnbufferedGPU(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView3D<ComplexType> spaceDomainData,
+    GPUArrayView3D<typename gpu::fft::ComplexType<ValueType>::type> spaceDomainDataGPU,
+    GPUStreamHandle spaceDomainStream, HostArrayView2D<ComplexType> freqDomainData,
+    GPUArrayView2D<typename gpu::fft::ComplexType<ValueType>::type> freqDomainDataGPU,
+    GPUStreamHandle freqDomainStream)
+    : comm_(std::move(comm)),
+      spaceDomainData_(spaceDomainData),
+      freqDomainData_(freqDomainData),
+      spaceDomainDataGPU_(spaceDomainDataGPU),
+      freqDomainDataGPU_(freqDomainDataGPU),
+      numLocalXYPlanes_(spaceDomainData.dim_outer()),
+      spaceDomainStream_(std::move(spaceDomainStream)),
+      freqDomainStream_(std::move(freqDomainStream)) {
+  assert(disjoint(spaceDomainData, freqDomainData));
+  assert(param->dim_x_freq() == spaceDomainData.dim_inner());
+  assert(param->dim_y() == spaceDomainData.dim_mid());
+  assert(param->num_xy_planes(comm_.rank()) == spaceDomainData.dim_outer());
+  assert(param->dim_z() == freqDomainData.dim_inner());
+  assert(param->num_z_sticks(comm_.rank()) == freqDomainData.dim_outer());
+
+  // create underlying type
+  MPIDatatypeHandle complexType =
+      MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<T>::get());
+
+  // create types in frequency space for each rank:
+  // each type represents a fixed length part of every z stick the rank holds
+  freqDomainTypeHandles_.reserve(comm_.size());
+  freqDomainCount_.reserve(comm_.size());
+  freqDomainTypes_.reserve(comm_.size());
+  freqDomainDispls_.assign(comm_.size(), 0);
+
+  const SizeType numLocalZSticks = param->num_z_sticks(comm_.rank());
+  const SizeType numLocalXYPlanes = param->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    if (param->num_xy_planes(r) > 0 && numLocalZSticks > 0) {
+      const int ndims = 2;
+      const int arrayOfSizes[] = {(int)numLocalZSticks, (int)freqDomainData_.dim_inner()};
+      const int arrayOfSubsizes[] = {(int)numLocalZSticks, (int)param->num_xy_planes(r)};
+      const int arrayOfStarts[] = {(int)0, (int)param->xy_plane_offset(r)};
+      const int order = MPI_ORDER_C;
+
+      freqDomainCount_.emplace_back(1);
+      freqDomainTypeHandles_.emplace_back(MPIDatatypeHandle::create_subarray(
+          ndims, arrayOfSizes, arrayOfSubsizes, arrayOfStarts, order, complexType.get()));
+      freqDomainTypes_.emplace_back(freqDomainTypeHandles_.back().get());
+    } else {
+      freqDomainCount_.emplace_back(0);
+      freqDomainTypeHandles_.emplace_back(complexType);
+      freqDomainTypes_.emplace_back(freqDomainTypeHandles_.back().get());
+    }
+  }
+
+  // create types in space domain for each rank:
+  // each type represents a batch of partial z sticks with inner stride dimX*dimY and placed
+  // according to the assosiated x/y indices
+  std::vector<int> indexedBlocklengths;
+  std::vector<MPI_Aint> indexedDispls;
+
+  spaceDomainTypes_.reserve(comm_.size());
+  spaceDomainCount_.reserve(comm_.size());
+  spaceDomainDispls_.assign(comm_.size(), 0);
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    if (param->num_z_sticks(r) > 0 && numLocalXYPlanes > 0) {
+      // data type for single z stick part
+      MPIDatatypeHandle stridedZStickType = MPIDatatypeHandle::create_vector(
+          numLocalXYPlanes, 1, spaceDomainData_.dim_inner() * spaceDomainData_.dim_mid(),
+          complexType.get());
+
+      const auto zStickXYIndices = param->z_stick_xy_indices(r);
+
+      indexedBlocklengths.resize(zStickXYIndices.size(), 1);
+      indexedDispls.resize(zStickXYIndices.size());
+      // displacements of all z stick parts to be send to current rank
+      for (SizeType idxZStick = 0; idxZStick < zStickXYIndices.size(); ++idxZStick) {
+        // transpose stick index
+        const int xyIndex = zStickXYIndices(idxZStick);
+        const int x = xyIndex / param->dim_y();
+        const int y = xyIndex - x * param->dim_y();
+
+        indexedDispls[idxZStick] = 2 * sizeof(T) * (y * param->dim_x_freq() + x);
+      }
+
+      spaceDomainCount_.emplace_back(1);
+      spaceDomainTypeHandles_.emplace_back(
+          MPIDatatypeHandle::create_hindexed(zStickXYIndices.size(), indexedBlocklengths.data(),
+                                             indexedDispls.data(), stridedZStickType.get()));
+      spaceDomainTypes_.emplace_back(spaceDomainTypeHandles_.back().get());
+    } else {
+      spaceDomainCount_.emplace_back(0);
+      spaceDomainTypeHandles_.emplace_back(complexType);
+      spaceDomainTypes_.emplace_back(complexType.get());
+    }
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::pack_backward() -> void {
+#ifdef SPFFT_GPU_DIRECT
+  gpu::check_status(gpu::memset_async(
+      static_cast<void*>(spaceDomainDataGPU_.data()), 0,
+      spaceDomainDataGPU_.size() * sizeof(typename decltype(spaceDomainDataGPU_)::ValueType),
+      spaceDomainStream_.get()));
+#else
+  copy_from_gpu_async(freqDomainStream_, freqDomainDataGPU_, freqDomainData_);
+  // zero target data location (not all values are overwritten upon unpacking)
+  std::memset(static_cast<void*>(spaceDomainData_.data()), 0,
+              sizeof(typename decltype(spaceDomainData_)::ValueType) * spaceDomainData_.size());
+#endif
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::unpack_backward() -> void {
+#ifndef SPFFT_GPU_DIRECT
+  copy_to_gpu_async(spaceDomainStream_, spaceDomainData_, spaceDomainDataGPU_);
+#endif
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::exchange_backward_start(const bool nonBlockingExchange) -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(freqDomainStream_.get()));
+
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Ialltoallw(freqDomainDataGPU_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(),
+                                    spaceDomainDataGPU_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    comm_.get(), mpiRequest_.get_and_activate()));
+#else
+    mpi_check_status(MPI_Ialltoallw(freqDomainData_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(),
+                                    spaceDomainData_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    comm_.get(), mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(
+        MPI_Alltoallw(freqDomainDataGPU_.data(), freqDomainCount_.data(), freqDomainDispls_.data(),
+                      freqDomainTypes_.data(), spaceDomainDataGPU_.data(), spaceDomainCount_.data(),
+                      spaceDomainDispls_.data(), spaceDomainTypes_.data(), comm_.get()));
+#else
+    mpi_check_status(
+        MPI_Alltoallw(freqDomainData_.data(), freqDomainCount_.data(), freqDomainDispls_.data(),
+                      freqDomainTypes_.data(), spaceDomainData_.data(), spaceDomainCount_.data(),
+                      spaceDomainDispls_.data(), spaceDomainTypes_.data(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::exchange_forward_start(const bool nonBlockingExchange) -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  gpu::check_status(gpu::stream_synchronize(spaceDomainStream_.get()));
+
+  if (nonBlockingExchange) {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Ialltoallw(spaceDomainDataGPU_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    freqDomainDataGPU_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#else
+    mpi_check_status(MPI_Ialltoallw(spaceDomainData_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    freqDomainData_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+#endif
+  } else {
+#ifdef SPFFT_GPU_DIRECT
+    mpi_check_status(MPI_Alltoallw(spaceDomainDataGPU_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                   freqDomainDataGPU_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get()));
+#else
+    mpi_check_status(MPI_Alltoallw(spaceDomainData_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                   freqDomainData_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get()));
+#endif
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::pack_forward() -> void {
+#ifndef SPFFT_GPU_DIRECT
+  copy_from_gpu_async(spaceDomainStream_, spaceDomainDataGPU_, spaceDomainData_);
+#endif
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedGPU<T>::unpack_forward() -> void {
+#ifndef SPFFT_GPU_DIRECT
+  copy_to_gpu_async(freqDomainStream_, freqDomainData_, freqDomainDataGPU_);
+#endif
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPIUnbufferedGPU<float>;
+#endif
+template class TransposeMPIUnbufferedGPU<double>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_unbuffered_gpu.hpp b/src/transpose/transpose_mpi_unbuffered_gpu.hpp
new file mode 100644
index 0000000..eeceeb7
--- /dev/null
+++ b/src/transpose/transpose_mpi_unbuffered_gpu.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_UNBUFFERED_GPU_HPP
+#define SPFFT_TRANSPOSE_MPI_UNBUFFERED_GPU_HPP
+
+#include <complex>
+#include <memory>
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_stream_handle.hpp"
+#include "memory/gpu_array.hpp"
+#include "memory/gpu_array_view.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T>
+class TransposeMPIUnbufferedGPU : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+public:
+  TransposeMPIUnbufferedGPU(
+      const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+      HostArrayView3D<ComplexType> spaceDomainData,
+      GPUArrayView3D<typename gpu::fft::ComplexType<ValueType>::type> spaceDomainDataGPU,
+      GPUStreamHandle spaceDomainStream, HostArrayView2D<ComplexType> freqDomainData,
+      GPUArrayView2D<typename gpu::fft::ComplexType<ValueType>::type> freqDomainDataGPU,
+      GPUStreamHandle freqDomainStream);
+
+  auto pack_backward() -> void override;
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+  auto unpack_backward() -> void override;
+
+  auto pack_forward() -> void override;
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+  auto unpack_forward() -> void override;
+
+private:
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+
+  HostArrayView3D<ComplexType> spaceDomainData_;
+  HostArrayView2D<ComplexType> freqDomainData_;
+
+  GPUArrayView3D<typename gpu::fft::ComplexType<ValueType>::type> spaceDomainDataGPU_;
+  GPUArrayView2D<typename gpu::fft::ComplexType<ValueType>::type> freqDomainDataGPU_;
+
+  SizeType numLocalXYPlanes_;
+  GPUStreamHandle spaceDomainStream_;
+  GPUStreamHandle freqDomainStream_;
+
+  std::vector<MPIDatatypeHandle> freqDomainTypeHandles_;
+  std::vector<MPI_Datatype> freqDomainTypes_;
+  std::vector<int> freqDomainDispls_;
+  std::vector<int> freqDomainCount_;
+  std::vector<MPIDatatypeHandle> spaceDomainTypeHandles_;
+  std::vector<MPI_Datatype> spaceDomainTypes_;
+  std::vector<int> spaceDomainDispls_;
+  std::vector<int> spaceDomainCount_;
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/transpose/transpose_mpi_unbuffered_host.cpp b/src/transpose/transpose_mpi_unbuffered_host.cpp
new file mode 100644
index 0000000..fc69818
--- /dev/null
+++ b/src/transpose/transpose_mpi_unbuffered_host.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/exceptions.hpp"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/omp_definitions.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_check_status.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_match_elementary_type.hpp"
+#include "transpose/transpose_mpi_unbuffered_host.hpp"
+
+namespace spfft {
+template <typename T>
+TransposeMPIUnbufferedHost<T>::TransposeMPIUnbufferedHost(
+    const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+    HostArrayView3D<ComplexType> spaceDomainData, HostArrayView2D<ComplexType> freqDomainData)
+    : comm_(std::move(comm)),
+      spaceDomainData_(spaceDomainData),
+      freqDomainData_(freqDomainData),
+      numLocalXYPlanes_(spaceDomainData.dim_outer()) {
+  assert(disjoint(spaceDomainData, freqDomainData));
+  assert(param->dim_x_freq() == spaceDomainData.dim_mid());
+  assert(param->dim_y() == spaceDomainData.dim_inner());
+  assert(param->num_xy_planes(comm_.rank()) == spaceDomainData.dim_outer());
+  assert(param->dim_z() == freqDomainData.dim_inner());
+  assert(param->num_z_sticks(comm_.rank()) == freqDomainData.dim_outer());
+
+  // create underlying type
+  MPIDatatypeHandle complexType =
+      MPIDatatypeHandle::create_contiguous(2, MPIMatchElementaryType<T>::get());
+
+  // create types in frequency space for each rank:
+  // each type represents a fixed length part of every z stick the rank holds
+  freqDomainTypeHandles_.reserve(comm_.size());
+  freqDomainCount_.reserve(comm_.size());
+  freqDomainTypes_.reserve(comm_.size());
+  freqDomainDispls_.assign(comm_.size(), 0);
+
+  const SizeType numLocalZSticks = param->num_z_sticks(comm_.rank());
+  const SizeType numLocalXYPlanes = param->num_xy_planes(comm_.rank());
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    if (param->num_xy_planes(r) > 0 && numLocalZSticks > 0) {
+      const int ndims = 2;
+      const int arrayOfSizes[] = {(int)numLocalZSticks, (int)freqDomainData_.dim_inner()};
+      const int arrayOfSubsizes[] = {(int)numLocalZSticks, (int)param->num_xy_planes(r)};
+      const int arrayOfStarts[] = {(int)0, (int)param->xy_plane_offset(r)};
+      const int order = MPI_ORDER_C;
+
+      freqDomainCount_.emplace_back(1);
+      freqDomainTypeHandles_.emplace_back(MPIDatatypeHandle::create_subarray(
+          ndims, arrayOfSizes, arrayOfSubsizes, arrayOfStarts, order, complexType.get()));
+      freqDomainTypes_.emplace_back(freqDomainTypeHandles_.back().get());
+    } else {
+      freqDomainCount_.emplace_back(0);
+      freqDomainTypeHandles_.emplace_back(complexType);
+      freqDomainTypes_.emplace_back(freqDomainTypeHandles_.back().get());
+    }
+  }
+
+  // create types in space domain for each rank:
+  // each type represents a batch of partial z sticks with inner stride dimX*dimY and placed
+  // according to the assosiated x/y indices
+  std::vector<int> indexedBlocklengths;
+  std::vector<MPI_Aint> indexedDispls;
+
+  spaceDomainTypes_.reserve(comm_.size());
+  spaceDomainCount_.reserve(comm_.size());
+  spaceDomainDispls_.assign(comm_.size(), 0);
+  for (SizeType r = 0; r < comm_.size(); ++r) {
+    if (param->num_z_sticks(r) > 0 && numLocalXYPlanes > 0) {
+      // data type for single z stick part
+      MPIDatatypeHandle stridedZStickType = MPIDatatypeHandle::create_vector(
+          numLocalXYPlanes, 1, spaceDomainData_.dim_inner() * spaceDomainData_.dim_mid(),
+          complexType.get());
+
+      const auto zStickXYIndices = param->z_stick_xy_indices(r);
+
+      indexedBlocklengths.resize(zStickXYIndices.size(), 1);
+      indexedDispls.resize(zStickXYIndices.size());
+      // displacements of all z stick parts to be send to current rank
+      for (SizeType idxZStick = 0; idxZStick < zStickXYIndices.size(); ++idxZStick) {
+        const auto& xyIndex = zStickXYIndices(idxZStick);
+        indexedDispls[idxZStick] = 2 * sizeof(T) * xyIndex;
+      }
+
+      spaceDomainCount_.emplace_back(1);
+      spaceDomainTypeHandles_.emplace_back(
+          MPIDatatypeHandle::create_hindexed(zStickXYIndices.size(), indexedBlocklengths.data(),
+                                             indexedDispls.data(), stridedZStickType.get()));
+      spaceDomainTypes_.emplace_back(spaceDomainTypeHandles_.back().get());
+    } else {
+      spaceDomainCount_.emplace_back(0);
+      spaceDomainTypeHandles_.emplace_back(complexType);
+      spaceDomainTypes_.emplace_back(complexType.get());
+    }
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedHost<T>::exchange_backward_start(const bool nonBlockingExchange)
+    -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  // zero target data location (not all values are overwritten upon unpacking)
+  std::memset(static_cast<void*>(spaceDomainData_.data()), 0,
+              sizeof(typename decltype(spaceDomainData_)::ValueType) * spaceDomainData_.size());
+
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoallw(freqDomainData_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(),
+                                    spaceDomainData_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    comm_.get(), mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(
+        MPI_Alltoallw(freqDomainData_.data(), freqDomainCount_.data(), freqDomainDispls_.data(),
+                      freqDomainTypes_.data(), spaceDomainData_.data(), spaceDomainCount_.data(),
+                      spaceDomainDispls_.data(), spaceDomainTypes_.data(), comm_.get()));
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedHost<T>::exchange_backward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedHost<T>::exchange_forward_start(const bool nonBlockingExchange) -> void {
+  assert(omp_get_thread_num() == 0); // only must thread must be allowed to enter
+
+  if (nonBlockingExchange) {
+    mpi_check_status(MPI_Ialltoallw(spaceDomainData_.data(), spaceDomainCount_.data(),
+                                    spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                    freqDomainData_.data(), freqDomainCount_.data(),
+                                    freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get(),
+                                    mpiRequest_.get_and_activate()));
+  } else {
+    mpi_check_status(MPI_Alltoallw(spaceDomainData_.data(), spaceDomainCount_.data(),
+                                   spaceDomainDispls_.data(), spaceDomainTypes_.data(),
+                                   freqDomainData_.data(), freqDomainCount_.data(),
+                                   freqDomainDispls_.data(), freqDomainTypes_.data(), comm_.get()));
+  }
+}
+
+template <typename T>
+auto TransposeMPIUnbufferedHost<T>::exchange_forward_finalize() -> void {
+  mpiRequest_.wait_if_active();
+}
+
+// Instantiate class for float and double
+#ifdef SPFFT_SINGLE_PRECISION
+template class TransposeMPIUnbufferedHost<float>;
+#endif
+template class TransposeMPIUnbufferedHost<double>;
+} // namespace spfft
+#endif // SPFFT_MPI
diff --git a/src/transpose/transpose_mpi_unbuffered_host.hpp b/src/transpose/transpose_mpi_unbuffered_host.hpp
new file mode 100644
index 0000000..58dfaa8
--- /dev/null
+++ b/src/transpose/transpose_mpi_unbuffered_host.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TRANSPOSE_MPI_UNBUFFERED_HOST_HPP
+#define SPFFT_TRANSPOSE_MPI_UNBUFFERED_HOST_HPP
+
+#include <complex>
+#include <memory>
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/config.h"
+#include "transpose.hpp"
+#include "util/common_types.hpp"
+#include "util/type_check.hpp"
+
+#ifdef SPFFT_MPI
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_datatype_handle.hpp"
+#include "mpi_util/mpi_request_handle.hpp"
+
+namespace spfft {
+template <typename T>
+class TransposeMPIUnbufferedHost : public Transpose {
+  static_assert(IsFloatOrDouble<T>::value, "Type T must be float or double");
+  using ValueType = T;
+  using ComplexType = std::complex<T>;
+
+public:
+  TransposeMPIUnbufferedHost(const std::shared_ptr<Parameters>& param, MPICommunicatorHandle comm,
+                             HostArrayView3D<ComplexType> spaceDomainData,
+                             HostArrayView2D<ComplexType> freqDomainData);
+
+  auto exchange_backward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_backward_finalize() -> void override;
+
+  auto exchange_forward_start(const bool nonBlockingExchange) -> void override;
+  auto exchange_forward_finalize() -> void override;
+
+private:
+  MPICommunicatorHandle comm_;
+  MPIRequestHandle mpiRequest_;
+
+  HostArrayView3D<ComplexType> spaceDomainData_;
+  HostArrayView2D<ComplexType> freqDomainData_;
+
+  SizeType numLocalXYPlanes_;
+
+  std::vector<MPIDatatypeHandle> freqDomainTypeHandles_;
+  std::vector<MPI_Datatype> freqDomainTypes_;
+  std::vector<int> freqDomainDispls_;
+  std::vector<int> freqDomainCount_;
+  std::vector<MPIDatatypeHandle> spaceDomainTypeHandles_;
+  std::vector<MPI_Datatype> spaceDomainTypes_;
+  std::vector<int> spaceDomainDispls_;
+  std::vector<int> spaceDomainCount_;
+};
+
+} // namespace spfft
+#endif // SPFFT_MPI
+#endif
diff --git a/src/util/common_types.hpp b/src/util/common_types.hpp
new file mode 100644
index 0000000..7fc00bc
--- /dev/null
+++ b/src/util/common_types.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_COMMON_TYPES_HPP
+#define SPFFT_COMMON_TYPES_HPP
+
+#include "spfft/config.h"
+namespace spfft {
+
+typedef unsigned long long SizeType;
+typedef long long SignedType;
+
+} // namespace
+#endif
diff --git a/src/util/omp_definitions.hpp b/src/util/omp_definitions.hpp
new file mode 100644
index 0000000..c80e55b
--- /dev/null
+++ b/src/util/omp_definitions.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_OMP_DEFINITIONS_HPP
+#define SPFFT_OMP_DEFINITIONS_HPP
+
+#include "spfft/config.h"
+#ifdef SPFFT_OMP
+#include <omp.h>
+#define SPFFT_OMP_PRAGMA(content) _Pragma(content)
+#else
+#define SPFFT_OMP_PRAGMA(content)
+namespace spfft {
+inline int omp_get_num_threads() { return 1; }
+inline int omp_get_thread_num() { return 0; }
+inline int omp_get_max_threads() { return 1; }
+inline int omp_in_parallel() { return 0; }
+inline int omp_get_nested() { return 0; }
+inline int omp_get_num_procs() { return 1; }
+inline int omp_get_level() { return 0; }
+inline void omp_set_nested(int) {}
+} // namespace spfft
+#endif
+
+#endif
diff --git a/src/util/type_check.hpp b/src/util/type_check.hpp
new file mode 100644
index 0000000..3bb9de3
--- /dev/null
+++ b/src/util/type_check.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TYPE_CHECK
+#define SPFFT_TYPE_CHECK
+
+#include <type_traits>
+#include "spfft/config.h"
+
+namespace spfft {
+
+template <class T>
+struct IsFloatOrDouble
+    : std::integral_constant<bool,
+                             std::is_same<float, typename std::remove_cv<T>::type>::value ||
+                                 std::is_same<double, typename std::remove_cv<T>::type>::value> {};
+} // namespace spfft
+
+#endif
diff --git a/style_guide.md b/style_guide.md
new file mode 100644
index 0000000..fe6fbce
--- /dev/null
+++ b/style_guide.md
@@ -0,0 +1,49 @@
+# Style Guide for SpFFT
+## Formatting
+The formatting style is based on the google style guide with the following exceptions:
+- Column size is limited to 100 instead of 80
+- Access modifiers such as public and private are offset by -2
+
+Clang-Format is used to format all files.
+
+## Naming
+The following rules are not strict and consistency when using external types is preferred.
+
+### Files
+Use underscores for separation.
+File suffix:
+- C++: .cpp and .hpp
+- C: .c and .h
+- CUDA: .cu and .cuh
+
+Example
+`my_class.cpp`
+
+### Types
+Use camelcase and start with a capital letter.
+Example
+`using MyType = int;`
+
+### Variables
+Use camelcase and start with lower case
+Example:
+`int myValue = 0;`
+
+#### Class / Struct Members
+Use a trailing underscore for non-public member variables. Public members are mamed like normal variables.
+
+#### Functions
+Function names use underscores and are all lower case
+Example:
+`my_function(int);`
+
+#### namespaces
+Namepsace are all lower case and use underscores.
+Example:
+` namespace my_space {}`
+
+#### Macros
+Macros are all capital and use underscores.
+Example:
+`#define MY_MACRO_VALUE 1`
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..af4d889
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,114 @@
+
+if(SPFFT_BUILD_TESTS)
+	cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # git fetch module requires at least 3.11
+	set(BUILD_GMOCK OFF CACHE BOOL "")
+	set(INSTALL_GTEST OFF CACHE BOOL "")
+	mark_as_advanced(BUILD_GMOCK INSTALL_GTEST)
+	include(FetchContent)
+
+	# add googletest
+	FetchContent_Declare(
+	  googletest
+	  GIT_REPOSITORY https://github.com/google/googletest.git
+	  GIT_TAG        release-1.8.1
+	)
+	FetchContent_GetProperties(googletest)
+	if(NOT googletest_POPULATED)
+	  message(STATUS "Downloading Google Test repository...")
+	  FetchContent_Populate(googletest)
+	endif()
+	add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
+
+	# add gtest_mpi
+	FetchContent_Declare(
+	  gtest_mpi
+	  GIT_REPOSITORY https://github.com/AdhocMan/gtest_mpi.git
+	  GIT_TAG        v1.0.0
+	)
+	FetchContent_GetProperties(gtest_mpi)
+	if(NOT gtest_mpi_POPULATED)
+	  message(STATUS "Downloading Google Test MPI extension repository...")
+	  FetchContent_Populate(gtest_mpi)
+	endif()
+	add_subdirectory(${gtest_mpi_SOURCE_DIR} ${gtest_mpi_BINARY_DIR})
+
+	# add command line parser
+	FetchContent_Declare(
+	  cli11
+	  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+	  GIT_TAG        v1.7.1
+	)
+	FetchContent_GetProperties(cli11)
+	if(NOT cli11_POPULATED)
+	  message(STATUS "Downloading CLI11 command line parser repository...")
+	  FetchContent_Populate(cli11)
+	endif()
+	list(APPEND SPFFT_EXTERNAL_INCLUDE_DIRS ${cli11_SOURCE_DIR}/include)
+
+	# add json parser
+	set(JSON_Install OFF CACHE BOOL "")
+	FetchContent_Declare(
+	  json
+	  GIT_REPOSITORY https://github.com/nlohmann/json.git
+	  GIT_TAG        v3.6.1
+	)
+	FetchContent_GetProperties(json)
+	if(NOT json_POPULATED)
+	  message(STATUS "Downloading json repository...")
+	  FetchContent_Populate(json)
+	endif()
+	set(JSON_BuildTests OFF CACHE INTERNAL "")
+	add_subdirectory(${json_SOURCE_DIR} ${json_BINARY_DIR})
+	list(APPEND SPFFT_EXTERNAL_LIBS nlohmann_json::nlohmann_json)
+
+
+	list(APPEND SPFFT_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/tests)
+
+# testing executables
+	add_executable(main programs/main.cpp)
+	target_link_libraries(main PRIVATE ${SPFFT_LIBS} ${SPFFT_EXTERNAL_LIBS})
+	target_compile_options(main PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+	target_include_directories(main PRIVATE ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+
+	if(SPFFT_MPI)
+		add_executable(main_mpi programs/main_mpi.cpp)
+		target_link_libraries(main_mpi PRIVATE ${SPFFT_LIBS} ${SPFFT_EXTERNAL_LIBS})
+		target_compile_options(main_mpi PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+		target_include_directories(main_mpi PRIVATE ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+
+		add_executable(benchmark programs/benchmark.cpp)
+		target_link_libraries(benchmark PRIVATE ${SPFFT_LIBS} ${SPFFT_EXTERNAL_LIBS})
+		target_compile_options(benchmark PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+		target_include_directories(benchmark PRIVATE ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+	endif()
+
+	# if(SPFFT_CUDA)
+	#     set_property(TARGET main PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+	# endif()
+
+	add_executable(run_local_tests
+		run_local_tests.cpp
+		local_tests/test_host_array.cpp
+		local_tests/test_disjoint.cpp
+		local_tests/test_local_transform.cpp
+		)
+	target_link_libraries(run_local_tests PRIVATE gtest_main gtest_mpi)
+	target_link_libraries(run_local_tests PRIVATE ${SPFFT_LIBS} ${SPFFT_EXTERNAL_LIBS})
+	target_compile_options(run_local_tests PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+	target_include_directories(run_local_tests PRIVATE ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+
+	if(SPFFT_MPI)
+		add_executable(run_mpi_tests
+			run_mpi_tests.cpp
+			mpi_tests/test_transform.cpp
+			mpi_tests/test_multi_transform.cpp
+			mpi_tests/test_transpose.cpp
+			mpi_tests/test_transpose_gpu.cpp
+			)
+		target_link_libraries(run_mpi_tests PRIVATE gtest_main gtest_mpi)
+		target_link_libraries(run_mpi_tests PRIVATE ${SPFFT_LIBS} ${SPFFT_EXTERNAL_LIBS})
+		target_compile_options(run_mpi_tests PRIVATE ${SPFFT_DEFINITIONS} ${SPFFT_EXTERNAL_COMPILE_OPTIONS})
+		target_include_directories(run_mpi_tests PRIVATE ${SPFFT_INCLUDE_DIRS} ${SPFFT_EXTERNAL_INCLUDE_DIRS})
+	endif()
+
+endif()
diff --git a/tests/local_tests/test_disjoint.cpp b/tests/local_tests/test_disjoint.cpp
new file mode 100644
index 0000000..33cc956
--- /dev/null
+++ b/tests/local_tests/test_disjoint.cpp
@@ -0,0 +1,111 @@
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+
+using namespace spfft;
+
+class DisjointTest : public ::testing::Test {
+protected:
+  void SetUp() override { array_ = HostArray<int>(100); }
+
+  HostArray<int> array_;
+};
+
+TEST_F(DisjointTest, dim1AndDim1) {
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_1d_view(array_, 0, 10);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_1d_view(array_, 5, 10);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_1d_view(array_, 10, 10);
+    EXPECT_TRUE(disjoint(view1, view2));
+  }
+}
+
+TEST_F(DisjointTest, dim1AndDim2) {
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_2d_view(array_, 0, 2, 5);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_2d_view(array_, 5, 2, 5);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_2d_view(array_, 10, 5, 2);
+    EXPECT_TRUE(disjoint(view1, view2));
+  }
+}
+
+TEST_F(DisjointTest, dim1AndDim3) {
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_3d_view(array_, 0, 2, 2, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_3d_view(array_, 5, 2, 2, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_1d_view(array_, 0, 10);
+    auto view2 = create_3d_view(array_, 10, 5, 2, 2);
+    EXPECT_TRUE(disjoint(view1, view2));
+  }
+}
+
+TEST_F(DisjointTest, dim2AndDim3) {
+  {
+    auto view1 = create_2d_view(array_, 0, 2, 3);
+    auto view2 = create_3d_view(array_, 0, 2, 3, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_2d_view(array_, 0, 2, 3);
+    auto view2 = create_3d_view(array_, 5, 2, 2, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_2d_view(array_, 0, 2, 3);
+    auto view2 = create_3d_view(array_, 6, 5, 2, 2);
+    EXPECT_TRUE(disjoint(view1, view2));
+  }
+}
+
+TEST_F(DisjointTest, dim3AndDim3) {
+  {
+    auto view1 = create_3d_view(array_, 0, 2, 3, 4);
+    auto view2 = create_3d_view(array_, 0, 2, 3, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_3d_view(array_, 0, 2, 3, 4);
+    auto view2 = create_3d_view(array_, 5, 2, 2, 2);
+    EXPECT_FALSE(disjoint(view1, view2));
+  }
+  {
+    auto view1 = create_3d_view(array_, 0, 2, 3, 2);
+    auto view2 = create_3d_view(array_, 12, 5, 2, 2);
+    EXPECT_TRUE(disjoint(view1, view2));
+  }
+}
+
+TEST_F(DisjointTest, DifferentValueTypes) {
+  auto view1 = create_3d_view(array_, 0, 2, 3, 4);
+  auto view2 =
+      HostArrayView3D<long long>(reinterpret_cast<long long*>(array_.data()), 2, 3, 4, false);
+  EXPECT_FALSE(disjoint(view1, view2));
+}
diff --git a/tests/local_tests/test_host_array.cpp b/tests/local_tests/test_host_array.cpp
new file mode 100644
index 0000000..2650ac7
--- /dev/null
+++ b/tests/local_tests/test_host_array.cpp
@@ -0,0 +1,48 @@
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/host_array.hpp"
+
+using namespace spfft;
+
+class HostArrayTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    array_ = HostArray<int>(5);
+
+    int count = 0;
+    auto data_ptr = array_.data();
+    for (SizeType i = 0; i < 5; ++i) {
+      data_ptr[i] = ++count;
+    }
+  }
+
+  HostArray<int> array_;
+};
+
+TEST_F(HostArrayTest, Iterators) {
+  ASSERT_EQ(*array_.begin(), 1);
+  ASSERT_EQ(*(array_.end() - 1), 5);
+  int count = 0;
+  for (auto& val : array_) {
+    EXPECT_EQ(val, ++count);
+  }
+}
+
+
+TEST_F(HostArrayTest, OperatorAccess) {
+  int count = 0;
+  ASSERT_EQ(array_.size(), 5);
+  for (SizeType i = 0; i < array_.size(); ++i) {
+    ASSERT_EQ(array_[i], ++count);
+  }
+  count = 0;
+  for (SizeType i = 0; i < array_.size(); ++i) {
+    ASSERT_EQ(array_(i), ++count);
+  }
+
+}
+
+TEST_F(HostArrayTest, Accessors) {
+  ASSERT_EQ(array_.front(), 1);
+  ASSERT_EQ(array_.back(), 5);
+}
diff --git a/tests/local_tests/test_local_transform.cpp b/tests/local_tests/test_local_transform.cpp
new file mode 100644
index 0000000..95609f3
--- /dev/null
+++ b/tests/local_tests/test_local_transform.cpp
@@ -0,0 +1,113 @@
+#include "test_util/test_transform.hpp"
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/grid.hpp"
+#include "spfft/transform.hpp"
+#include "test_util/test_check_values.hpp"
+#include "test_util/generate_indices.hpp"
+#include "util/common_types.hpp"
+
+class TestLocalTransform : public TransformTest {
+protected:
+  TestLocalTransform()
+      : TransformTest(),
+        grid_(dimX_, dimY_, dimZ_, dimX_ * dimY_, std::get<1>(GetParam()), -1) {}
+
+  auto grid() -> Grid& override { return grid_; }
+
+  Grid grid_;
+};
+TEST_P(TestLocalTransform, ForwardC2C) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 1.0);
+    std::vector<double> xyPlaneDistribution(comm_size(), 1.0);
+    test_forward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(TestLocalTransform, BackwardC2C) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 1.0);
+    std::vector<double> xyPlaneDistribution(comm_size(), 1.0);
+
+    test_backward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(TestLocalTransform, R2C) {
+  try {
+    std::vector<double> xyPlaneDistribution(comm_size(), 1.0);
+    test_r2c(xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+// Show exchange name instead of enum value for test output
+static auto param_type_names(
+    const ::testing::TestParamInfo<
+        std::tuple<SpfftExchangeType, SpfftProcessingUnitType, int, int, int, bool>>& info)
+    -> std::string {
+  const auto exchType = std::get<0>(info.param);
+  const auto procType = std::get<1>(info.param);
+  std::string name;
+  switch (procType) {
+    case SpfftProcessingUnitType::SPFFT_PU_HOST: {
+      name += "Host";
+    } break;
+    case SpfftProcessingUnitType::SPFFT_PU_GPU: {
+      name += "GPU";
+    } break;
+    default: { name += "Host+GPU"; }
+  }
+  name += "Size";
+  name += std::to_string(std::get<2>(info.param));
+  name += "x";
+  name += std::to_string(std::get<3>(info.param));
+  name += "x";
+  name += std::to_string(std::get<4>(info.param));
+  return name;
+}
+
+// instantiate tests with parameters
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#define TEST_PROCESSING_UNITS \
+  SpfftProcessingUnitType::SPFFT_PU_HOST, SpfftProcessingUnitType::SPFFT_PU_GPU
+#else
+#define TEST_PROCESSING_UNITS SpfftProcessingUnitType::SPFFT_PU_HOST
+#endif
+
+INSTANTIATE_TEST_CASE_P(FullTest, TestLocalTransform,
+                        ::testing::Combine(::testing::Values(SpfftExchangeType::SPFFT_EXCH_DEFAULT),
+                                           ::testing::Values(TEST_PROCESSING_UNITS),
+                                           ::testing::Values(1, 2, 11, 12, 13, 100),
+                                           ::testing::Values(1, 2, 11, 12, 13, 100),
+                                           ::testing::Values(1, 2, 11, 12, 13, 100),
+                                           ::testing::Values(false)),
+                        param_type_names);
+
+INSTANTIATE_TEST_CASE_P(CenteredIndicesTest, TestLocalTransform,
+                        ::testing::Combine(::testing::Values(SpfftExchangeType::SPFFT_EXCH_DEFAULT),
+                                           ::testing::Values(TEST_PROCESSING_UNITS),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(true)),
+                        param_type_names);
+
diff --git a/tests/mpi_tests/test_multi_transform.cpp b/tests/mpi_tests/test_multi_transform.cpp
new file mode 100644
index 0000000..803f069
--- /dev/null
+++ b/tests/mpi_tests/test_multi_transform.cpp
@@ -0,0 +1,92 @@
+#include "test_util/test_transform.hpp"
+#include <fftw3.h>
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/spfft.hpp"
+#include "test_util/generate_indices.hpp"
+#include "util/common_types.hpp"
+
+TEST(MPIMultiTransformTest, BackwardsForwards) {
+  try {
+    MPICommunicatorHandle comm(MPI_COMM_WORLD);
+    const std::vector<double> zStickDistribution(comm.size(), 1.0);
+    const std::vector<double> xyPlaneDistribution(comm.size(), 1.0);
+
+    const int dimX = comm.size() * 10;
+    const int dimY = comm.size() * 11;
+    const int dimZ = comm.size() * 12;
+
+    const int numTransforms = 3;
+
+    std::mt19937 randGen(42);
+    const auto valueIndicesPerRank =
+        create_value_indices(randGen, zStickDistribution, 0.7, 0.7, dimX, dimY, dimZ, false);
+    const int numLocalXYPlanes =
+        calculate_num_local_xy_planes(comm.rank(), dimZ, xyPlaneDistribution);
+
+    const auto& localIndices = valueIndicesPerRank[comm.rank()];
+    const int numValues = localIndices.size() / 3;
+    std::vector<std::vector<std::complex<double>>> freqValuesPerTrans(
+        numTransforms, std::vector<std::complex<double>>(numValues));
+
+    std::vector<double*> freqValuePtr;
+    for(auto& values: freqValuesPerTrans) {
+      freqValuePtr.push_back(reinterpret_cast<double*>(values.data()));
+    }
+
+    // set frequency values to constant for each transform
+    for(std::size_t i = 0; i < freqValuesPerTrans.size(); ++i) {
+      for(auto& val : freqValuesPerTrans[i]) {
+        val = std::complex<double>(i, i);
+      }
+    }
+
+    std::vector<Transform> transforms;
+
+    // create first transforms
+    transforms.push_back(Grid(dimX, dimY, dimZ, dimX * dimY, numLocalXYPlanes, SPFFT_PU_HOST, -1,
+                              comm.get(), SPFFT_EXCH_DEFAULT)
+                             .create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ,
+                                               numLocalXYPlanes, numValues, SPFFT_INDEX_TRIPLETS,
+                                               localIndices.data()));
+    // clone first transform
+    for(int i = 1; i < numTransforms; ++i) {
+      transforms.push_back(transforms.front().clone());
+    }
+
+    std::vector<SpfftProcessingUnitType> processingUnits(numTransforms, SPFFT_PU_HOST);
+    std::vector<SpfftScalingType> scalingTypes(numTransforms, SPFFT_NO_SCALING);
+
+    // backward
+    multi_transform_backward(numTransforms, transforms.data(), freqValuePtr.data(), processingUnits.data());
+
+    // forward
+    multi_transform_forward(numTransforms, transforms.data(), processingUnits.data(),
+                            freqValuePtr.data(), scalingTypes.data());
+
+
+    // check all values
+    for(std::size_t i = 0; i < freqValuesPerTrans.size(); ++i) {
+      const auto targetValue = std::complex<double>(i * dimX * dimY * dimZ, i * dimX * dimY * dimZ);
+      for(auto& val : freqValuesPerTrans[i]) {
+        ASSERT_NEAR(targetValue.real(), val.real(), 1e-8);
+        ASSERT_NEAR(targetValue.imag(), val.imag(), 1e-8);
+      }
+    }
+
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
diff --git a/tests/mpi_tests/test_transform.cpp b/tests/mpi_tests/test_transform.cpp
new file mode 100644
index 0000000..2560a8e
--- /dev/null
+++ b/tests/mpi_tests/test_transform.cpp
@@ -0,0 +1,193 @@
+#include "test_util/test_transform.hpp"
+#include <fftw3.h>
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/grid.hpp"
+#include "spfft/transform.hpp"
+#include "test_util/test_check_values.hpp"
+#include "test_util/generate_indices.hpp"
+#include "util/common_types.hpp"
+
+class MPITransformTest : public TransformTest {
+protected:
+  MPITransformTest()
+      : TransformTest(),
+        comm_(MPI_COMM_WORLD),
+        grid_(dimX_, dimY_, dimZ_, dimX_ * dimY_, dimZ_, std::get<1>(GetParam()), -1, comm_.get(),
+              std::get<0>(GetParam())) {}
+
+  auto comm_rank() -> SizeType override { return comm_.rank(); }
+
+  auto comm_size() -> SizeType override { return comm_.size(); }
+
+  auto grid() -> Grid& override { return grid_; }
+
+  MPICommunicatorHandle comm_;
+  Grid grid_;
+};
+TEST_P(MPITransformTest, ForwardUniformDistribution) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 1.0);
+    std::vector<double> xyPlaneDistribution(comm_size(), 1.0);
+    test_forward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, BackwardAllOneRank) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 0.0);
+    zStickDistribution[0] = 1.0;
+    std::vector<double> xyPlaneDistribution(comm_size(), 0.0);
+    xyPlaneDistribution[0] = 1.0;
+
+    test_backward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, ForwardAllOneRank) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 0.0);
+    zStickDistribution[0] = 1.0;
+    std::vector<double> xyPlaneDistribution(comm_size(), 0.0);
+    xyPlaneDistribution[0] = 1.0;
+
+    test_forward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, BackwardAllOneRankPerSide) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 0.0);
+    zStickDistribution[0] = 1.0;
+    std::vector<double> xyPlaneDistribution(comm_size(), 0.0);
+    xyPlaneDistribution[comm_size() - 1] = 1.0;
+
+    test_backward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, ForwardAllOneRankPerSide) {
+  try {
+    std::vector<double> zStickDistribution(comm_size(), 0.0);
+    zStickDistribution[0] = 1.0;
+    std::vector<double> xyPlaneDistribution(comm_size(), 0.0);
+    xyPlaneDistribution[comm_size() - 1] = 1.0;
+
+    test_forward_c2c(zStickDistribution, xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, R2CUniformDistribution) {
+  try {
+    std::vector<double> xyPlaneDistribution(comm_size(), 1.0);
+    test_r2c(xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_P(MPITransformTest, R2COneRankAllPlanes) {
+  try {
+    std::vector<double> xyPlaneDistribution(comm_size(), 0.0);
+    xyPlaneDistribution[0] = 1.0;
+    test_r2c(xyPlaneDistribution);
+  } catch (const std::exception& e) {
+    std::cout << "ERROR: Rank " << comm_rank() << ", " << e.what() << std::endl;
+    ASSERT_TRUE(false);
+  }
+}
+
+// Show exchange name instead of enum value for test output
+static auto param_type_names(
+    const ::testing::TestParamInfo<
+        std::tuple<SpfftExchangeType, SpfftProcessingUnitType, int, int, int, bool>>& info)
+    -> std::string {
+  const auto exchType = std::get<0>(info.param);
+  const auto procType = std::get<1>(info.param);
+  std::string name;
+  switch (exchType) {
+    case SpfftExchangeType::SPFFT_EXCH_BUFFERED: {
+      name += "Buffered";
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED: {
+      name += "CompactBuffered";
+    } break;
+    case SpfftExchangeType::SPFFT_EXCH_UNBUFFERED: {
+      name += "Unbuffered";
+    } break;
+    default:
+      name += "Default";
+  }
+  switch (procType) {
+    case SpfftProcessingUnitType::SPFFT_PU_HOST: {
+      name += "Host";
+    } break;
+    case SpfftProcessingUnitType::SPFFT_PU_GPU: {
+      name += "GPU";
+    } break;
+    default: { name += "Host+GPU"; }
+  }
+  name += "Size";
+  name += std::to_string(std::get<2>(info.param));
+  name += "x";
+  name += std::to_string(std::get<3>(info.param));
+  name += "x";
+  name += std::to_string(std::get<4>(info.param));
+  return name;
+}
+
+// instantiate tests with parameters
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#define TEST_PROCESSING_UNITS \
+  SpfftProcessingUnitType::SPFFT_PU_HOST, SpfftProcessingUnitType::SPFFT_PU_GPU
+#else
+#define TEST_PROCESSING_UNITS SpfftProcessingUnitType::SPFFT_PU_HOST
+#endif
+
+INSTANTIATE_TEST_CASE_P(
+    FullTest, MPITransformTest,
+    ::testing::Combine(::testing::Values(SpfftExchangeType::SPFFT_EXCH_BUFFERED,
+                                         SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED,
+                                         SpfftExchangeType::SPFFT_EXCH_UNBUFFERED,
+                                         SpfftExchangeType::SPFFT_EXCH_DEFAULT),
+                       ::testing::Values(TEST_PROCESSING_UNITS),
+                       ::testing::Values(1, 2, 11, 12, 13, 100),
+                       ::testing::Values(1, 2, 11, 12, 13, 100),
+                       ::testing::Values(1, 2, 11, 12, 13, 100), ::testing::Values(false)),
+    param_type_names);
+
+INSTANTIATE_TEST_CASE_P(CenteredIndicesTest, MPITransformTest,
+                        ::testing::Combine(::testing::Values(SpfftExchangeType::SPFFT_EXCH_DEFAULT),
+                                           ::testing::Values(TEST_PROCESSING_UNITS),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(1, 2, 11, 100),
+                                           ::testing::Values(true)),
+                        param_type_names);
+
diff --git a/tests/mpi_tests/test_transpose.cpp b/tests/mpi_tests/test_transpose.cpp
new file mode 100644
index 0000000..f6598a1
--- /dev/null
+++ b/tests/mpi_tests/test_transpose.cpp
@@ -0,0 +1,186 @@
+#include <fftw3.h>
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "parameters/parameters.hpp"
+#include "transpose/transpose_mpi_buffered_host.hpp"
+#include "transpose/transpose_mpi_compact_buffered_host.hpp"
+#include "transpose/transpose_mpi_unbuffered_host.hpp"
+#include "util/common_types.hpp"
+
+using namespace spfft;
+
+class TransposeTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    comm_ = MPICommunicatorHandle(MPI_COMM_WORLD);
+
+    SizeType dimX = 2 * comm_.size();
+    SizeType dimY = 3 * comm_.size();
+    SizeType dimZ = 4 * comm_.size();
+
+    // create memory space
+    array1_ = HostArray<std::complex<double>>(dimX * dimY * dimZ, std::complex<double>(1.0, 1.0));
+    array2_ = HostArray<std::complex<double>>(dimX * dimY * dimZ, std::complex<double>(1.0, 1.0));
+    fullArray_ = HostArray<std::complex<double>>(dimX * dimY * dimZ);
+
+    // plane split between ranks
+    const SizeType numLocalXYPlanes =
+        (dimZ / comm_.size()) + (comm_.rank() == comm_.size() - 1 ? dimZ % comm_.size() : 0);
+    const SizeType localXYPlaneOffset = (dimZ / comm_.size()) * comm_.rank();
+
+    // create all indices the same way (random generator must be equally initialized)
+    std::mt19937 sharedRandGen(42);
+    std::uniform_real_distribution<double> dis(0.0, 1.0);
+    std::uniform_int_distribution<int> rankSelector(0, comm_.size() - 1);
+
+    std::vector<int> indexTriplets;
+    indexTriplets.reserve(dimX * dimY * dimZ);
+    for (int x = 0; x < static_cast<int>(dimX); ++x) {
+      for (int y = 0; y < static_cast<int>(dimY); ++y) {
+        // create sparse z stick distribution
+        if (dis(sharedRandGen) < 0.5 &&
+            rankSelector(sharedRandGen) == static_cast<int>(comm_.size())) {
+          for (int z = 0; z < static_cast<int>(dimY); ++z) {
+            indexTriplets.push_back(x);
+            indexTriplets.push_back(y);
+            indexTriplets.push_back(z);
+          }
+        }
+      }
+    }
+
+    paramPtr_.reset(new Parameters(comm_, SPFFT_TRANS_C2C, dimX, dimY, dimZ, numLocalXYPlanes,
+                                   indexTriplets.size() / 3, SPFFT_INDEX_TRIPLETS,
+                                   indexTriplets.data()));
+
+    // initialize random z-stick data
+    auto fullView = create_3d_view(fullArray_, 0, dimX, dimY, dimZ);
+    auto freqView = create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), dimZ);
+
+    for (SizeType r = 0; r < comm_.size(); ++r) {
+      for (const auto& stickIdx : paramPtr_->z_stick_xy_indices(r)) {
+        const auto x = stickIdx / dimY;
+        const auto y = stickIdx - x * dimY;
+        for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+          fullView(x, y, z) =
+              std::complex<double>(dis(sharedRandGen), dis(sharedRandGen));
+        }
+      }
+    }
+
+    // copy data into sticks
+    SizeType count = 0;
+    for (const auto& stickIdx : paramPtr_->z_stick_xy_indices(comm_.rank())) {
+      const auto x = stickIdx / dimY;
+      const auto y = stickIdx - x * dimY;
+      for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+        freqView(count, z) = fullView(x, y, z);
+      }
+      ++count;
+    }
+  }
+
+  MPICommunicatorHandle comm_;
+  std::shared_ptr<Parameters> paramPtr_;
+  HostArray<std::complex<double>> array1_;
+  HostArray<std::complex<double>> array2_;
+  HostArray<std::complex<double>> fullArray_;
+};
+
+static void check_space_domain(const HostArrayView3D<std::complex<double>>& realView,
+                               const HostArrayView3D<std::complex<double>>& fullView,
+                               const SizeType planeOffset, const SizeType numLocalXYPlanes) {
+  for (SizeType z = 0; z < numLocalXYPlanes; ++z) {
+    for (SizeType x = 0; x < fullView.dim_outer(); ++x) {
+      for (SizeType y = 0; y < fullView.dim_mid(); ++y) {
+        EXPECT_EQ(realView(z, x, y).real(), fullView(x, y, z + planeOffset).real());
+        EXPECT_EQ(realView(z, x, y).imag(), fullView(x, y, z + planeOffset).imag());
+      }
+    }
+  }
+}
+
+static void check_freq_domain(const HostArrayView2D<std::complex<double>>& freqView,
+                              const HostArrayView3D<std::complex<double>>& fullView,
+                              HostArrayConstView1D<int> xyIndices) {
+  for (SizeType stickIdx = 0; stickIdx < freqView.dim_outer(); ++stickIdx) {
+    const auto x = xyIndices(stickIdx) / fullView.dim_outer();
+    const auto y = xyIndices(stickIdx) - x * fullView.dim_outer();
+    for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+      EXPECT_EQ(freqView(stickIdx, z).real(), fullView(x, y, z).real());
+      EXPECT_EQ(freqView(stickIdx, z).imag(), fullView(x, y, z).imag());
+    }
+  }
+}
+
+TEST_F(TransposeTest, Unbuffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_x(), paramPtr_->dim_y());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  TransposeMPIUnbufferedHost<double> transpose(paramPtr_, comm_, freqXYView, freqView);
+
+  transpose.backward();
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                     paramPtr_->num_xy_planes(comm_.rank()));
+
+  transpose.forward();
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
+
+TEST_F(TransposeTest, CompactBuffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_x(), paramPtr_->dim_y());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  auto transposeBufferZ = create_1d_view(
+      array2_, 0,
+      paramPtr_->total_num_xy_planes() * paramPtr_->num_z_sticks(comm_.rank()));
+  auto transposeBufferXY = create_1d_view(
+      array1_, 0,
+      paramPtr_->total_num_z_sticks() * paramPtr_->num_xy_planes(comm_.rank()));
+
+  TransposeMPICompactBufferedHost<double, double> transpose(paramPtr_, comm_, freqXYView, freqView,
+                                                            transposeBufferXY, transposeBufferZ);
+
+  transpose.backward();
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                     paramPtr_->num_xy_planes(comm_.rank()));
+  transpose.forward();
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
+
+TEST_F(TransposeTest, Buffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_x(), paramPtr_->dim_y());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  auto transposeBufferZ = create_1d_view(
+      array2_, 0, paramPtr_->max_num_z_sticks() * paramPtr_->max_num_xy_planes() * comm_.size());
+  auto transposeBufferXY = create_1d_view(
+      array1_, 0, paramPtr_->max_num_z_sticks() * paramPtr_->max_num_xy_planes() * comm_.size());
+  TransposeMPIBufferedHost<double, double> transpose(paramPtr_, comm_, freqXYView, freqView,
+                                                     transposeBufferXY, transposeBufferZ);
+
+  transpose.backward();
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                     paramPtr_->num_xy_planes(comm_.rank()));
+  transpose.forward();
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
diff --git a/tests/mpi_tests/test_transpose_gpu.cpp b/tests/mpi_tests/test_transpose_gpu.cpp
new file mode 100644
index 0000000..0d88355
--- /dev/null
+++ b/tests/mpi_tests/test_transpose_gpu.cpp
@@ -0,0 +1,254 @@
+#include <fftw3.h>
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "parameters/parameters.hpp"
+#include "util/common_types.hpp"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "execution/execution_gpu.hpp"
+#include "memory/gpu_array.hpp"
+#include "transpose/transpose_mpi_buffered_gpu.hpp"
+#include "transpose/transpose_mpi_compact_buffered_gpu.hpp"
+#include "transpose/transpose_mpi_unbuffered_gpu.hpp"
+
+using namespace spfft;
+
+class TransposeGPUTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    comm_ = MPICommunicatorHandle(MPI_COMM_WORLD);
+
+    SizeType dimX = 2 * comm_.size();
+    SizeType dimY = 3 * comm_.size();
+    SizeType dimZ = 4 * comm_.size();
+
+    // create memory space
+    array1_ = HostArray<std::complex<double>>(dimX * dimY * dimZ, std::complex<double>(1.0, 1.0));
+    array2_ = HostArray<std::complex<double>>(dimX * dimY * dimZ, std::complex<double>(1.0, 1.0));
+    fullArray_ = HostArray<std::complex<double>>(dimX * dimY * dimZ);
+    gpuArray1_ = GPUArray<typename gpu::fft::ComplexType<double>::type>(array1_.size());
+    gpuArray2_ = GPUArray<typename gpu::fft::ComplexType<double>::type>(array1_.size());
+
+    // pinn arrays
+    array1_.pin_memory();
+    array2_.pin_memory();
+
+    // plane split between ranks
+    const SizeType numLocalXYPlanes =
+        (dimZ / comm_.size()) + (comm_.rank() == comm_.size() - 1 ? dimZ % comm_.size() : 0);
+    const SizeType localXYPlaneOffset = (dimZ / comm_.size()) * comm_.rank();
+
+    // create all indices the same way (random generator must be equally initialized)
+    std::mt19937 sharedRandGen(42);
+    std::uniform_real_distribution<double> dis(0.0, 1.0);
+    std::uniform_int_distribution<int> rankSelector(0, comm_.size() - 1);
+
+    std::vector<int> indexTriplets;
+    indexTriplets.reserve(dimX * dimY * dimZ);
+    for (int x = 0; x < static_cast<int>(dimX); ++x) {
+      for (int y = 0; y < static_cast<int>(dimY); ++y) {
+        // create sparse z stick distribution
+        if (dis(sharedRandGen) < 0.5 && rankSelector(sharedRandGen) == comm_.size()) {
+          for (int z = 0; z < static_cast<int>(dimY); ++z) {
+            indexTriplets.push_back(x);
+            indexTriplets.push_back(y);
+            indexTriplets.push_back(z);
+          }
+        }
+      }
+    }
+    paramPtr_.reset(new Parameters(comm_, SPFFT_TRANS_C2C, dimX, dimY, dimZ, numLocalXYPlanes,
+                                   indexTriplets.size() / 3, SPFFT_INDEX_TRIPLETS,
+                                   indexTriplets.data()));
+
+    // initialize random z-stick data
+    auto fullView = create_3d_view(fullArray_, 0, dimX, dimY, dimZ);
+    auto freqView = create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), dimZ);
+
+    for (SizeType r = 0; r < comm_.size(); ++r) {
+      for (const auto& stickIdx : paramPtr_->z_stick_xy_indices(r)) {
+        const auto x = stickIdx / dimY;
+        const auto y = stickIdx - x * dimY;
+        for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+          fullView(x, y, z) =
+              std::complex<double>(dis(sharedRandGen), dis(sharedRandGen));
+        }
+      }
+    }
+
+    // copy data into sticks
+    SizeType count = 0;
+    for (const auto& stickIdx : paramPtr_->z_stick_xy_indices(comm_.rank())) {
+      const auto x = stickIdx / dimY;
+      const auto y = stickIdx - x * dimY;
+      for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+        freqView(count, z) = fullView(x, y, z);
+      }
+      ++count;
+    }
+  }
+
+  MPICommunicatorHandle comm_;
+  std::shared_ptr<Parameters> paramPtr_;
+  HostArray<std::complex<double>> array1_;
+  HostArray<std::complex<double>> array2_;
+  HostArray<std::complex<double>> fullArray_;
+  GPUArray<typename gpu::fft::ComplexType<double>::type> gpuArray1_;
+  GPUArray<typename gpu::fft::ComplexType<double>::type> gpuArray2_;
+};
+
+static void check_space_domain(const HostArrayView3D<std::complex<double>>& realView,
+                               const HostArrayView3D<std::complex<double>>& fullView,
+                               const SizeType planeOffset, const SizeType numLocalXYPlanes) {
+  for (SizeType z = 0; z < numLocalXYPlanes; ++z) {
+    for (SizeType x = 0; x < fullView.dim_outer(); ++x) {
+      for (SizeType y = 0; y < fullView.dim_mid(); ++y) {
+        EXPECT_EQ(realView(z, x, y).real(), fullView(x, y, z + planeOffset).real());
+        EXPECT_EQ(realView(z, x, y).imag(), fullView(x, y, z + planeOffset).imag());
+      }
+    }
+  }
+}
+
+static void check_freq_domain(const HostArrayView2D<std::complex<double>>& freqView,
+                              const HostArrayView3D<std::complex<double>>& fullView,
+                              HostArrayConstView1D<int> xyIndices) {
+  for (SizeType stickIdx = 0; stickIdx < freqView.dim_outer(); ++stickIdx) {
+    const auto x = stickIdx / fullView.dim_outer();
+    const auto y = stickIdx - x * fullView.dim_outer();
+    for (SizeType z = 0; z < freqView.dim_inner(); ++z) {
+      EXPECT_EQ(freqView(stickIdx, z).real(), fullView(x, y, z).real());
+      EXPECT_EQ(freqView(stickIdx, z).imag(), fullView(x, y, z).imag());
+    }
+  }
+}
+
+
+TEST_F(TransposeGPUTest, Buffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqXYViewGPU = create_3d_view(gpuArray2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                      paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto freqViewGPU =
+      create_2d_view(gpuArray1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  GPUStreamHandle stream(false);
+  auto transposeBufferZ = create_1d_view(
+      array2_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferZGPU = create_1d_view(
+      gpuArray2_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferXY = create_1d_view(
+      array1_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferXYGPU = create_1d_view(
+      gpuArray1_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+
+  TransposeMPIBufferedGPU<double, double> transpose(
+      paramPtr_, comm_, transposeBufferXY, freqXYViewGPU, transposeBufferXYGPU, stream,
+      transposeBufferZ, freqViewGPU, transposeBufferZGPU, stream);
+
+  copy_to_gpu_async(stream, freqView, freqViewGPU);
+  transpose.backward();
+  copy_from_gpu_async(stream, freqXYViewGPU, freqXYView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                   paramPtr_->num_xy_planes(comm_.rank()));
+
+  transpose.forward();
+  copy_from_gpu_async(stream, freqViewGPU, freqView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
+
+TEST_F(TransposeGPUTest, CompactBuffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqXYViewGPU = create_3d_view(gpuArray2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                      paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto freqViewGPU =
+      create_2d_view(gpuArray1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  GPUStreamHandle stream(false);
+  auto transposeBufferZ = create_1d_view(
+      array2_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferZGPU = create_1d_view(
+      gpuArray2_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferXY = create_1d_view(
+      array1_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+  auto transposeBufferXYGPU = create_1d_view(
+      gpuArray1_, 0,
+      comm_.size() * paramPtr_->max_num_xy_planes() * paramPtr_->max_num_z_sticks());
+
+  TransposeMPICompactBufferedGPU<double, double> transpose(
+      paramPtr_, comm_, transposeBufferXY, freqXYViewGPU, transposeBufferXYGPU, stream,
+      transposeBufferZ, freqViewGPU, transposeBufferZGPU, stream);
+
+  copy_to_gpu_async(stream, freqView, freqViewGPU);
+  transpose.pack_backward();
+  transpose.backward();
+  transpose.unpack_backward();
+  copy_from_gpu_async(stream, freqXYViewGPU, freqXYView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                   paramPtr_->num_xy_planes(comm_.rank()));
+
+  transpose.forward();
+  copy_from_gpu_async(stream, freqViewGPU, freqView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
+
+TEST_F(TransposeGPUTest, Unbuffered) {
+  auto freqXYView = create_3d_view(array2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                   paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqXYViewGPU = create_3d_view(gpuArray2_, 0, paramPtr_->num_xy_planes(comm_.rank()),
+                                      paramPtr_->dim_y(), paramPtr_->dim_x());
+  auto freqView =
+      create_2d_view(array1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+  auto freqViewGPU =
+      create_2d_view(gpuArray1_, 0, paramPtr_->num_z_sticks(comm_.rank()), paramPtr_->dim_z());
+
+  auto fullView =
+      create_3d_view(fullArray_, 0, paramPtr_->dim_x(), paramPtr_->dim_y(), paramPtr_->dim_z());
+
+  GPUStreamHandle stream(false);
+
+  TransposeMPIUnbufferedGPU<double> transpose(paramPtr_, comm_, freqXYView, freqXYViewGPU, stream,
+                                              freqView, freqViewGPU, stream);
+
+  copy_to_gpu_async(stream, freqView, freqViewGPU);
+  transpose.backward();
+  copy_from_gpu_async(stream, freqXYViewGPU, freqXYView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_space_domain(freqXYView, fullView, paramPtr_->xy_plane_offset(comm_.rank()),
+                   paramPtr_->num_xy_planes(comm_.rank()));
+
+  transpose.forward();
+  copy_from_gpu_async(stream, freqViewGPU, freqView);
+  gpu::check_status(gpu::stream_synchronize(stream.get()));
+  check_freq_domain(freqView, fullView, paramPtr_->z_stick_xy_indices(comm_.rank()));
+}
+#endif
diff --git a/tests/programs/benchmark.cpp b/tests/programs/benchmark.cpp
new file mode 100644
index 0000000..6246ee8
--- /dev/null
+++ b/tests/programs/benchmark.cpp
@@ -0,0 +1,284 @@
+#include <algorithm>
+#include <chrono>
+#include <complex>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <thread>
+#include <vector>
+#include "CLI/CLI.hpp"
+#include "fft/transform_1d_host.hpp"
+#include "memory/host_array.hpp"
+#include "spfft/config.h"
+
+#include "nlohmann/json.hpp"
+#include "timing/timing.hpp"
+
+#include <mpi.h>
+#include "memory/array_view_utility.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_init_handle.hpp"
+#include "util/omp_definitions.hpp"
+
+#include "spfft/grid.hpp"
+#include "spfft/multi_transform.hpp"
+#include "spfft/transform.hpp"
+
+#include <unistd.h> // for MPI debugging
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "gpu_util/gpu_runtime_api.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "memory/gpu_array.hpp"
+#endif
+
+namespace std {
+void to_json(nlohmann::json& j, const std::list<::spfft::timing::TimingResult>& resList) {
+  for (const auto& res : resList) {
+    if (res.subNodes.empty())
+      j[res.identifier] = {{"values", res.timings}};
+    else
+      j[res.identifier] = {{"values", res.timings}, {"sub-timings", res.subNodes}};
+  }
+}
+} // namespace std
+
+using namespace spfft;
+
+void run_benchmark(const int dimX, const int dimY, const int dimZ, const int numLocalZSticks,
+                   const int numLocalXYPlanes, const SpfftProcessingUnitType executionUnit,
+                   const SpfftProcessingUnitType targetUnit, const int numThreads,
+                   const SpfftExchangeType exchangeType, const std::vector<int>& indices,
+                   const int numRepeats, const int numTransforms, double** freqValuesPTR) {
+  std::vector<Transform> transforms;
+  for (int t = 0; t < numTransforms; ++t) {
+    Grid grid(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, numThreads,
+              MPI_COMM_WORLD, exchangeType);
+
+    auto transform = grid.create_transform(
+        executionUnit, SpfftTransformType::SPFFT_TRANS_C2C, dimX, dimY, dimZ, numLocalXYPlanes,
+        indices.size() / 3, SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS, indices.data());
+    transforms.emplace_back(std::move(transform));
+  }
+  std::vector<SpfftProcessingUnitType> targetUnits(numTransforms, targetUnit);
+  std::vector<SpfftScalingType> scalingTypes(numTransforms, SPFFT_NO_SCALING);
+
+  // run once for warm cache
+  {
+    HOST_TIMING_SCOPED("Warming")
+    multi_transform_backward(transforms.size(), transforms.data(), freqValuesPTR,
+                             targetUnits.data());
+    multi_transform_forward(transforms.size(), transforms.data(), targetUnits.data(), freqValuesPTR,
+                            scalingTypes.data());
+  }
+
+  std::string exchName("Compact buffered");
+  if (exchangeType == SpfftExchangeType::SPFFT_EXCH_BUFFERED) {
+    exchName = "Buffered";
+  } else if (exchangeType == SpfftExchangeType::SPFFT_EXCH_UNBUFFERED) {
+    exchName = "Unbuffered";
+  } else if (exchangeType == SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED_FLOAT) {
+    exchName = "Compact buffered float";
+  } else if (exchangeType == SpfftExchangeType::SPFFT_EXCH_BUFFERED_FLOAT) {
+    exchName = "Buffered float";
+  }
+
+  HOST_TIMING_SCOPED(exchName)
+  if (numTransforms == 1) {
+    for (int repeat = 0; repeat < numRepeats; ++repeat) {
+      transforms.front().backward(*freqValuesPTR, targetUnits.front());
+      transforms.front().forward(targetUnits.front(),
+                              *freqValuesPTR, scalingTypes.front());
+    }
+  } else {
+    for (int repeat = 0; repeat < numRepeats; ++repeat) {
+      multi_transform_backward(transforms.size(), transforms.data(), freqValuesPTR,
+                               targetUnits.data());
+      multi_transform_forward(transforms.size(), transforms.data(), targetUnits.data(),
+                              freqValuesPTR, scalingTypes.data());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  MPIInitHandle initHandle(argc, argv, true);
+  MPICommunicatorHandle comm(MPI_COMM_WORLD);
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  // set device for multi-gpu nodes
+  int deviceCount = 0;
+  gpu::check_status(gpu::get_device_count(&deviceCount));
+  if (deviceCount > 1) {
+    gpu::check_status(gpu::set_device(comm.rank() % deviceCount));
+  }
+#endif
+
+  // if(comm.rank() == 0) {
+  //   std::cout << "PID = " << getpid() << std::endl;
+  // }
+  // bool waitLoop = comm.rank() == 0;
+  // while(waitLoop) {
+  //   sleep(5);
+  // }
+
+  int numRepeats = 1;
+  int gridDimSize = 4;
+  int numTransforms = 1;
+  std::string outputFileName;
+  std::string exchName;
+  std::string procName;
+  double sparsity = 1.0;
+
+  CLI::App app{"fft test"};
+  app.add_option("-n", gridDimSize, "Size of symmetric fft grid in each dimension")->required();
+  app.add_option("-r", numRepeats, "Number of repeats")->required();
+  app.add_option("-o", outputFileName, "Output file name")->required();
+  app.add_option("-t", numTransforms, "Number of transforms")->default_val("1");
+  app.add_option("-s", sparsity, "Sparsity");
+  app.add_set("-e", exchName,
+              std::set<std::string>{"all", "compact", "compactFloat", "buffered", "bufferedFloat",
+                                    "unbuffered"},
+              "Exchange type")
+      ->required();
+  app.add_set("-p", procName, std::set<std::string>{"cpu", "gpu", "gpu-gpu"}, "Processing unit")
+      ->required();
+  CLI11_PARSE(app, argc, argv);
+
+  int dimX = gridDimSize;
+  int dimY = gridDimSize;
+  int dimZ = gridDimSize;
+
+  const int numThreads = omp_get_max_threads();
+
+  const SizeType numLocalXYPlanes =
+      (dimZ / comm.size()) + (comm.rank() < dimZ % comm.size() ? 1 : 0);
+  int numLocalZSticks = 0;
+
+  std::vector<int> xyzIndices;
+  {
+    // std::mt19937 randGen(42);
+    // std::uniform_real_distribution<double> uniformRandDis(0.0, 1.0);
+    // create all global x-y index pairs
+    std::vector<std::pair<int, int>> xyIndicesGlobal;
+    xyIndicesGlobal.reserve(dimX * dimY);
+    for (int x = 0; x < static_cast<int>(dimX * sparsity); ++x) {
+      for (int y = 0; y < static_cast<int>(dimY); ++y) {
+        xyIndicesGlobal.emplace_back(x, y);
+      }
+    }
+
+    // distribute z-sticks as evenly as possible
+    numLocalZSticks = (xyIndicesGlobal.size()) / comm.size() +
+                      (comm.rank() < (xyIndicesGlobal.size()) % comm.size() ? 1 : 0);
+    const int offset = ((xyIndicesGlobal.size()) / comm.size()) * comm.rank() +
+                       std::min(comm.rank(), static_cast<SizeType>(xyIndicesGlobal.size()) % comm.size());
+
+    // assemble index triplets
+    xyzIndices.reserve(numLocalZSticks);
+    for (int i = offset; i < offset + numLocalZSticks; ++i) {
+      for (int z = 0; z < dimZ; ++z) {
+        xyzIndices.push_back(xyIndicesGlobal[i].first);
+        xyzIndices.push_back(xyIndicesGlobal[i].second);
+        xyzIndices.push_back(z);
+      }
+    }
+  }
+
+  // store full z-sticks values
+
+  const auto executionUnit = procName == "cpu" ? SpfftProcessingUnitType::SPFFT_PU_HOST
+                                               : SpfftProcessingUnitType::SPFFT_PU_GPU;
+  const auto targetUnit = procName == "gpu-gpu" ? SpfftProcessingUnitType::SPFFT_PU_GPU
+                                                : SpfftProcessingUnitType::SPFFT_PU_HOST;
+
+  std::vector<double*> freqValuesPointers(numTransforms);
+  std::vector<HostArray<std::complex<double>>> freqValues;
+  for (int t = 0; t < numTransforms; ++t) freqValues.emplace_back(xyzIndices.size() / 3);
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  std::vector<GPUArray<double2>> freqValuesGPU;
+  for (int t = 0; t < numTransforms; ++t) freqValuesGPU.emplace_back(xyzIndices.size() / 3);
+
+  for (int t = 0; t < numTransforms; ++t) {
+    freqValuesPointers[t] = procName == "gpu-gpu"
+                                ? reinterpret_cast<double*>(freqValuesGPU[t].data())
+                                : reinterpret_cast<double*>(freqValues[t].data());
+  }
+#else
+  for (int t = 0; t < numTransforms; ++t) {
+    freqValuesPointers[t] = reinterpret_cast<double*>(freqValues[t].data());
+  }
+#endif
+
+  if (comm.rank() == 0) {
+    std::cout << "Num MPI ranks: " << comm.size() << std::endl;
+    std::cout << "Grid size: " << dimX << ", " << dimY << ", " << dimZ << std::endl;
+    std::cout << "Proc: " << procName << std::endl;
+  }
+
+  if (exchName == "all") {
+    run_benchmark(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, targetUnit,
+                  numThreads, SpfftExchangeType::SPFFT_EXCH_BUFFERED, xyzIndices, numRepeats,
+                  numTransforms, freqValuesPointers.data());
+    run_benchmark(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, targetUnit,
+                  numThreads, SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED, xyzIndices,
+                  numRepeats, numTransforms, freqValuesPointers.data());
+    run_benchmark(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, targetUnit,
+                  numThreads, SpfftExchangeType::SPFFT_EXCH_UNBUFFERED, xyzIndices, numRepeats,
+                  numTransforms, freqValuesPointers.data());
+  } else {
+    auto exchangeType = SpfftExchangeType::SPFFT_EXCH_DEFAULT;
+    if (exchName == "compact") {
+      exchangeType = SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED;
+    } else if (exchName == "compactFloat") {
+      exchangeType = SpfftExchangeType::SPFFT_EXCH_COMPACT_BUFFERED_FLOAT;
+    } else if (exchName == "buffered") {
+      exchangeType = SpfftExchangeType::SPFFT_EXCH_BUFFERED;
+    } else if (exchName == "bufferedFloat") {
+      exchangeType = SpfftExchangeType::SPFFT_EXCH_BUFFERED_FLOAT;
+    } else if (exchName == "unbuffered") {
+      exchangeType = SpfftExchangeType::SPFFT_EXCH_UNBUFFERED;
+    }
+
+    run_benchmark(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, targetUnit,
+                  numThreads, exchangeType, xyzIndices, numRepeats, numTransforms,
+                  freqValuesPointers.data());
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (comm.rank() == 0) {
+    HOST_TIMING_PRINT();
+    if (!outputFileName.empty()) {
+      nlohmann::json j;
+      const std::time_t t = std::time(nullptr);
+      std::string time(std::ctime(&t));
+      time.pop_back();
+      j["timings"] = HOST_TIMING_PROCESS_TIMINGS();
+#ifdef SPFFT_GPU_DIRECT
+      const bool gpuDirectEnabled = true;
+#else
+      const bool gpuDirectEnabled = false;
+#endif
+
+      const bool data_on_gpu = procName == "gpu-gpu";
+      j["parameters"] = {{"proc", procName},
+                         {"data_on_gpu", data_on_gpu},
+                         {"gpu_direct", gpuDirectEnabled},
+                         {"num_ranks", comm.size()},
+                         {"num_threads", numThreads},
+                         {"dim_x", dimX},
+                         {"dim_y", dimY},
+                         {"dim_z", dimZ},
+                         {"exchange_type", exchName},
+                         {"num_repeats", numRepeats},
+                         {"time", time}};
+      std::ofstream file(outputFileName);
+      file << std::setw(2) << j;
+      file.close();
+    }
+  }
+
+  return 0;
+}
diff --git a/tests/programs/main.cpp b/tests/programs/main.cpp
new file mode 100644
index 0000000..8cd8fe0
--- /dev/null
+++ b/tests/programs/main.cpp
@@ -0,0 +1,185 @@
+#include <complex>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include "CLI/CLI.hpp"
+#include "fft/transform_1d_host.hpp"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "spfft/config.h"
+#include "timing/timing.hpp"
+
+#include "spfft/grid.hpp"
+#include "spfft/transform.hpp"
+
+using namespace spfft;
+
+static bool enablePrint = false;
+
+auto print_view_3d(const HostArrayView3D<std::complex<double>>& view, std::string label) -> void {
+  if (!enablePrint) return;
+  // std::cout << std::scientific;
+  std::cout << std::fixed;
+  std::cout << std::setprecision(2);
+  std::cout << " -------------------- " << std::endl;
+  std::cout << label << ":" << std::endl;
+  for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+    for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+      for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+        const auto& value = view(idxOuter, idxMid, idxInner);
+        std::cout << std::setw(8) << std::right << value.real();
+        if (std::signbit(value.imag())) {
+          std::cout << " - ";
+        } else {
+          std::cout << " + ";
+        }
+        std::cout << std::left << std::setw(6) << std::abs(value.imag());
+      }
+      std::cout << " | ";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << " -------------------- " << std::endl;
+}
+
+auto print_view_3d_transposed(const HostArrayView3D<std::complex<double>>& view, std::string label)
+    -> void {
+  if (!enablePrint) return;
+  // std::cout << std::scientific;
+  std::cout << std::fixed;
+  std::cout << std::setprecision(2);
+  std::cout << " -------------------- " << std::endl;
+  std::cout << label << ":" << std::endl;
+  for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+    for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+      for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+        const auto& value = view(idxOuter, idxMid, idxInner);
+        std::cout << std::setw(8) << std::right << value.real();
+        if (std::signbit(value.imag())) {
+          std::cout << " - ";
+        } else {
+          std::cout << " + ";
+        }
+        std::cout << std::left << std::setw(6) << std::abs(value.imag());
+      }
+      std::cout << " | ";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << " -------------------- " << std::endl;
+}
+
+// #define print_view_3d(...)
+// #define print_view_3d_transposed(...)
+
+int main(int argc, char** argv) {
+  SizeType numRepeats = 1;
+  SizeType gridDimSize = 4;
+
+  CLI::App app{"Single node fft test"};
+  app.add_option("-n", gridDimSize, "Size of symmetric fft grid in each dimension")->required();
+  app.add_option("-r", numRepeats, "Number of repeats")->default_val("1");
+  app.add_flag("-p", enablePrint, "Enable print");
+  CLI11_PARSE(app, argc, argv);
+
+  SizeType dimX = gridDimSize;
+  SizeType dimY = gridDimSize;
+  SizeType dimZ = gridDimSize;
+  std::vector<int> xyzIndices;
+
+  for (int x = 0; x < static_cast<int>(dimX); ++x) {
+    for (int y = 0; y < static_cast<int>(dimY); ++y) {
+      for (int z = 0; z < static_cast<int>(dimZ); ++z) {
+        xyzIndices.push_back(x);
+        xyzIndices.push_back(y);
+        xyzIndices.push_back(z);
+      }
+    }
+  }
+
+  // create full 3d freq view
+  HostArray<std::complex<double>> array1(dimX * dimY * dimZ);
+  auto fftwView = create_3d_view(array1, 0, dimX, dimY, dimZ);
+  SizeType counter = 1;
+  for (SizeType x = 0; x < dimX; ++x) {
+    for (SizeType y = 0; y < dimY; ++y) {
+      for (SizeType z = 0; z < dimZ; ++z) {
+        fftwView(x, y, z) = std::complex<double>(counter, counter);
+        ++counter;
+      }
+    }
+  }
+
+  // store full z-sticks values
+  HostArray<std::complex<double>> arrayPacked(dimZ * dimX * dimY);
+
+  SizeType valueIndex = 0;
+  for (SizeType i = 0; i < xyzIndices.size(); i += 3, ++valueIndex) {
+    arrayPacked(valueIndex) = fftwView(xyzIndices[i], xyzIndices[i + 1], xyzIndices[i + 2]);
+  }
+  auto freqDomainZ = create_3d_view(arrayPacked, 0, 1, dimX * dimY, dimZ);
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  const auto executionUnit = SpfftProcessingUnitType::SPFFT_PU_GPU;
+#else
+  const auto executionUnit = SpfftProcessingUnitType::SPFFT_PU_HOST;
+#endif
+
+  Grid grid(dimX, dimY, dimZ, dimX * dimY, executionUnit, -1);
+
+  auto transform = grid.create_transform(
+      executionUnit, SpfftTransformType::SPFFT_TRANS_C2C, dimX, dimY, dimZ, dimZ,
+      xyzIndices.size() / 3, SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS, xyzIndices.data());
+
+  // output initial z stick
+  print_view_3d(freqDomainZ, "Freq input");
+
+  auto spaceDomainView = HostArrayView3D<std::complex<double>>(
+      reinterpret_cast<std::complex<double>*>(
+          transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST)),
+      dimZ, dimY, dimX, false);
+  for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+    transform.backward(reinterpret_cast<double*>(arrayPacked.data()),
+                       SpfftProcessingUnitType::SPFFT_PU_HOST);
+    print_view_3d(spaceDomainView, "Real");
+    transform.forward(SpfftProcessingUnitType::SPFFT_PU_HOST,
+                      reinterpret_cast<double*>(arrayPacked.data()));
+  }
+
+  // output final z stick
+  print_view_3d(freqDomainZ, "Freq after forward and");
+
+  HOST_TIMING_START("FFTW 3d init backward")
+  fftw_plan plan3DBackward =
+      fftw_plan_dft_3d(dimX, dimY, dimZ, (fftw_complex*)fftwView.data(),
+                       (fftw_complex*)fftwView.data(), FFTW_BACKWARD, FFTW_ESTIMATE);
+  HOST_TIMING_STOP("FFTW 3d init backward")
+
+  HOST_TIMING_START("FFTW 3d init forward")
+  fftw_plan plan3DForward =
+      fftw_plan_dft_3d(dimX, dimY, dimZ, (fftw_complex*)fftwView.data(),
+                       (fftw_complex*)fftwView.data(), FFTW_FORWARD, FFTW_ESTIMATE);
+  HOST_TIMING_STOP("FFTW 3d init forward")
+
+  for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+    HOST_TIMING_START("FFTW 3d backward")
+    fftw_execute(plan3DBackward);
+    HOST_TIMING_STOP("FFTW 3d backward")
+
+    print_view_3d_transposed(fftwView, "FFTW ref real");
+
+    HOST_TIMING_START("FFTW 3d forward")
+    fftw_execute(plan3DForward);
+    HOST_TIMING_STOP("FFTW 3d forward")
+  }
+  print_view_3d(fftwView, "FFTW freq after forward and");
+
+  fftw_destroy_plan(plan3DBackward);
+  fftw_destroy_plan(plan3DForward);
+
+  HOST_TIMING_PRINT();
+
+  return 0;
+}
diff --git a/tests/programs/main_mpi.cpp b/tests/programs/main_mpi.cpp
new file mode 100644
index 0000000..31e27e3
--- /dev/null
+++ b/tests/programs/main_mpi.cpp
@@ -0,0 +1,375 @@
+/*
+#define PRINT_VAR(variable)                                                                        \
+  {                                                                                                \
+    int mpiRankPrint, mpiPrintSize;                                                                \
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRankPrint);                                                  \
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiPrintSize);                                                  \
+    for (int mpiRankPrintIdx = 0; mpiRankPrintIdx < mpiPrintSize; ++mpiRankPrintIdx) {             \
+      MPI_Barrier(MPI_COMM_WORLD);                                                                 \
+      if (mpiRankPrint != mpiRankPrintIdx) continue;                                               \
+      std::cout << "rank " << mpiRankPrint << ", " << #variable << " = " << variable << std::endl; \
+    }                                                                                              \
+  }
+*/
+
+#include <fftw3.h>
+#include <complex>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <thread>
+#include <vector>
+#include "CLI/CLI.hpp"
+#include "memory/host_array.hpp"
+#include "spfft/grid.hpp"
+#include "spfft/transform.hpp"
+
+#define SPFFT_ENABLE_TIMING 1
+#include "timing/timing.hpp"
+
+#include <mpi.h>
+#include "memory/array_view_utility.hpp"
+#include "mpi_util/mpi_communicator_handle.hpp"
+#include "mpi_util/mpi_init_handle.hpp"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "memory/gpu_array.hpp"
+#endif
+
+#include <unistd.h> // for MPI debugging
+using namespace spfft;
+
+static bool enablePrint = false;
+
+auto print_view_3d(const HostArrayView3D<std::complex<double>>& view, std::string label) -> void {
+  if (!enablePrint) return;
+  int rank, size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  for (int r = 0; r < size; ++r) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (r != rank) continue;
+    std::stringstream stream;
+    // stream << std::scientific;
+    stream << std::fixed;
+    stream << std::setprecision(1);
+    stream << " -------------------- " << std::endl;
+    stream << "Rank = " << rank << ", " << label << ":" << std::endl;
+    for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+      for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+        for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+          const auto& value = view(idxOuter, idxMid, idxInner);
+          stream << std::setw(8) << std::right << value.real();
+          if (std::signbit(value.imag())) {
+            stream << " - ";
+          } else {
+            stream << " + ";
+          }
+          stream << std::left << std::setw(8) << std::abs(value.imag());
+        }
+        stream << " | ";
+      }
+      stream << std::endl;
+    }
+    stream << " -------------------- " << std::endl;
+    std::cout << stream.str();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+}
+
+auto print_view_3d(const HostArrayView3D<double>& view, std::string label) -> void {
+  if (!enablePrint) return;
+  int rank, size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  for (int r = 0; r < size; ++r) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (r != rank) continue;
+    std::stringstream stream;
+    // stream << std::scientific;
+    stream << std::fixed;
+    stream << std::setprecision(1);
+    stream << " -------------------- " << std::endl;
+    stream << "Rank = " << rank << ", " << label << ":" << std::endl;
+    for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+      for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+        for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+          const auto& value = view(idxOuter, idxMid, idxInner);
+          stream << std::setw(8) << std::right << value;
+        }
+        stream << " | ";
+      }
+      stream << std::endl;
+    }
+    stream << " -------------------- " << std::endl;
+    std::cout << stream.str();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+}
+
+auto print_view_3d_transposed(const HostArrayView3D<std::complex<double>>& view, std::string label)
+    -> void {
+  if (!enablePrint) return;
+  int rank, size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  for (int r = 0; r < size; ++r) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (r != rank) continue;
+    std::stringstream stream;
+    // stream << std::scientific;
+    stream << std::fixed;
+    stream << std::setprecision(1);
+    stream << " -------------------- " << std::endl;
+    stream << label << ":" << std::endl;
+    for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+      for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+        for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+          const auto& value = view(idxOuter, idxMid, idxInner);
+          stream << std::setw(8) << std::right << value.real();
+          if (std::signbit(value.imag())) {
+            stream << " - ";
+          } else {
+            stream << " + ";
+          }
+          stream << std::left << std::setw(8) << std::abs(value.imag());
+        }
+        stream << " | ";
+      }
+      stream << std::endl;
+    }
+    stream << " -------------------- " << std::endl;
+    std::cout << stream.str();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+}
+
+auto print_view_3d_transposed(const HostArrayView3D<double>& view, std::string label) -> void {
+  if (!enablePrint) return;
+  int rank, size;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  for (int r = 0; r < size; ++r) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (r != rank) continue;
+    std::stringstream stream;
+    // stream << std::scientific;
+    stream << std::fixed;
+    stream << std::setprecision(1);
+    stream << " -------------------- " << std::endl;
+    stream << label << ":" << std::endl;
+    for (SizeType idxInner = 0; idxInner < view.dim_inner(); ++idxInner) {
+      for (SizeType idxMid = 0; idxMid < view.dim_mid(); ++idxMid) {
+        for (SizeType idxOuter = 0; idxOuter < view.dim_outer(); ++idxOuter) {
+          const auto& value = view(idxOuter, idxMid, idxInner);
+          stream << std::setw(8) << std::right << value;
+        }
+        stream << " | ";
+      }
+      stream << std::endl;
+    }
+    stream << " -------------------- " << std::endl;
+    std::cout << stream.str();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int main(int argc, char** argv) {
+  MPIInitHandle initHandle(argc, argv, true);
+  MPICommunicatorHandle comm(MPI_COMM_WORLD);
+
+  // if(comm.rank() == 0) {
+  //   std::cout << "PID = " << getpid() << std::endl;
+  // }
+  // bool waitLoop = comm.rank() == 0;
+  // while(waitLoop) {
+  //   sleep(5);
+  // }
+
+  SizeType numRepeats = 1;
+  SizeType gridDimSize = 4;
+  bool realToComplex = false;
+  auto transformType = SpfftTransformType::SPFFT_TRANS_C2C;
+  CLI::App app{"Single node fft test"};
+  app.add_option("-n", gridDimSize, "Size of symmetric fft grid in each dimension")->required();
+  app.add_option("-r", numRepeats, "Number of repeats")->default_val("1");
+  app.add_flag("-p", enablePrint, "Enable print");
+  app.add_flag("-s", realToComplex, "Enable realToComplex");
+  CLI11_PARSE(app, argc, argv);
+
+  if (realToComplex) {
+    transformType = SpfftTransformType::SPFFT_TRANS_R2C;
+  }
+
+  SizeType dimX = gridDimSize;
+  SizeType dimY = gridDimSize;
+  SizeType dimZ = gridDimSize;
+  // SizeType dimX = 11;
+  // SizeType dimY = 12;
+  // SizeType dimZ = 13;
+
+  const SizeType numLocalXYPlanes =
+      (dimZ / comm.size()) + (comm.rank() == comm.size() - 1 ? dimZ % comm.size() : 0);
+  const SizeType numLocalYIndices =
+      (dimY / comm.size()) + (comm.rank() == comm.size() - 1 ? dimY % comm.size() : 0);
+  const SizeType numLocalZSticks = dimX * numLocalYIndices;
+
+  // create xy indices
+  std::vector<int> xyzIndices;
+  int freqDimX = dimX;
+  if (realToComplex) {
+    freqDimX = dimX / 2 + 1;
+  }
+  for (int x = 0; x < static_cast<int>(freqDimX); ++x) {
+    for (int y = 0; y < static_cast<int>(numLocalYIndices); ++y) {
+      for (int z = 0; z < static_cast<int>(dimZ); ++z) {
+        if (!realToComplex || !(x == 0 && y + comm.rank() * (dimY / comm.size()) >= dimY / 2 + 1)) {
+          xyzIndices.push_back(x);
+          xyzIndices.push_back(y + comm.rank() * (dimY / comm.size()));
+          xyzIndices.push_back(z);
+        }
+      }
+    }
+  }
+
+  // create full 3d freq view
+  HostArray<std::complex<double>> array1(dimX * dimY * dimZ);
+  auto fftwView = create_3d_view(array1, 0, dimX, dimY, dimZ);
+  SizeType counter = 0;
+  std::mt19937 randGen;
+  std::uniform_real_distribution<double> uniformRandDis;
+  for (SizeType x = 0; x < dimX; ++x) {
+    for (SizeType y = 0; y < dimY; ++y) {
+      for (SizeType z = 0; z < dimZ; ++z) {
+        if (realToComplex) {
+          // fftwView(x, y, z) = std::complex<double>(counter, 0.0);
+          fftwView(x, y, z) = std::complex<double>(uniformRandDis(randGen), 0.0);
+        } else {
+          fftwView(x, y, z) = std::complex<double>(counter, counter);
+        }
+        ++counter;
+      }
+    }
+  }
+
+  // store full z-sticks values
+  HostArray<std::complex<double>> arrayPacked(dimZ * numLocalZSticks);
+
+  auto freqDomainZ = create_3d_view(arrayPacked, 0, 1, numLocalZSticks, dimZ);
+  // output initial z stick
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  const auto executionUnit = SpfftProcessingUnitType::SPFFT_PU_GPU;
+#else
+  const auto executionUnit = SpfftProcessingUnitType::SPFFT_PU_HOST;
+#endif
+
+  Grid grid(dimX, dimY, dimZ, numLocalZSticks, numLocalXYPlanes, executionUnit, -1, MPI_COMM_WORLD,
+            SpfftExchangeType::SPFFT_EXCH_BUFFERED);
+
+  auto transform = grid.create_transform(
+      executionUnit, transformType, dimX, dimY, dimZ, numLocalXYPlanes, xyzIndices.size() / 3,
+      SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS, xyzIndices.data());
+
+  if (realToComplex) {
+    auto spaceDomainView =
+        HostArrayView3D<double>(transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST),
+                                numLocalXYPlanes, dimY, dimX, false);
+    for (SizeType x = 0; x < dimX; ++x) {
+      for (SizeType y = 0; y < dimY; ++y) {
+        for (SizeType z = 0; z < numLocalXYPlanes; ++z) {
+          spaceDomainView(z, y, x) = fftwView(x, y, z + comm.rank() * (dimZ / comm.size())).real();
+        }
+      }
+    }
+    for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+      print_view_3d(spaceDomainView, "Real init");
+      transform.forward(SpfftProcessingUnitType::SPFFT_PU_HOST,
+                        reinterpret_cast<double*>(arrayPacked.data()));
+      print_view_3d(freqDomainZ, "Freq");
+      transform.backward(reinterpret_cast<double*>(arrayPacked.data()),
+                         SpfftProcessingUnitType::SPFFT_PU_HOST);
+      print_view_3d(spaceDomainView, "Real after back and forth");
+    }
+  } else {
+    SizeType valueIndex = 0;
+    for (SizeType i = 0; i < xyzIndices.size(); i += 3, ++valueIndex) {
+      arrayPacked(valueIndex) = fftwView(xyzIndices[i], xyzIndices[i + 1], xyzIndices[i + 2]);
+    }
+    auto spaceDomainView = HostArrayView3D<std::complex<double>>(
+        reinterpret_cast<std::complex<double>*>(
+            transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST)),
+        numLocalXYPlanes, dimY, dimX, false);
+
+    for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+      print_view_3d(freqDomainZ, "Freq input");
+      transform.backward(reinterpret_cast<double*>(arrayPacked.data()),
+                         SpfftProcessingUnitType::SPFFT_PU_HOST);
+      print_view_3d(spaceDomainView, "Real");
+      transform.forward(SpfftProcessingUnitType::SPFFT_PU_HOST,
+                        reinterpret_cast<double*>(arrayPacked.data()));
+      print_view_3d(freqDomainZ, "Freq after forward and backward");
+    }
+  }
+
+  // output final z stick
+
+  // calculate reference
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  HOST_TIMING_START("FFTW 3d init backward")
+  fftw_plan plan3DBackward =
+      fftw_plan_dft_3d(dimX, dimY, dimZ, (fftw_complex*)fftwView.data(),
+                       (fftw_complex*)fftwView.data(), FFTW_BACKWARD, FFTW_ESTIMATE);
+  HOST_TIMING_STOP("FFTW 3d init backward")
+
+  HOST_TIMING_START("FFTW 3d init forward")
+  fftw_plan plan3DForward =
+      fftw_plan_dft_3d(dimX, dimY, dimZ, (fftw_complex*)fftwView.data(),
+                       (fftw_complex*)fftwView.data(), FFTW_FORWARD, FFTW_ESTIMATE);
+  HOST_TIMING_STOP("FFTW 3d init forward")
+
+  if (realToComplex) {
+    for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+      print_view_3d_transposed(fftwView, "FFTW ref real");
+
+      HOST_TIMING_START("FFTW 3d forward")
+      fftw_execute(plan3DForward);
+      HOST_TIMING_STOP("FFTW 3d forward")
+
+      print_view_3d(fftwView, "FFTW freq");
+
+      HOST_TIMING_START("FFTW 3d backward")
+      fftw_execute(plan3DBackward);
+      HOST_TIMING_STOP("FFTW 3d backward")
+
+      print_view_3d_transposed(fftwView, "FFTW real after back and forth");
+    }
+  } else {
+    for (SizeType repeat = 0; repeat < numRepeats; ++repeat) {
+      HOST_TIMING_START("FFTW 3d backward")
+      fftw_execute(plan3DBackward);
+      HOST_TIMING_STOP("FFTW 3d backward")
+
+      print_view_3d_transposed(fftwView, "FFTW ref real");
+
+      HOST_TIMING_START("FFTW 3d forward")
+      fftw_execute(plan3DForward);
+      HOST_TIMING_STOP("FFTW 3d forward")
+      print_view_3d(fftwView, "FFTW freq after forward and");
+    }
+  }
+
+  fftw_destroy_plan(plan3DBackward);
+  fftw_destroy_plan(plan3DForward);
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (comm.rank() == 0) {
+    HOST_TIMING_PRINT();
+  }
+
+  return 0;
+}
diff --git a/tests/run_local_tests.cpp b/tests/run_local_tests.cpp
new file mode 100644
index 0000000..d058ab8
--- /dev/null
+++ b/tests/run_local_tests.cpp
@@ -0,0 +1,7 @@
+#include "gtest/gtest.h"
+
+int main(int argc, char *argv[])
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/tests/run_mpi_tests.cpp b/tests/run_mpi_tests.cpp
new file mode 100644
index 0000000..e7ac17a
--- /dev/null
+++ b/tests/run_mpi_tests.cpp
@@ -0,0 +1,29 @@
+#include <mpi.h>
+#include "gtest/gtest.h"
+#include "gtest_mpi/gtest_mpi.hpp"
+
+int main(int argc, char* argv[]) {
+  // Initialize MPI before any call to gtest_mpi
+  MPI_Init(&argc, &argv);
+
+  // Intialize google test
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Add a test envirnment, which will initialize a test communicator
+  // (a duplicate of MPI_COMM_WORLD)
+  ::testing::AddGlobalTestEnvironment(new gtest_mpi::MPITestEnvironment());
+
+  auto& test_listeners = ::testing::UnitTest::GetInstance()->listeners();
+
+  // Remove default listener and replace with the custom MPI listener
+  delete test_listeners.Release(test_listeners.default_result_printer());
+  test_listeners.Append(new gtest_mpi::PrettyMPIUnitTestResultPrinter());
+
+  // run tests
+  auto exit_code = RUN_ALL_TESTS();
+
+  // Finalize MPI before exiting
+  MPI_Finalize();
+
+  return exit_code;
+}
diff --git a/tests/test_util/generate_indices.hpp b/tests/test_util/generate_indices.hpp
new file mode 100644
index 0000000..9ccb79b
--- /dev/null
+++ b/tests/test_util/generate_indices.hpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_GENERATE_INDICES_HPP
+#define SPFFT_GENERATE_INDICES_HPP
+
+#include <algorithm>
+#include <random>
+#include <vector>
+#include "spfft/config.h"
+
+namespace spfft {
+// creates randomly distributed indices for all ranks according to the input distributions
+template <typename T>
+auto create_value_indices(T& sharedRandGen, const std::vector<double>& zStickDistribution,
+                                 const double totalZStickFraction, const double zStickFillFraction,
+                                 const int dimX, const int dimY, const int dimZ,
+                                 const bool hermitianSymmetry) -> std::vector<std::vector<int>> {
+  std::uniform_real_distribution<double> uniformRandDis(0.0, 1.0);
+  std::discrete_distribution<int> rankSelectDis(zStickDistribution.begin(),
+                                                zStickDistribution.end());
+
+  const double zStickFractionSum =
+      std::accumulate(zStickDistribution.begin(), zStickDistribution.end(), 0.0);
+
+  std::vector<std::vector<std::pair<int, int>>> xyIndicesPerRank(zStickDistribution.size());
+
+  const int dimXFreq = hermitianSymmetry ? dimX / 2 + 1 : dimX;
+  const int dimYFreq = hermitianSymmetry ? dimY / 2 + 1 : dimY;
+  for (int x = 0; x < dimXFreq; ++x) {
+    for (int y = 0; y < dimY; ++y) {
+      if (!(x == 0 && y >= dimYFreq) && uniformRandDis(sharedRandGen) < totalZStickFraction) {
+        const auto selectedRank = rankSelectDis(sharedRandGen);
+        xyIndicesPerRank[selectedRank].emplace_back(std::make_pair(x, y));
+      }
+    }
+  }
+
+  const int dimZFreq = hermitianSymmetry ? dimZ / 2 + 1 : dimZ;
+  std::vector<std::vector<int>> valueIndices(zStickDistribution.size());
+  auto valueIndicesIt = valueIndices.begin();
+  for (const auto& xyIndices : xyIndicesPerRank) {
+    for (const auto& xyIndex : xyIndices) {
+      for (int z = 0; z < dimZ; ++z) {
+        // only add half x=0, y=0 stick if hermitian symmetry is used
+        if (!(hermitianSymmetry && xyIndex.first == 0 && xyIndex.second == 0 && z >= dimZFreq) &&
+            uniformRandDis(sharedRandGen) < zStickFillFraction) {
+          valueIndicesIt->emplace_back(xyIndex.first);
+          valueIndicesIt->emplace_back(xyIndex.second);
+          valueIndicesIt->emplace_back(z);
+        }
+      }
+    }
+    ++valueIndicesIt;
+  }
+
+  return valueIndices;
+}
+
+inline auto center_indices(const int dimX, const int dimY, const int dimZ,
+                           std::vector<std::vector<int>>& indicesPerRank) -> void {
+  const int positiveSizeX = dimX / 2 + 1;
+  const int positiveSizeY = dimY / 2 + 1;
+  const int positiveSizeZ = dimZ / 2 + 1;
+  for (auto& rankIndices : indicesPerRank) {
+	for (std::size_t i = 0; i < rankIndices.size() ; i += 3) {
+	  if (rankIndices[i] >= positiveSizeX) rankIndices[i] -= dimX;
+	  if (rankIndices[i + 1] >= positiveSizeY) rankIndices[i + 1] -= dimY;
+	  if (rankIndices[i + 2] >= positiveSizeZ) rankIndices[i + 2] -= dimZ;
+	}
+  }
+}
+
+// assigns a number of xy planes to the local rank according to the xy plane distribution
+inline auto calculate_num_local_xy_planes(const int rank, const int dimZ,
+                                          const std::vector<double>& planeRankDistribution) -> int {
+  const double planeDistriSum =
+      std::accumulate(planeRankDistribution.begin(), planeRankDistribution.end(), 0.0);
+  std::vector<int> numXYPlanesPerRank(planeRankDistribution.size());
+  for (std::size_t i = 0; i < planeRankDistribution.size(); ++i) {
+    numXYPlanesPerRank[i] = planeRankDistribution[i] / planeDistriSum * dimZ;
+  }
+
+  int numMissingPlanes =
+      dimZ - std::accumulate(numXYPlanesPerRank.begin(), numXYPlanesPerRank.end(), 0);
+  for (auto& val : numXYPlanesPerRank) {
+    // add missing planes to rank with non-zero number
+    if (val > 0 && numMissingPlanes > 0) {
+      val += numMissingPlanes;
+      numMissingPlanes = 0;
+      break;
+    }
+    // substract extra planes
+    if (numMissingPlanes < 0) {
+      val -= std::min(val, -numMissingPlanes);
+      numMissingPlanes += val;
+      if (numMissingPlanes >= 0) {
+        numMissingPlanes = 0;
+        break;
+      }
+    }
+  }
+
+  // if all ranks have 0 planes, some planes have to be assigned somewhere
+  if (numMissingPlanes > 0) {
+    numXYPlanesPerRank[0] = numMissingPlanes;
+  }
+  return numXYPlanesPerRank[rank];
+}
+
+} // namespace spfft
+
+#endif
+
diff --git a/tests/test_util/test_check_values.hpp b/tests/test_util/test_check_values.hpp
new file mode 100644
index 0000000..3d9f481
--- /dev/null
+++ b/tests/test_util/test_check_values.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 ETH Zurich, Simon Frasch
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPFFT_TEST_CHECK_VALUES_HPP
+#define SPFFT_TEST_CHECK_VALUES_HPP
+
+#include <cassert>
+#include <complex>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/host_array_view.hpp"
+#include "spfft/config.h"
+
+namespace spfft {
+
+inline void check_c2c_space_domain(const HostArrayView3D<std::complex<double>>& realView,
+                                 const HostArrayView3D<std::complex<double>>& fftwView,
+                                 const SizeType planeOffset, const SizeType numLocalXYPlanes) {
+  for (SizeType z = 0; z < numLocalXYPlanes; ++z) {
+    for (SizeType x = 0; x < fftwView.dim_outer(); ++x) {
+      for (SizeType y = 0; y < fftwView.dim_mid(); ++y) {
+        ASSERT_NEAR(realView(z, y, x).real(), fftwView(x, y, z + planeOffset).real(), 1e-6);
+        ASSERT_NEAR(realView(z, y, x).imag(), fftwView(x, y, z + planeOffset).imag(), 1e-6);
+      }
+    }
+  }
+}
+
+inline void check_r2c_space_domain(const HostArrayView3D<double>& realView,
+                                 const HostArrayView3D<std::complex<double>>& fftwView,
+                                 const SizeType planeOffset, const SizeType numLocalXYPlanes) {
+  for (SizeType z = 0; z < numLocalXYPlanes; ++z) {
+    for (SizeType x = 0; x < fftwView.dim_outer(); ++x) {
+      for (SizeType y = 0; y < fftwView.dim_mid(); ++y) {
+        ASSERT_NEAR(realView(z, y, x), fftwView(x, y, z + planeOffset).real(), 1e-6);
+      }
+    }
+  }
+}
+
+inline void check_freq_domain(const std::vector<std::complex<double>>& freqValues,
+                             const HostArrayView3D<std::complex<double>>& fftwView,
+                             const std::vector<int>& indices) {
+  assert(indices.size() == freqValues.size() * 3);
+
+  for (SizeType i = 0; i < freqValues.size(); ++i) {
+    int x = indices[i * 3];
+    int y = indices[i * 3 + 1];
+    int z = indices[i * 3 + 2];
+    if (x < 0) x = fftwView.dim_outer() + x;
+    if (y < 0) y = fftwView.dim_mid() + y;
+    if (z < 0) z = fftwView.dim_inner() + z;
+    ASSERT_NEAR(freqValues[i].real(), fftwView(x, y, z).real(), 1e-6);
+    ASSERT_NEAR(freqValues[i].imag(), fftwView(x, y, z).imag(), 1e-6);
+  }
+}
+
+} // namespace spfft
+
+#endif
+
diff --git a/tests/test_util/test_transform.hpp b/tests/test_util/test_transform.hpp
new file mode 100644
index 0000000..a256914
--- /dev/null
+++ b/tests/test_util/test_transform.hpp
@@ -0,0 +1,278 @@
+#ifndef SPFFT_TEST_TRANSFORM_HPP
+#define SPFFT_TEST_TRANSFORM_HPP
+
+#include <fftw3.h>
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "gtest/gtest.h"
+#include "memory/array_view_utility.hpp"
+#include "memory/host_array.hpp"
+#include "memory/host_array_view.hpp"
+#include "parameters/parameters.hpp"
+#include "spfft/grid.hpp"
+#include "spfft/transform.hpp"
+#include "test_util/test_check_values.hpp"
+#include "test_util/generate_indices.hpp"
+#include "util/common_types.hpp"
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+#include "gpu_util/gpu_fft_api.hpp"
+#include "gpu_util/gpu_transfer.hpp"
+#include "memory/gpu_array.hpp"
+#endif
+
+using namespace spfft;
+class TransformTest
+    : public ::testing::TestWithParam<
+          std::tuple<SpfftExchangeType, SpfftProcessingUnitType, int, int, int, bool>> {
+protected:
+  TransformTest()
+      : dimX_(std::get<2>(GetParam())),
+        dimY_(std::get<3>(GetParam())),
+        dimZ_(std::get<4>(GetParam())),
+        fftwArray_(dimX_ * dimY_ * dimZ_),
+        fftwView_(create_3d_view(fftwArray_, 0, dimX_, dimY_, dimZ_)),
+        centeredIndices_(std::get<5>(GetParam())) {
+    // initialize ffw plans
+    fftwPlanBackward_ =
+        fftw_plan_dft_3d(dimX_, dimY_, dimZ_, (fftw_complex*)fftwArray_.data(),
+                         (fftw_complex*)fftwArray_.data(), FFTW_BACKWARD, FFTW_ESTIMATE);
+    fftwPlanForward_ =
+        fftw_plan_dft_3d(dimX_, dimY_, dimZ_, (fftw_complex*)fftwArray_.data(),
+                         (fftw_complex*)fftwArray_.data(), FFTW_FORWARD, FFTW_ESTIMATE);
+  }
+
+  inline auto test_backward_c2c(const std::vector<double>& zStickDistribution,
+                                const std::vector<double>& xyPlaneDistribution) -> void;
+
+  inline auto test_forward_c2c(const std::vector<double>& zStickDistribution,
+                               const std::vector<double>& xyPlaneDistribution) -> void;
+
+  inline auto test_r2c(const std::vector<double>& xyPlaneDistribution) -> void;
+
+  virtual auto comm_rank() -> SizeType { return 0; }
+
+  virtual auto comm_size() -> SizeType { return 1; }
+
+  virtual auto grid() -> Grid& = 0;
+
+  ~TransformTest() override {
+    if (fftwPlanBackward_) fftw_destroy_plan(fftwPlanBackward_);
+    if (fftwPlanForward_) fftw_destroy_plan(fftwPlanForward_);
+    fftwPlanBackward_ = nullptr;
+    fftwPlanForward_ = nullptr;
+  }
+
+  int dimX_, dimY_, dimZ_;
+  HostArray<std::complex<double>> fftwArray_;
+  HostArrayView3D<std::complex<double>> fftwView_;
+  fftw_plan fftwPlanBackward_ = nullptr;
+  fftw_plan fftwPlanForward_ = nullptr;
+  bool centeredIndices_;
+};
+
+auto TransformTest::test_backward_c2c(const std::vector<double>& zStickDistribution,
+                                      const std::vector<double>& xyPlaneDistribution) -> void {
+  std::mt19937 randGen(42);
+  std::uniform_real_distribution<double> uniformRandDis(0.0, 1.0);
+  auto valueIndicesPerRank =
+      create_value_indices(randGen, zStickDistribution, 0.7, 0.7, dimX_, dimY_, dimZ_, false);
+  const int numLocalXYPlanes =
+      calculate_num_local_xy_planes(comm_rank(), dimZ_, xyPlaneDistribution);
+
+  // assign values to fftw input
+  for (const auto& valueIndices : valueIndicesPerRank) {
+    for (std::size_t i = 0; i < valueIndices.size(); i += 3) {
+      fftwView_(valueIndices[i], valueIndices[i + 1], valueIndices[i + 2]) =
+          std::complex<double>(uniformRandDis(randGen), uniformRandDis(randGen));
+    }
+  }
+
+  // extract local rank values
+  std::vector<std::complex<double>> values(valueIndicesPerRank[comm_rank()].size() / 3);
+  for (std::size_t i = 0; i < values.size(); ++i) {
+    const auto x = valueIndicesPerRank[comm_rank()][i * 3];
+    const auto y = valueIndicesPerRank[comm_rank()][i * 3 + 1];
+    const auto z = valueIndicesPerRank[comm_rank()][i * 3 + 2];
+    values[i] = fftwView_(x, y, z);
+  }
+
+  if (centeredIndices_) {
+    center_indices(dimX_, dimY_, dimZ_, valueIndicesPerRank);
+  }
+
+  auto transform = grid().create_transform(
+      std::get<1>(GetParam()), SpfftTransformType::SPFFT_TRANS_C2C, dimX_, dimY_, dimZ_,
+      numLocalXYPlanes, values.size(), SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS,
+      valueIndicesPerRank[comm_rank()].data());
+
+  HostArrayView3D<std::complex<double>> realView(
+      reinterpret_cast<std::complex<double>*>(
+          transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST)),
+      numLocalXYPlanes, dimY_, dimX_, false);
+
+  fftw_execute(fftwPlanBackward_);
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  if (std::get<1>(GetParam()) == SpfftProcessingUnitType::SPFFT_PU_GPU) {
+    // copy frequency values to GPU
+    GPUArray<typename gpu::fft::ComplexType<double>::type> valuesGPU(values.size());
+    copy_to_gpu(values, valuesGPU);
+
+    // transform
+    transform.backward(reinterpret_cast<double*>(valuesGPU.data()),
+                       SpfftProcessingUnitType::SPFFT_PU_GPU);
+    // run twice to ensure memory is zeroed correctly
+    transform.backward(reinterpret_cast<double*>(valuesGPU.data()),
+                       SpfftProcessingUnitType::SPFFT_PU_GPU);
+
+    // use transform buffer to copy values
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> realViewGPU(
+        reinterpret_cast<typename gpu::fft::ComplexType<double>::type*>(
+            transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_GPU)),
+        numLocalXYPlanes, dimY_, dimX_, false);
+    copy_from_gpu(realViewGPU, realView);
+  }
+#endif
+  if (std::get<1>(GetParam()) == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    transform.backward(reinterpret_cast<double*>(values.data()),
+                       SpfftProcessingUnitType::SPFFT_PU_HOST);
+    // run twice to ensure memory is zeroed correctly
+    transform.backward(reinterpret_cast<double*>(values.data()),
+                       SpfftProcessingUnitType::SPFFT_PU_HOST);
+  }
+  check_c2c_space_domain(realView, fftwView_, transform.local_z_offset(), numLocalXYPlanes);
+}
+
+auto TransformTest::test_forward_c2c(const std::vector<double>& zStickDistribution,
+                                     const std::vector<double>& xyPlaneDistribution) -> void {
+  std::mt19937 randGen(42);
+  std::uniform_real_distribution<double> uniformRandDis(0.0, 1.0);
+  auto valueIndicesPerRank =
+      create_value_indices(randGen, zStickDistribution, 0.7, 0.7, dimX_, dimY_, dimZ_, false);
+  const int numLocalXYPlanes =
+      calculate_num_local_xy_planes(comm_rank(), dimZ_, xyPlaneDistribution);
+
+  // assign values to fftw input
+  for (const auto& valueIndices : valueIndicesPerRank) {
+    for (std::size_t i = 0; i < valueIndices.size(); i += 3) {
+      fftwView_(valueIndices[i], valueIndices[i + 1], valueIndices[i + 2]) =
+          std::complex<double>(uniformRandDis(randGen), uniformRandDis(randGen));
+    }
+  }
+
+  std::vector<std::complex<double>> freqValues(valueIndicesPerRank[comm_rank()].size() / 3);
+
+  if (centeredIndices_) {
+    center_indices(dimX_, dimY_, dimZ_, valueIndicesPerRank);
+  }
+
+  auto transform = grid().create_transform(
+      std::get<1>(GetParam()), SpfftTransformType::SPFFT_TRANS_C2C, dimX_, dimY_, dimZ_,
+      numLocalXYPlanes, freqValues.size(), SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS,
+      valueIndicesPerRank[comm_rank()].data());
+
+  HostArrayView3D<std::complex<double>> realView(
+      reinterpret_cast<std::complex<double>*>(
+          transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST)),
+      numLocalXYPlanes, dimY_, dimX_, false);
+
+  fftw_execute(fftwPlanBackward_);
+
+  // copy space domain values from fftw buffer
+  const auto zOffset = transform.local_z_offset();
+  for (int z = 0; z < numLocalXYPlanes; ++z) {
+    for (int y = 0; y < dimY_; ++y) {
+      for (int x = 0; x < dimX_; ++x) {
+        realView(z, y, x) = fftwView_(x, y, z + zOffset);
+      }
+    }
+  }
+
+  fftw_execute(fftwPlanForward_);
+
+#if defined(SPFFT_CUDA) || defined(SPFFT_ROCM)
+  if (std::get<1>(GetParam()) == SpfftProcessingUnitType::SPFFT_PU_GPU) {
+    // use transform buffer to copy values
+    GPUArrayView3D<typename gpu::fft::ComplexType<double>::type> realViewGPU(
+        reinterpret_cast<typename gpu::fft::ComplexType<double>::type*>(
+            transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_GPU)),
+        numLocalXYPlanes, dimY_, dimX_, false);
+    copy_to_gpu(realView, realViewGPU);
+
+    GPUArray<typename gpu::fft::ComplexType<double>::type> freqValuesGPU(freqValues.size());
+    transform.forward(SpfftProcessingUnitType::SPFFT_PU_GPU,
+                      reinterpret_cast<double*>(freqValuesGPU.data()));
+    copy_from_gpu(freqValuesGPU, freqValues);
+  }
+#endif
+  if (std::get<1>(GetParam()) == SpfftProcessingUnitType::SPFFT_PU_HOST) {
+    transform.forward(SpfftProcessingUnitType::SPFFT_PU_HOST,
+                      reinterpret_cast<double*>(freqValues.data()));
+  }
+
+  check_freq_domain(freqValues, fftwView_, valueIndicesPerRank[comm_rank()]);
+}
+
+auto TransformTest::test_r2c(const std::vector<double>& xyPlaneDistribution) -> void {
+  std::mt19937 randGen(42);
+  std::uniform_real_distribution<double> uniformRandDis(0.0, 1.0);
+
+  // create full set of global z-sticks (up to dimX_ / 2 + 1, due to symmetry)
+  std::vector<double> zStickDistribution(xyPlaneDistribution.size(), 1.0);
+  auto valueIndicesPerRank =
+      create_value_indices(randGen, zStickDistribution, 1.0, 1.0, dimX_, dimY_, dimZ_, true);
+  const int numLocalXYPlanes =
+      calculate_num_local_xy_planes(comm_rank(), dimZ_, xyPlaneDistribution);
+
+  // assign values to fftw input
+  for (const auto& valueIndices : valueIndicesPerRank) {
+    for (std::size_t i = 0; i < valueIndices.size(); i += 3) {
+      fftwView_(valueIndices[i], valueIndices[i + 1], valueIndices[i + 2]) =
+          std::complex<double>(uniformRandDis(randGen), 0.0);
+    }
+  }
+
+  std::vector<std::complex<double>> freqValues(valueIndicesPerRank[comm_rank()].size() / 3);
+
+  if (centeredIndices_) {
+    center_indices(dimX_, dimY_, dimZ_, valueIndicesPerRank);
+  }
+
+  auto transform = grid().create_transform(
+      std::get<1>(GetParam()), SpfftTransformType::SPFFT_TRANS_R2C, dimX_, dimY_, dimZ_,
+      numLocalXYPlanes, freqValues.size(), SpfftIndexFormatType::SPFFT_INDEX_TRIPLETS,
+      valueIndicesPerRank[comm_rank()].data());
+
+  HostArrayView3D<double> realView(
+      transform.space_domain_data(SpfftProcessingUnitType::SPFFT_PU_HOST), numLocalXYPlanes, dimY_,
+      dimX_, false);
+
+  // copy space domain values from fftw buffer
+  const auto zOffset = transform.local_z_offset();
+  for (int z = 0; z < numLocalXYPlanes; ++z) {
+    for (int y = 0; y < dimY_; ++y) {
+      for (int x = 0; x < dimX_; ++x) {
+        realView(z, y, x) = fftwView_(x, y, z + zOffset).real();
+      }
+    }
+  }
+
+  // check forward
+  transform.forward(SpfftProcessingUnitType::SPFFT_PU_HOST,
+                    reinterpret_cast<double*>(freqValues.data()));
+  fftw_execute(fftwPlanForward_);
+  check_freq_domain(freqValues, fftwView_, valueIndicesPerRank[comm_rank()]);
+
+  // check backward
+  transform.backward(reinterpret_cast<double*>(freqValues.data()),
+                     SpfftProcessingUnitType::SPFFT_PU_HOST);
+  fftw_execute(fftwPlanBackward_);
+  check_r2c_space_domain(realView, fftwView_, transform.local_z_offset(), numLocalXYPlanes);
+}
+
+#endif