scverse · Intron7 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,22 @@
+BasedOnStyle: Google
+Language: Cpp
+
+# Make braces stay on the same line (like your diffs)
+BreakBeforeBraces: Attach
+AllowShortFunctionsOnASingleLine: None
+
+# Compact/“binpack” parameter lists (what produced your earlier diffs)
+BinPackParameters: true
+BinPackArguments: true
+
+# Typical CUDA/C++ ergonomics
+IndentWidth: 2
+ColumnLimit: 100
+PointerAlignment: Left
+DerivePointerAlignment: false
+
+# Don’t reorder #includes if you don’t want surprise churn
+SortIncludes: false
+
+# Optional: make templates break more aggressively
+AlwaysBreakTemplateDeclarations: Yes
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -1,36 +1,95 @@
-# This workflow will upload a Python Package using Twine when a release is created
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
 
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
+name: Build and upload to PyPI
 
 on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches: [main]
   release:
     types: [published]
 
 jobs:
-  deploy:
+  build_wheels:
+    name: Build wheels for ${{ matrix.os }}
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      matrix:
+        include:
+          - os: linux-intel
+            runs-on: ubuntu-latest
+            cibw_image: "ghcr.io/scverse/rapids_singlecell:manylinux_2_28_x86_64_cuda12.9"
+            dockerfile: "docker/manylinux_2_28_x86_64_cuda12.9.Dockerfile"
+          - os: linux-arm
+            runs-on: ubuntu-24.04-arm
+            cibw_image: "ghcr.io/scverse/rapids_singlecell:manylinux_2_28_aarch64_cuda12.9"
+            dockerfile: "docker/manylinux_2_28_aarch64_cuda12.9.Dockerfile"
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Build CUDA manylinux image
+        run: |
+          docker build -t "${{ matrix.cibw_image }}" -f "${{ matrix.dockerfile }}" docker
+
+      # cibuildwheel action (Linux-only wheels inside our custom manylinux+CUDA images)
+      - name: Build wheels (CUDA 12.9)
+        uses: pypa/[email protected]
+        env:
+          # Skip musllinux
+          CIBW_SKIP: '*-musllinux*'
+          # Point cibuildwheel to our CUDA manylinux images (per-arch)
+          CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.os == 'linux-intel' && matrix.cibw_image || '' }}
+          CIBW_MANYLINUX_AARCH64_IMAGE: ${{ matrix.os == 'linux-arm' && matrix.cibw_image || '' }}
+          # Make CUDA visible inside the build container
+          CIBW_ENVIRONMENT: >
+            CUDA_PATH=/usr/local/cuda
+            LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+            PATH=/usr/local/cuda/bin:$PATH
+          # Tooling to build a nanobind/scikit-build-core extension
+          CIBW_BEFORE_BUILD: >
+            python -m pip install -U pip
+            scikit-build-core cmake ninja nanobind
+          # No runtime tests (CI has no GPU)
+          CIBW_TEST_SKIP: "*"
+          CIBW_TEST_COMMAND: ""
+          # Bundle redistributable CUDA libs & ensure manylinux compliance
+          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair -w {dest_dir} {wheel}"
+          # Be somewhat chatty to see compile/link flags
+          CIBW_BUILD_VERBOSITY: "1"
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+          path: ./wheelhouse/*.whl
 
+  build_sdist:
+    name: Build source distribution
     runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - name: Build sdist
+        run: pipx run build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-sdist
+          path: dist/*.tar.gz
 
+  upload_pypi:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
     environment: publish
-
     permissions:
       id-token: write
-
+    if: github.event_name == 'release' && github.event.action == 'published'
     steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.x'
-    - name: Install CLI tool
-      run: pip install build
-    - name: Build package
-      run: python -m build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
+      - uses: actions/download-artifact@v5
+        with:
+          # unpacks all CIBW artifacts into dist/
+          pattern: cibw-*
+          path: dist
+          merge-multiple: true
+
+      - uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,3 +32,9 @@ repos:
     -   id: codespell
         additional_dependencies:
         - tomli
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    -   id: clang-format
+        args: [--style=file, -i]
+        types_or: [c, c++, cuda]
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -6,14 +6,15 @@ build:
   os: ubuntu-24.04
   tools:
     python: "3.12"
+
   commands:
     # Install and set up uv
     - asdf plugin add uv
     - asdf install uv latest
     - asdf global uv latest
 
     # Use uv to synchronize dependencies
-    - uv pip install --system .[doc]
+    - CMAKE_ARGS="-DRSC_BUILD_EXTENSIONS=OFF" uv pip install --system ".[doc]"
 
     # Build documentation using sphinx
     - python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(rapids_singlecell_cuda LANGUAGES CXX)
+
+# Option to disable building compiled extensions (for docs/RTD)
+option(RSC_BUILD_EXTENSIONS "Build CUDA/C++ extensions" ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+if (RSC_BUILD_EXTENSIONS)
+  enable_language(CUDA)
+  find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
+  find_package(nanobind CONFIG REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
+else()
+  message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
+endif()
+
+# Helper to declare a nanobind CUDA module uniformly
+function(add_nb_cuda_module target src)
+  if (RSC_BUILD_EXTENSIONS)
+    nanobind_add_module(${target} STABLE_ABI LTO
+        ${src}
+    )
+    target_link_libraries(${target} PRIVATE CUDA::cudart)
+    set_target_properties(${target} PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+    )
+    install(TARGETS ${target} LIBRARY DESTINATION rapids_singlecell/_cuda)
+    # Also copy built module into source tree for editable installs
+    add_custom_command(TARGET ${target} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+            $<TARGET_FILE:${target}>
+            ${PROJECT_SOURCE_DIR}/src/rapids_singlecell/_cuda/$<TARGET_FILE_NAME:${target}>
+    )
+  endif()
+endfunction()
+
+if (RSC_BUILD_EXTENSIONS)
+  # CUDA modules
+  add_nb_cuda_module(_mean_var_cuda     src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
+  add_nb_cuda_module(_sparse2dense_cuda src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu)
+  add_nb_cuda_module(_scale_cuda        src/rapids_singlecell/_cuda/scale/scale.cu)
+  add_nb_cuda_module(_qc_cuda           src/rapids_singlecell/_cuda/qc/qc.cu)
+  add_nb_cuda_module(_qc_dask_cuda      src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu)
+  add_nb_cuda_module(_bbknn_cuda        src/rapids_singlecell/_cuda/bbknn/bbknn.cu)
+  add_nb_cuda_module(_norm_cuda         src/rapids_singlecell/_cuda/norm/norm.cu)
+  add_nb_cuda_module(_pr_cuda           src/rapids_singlecell/_cuda/pr/pr.cu)
+  add_nb_cuda_module(_nn_descent_cuda   src/rapids_singlecell/_cuda/nn_descent/nn_descent.cu)
+  add_nb_cuda_module(_aucell_cuda       src/rapids_singlecell/_cuda/aucell/aucell.cu)
+  add_nb_cuda_module(_nanmean_cuda      src/rapids_singlecell/_cuda/nanmean/nanmean.cu)
+  add_nb_cuda_module(_autocorr_cuda     src/rapids_singlecell/_cuda/autocorr/autocorr.cu)
+  add_nb_cuda_module(_cooc_cuda         src/rapids_singlecell/_cuda/cooc/cooc.cu)
+  add_nb_cuda_module(_aggr_cuda         src/rapids_singlecell/_cuda/aggr/aggr.cu)
+  add_nb_cuda_module(_spca_cuda         src/rapids_singlecell/_cuda/spca/spca.cu)
+  add_nb_cuda_module(_ligrec_cuda       src/rapids_singlecell/_cuda/ligrec/ligrec.cu)
+  add_nb_cuda_module(_pv_cuda           src/rapids_singlecell/_cuda/pv/pv.cu)
+  # Harmony CUDA modules
+  add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
+  add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
+  add_nb_cuda_module(_harmony_colsum_cuda    src/rapids_singlecell/_cuda/harmony/colsum/colsum.cu)
+  add_nb_cuda_module(_harmony_kmeans_cuda    src/rapids_singlecell/_cuda/harmony/kmeans/kmeans.cu)
+  add_nb_cuda_module(_harmony_normalize_cuda src/rapids_singlecell/_cuda/harmony/normalize/normalize.cu)
+  add_nb_cuda_module(_harmony_pen_cuda       src/rapids_singlecell/_cuda/harmony/pen/pen.cu)
+endif()
diff --git a/docker/manylinux_2_28_aarch64_cuda12.9.Dockerfile b/docker/manylinux_2_28_aarch64_cuda12.9.Dockerfile
@@ -0,0 +1,18 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64
+
+RUN yum -y install dnf-plugins-core && \
+    dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
+    yum -y clean all && yum -y makecache && \
+    yum -y install \
+        cuda-nvcc-12-9 \
+        cuda-cudart-12-9 \
+        cuda-cudart-devel-12-9 \
+        libcublas-12-9 \
+        libcublas-devel-12-9 \
+        libcusparse-12-9 \
+        libcusparse-devel-12-9 && \
+    yum clean all
+
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/manylinux_2_28_x86_64_cuda12.9.Dockerfile b/docker/manylinux_2_28_x86_64_cuda12.9.Dockerfile
@@ -0,0 +1,20 @@
+FROM quay.io/pypa/manylinux_2_28_x86_64
+
+# Add NVIDIA CUDA repo (RHEL8/Alma8 base in manylinux_2_28)
+RUN yum -y install dnf-plugins-core && \
+    dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
+    yum -y clean all && yum -y makecache && \
+    # Install only what you actually link against
+    yum -y install \
+      cuda-nvcc-12-9 \
+      cuda-cudart-12-9 \
+      cuda-cudart-devel-12-9 \
+      libcublas-12-9 \
+      libcublas-devel-12-9 \
+      libcusparse-12-9 \
+      libcusparse-devel-12-9 && \
+    yum clean all
+
+ENV CUDA_HOME=/usr/local/cuda
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docs/release-notes/0.13.3.md → docs/release-notes/0.14.0.md b/docs/release-notes/0.13.3.md → docs/release-notes/0.14.0.md
@@ -1,8 +1,8 @@
-### 0.13.3 {small}`the-future`
+### 0.14.0 {small}`the-future`
 
 ```{rubric} Features
 ```
-
+* switch all `cupy.rawkernels` into a compiled cuda extension with nanobind {pr}`455` {smaller}`S Dicks & P Angerer`
 
 ```{rubric} Performance
 ```

diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md
@@ -2,9 +2,11 @@
 
 # Release notes
 
-## Version 0.13.0
-```{include} /release-notes/0.13.3.md
+## Version 0.14.0
+```{include} /release-notes/0.14.0.md
 ```
+
+## Version 0.13.0
 ```{include} /release-notes/0.13.2.md
 ```
 ```{include} /release-notes/0.13.1.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,11 +1,16 @@
 [build-system]
-requires = [ "hatchling", "hatch-vcs" ]
-build-backend = "hatchling.build"
+requires = [
+    "scikit-build-core>=0.10",
+    "nanobind>=2.0.0",
+    "pybind11-stubgen",
+    "setuptools-scm>=8",
+]
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "rapids_singlecell"
 description = "running single cell analysis on Nvidia GPUs"
-requires-python = ">=3.11, <3.14"
+requires-python = ">=3.12, <3.14"
 license = { file = "LICENSE" }
 authors = [ { name = "Severin Dicks" } ]
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -107,21 +112,41 @@ markers = [
     "gpu: tests that use a GPU (currently unused, but needs to be specified here as we import anndata.tests.helpers, which uses it)",
 ]
 
-[tool.hatch.build]
-# exclude big files that don’t need to be installed
-exclude = [
-    "tests",
-    "docs",
-    "notebooks",
-]
-[tool.hatch.build.hooks.vcs]
-version-file = "src/rapids_singlecell/_version.py"
+[tool.setuptools_scm]
+write_to = "src/rapids_singlecell/_version.py"
+# Optional but useful:
+version_scheme = "guess-next-dev"
+local_scheme = "node-and-date"
 
-[tool.hatch.version]
-source = "vcs"
+[tool.scikit-build]
+# Use limited ABI wheels (one wheel for all Python minor versions on one platform)
+wheel.py-api = "cp312"
+wheel.packages = [ "src/rapids_singlecell", "src/testing" ]
+cmake.version = ">=3.24"
+cmake.build-type = "Release"
+ninja.version = ">=1.10"
+experimental = false
+cmake.args = [ "-DCMAKE_CUDA_ARCHITECTURES=75;80;86;89;90;100;120" ]
+build-dir = "build"
+metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
+sdist.include = [ "src/rapids_singlecell/_version.py" ]
 
-[tool.hatch.build.targets.wheel]
-packages = [ 'src/rapids_singlecell', 'src/testing' ]
+# Use abi3audit to catch issues with Limited API wheels
+[tool.cibuildwheel.linux]
+repair-wheel-command = [
+    "auditwheel repair -w {dest_dir} {wheel}",
+    "pipx run abi3audit --strict --report {wheel}",
+]
+[tool.cibuildwheel.macos]
+repair-wheel-command = [
+    "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}",
+    "pipx run abi3audit --strict --report {wheel}",
+]
+[tool.cibuildwheel.windows]
+repair-wheel-command = [
+    "copy {wheel} {dest_dir}",
+    "pipx run abi3audit --strict --report {wheel}",
+]
 
 [tool.codespell]
 skip = '*.ipynb,*.csv'

diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+# Subpackage for CUDA extensions (built via scikit-build-core/nanobind)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from __future__ import annotations

		# Subpackage for CUDA extensions (built via scikit-build-core/nanobind)