devitocodes · ggorman · Oct 1, 2025 · Oct 2, 2025
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -0,0 +1,8 @@
+# actionlint configuration file
+# https://github.com/rhysd/actionlint/blob/main/docs/config.md
+
+self-hosted-runner:
+  # Custom labels for self-hosted runners
+  labels:
+    - nvidiagpu  # Self-hosted runner for NVIDIA GPU builds
+    - amdgpu     # Self-hosted runner for AMD GPU builds
diff --git a/.github/workflows/docker-bases.yml b/.github/workflows/docker-bases.yml
@@ -16,6 +16,9 @@
       cpu:
         type: boolean
         default: false
+      arm64:
+        type: boolean
+        default: false
       nvidia:
         type: boolean
         default: false
@@ -34,51 +37,84 @@
 
 jobs:
   #######################################################
-  ############## Basic gcc CPU ##########################
+  ############## GCC Multi-Architecture Base ############
   #######################################################
-  deploy-cpu-bases:
-    if: inputs.cpu
-    name: "cpu-base"
-    runs-on: ubuntu-latest
+  deploy-gcc-bases:
+    if: inputs.cpu || inputs.arm64
+    name: "gcc-${{ matrix.arch }}-${{ matrix.gcc || 'default' }}"
+    runs-on: ${{ matrix.runner }}
     env:
       DOCKER_BUILDKIT: "1"
 
     strategy:
       matrix:
-        gcc: ["", "14"]
+        include:
+          # x86_64 builds - both default and GCC 14
+          - arch: amd64
+            runner: ubuntu-22.04
+            platform: linux/amd64
+            tag_prefix: cpu
+            gcc: ""
+            input_flag: cpu
+          - arch: amd64
+            runner: ubuntu-22.04
+            platform: linux/amd64
+            tag_prefix: cpu
+            gcc: "14"
+            input_flag: cpu
+          # ARM64 build - only default GCC (building from source too slow)
+          - arch: arm64
+            runner: ubuntu-24.04-arm
+            platform: linux/arm64
+            tag_prefix: arm64
+            gcc: ""
+            input_flag: arm64
 
     steps:
+      - name: Check if should build
+        id: check
+        run: |
+          if [[ "${{ matrix.input_flag }}" == "cpu" && "${{ inputs.cpu }}" == "true" ]] || \
+             [[ "${{ matrix.input_flag }}" == "arm64" && "${{ inputs.arm64 }}" == "true" ]]; then
+            echo "skip=false" >> $GITHUB_OUTPUT
+          else
+            echo "skip=true" >> $GITHUB_OUTPUT
+          fi
+
       - name: Checkout devito
+        if: steps.check.outputs.skip != 'true'
         uses: actions/checkout@v5
 
       - name: Check event name
+        if: steps.check.outputs.skip != 'true'
         run: echo ${{ github.event_name }}
 
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
       - name: Set up Docker Buildx
+        if: steps.check.outputs.skip != 'true'
         uses: docker/setup-buildx-action@v3
 
       - name: Login to DockerHub
+        if: steps.check.outputs.skip != 'true'
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
 
-      - name: GCC image
+      - name: Build and push GCC image
+        if: steps.check.outputs.skip != 'true'
         uses: docker/build-push-action@v6
         with:
           context: .
           file: "./docker/Dockerfile.cpu"
           push: true
+          platforms: ${{ matrix.platform }}
           build-args: "gcc=${{ matrix.gcc }}"
-          tags: "devitocodes/bases:cpu-gcc${{ matrix.gcc }}"
+          tags: "devitocodes/bases:${{ matrix.tag_prefix }}-gcc${{ matrix.gcc }}"
 
   #######################################################
   ############## Intel OneApi CPU #######################
  #######################################################
  deploy-oneapi-bases:
    if: inputs.intel
    name: "oneapi-base"
    runs-on: ubuntu-latest

diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml
@@ -46,6 +46,12 @@ jobs:
             test: 'tests/test_operator.py'
             runner: ubuntu-latest
 
+          - base: 'bases:arm64-gcc'
+            tag: 'arm64-gcc'
+            flag: '--init -t'
+            test: 'tests/test_operator.py'
+            runner: ubuntu-24.04-arm  # GitHub's free ARM64 runner
+
     steps:
       - name: Checkout devito
         uses: actions/checkout@v5

diff --git a/.github/workflows/pytest-core-mpi.yml b/.github/workflows/pytest-core-mpi.yml
@@ -66,25 +66,32 @@ jobs:
       runs-on: ${{ matrix.os }}
       strategy:
         matrix:
-          name: [gcc, icx]
+          name: [gcc, icx, gcc-arm64]
           include:
             - name: gcc
               arch: gcc
-              os: ubuntu-latest
+              base_prefix: cpu
+              os: ubuntu-22.04
               mpiflag: ""
             - name: icx
               arch: icx
-              os: ubuntu-latest
+              base_prefix: cpu
+              os: ubuntu-22.04
               # Need safe math for icx due to inaccuracy with mpi+sinc interpolation
               mpiflag: "-e DEVITO_SAFE_MATH=1"
+            - name: gcc-arm64
+              arch: gcc
+              base_prefix: arm64
+              os: ubuntu-24.04-arm
+              mpiflag: ""
 
       steps:
       - name: Checkout devito
         uses: actions/checkout@v5
 
       - name: Build docker image
         run: |
-          docker build . --file docker/Dockerfile.devito --tag devito_img --build-arg base=devitocodes/bases:cpu-${{ matrix.arch }}
+          docker build . --file docker/Dockerfile.devito --tag devito_img --build-arg base=devitocodes/bases:${{ matrix.base_prefix }}-${{ matrix.arch }}
 
       - name: Test with pytest
         run: |

diff --git a/.github/workflows/pytest-core-nompi.yml b/.github/workflows/pytest-core-nompi.yml
@@ -39,7 +39,9 @@ jobs:
            pytest-osx-py312-clang-omp,
            pytest-docker-py310-gcc-omp,
            pytest-docker-py310-icx-omp,
-           pytest-ubuntu-py313-gcc14-omp
+           pytest-ubuntu-py313-gcc14-omp,
+           pytest-ubuntu-py312-gcc14-omp-arm64,
+           pytest-ubuntu-py311-gcc13-omp-arm64
         ]
         set: [base, adjoint]
         include:
@@ -113,6 +115,20 @@ jobs:
           language: "openmp"
           sympy: "1.14"
 
+        - name: pytest-ubuntu-py312-gcc14-omp-arm64
+          python-version: '3.12'
+          os: ubuntu-24.04-arm
+          arch: "gcc-14"
+          language: "openmp"
+          sympy: "1.14"
+
+        - name: pytest-ubuntu-py311-gcc13-omp-arm64
+          python-version: '3.11'
+          os: ubuntu-24.04-arm
+          arch: "gcc-13"
+          language: "openmp"
+          sympy: "1.13"
+
         - set: base
           test-set: 'not adjoint'
 

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
@@ -45,7 +45,7 @@ RUN cd /tmp && mkdir openmpi && \
     ../configure --prefix=/opt/openmpi/ \
                  --disable-mpi-fortran \
                  --enable-mca-no-build=btl-uct --enable-mpi1-compatibility && \
-    make -j ${nproc} && \
+    make -j $(nproc) && \
     make install && \
     cd /tmp && rm -rf /tmp/openmpi
 

diff --git a/docker/README.md b/docker/README.md
@@ -8,11 +8,12 @@ Devito provides several images that target different architectures and compilers
 
 ### [Devito] on CPU
 
-We provide two CPU images:
-- `devito:gcc-*` with the standard GNU gcc compiler.
-- `devito:icx-*` with the Intel C compiler for Intel architectures.
+We provide CPU images for different architectures and compilers:
+- `devito:gcc-*` with the standard GNU gcc compiler (x86_64).
+- `devito:icx-*` with the Intel C compiler for Intel architectures (x86_64).
+- `devito:arm64-gcc-*` with the standard GNU gcc compiler for ARM64 architectures (AWS Graviton, Apple Silicon, etc.).
 
-These images provide a working environment for any CPU architecture and come with [Devito], `gcc/icx` and `mpi` preinstalled, and utilities such as `jupyter` for usability and exploration of the package.
+These images provide a working environment for their respective CPU architectures and come with [Devito], `gcc/icx` and `mpi` preinstalled, and utilities such as `jupyter` for usability and exploration of the package.
 
 To run this image locally, you will first need to install `docker`. Then, the following commands will get you started:
 
@@ -38,6 +39,38 @@ In addition, the following legacy tags are available:
 - `devito:cpu-*` that corresponds to `devito:gcc-*`
 
 
+### [Devito] on ARM64
+
+We provide ARM64-specific images optimized for ARM64 processors:
+- `devito:arm64-gcc-*` with the standard GNU gcc compiler for ARM64 architectures.
+
+These images support various ARM64 platforms including:
+- AWS Graviton2/3/4 instances
+- Apple Silicon (M1/M2/M3) via Docker Desktop
+- ARM-based cloud instances
+
+Devito automatically detects the specific ARM64 variant at runtime and applies appropriate optimizations through its JIT compiler.
+
+To run on ARM64 systems:
+
+```bash
+# Pull image and start a bash shell
+docker run --rm -it -p 8888:8888 devitocodes/devito:arm64-gcc-latest /bin/bash
+
+# or start a Jupyter notebook server on port 8888
+docker run --rm -it -p 8888:8888 devitocodes/devito:arm64-gcc-latest
+
+# Run an example
+docker run --rm -it devitocodes/devito:arm64-gcc-latest python examples/seismic/acoustic/acoustic_example.py
+```
+
+On AWS Graviton instances with user context:
+
+```bash
+docker run --rm -it -v `pwd`:`pwd` -w `pwd` -u $(id -u):$(id -g) devitocodes/devito:arm64-gcc-latest python examples/seismic/acoustic/acoustic_example.py
+```
+
+
 ### [Devito] on GPU
 
 Second, we provide three images to run [Devito] on GPUs, tagged `devito:nvidia-nvc-*`, and `devito:amd-*`.

diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py
@@ -1144,26 +1144,31 @@ def test_interpolate_subdomain_mpi(self, mode):
 
         op.apply()
 
-        if grid.distributor.myrank == 0:
-            assert np.all(np.isclose(sr0.data, [3.75, 0.]))
-            assert np.all(np.isclose(sr1.data, [0., 0.]))
-            assert np.all(np.isclose(sr2.data, [0., 0.]))
-            assert np.all(np.isclose(sr3.data, [0., 0.]))
-        elif grid.distributor.myrank == 1:
-            assert np.all(np.isclose(sr0.data, [0., 3.]))
-            assert np.all(np.isclose(sr1.data, [0., 0.]))
-            assert np.all(np.isclose(sr2.data, [0., 3.]))
-            assert np.all(np.isclose(sr3.data, [34., 0.]))
-        elif grid.distributor.myrank == 2:
-            assert np.all(np.isclose(sr0.data, [0., 0.]))
-            assert np.all(np.isclose(sr1.data, [0., 0.]))
-            assert np.all(np.isclose(sr2.data, [0., 16.5]))
-            assert np.all(np.isclose(sr3.data, [30., 0.]))
-        elif grid.distributor.myrank == 3:
-            assert np.all(np.isclose(sr0.data, [6.75, 0.]))
-            assert np.all(np.isclose(sr1.data, [0., 48.75]))
-            assert np.all(np.isclose(sr2.data, [0., 112.5]))
-            assert np.all(np.isclose(sr3.data, [0., 0.]))
+        # Expected values for all 8 sparse points (computed in serial or from serial test)
+        # These are the ground truth values independent of MPI decomposition
+        expected_sr0 = np.array([3.75, 9., 0., 3., 0., 13.75, 6.75, 0.])
+        expected_sr1 = np.array([0., 0., 0., 0., 0., 30.25, 2.5, 63.75])
+        expected_sr2 = np.array([0., 0., 34., 3., 30., 60.5, 9.25, 127.5])
+        expected_sr3 = np.array([0., 0., 34., 0., 30., 0., 0., 0.])
+
+        # Get the sparse points owned by this rank
+        # _dist_datamap maps rank -> list of owned point indices
+        owned_points = sr0._dist_datamap.get(grid.distributor.myrank, [])
+
+        # Check that computed values match expected values for owned points
+        for i, point_idx in enumerate(owned_points):
+            assert np.isclose(sr0.data[i], expected_sr0[point_idx]), \
+                f"Rank {grid.distributor.myrank}: sr0.data[{i}] = {sr0.data[i]}, " \
+                f"expected {expected_sr0[point_idx]} for global point {point_idx}"
+            assert np.isclose(sr1.data[i], expected_sr1[point_idx]), \
+                f"Rank {grid.distributor.myrank}: sr1.data[{i}] = {sr1.data[i]}, " \
+                f"expected {expected_sr1[point_idx]} for global point {point_idx}"
+            assert np.isclose(sr2.data[i], expected_sr2[point_idx]), \
+                f"Rank {grid.distributor.myrank}: sr2.data[{i}] = {sr2.data[i]}, " \
+                f"expected {expected_sr2[point_idx]} for global point {point_idx}"
+            assert np.isclose(sr3.data[i], expected_sr3[point_idx]), \
+                f"Rank {grid.distributor.myrank}: sr3.data[{i}] = {sr3.data[i]}, " \
+                f"expected {expected_sr3[point_idx]} for global point {point_idx}"
 
     @pytest.mark.parallel(mode=4)
     def test_inject_subdomain_mpi(self, mode):