Skip to content

Conversation

PlaidCat
Copy link
Collaborator

THIS IS BREAKING AND NOT INTENDED TO BE SHIPPED

This is to test the PR checker and the builders

THIS IS BREAKING AND NOT INTENDED TO BE SHIPPED
@PlaidCat PlaidCat merged commit 935c85c into ctrliq:{jmaple}_ciqlts8_8 Jan 13, 2025
1 check passed
github-actions bot pushed a commit that referenced this pull request May 1, 2025
JIRA: https://issues.redhat.com/browse/RHEL-68940

commit 031af50
Author: Mark Rutland <[email protected]>
Date:   Wed, 4 Jan 2023 15:16:26 +0000

    arm64: cmpxchg_double*: hazard against entire exchange variable

    The inline assembly for arm64's cmpxchg_double*() implementations use a
    +Q constraint to hazard against other accesses to the memory location
    being exchanged. However, the pointer passed to the constraint is a
    pointer to unsigned long, and thus the hazard only applies to the first
    8 bytes of the location.

    GCC can take advantage of this, assuming that other portions of the
    location are unchanged, leading to a number of potential problems.

    This is similar to what we fixed back in commit:

      fee960b ("arm64: xchg: hazard against entire exchange variable")

    ... but we forgot to adjust cmpxchg_double*() similarly at the same
    time.

    The same problem applies, as demonstrated with the following test:

    | struct big {
    |         u64 lo, hi;
    | } __aligned(128);
    |
    | unsigned long foo(struct big *b)
    | {
    |         u64 hi_old, hi_new;
    |
    |         hi_old = b->hi;
    |         cmpxchg_double_local(&b->lo, &b->hi, 0x12, 0x34, 0x56, 0x78);
    |         hi_new = b->hi;
    |
    |         return hi_old ^ hi_new;
    | }

    ... which GCC 12.1.0 compiles as:

    | 0000000000000000 <foo>:
    |    0:   d503233f        paciasp
    |    4:   aa0003e4        mov     x4, x0
    |    8:   1400000e        b       40 <foo+0x40>
    |    c:   d2800240        mov     x0, #0x12                       // #18
    |   10:   d2800681        mov     x1, #0x34                       // #52
    |   14:   aa0003e5        mov     x5, x0
    |   18:   aa0103e6        mov     x6, x1
    |   1c:   d2800ac2        mov     x2, #0x56                       // #86
    |   20:   d2800f03        mov     x3, #0x78                       // #120
    |   24:   48207c82        casp    x0, x1, x2, x3, [x4]
    |   28:   ca050000        eor     x0, x0, x5
    |   2c:   ca060021        eor     x1, x1, x6
    |   30:   aa010000        orr     x0, x0, x1
    |   34:   d2800000        mov     x0, #0x0                        // #0    <--- BANG
    |   38:   d50323bf        autiasp
    |   3c:   d65f03c0        ret
    |   40:   d2800240        mov     x0, #0x12                       // #18
    |   44:   d2800681        mov     x1, #0x34                       // #52
    |   48:   d2800ac2        mov     x2, #0x56                       // #86
    |   4c:   d2800f03        mov     x3, #0x78                       // #120
    |   50:   f9800091        prfm    pstl1strm, [x4]
    |   54:   c87f1885        ldxp    x5, x6, [x4]
    |   58:   ca0000a5        eor     x5, x5, x0
    |   5c:   ca0100c6        eor     x6, x6, x1
    |   60:   aa0600a6        orr     x6, x5, x6
    |   64:   b5000066        cbnz    x6, 70 <foo+0x70>
    |   68:   c8250c82        stxp    w5, x2, x3, [x4]
    |   6c:   35ffff45        cbnz    w5, 54 <foo+0x54>
    |   70:   d2800000        mov     x0, #0x0                        // #0     <--- BANG
    |   74:   d50323bf        autiasp
    |   78:   d65f03c0        ret

    Notice that at the lines with "BANG" comments, GCC has assumed that the
    higher 8 bytes are unchanged by the cmpxchg_double() call, and that
    `hi_old ^ hi_new` can be reduced to a constant zero, for both LSE and
    LL/SC versions of cmpxchg_double().

    This patch fixes the issue by passing a pointer to __uint128_t into the
    +Q constraint, ensuring that the compiler hazards against the entire 16
    bytes being modified.

    With this change, GCC 12.1.0 compiles the above test as:

    | 0000000000000000 <foo>:
    |    0:   f9400407        ldr     x7, [x0, #8]
    |    4:   d503233f        paciasp
    |    8:   aa0003e4        mov     x4, x0
    |    c:   1400000f        b       48 <foo+0x48>
    |   10:   d2800240        mov     x0, #0x12                       // #18
    |   14:   d2800681        mov     x1, #0x34                       // #52
    |   18:   aa0003e5        mov     x5, x0
    |   1c:   aa0103e6        mov     x6, x1
    |   20:   d2800ac2        mov     x2, #0x56                       // #86
    |   24:   d2800f03        mov     x3, #0x78                       // #120
    |   28:   48207c82        casp    x0, x1, x2, x3, [x4]
    |   2c:   ca050000        eor     x0, x0, x5
    |   30:   ca060021        eor     x1, x1, x6
    |   34:   aa010000        orr     x0, x0, x1
    |   38:   f9400480        ldr     x0, [x4, #8]
    |   3c:   d50323bf        autiasp
    |   40:   ca0000e0        eor     x0, x7, x0
    |   44:   d65f03c0        ret
    |   48:   d2800240        mov     x0, #0x12                       // #18
    |   4c:   d2800681        mov     x1, #0x34                       // #52
    |   50:   d2800ac2        mov     x2, #0x56                       // #86
    |   54:   d2800f03        mov     x3, #0x78                       // #120
    |   58:   f9800091        prfm    pstl1strm, [x4]
    |   5c:   c87f1885        ldxp    x5, x6, [x4]
    |   60:   ca0000a5        eor     x5, x5, x0
    |   64:   ca0100c6        eor     x6, x6, x1
    |   68:   aa0600a6        orr     x6, x5, x6
    |   6c:   b5000066        cbnz    x6, 78 <foo+0x78>
    |   70:   c8250c82        stxp    w5, x2, x3, [x4]
    |   74:   35ffff45        cbnz    w5, 5c <foo+0x5c>
    |   78:   f9400480        ldr     x0, [x4, #8]
    |   7c:   d50323bf        autiasp
    |   80:   ca0000e0        eor     x0, x7, x0
    |   84:   d65f03c0        ret

    ... sampling the high 8 bytes before and after the cmpxchg, and
    performing an EOR, as we'd expect.

    For backporting, I've tested this atop linux-4.9.y with GCC 5.5.0. Note
    that linux-4.9.y is oldest currently supported stable release, and
    mandates GCC 5.1+. Unfortunately I couldn't get a GCC 5.1 binary to run
    on my machines due to library incompatibilities.

    I've also used a standalone test to check that we can use a __uint128_t
    pointer in a +Q constraint at least as far back as GCC 4.8.5 and LLVM
    3.9.1.

    Fixes: 5284e1b ("arm64: xchg: Implement cmpxchg_double")
    Fixes: e9a4b79 ("arm64: cmpxchg_dbl: patch in lse instructions when supported by the CPU")
    Reported-by: Boqun Feng <[email protected]>
    Link: https://lore.kernel.org/lkml/Y6DEfQXymYVgL3oJ@boqun-archlinux/
    Reported-by: Peter Zijlstra <[email protected]>
    Link: https://lore.kernel.org/lkml/[email protected]/
    Signed-off-by: Mark Rutland <[email protected]>
    Cc: [email protected]
    Cc: Arnd Bergmann <[email protected]>
    Cc: Catalin Marinas <[email protected]>
    Cc: Steve Capper <[email protected]>
    Cc: Will Deacon <[email protected]>
    Link: https://lore.kernel.org/r/[email protected]
    Signed-off-by: Will Deacon <[email protected]>

Signed-off-by: Waiman Long <[email protected]>
github-actions bot pushed a commit that referenced this pull request Sep 3, 2025
When there are memory-only nodes (nodes without CPUs), these nodes are not
properly initialized, causing kernel panic during boot.

of_numa_init
	of_numa_parse_cpu_nodes
		node_set(nid, numa_nodes_parsed);
	of_numa_parse_memory_nodes

In of_numa_parse_cpu_nodes, numa_nodes_parsed gets updated only for nodes
containing CPUs.  Memory-only nodes should have been updated in
of_numa_parse_memory_nodes, but they weren't.

Subsequently, when free_area_init() attempts to access NODE_DATA() for
these uninitialized memory nodes, the kernel panics due to NULL pointer
dereference.

This can be reproduced on ARM64 QEMU with 1 CPU and 2 memory nodes:

qemu-system-aarch64 \
-cpu host -nographic \
-m 4G -smp 1 \
-machine virt,accel=kvm,gic-version=3,iommu=smmuv3 \
-object memory-backend-ram,size=2G,id=mem0 \
-object memory-backend-ram,size=2G,id=mem1 \
-numa node,nodeid=0,memdev=mem0 \
-numa node,nodeid=1,memdev=mem1 \
-kernel $IMAGE \
-hda $DISK \
-append "console=ttyAMA0 root=/dev/vda rw earlycon"

[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x481fd010]
[    0.000000] Linux version 6.17.0-rc1-00001-gabb4b3daf18c-dirty (yintirui@local) (gcc (GCC) 12.3.1, GNU ld (GNU Binutils) 2.41) #52 SMP PREEMPT Mon Aug 18 09:49:40 CST 2025
[    0.000000] KASLR enabled
[    0.000000] random: crng init done
[    0.000000] Machine model: linux,dummy-virt
[    0.000000] efi: UEFI not found.
[    0.000000] earlycon: pl11 at MMIO 0x0000000009000000 (options '')
[    0.000000] printk: legacy bootconsole [pl11] enabled
[    0.000000] OF: reserved mem: Reserved memory: No reserved-memory node in the DT
[    0.000000] NODE_DATA(0) allocated [mem 0xbfffd9c0-0xbfffffff]
[    0.000000] node 1 must be removed before remove section 23
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x000000013fffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000040000000-0x00000000bfffffff]
[    0.000000]   node   1: [mem 0x00000000c0000000-0x000000013fffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000040000000-0x00000000bfffffff]
[    0.000000] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a0
[    0.000000] Mem abort info:
[    0.000000]   ESR = 0x0000000096000004
[    0.000000]   EC = 0x25: DABT (current EL), IL = 32 bits
[    0.000000]   SET = 0, FnV = 0
[    0.000000]   EA = 0, S1PTW = 0
[    0.000000]   FSC = 0x04: level 0 translation fault
[    0.000000] Data abort info:
[    0.000000]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[    0.000000]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[    0.000000]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[    0.000000] [00000000000000a0] user address but active_mm is swapper
[    0.000000] Internal error: Oops: 0000000096000004 [#1]  SMP
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc1-00001-g760c6dabf762-dirty #54 PREEMPT
[    0.000000] Hardware name: linux,dummy-virt (DT)
[    0.000000] pstate: 800000c5 (Nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    0.000000] pc : free_area_init+0x50c/0xf9c
[    0.000000] lr : free_area_init+0x5c0/0xf9c
[    0.000000] sp : ffffa02ca0f33c00
[    0.000000] x29: ffffa02ca0f33cb0 x28: 0000000000000000 x27: 0000000000000000
[    0.000000] x26: 4ec4ec4ec4ec4ec5 x25: 00000000000c0000 x24: 00000000000c0000
[    0.000000] x23: 0000000000040000 x22: 0000000000000000 x21: ffffa02ca0f3b368
[    0.000000] x20: ffffa02ca14c7b98 x19: 0000000000000000 x18: 0000000000000002
[    0.000000] x17: 000000000000cacc x16: 0000000000000001 x15: 0000000000000001
[    0.000000] x14: 0000000080000000 x13: 0000000000000018 x12: 0000000000000002
[    0.000000] x11: ffffa02ca0fd4f00 x10: ffffa02ca14bab20 x9 : ffffa02ca14bab38
[    0.000000] x8 : 00000000000c0000 x7 : 0000000000000001 x6 : 0000000000000002
[    0.000000] x5 : 0000000140000000 x4 : ffffa02ca0f33c90 x3 : ffffa02ca0f33ca0
[    0.000000] x2 : ffffa02ca0f33c98 x1 : 0000000080000000 x0 : 0000000000000001
[    0.000000] Call trace:
[    0.000000]  free_area_init+0x50c/0xf9c (P)
[    0.000000]  bootmem_init+0x110/0x1dc
[    0.000000]  setup_arch+0x278/0x60c
[    0.000000]  start_kernel+0x70/0x748
[    0.000000]  __primary_switched+0x88/0x90
[    0.000000] Code: d503201f b98093e0 52800016 f8607a93 (f9405260)
[    0.000000] ---[ end trace 0000000000000000 ]---
[    0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

Link: https://lkml.kernel.org/r/[email protected]
Fixes: 7675076 ("arch_numa: switch over to numa_memblks")
Signed-off-by: Yin Tirui <[email protected]>
Acked-by: David Hildenbrand <[email protected]>
Acked-by: Mike Rapoport (Microsoft) <[email protected]>
Reviewed-by: Kefeng Wang <[email protected]>
Cc: Chen Jun <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Joanthan Cameron <[email protected]>
Cc: Rob Herring <[email protected]>
Cc: Saravana Kannan <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
github-actions bot pushed a commit that referenced this pull request Sep 10, 2025
commit ee4d098 upstream.

When there are memory-only nodes (nodes without CPUs), these nodes are not
properly initialized, causing kernel panic during boot.

of_numa_init
	of_numa_parse_cpu_nodes
		node_set(nid, numa_nodes_parsed);
	of_numa_parse_memory_nodes

In of_numa_parse_cpu_nodes, numa_nodes_parsed gets updated only for nodes
containing CPUs.  Memory-only nodes should have been updated in
of_numa_parse_memory_nodes, but they weren't.

Subsequently, when free_area_init() attempts to access NODE_DATA() for
these uninitialized memory nodes, the kernel panics due to NULL pointer
dereference.

This can be reproduced on ARM64 QEMU with 1 CPU and 2 memory nodes:

qemu-system-aarch64 \
-cpu host -nographic \
-m 4G -smp 1 \
-machine virt,accel=kvm,gic-version=3,iommu=smmuv3 \
-object memory-backend-ram,size=2G,id=mem0 \
-object memory-backend-ram,size=2G,id=mem1 \
-numa node,nodeid=0,memdev=mem0 \
-numa node,nodeid=1,memdev=mem1 \
-kernel $IMAGE \
-hda $DISK \
-append "console=ttyAMA0 root=/dev/vda rw earlycon"

[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x481fd010]
[    0.000000] Linux version 6.17.0-rc1-00001-gabb4b3daf18c-dirty (yintirui@local) (gcc (GCC) 12.3.1, GNU ld (GNU Binutils) 2.41) #52 SMP PREEMPT Mon Aug 18 09:49:40 CST 2025
[    0.000000] KASLR enabled
[    0.000000] random: crng init done
[    0.000000] Machine model: linux,dummy-virt
[    0.000000] efi: UEFI not found.
[    0.000000] earlycon: pl11 at MMIO 0x0000000009000000 (options '')
[    0.000000] printk: legacy bootconsole [pl11] enabled
[    0.000000] OF: reserved mem: Reserved memory: No reserved-memory node in the DT
[    0.000000] NODE_DATA(0) allocated [mem 0xbfffd9c0-0xbfffffff]
[    0.000000] node 1 must be removed before remove section 23
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x000000013fffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000040000000-0x00000000bfffffff]
[    0.000000]   node   1: [mem 0x00000000c0000000-0x000000013fffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000040000000-0x00000000bfffffff]
[    0.000000] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a0
[    0.000000] Mem abort info:
[    0.000000]   ESR = 0x0000000096000004
[    0.000000]   EC = 0x25: DABT (current EL), IL = 32 bits
[    0.000000]   SET = 0, FnV = 0
[    0.000000]   EA = 0, S1PTW = 0
[    0.000000]   FSC = 0x04: level 0 translation fault
[    0.000000] Data abort info:
[    0.000000]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[    0.000000]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[    0.000000]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[    0.000000] [00000000000000a0] user address but active_mm is swapper
[    0.000000] Internal error: Oops: 0000000096000004 [#1]  SMP
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc1-00001-g760c6dabf762-dirty #54 PREEMPT
[    0.000000] Hardware name: linux,dummy-virt (DT)
[    0.000000] pstate: 800000c5 (Nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    0.000000] pc : free_area_init+0x50c/0xf9c
[    0.000000] lr : free_area_init+0x5c0/0xf9c
[    0.000000] sp : ffffa02ca0f33c00
[    0.000000] x29: ffffa02ca0f33cb0 x28: 0000000000000000 x27: 0000000000000000
[    0.000000] x26: 4ec4ec4ec4ec4ec5 x25: 00000000000c0000 x24: 00000000000c0000
[    0.000000] x23: 0000000000040000 x22: 0000000000000000 x21: ffffa02ca0f3b368
[    0.000000] x20: ffffa02ca14c7b98 x19: 0000000000000000 x18: 0000000000000002
[    0.000000] x17: 000000000000cacc x16: 0000000000000001 x15: 0000000000000001
[    0.000000] x14: 0000000080000000 x13: 0000000000000018 x12: 0000000000000002
[    0.000000] x11: ffffa02ca0fd4f00 x10: ffffa02ca14bab20 x9 : ffffa02ca14bab38
[    0.000000] x8 : 00000000000c0000 x7 : 0000000000000001 x6 : 0000000000000002
[    0.000000] x5 : 0000000140000000 x4 : ffffa02ca0f33c90 x3 : ffffa02ca0f33ca0
[    0.000000] x2 : ffffa02ca0f33c98 x1 : 0000000080000000 x0 : 0000000000000001
[    0.000000] Call trace:
[    0.000000]  free_area_init+0x50c/0xf9c (P)
[    0.000000]  bootmem_init+0x110/0x1dc
[    0.000000]  setup_arch+0x278/0x60c
[    0.000000]  start_kernel+0x70/0x748
[    0.000000]  __primary_switched+0x88/0x90
[    0.000000] Code: d503201f b98093e0 52800016 f8607a93 (f9405260)
[    0.000000] ---[ end trace 0000000000000000 ]---
[    0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

Link: https://lkml.kernel.org/r/[email protected]
Fixes: 7675076 ("arch_numa: switch over to numa_memblks")
Signed-off-by: Yin Tirui <[email protected]>
Acked-by: David Hildenbrand <[email protected]>
Acked-by: Mike Rapoport (Microsoft) <[email protected]>
Reviewed-by: Kefeng Wang <[email protected]>
Cc: Chen Jun <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Joanthan Cameron <[email protected]>
Cc: Rob Herring <[email protected]>
Cc: Saravana Kannan <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging this pull request may close these issues.

1 participant