[DAG] Combine `store + vselect` to `masked_store` #145176

abhishek-kaushik22 · 2025-06-21T16:19:40Z

Add a new combine to replace

(store ch (vselect cond truevec (load ch ptr offset)) ptr offset)

to

(mstore ch truevec ptr offset cond)

This saves a blend operation on targets that support conditional stores.

llvmbot · 2025-06-21T16:20:14Z

@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-backend-arm

@llvm/pr-subscribers-backend-x86

Author: Abhishek Kaushik (abhishek-kaushik22)

Changes

Add a new combine to replace

(store ch (vselect cond truevec (load ch ptr offset)) ptr offset)

to

(mstore ch truevec ptr offset cond)

This saves a blend operation on targets that support conditional stores.

Full diff: https://github.com/llvm/llvm-project/pull/145176.diff

2 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+78)
(added) llvm/test/CodeGen/X86/combine-storetomstore.ll (+276)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 33083c0eba695..7a8ec1b25de62 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -66,6 +66,7 @@
 #include <bitset>
 #include <cctype>
 #include <numeric>
+#include <queue>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
@@ -53403,6 +53404,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
+                                 const SDLoc &Dl,
+                                 const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX() && !Subtarget.hasAVX2() && !Subtarget.hasAVX512())
+    return SDValue();
+
+  if (!Store->isSimple())
+    return SDValue();
+
+  SDValue StoredVal = Store->getValue();
+  SDValue StorePtr = Store->getBasePtr();
+  SDValue StoreOffset = Store->getOffset();
+  EVT VT = StoredVal.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT))
+    return SDValue();
+
+  if (StoredVal.getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  SDValue Mask = StoredVal.getOperand(0);
+  SDValue TrueVec = StoredVal.getOperand(1);
+  SDValue FalseVec = StoredVal.getOperand(2);
+
+  LoadSDNode *Load = cast<LoadSDNode>(FalseVec.getNode());
+  if (!Load || !Load->isSimple())
+    return SDValue();
+
+  SDValue LoadPtr = Load->getBasePtr();
+  SDValue LoadOffset = Load->getOffset();
+
+  if (StorePtr != LoadPtr || StoreOffset != LoadOffset)
+    return SDValue();
+
+  auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) {
+    std::queue<SDValue> Worklist;
+
+    Worklist.push(Store->getChain());
+
+    while (!Worklist.empty()) {
+      SDValue Chain = Worklist.front();
+      Worklist.pop();
+
+      SDNode *Node = Chain.getNode();
+      if (!Node)
+        return false;
+
+      if (const auto *MemNode = dyn_cast<MemSDNode>(Node))
+        if (!MemNode->isSimple() || MemNode->writeMem())
+          return false;
+
+      if (Node == Load)
+        return true;
+
+      if (Node->getOpcode() == ISD::TokenFactor) {
+        for (unsigned i = 0; i < Node->getNumOperands(); ++i)
+          Worklist.push(Node->getOperand(i));
+      } else {
+        Worklist.push(Node->getOperand(0));
+      }
+    }
+
+    return false;
+  };
+
+  if (!IsSafeToFold(Store, Load))
+    return SDValue();
+
+  return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr,
+                            StoreOffset, Mask, Store->getMemoryVT(),
+                            Store->getMemOperand(), Store->getAddressingMode());
+}
+
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
@@ -53728,6 +53803,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                         St->getMemOperand()->getFlags());
   }
 
+  if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget))
+    return MaskedStore;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
new file mode 100644
index 0000000000000..75d0dd85cafda
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx     | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2    | FileCheck %s -check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s -check-prefix=AVX512
+
+
+define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AVX-LABEL: test_masked_store_success:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_masked_store_success:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaskmovd %ymm0, %ymm1, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_masked_store_success:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %load = load <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AVX-LABEL: test_masked_store_volatile_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vmovaps (%rdi), %ymm2
+; AVX-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_masked_store_volatile_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_masked_store_volatile_load:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %load = load volatile <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AVX-LABEL: test_masked_store_volatile_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vmovaps (%rdi), %ymm2
+; AVX-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_masked_store_volatile_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_masked_store_volatile_store:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %load = load <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store volatile <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+declare void @use_vec(<8 x i32>)
+
+define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AVX-LABEL: test_masked_store_intervening:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    subq $32, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    .cfi_offset %rbx, -16
+; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT:    vmovaps (%rdi), %ymm2
+; AVX-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    callq use_vec@PLT
+; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm0, (%rbx)
+; AVX-NEXT:    addq $32, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_masked_store_intervening:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    .cfi_offset %rbx, -16
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-NEXT:    vblendvps %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX2-NEXT:    callq use_vec@PLT
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, (%rbx)
+; AVX2-NEXT:    addq $32, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_masked_store_intervening:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    subq $144, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 160
+; AVX512-NEXT:    .cfi_offset %rbx, -16
+; AVX512-NEXT:    movq %rdi, %rbx
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm0
+; AVX512-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX512-NEXT:    callq use_vec@PLT
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %ymm1, (%rbx)
+; AVX512-NEXT:    addq $144, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %load = load <8 x i32>, ptr %ptr, align 32
+  store <8 x i32> zeroinitializer, ptr %ptr, align 32
+  %tmp = load <8 x i32>, ptr %ptr
+  call void @use_vec(<8 x i32> %tmp)
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+
+define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) {
+; AVX-LABEL: foo:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX-NEXT:    vpslld $31, %xmm4, %xmm4
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX-NEXT:    vpslld $31, %xmm4, %xmm4
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX-NEXT:    vmovaps (%rsi), %ymm4
+; AVX-NEXT:    vblendvps %ymm3, %ymm1, %ymm4, %ymm1
+; AVX-NEXT:    vmaskmovps %ymm0, %ymm2, (%rdi)
+; AVX-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: foo:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm3, %ymm3
+; AVX2-NEXT:    vmovaps (%rsi), %ymm4
+; AVX2-NEXT:    vblendvps %ymm3, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpmaskmovd %ymm0, %ymm2, (%rdi)
+; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: foo:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
+; AVX512-NEXT:    vpsllq $63, %zmm2, %zmm2
+; AVX512-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512-NEXT:    vpmovsxwq %xmm3, %zmm2
+; AVX512-NEXT:    vpsllq $63, %zmm2, %zmm2
+; AVX512-NEXT:    vptestmq %zmm2, %zmm2, %k2
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2}
+; AVX512-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512-NEXT:    vmovdqa %ymm2, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %load = load <8 x i32>, ptr %ptr1, align 32
+  %load2 = load <8 x i32>, ptr %ptr2, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2
+  store <8 x i32> %sel, ptr %ptr1, align 32
+  store <8 x i32> %sel2, ptr %ptr2, align 32
+  ret void
+}

phoebewang · 2025-06-22T02:23:18Z

Can we use a pattern match instead?

abhishek-kaushik22 · 2025-06-22T11:25:24Z

Can we use a pattern match instead?

There was no m_Load to match the Load, so I've added one.

llvm/include/llvm/CodeGen/SDPatternMatch.h

llvm/lib/Target/X86/X86ISelLowering.cpp

phoebewang · 2025-06-22T13:32:32Z

Can we use a pattern match instead?

There was no m_Load to match the Load, so I've added one.

I mean add

def: Pat<(st_frag (_.VT (vselect_mask _.KRCWM:$mask, (_.VT _.RC:$src), (_.VT (ld_frag addr:$dst)))), addr:$dst), (!cast<Instruction>(Name#_.ZSuffix#mrk) addr:$ptr, _.KRCWM:$mask, _.RC:$src)>;
def: Pat<(st_frag (_.VT (vselect_mask _.KRCWM:$mask, (_.VT _.RC:$src), _.ImmAllZerosV)), addr:$dst), (!cast<Instruction>(Name#_.ZSuffix#mrkz) addr:$ptr, _.KRCWM:$mask, _.RC:$src)>;

to avx512_store.

abhishek-kaushik22 · 2025-06-22T15:43:35Z

Can we use a pattern match instead?

There was no m_Load to match the Load, so I've added one.

I mean add

def: Pat<(st_frag (_.VT (vselect_mask _.KRCWM:$mask, (_.VT _.RC:$src), (_.VT (ld_frag addr:$dst)))), addr:$dst), (!cast<Instruction>(Name#_.ZSuffix#mrk) addr:$ptr, _.KRCWM:$mask, _.RC:$src)>;
def: Pat<(st_frag (_.VT (vselect_mask _.KRCWM:$mask, (_.VT _.RC:$src), _.ImmAllZerosV)), addr:$dst), (!cast<Instruction>(Name#_.ZSuffix#mrkz) addr:$ptr, _.KRCWM:$mask, _.RC:$src)>;

to avx512_store.

Is that safe to do? There might be other stores in between the store and load, and we don't have any alias information

llvm/lib/Target/X86/X86ISelLowering.cpp

Add a new combine to replace ``` (store ch (vselect cond truevec (load ch ptr offset)) ptr offset) ``` to ``` (mstore ch truevec ptr offset cond) ```

This reverts commit 73c5a668e2c4ff72195a816b1b3c93279ed46185.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

…llowed

…llvm-project into masked-store

abhishek-kaushik22 · 2025-07-31T16:46:42Z

@topperc @arsenm @RKSimon can you please take a look?

RKSimon

SGTM - any other comments?

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

abhishek-kaushik22 · 2025-08-04T08:03:18Z

If there are no more issues, can I please merge this?

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

abhishek-kaushik22 · 2025-08-04T13:35:31Z

I'll be merging this now, thank you everyone for the reviews :)

pranavk · 2025-09-05T18:59:24Z

This seems to be breaking one of our internal software. Backend crashes with:

ScalarizeVectorOperand Op #4: t94: ch = masked_store<(store (s64) into %ir.add.ptr, !alias.scope !2182)> t12, t98, t11, undef:i64, t96

Relevant lines of backtrace:

#8 llvm::DAGTypeLegalizer::run() 
#9 0x000055c40ae936b2 llvm::SelectionDAG::LegalizeTypes() 
#10 0x000055c40ae53808 llvm::SelectionDAGISel::CodeGenAndEmitDAG()

topperc · 2025-09-05T19:08:02Z

This seems to be breaking one of our internal software. Backend crashes with:

ScalarizeVectorOperand Op #4: t94: ch = masked_store<(store (s64) into %ir.add.ptr, !alias.scope !2182)> t12, t98, t11, undef:i64, t96

Relevant lines of backtrace:

#8 llvm::DAGTypeLegalizer::run() 
#9 0x000055c40ae936b2 llvm::SelectionDAG::LegalizeTypes() 
#10 0x000055c40ae53808 llvm::SelectionDAGISel::CodeGenAndEmitDAG()

That suggests that t96 has an illegal type that needs to be scalarized. Can you print more of the DAG at the point of the crash?

abhishek-kaushik22 · 2025-09-06T04:51:53Z

This seems to be breaking one of our internal software. Backend crashes with:

ScalarizeVectorOperand Op #4: t94: ch = masked_store<(store (s64) into %ir.add.ptr, !alias.scope !2182)> t12, t98, t11, undef:i64, t96

Relevant lines of backtrace:

#8 llvm::DAGTypeLegalizer::run() 
#9 0x000055c40ae936b2 llvm::SelectionDAG::LegalizeTypes() 
#10 0x000055c40ae53808 llvm::SelectionDAGISel::CodeGenAndEmitDAG()

Is the error "Do not know how to scalarize this operator's operand!"? I see there is no case in ScalarizeVectorOperand to scalarize ISD::MSTORE, but it shouldn't be too hard to add one. Can you please share what target is this failing on and maybe a minimal reproducer?

topperc · 2025-09-06T04:59:20Z

This seems to be breaking one of our internal software. Backend crashes with:
ScalarizeVectorOperand Op #4: t94: ch = masked_store<(store (s64) into %ir.add.ptr, !alias.scope !2182)> t12, t98, t11, undef:i64, t96
Relevant lines of backtrace:
#8 llvm::DAGTypeLegalizer::run() 
#9 0x000055c40ae936b2 llvm::SelectionDAG::LegalizeTypes() 
#10 0x000055c40ae53808 llvm::SelectionDAGISel::CodeGenAndEmitDAG()
Is the error "Do not know how to scalarize this operator's operand!"? I see there is no case in ScalarizeVectorOperand to scalarize ISD::MSTORE, but it shouldn't be too hard to add one.

It's not possible to create a scalar conditional store in SelectionDAG. It would need to split the basic block and introduce a branch over the store.

abhishek-kaushik22 · 2025-09-06T05:09:03Z

It's not possible to create a scalar conditional store in SelectionDAG. It would need to split the basic block and introduce a branch over the store.

Sorry, you are right. In that case maybe we need stricter checks before we do this combine, I guess

if (!TLI.isOperationLegalOrCustom(ISD::MSTORE, VT) ||
      !TLI.allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment))
    return SDValue();

is not enough. I assumed that the isOperationLegalOrCustom will handle cases with illegal types.

jan-wassenberg · 2025-09-08T07:20:15Z

For repro-ing, one of the TUs that crashes during compilation is https://github.com/google/highway/blob/master/hwy/tests/compress_test.cc.

abhishek-kaushik22 · 2025-09-08T07:24:29Z

For repro-ing, one of the TUs that crashes during compilation is https://github.com/google/highway/blob/master/hwy/tests/compress_test.cc.

@jan-wassenberg Can you please specify the target this is failing on? I see the tests support multiple targets, does it fail on all targets?

jan-wassenberg · 2025-09-08T08:23:50Z

It's likely the HWY_NEON_BF16 target. We are able to repro the compiler crash with an UBSAN build and -march=armv8.2-a+sve.

abhishek-kaushik22 · 2025-09-08T12:15:27Z

It's likely the HWY_NEON_BF16 target. We are able to repro the compiler crash with an UBSAN build and -march=armv8.2-a+sve.

Can you please provide a preprocessed cc file that I can use to reproduce? I wasn't able to build the project on my machine

jan-wassenberg · 2025-09-08T13:00:46Z

It is difficult to share a preprocessed .cc file.
Highway is widely used, I'm curious what is the issue with building?

abhishek-kaushik22 · 2025-09-08T13:17:41Z

It is difficult to share a preprocessed .cc file. Highway is widely used, I'm curious what is the issue with building?

The issue is I don't have an arm machine, and I don't have the privileges to setup arm toolchains on office servers :(

jan-wassenberg · 2025-09-08T17:51:36Z

ACK. We are seeing about simplifying/reviewing the source.

pranavk · 2025-09-19T14:46:52Z

simple.txt

Here's the minimal code to repro this.
Compile flags:
-std=c++20 -c -Wno-constant-logical-operand --target=aarch64-none-linux-gnu -fno-crash-diagnostics -march=armv8.2-a+sve -O1

Crashes with same stack trace as mentioned above.

abhishek-kaushik22 · 2025-09-20T08:44:54Z

I've opened #159912 to track this. I'll try and get a PR with the fix soon.

abhishek-kaushik22 requested review from phoebewang and e-kud June 21, 2025 16:19

abhishek-kaushik22 changed the title ~~[X86] Combine store + vselect to masked_store`~~ [X86] Combine store + vselect to masked_store Jun 21, 2025

llvmbot added the backend:X86 label Jun 21, 2025

abhishek-kaushik22 requested a review from RKSimon June 21, 2025 16:25

RKSimon reviewed Jun 22, 2025

View reviewed changes

llvm/include/llvm/CodeGen/SDPatternMatch.h Outdated Show resolved Hide resolved

RKSimon reviewed Jun 22, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Outdated Show resolved Hide resolved

phoebewang reviewed Jun 23, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Outdated Show resolved Hide resolved

phoebewang reviewed Jun 23, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Outdated Show resolved Hide resolved

abhishek-kaushik22 added 6 commits June 26, 2025 23:13

[X86] Combine store + vselect to masked_store`

d9d04de

Add a new combine to replace ``` (store ch (vselect cond truevec (load ch ptr offset)) ptr offset) ``` to ``` (mstore ch truevec ptr offset cond) ```

Use pattern match

9eca209

Fix tests

c0d5cf0

Revert last 3 commits

b3a4522

Revert "[X86] Combine store + vselect to masked_store`"

04366fa

This reverts commit 73c5a668e2c4ff72195a816b1b3c93279ed46185.

Move to DAGCombiner

34fa965

abhishek-kaushik22 force-pushed the masked-store branch from 198fe9d to 34fa965 Compare June 26, 2025 17:45

llvmbot added backend:ARM backend:AArch64 llvm:codegen llvm:SelectionDAG SelectionDAGISel as well llvm:transforms labels Jun 26, 2025

abhishek-kaushik22 changed the title ~~[X86] Combine store + vselect to masked_store~~ [DAG] Combine store + vselect to masked_store Jun 26, 2025

Update macro-fuse-cmp.ll

3106f46

arsenm reviewed Jun 27, 2025

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

Use allowsMisalignedMemoryAccesses to check if unaligned stores are a…

8c14fba

…llowed

Merge branch 'masked-store' of https://github.com/abhishek-kaushik22/…

ad5ead1

…llvm-project into masked-store

abhishek-kaushik22 requested review from arsenm and topperc July 31, 2025 16:45

RKSimon reviewed Aug 2, 2025

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

abhishek-kaushik22 added 2 commits August 3, 2025 15:33

Update DAGCombiner.cpp

ed1d804

Merge branch 'main' into masked-store

f5aed1f

RKSimon reviewed Aug 4, 2025

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Show resolved Hide resolved

abhishek-kaushik22 added 2 commits August 4, 2025 14:53

Merge branch 'main' into masked-store

6d26be2

Add address space check

f4157dd

arsenm approved these changes Aug 4, 2025

View reviewed changes

abhishek-kaushik22 merged commit 1c0ac80 into llvm:main Aug 4, 2025
9 checks passed

abhishek-kaushik22 deleted the masked-store branch August 4, 2025 13:35

[DAG] Combine store + vselect to masked_store #145176

[DAG] Combine store + vselect to masked_store #145176

Conversation

abhishek-kaushik22 commented Jun 21, 2025

Uh oh!

llvmbot commented Jun 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

phoebewang commented Jun 22, 2025

Uh oh!

abhishek-kaushik22 commented Jun 22, 2025

Uh oh!

Uh oh!

Uh oh!

phoebewang commented Jun 22, 2025

Uh oh!

abhishek-kaushik22 commented Jun 22, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

abhishek-kaushik22 commented Jul 31, 2025

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

abhishek-kaushik22 commented Aug 4, 2025

Uh oh!

Uh oh!

abhishek-kaushik22 commented Aug 4, 2025

Uh oh!

Uh oh!

pranavk commented Sep 5, 2025

Uh oh!

topperc commented Sep 5, 2025

Uh oh!

abhishek-kaushik22 commented Sep 6, 2025

Uh oh!

topperc commented Sep 6, 2025

Uh oh!

abhishek-kaushik22 commented Sep 6, 2025

Uh oh!

jan-wassenberg commented Sep 8, 2025

Uh oh!

abhishek-kaushik22 commented Sep 8, 2025

Uh oh!

jan-wassenberg commented Sep 8, 2025

Uh oh!

abhishek-kaushik22 commented Sep 8, 2025

Uh oh!

jan-wassenberg commented Sep 8, 2025

Uh oh!

abhishek-kaushik22 commented Sep 8, 2025

Uh oh!

jan-wassenberg commented Sep 8, 2025

Uh oh!

pranavk commented Sep 19, 2025

Uh oh!

abhishek-kaushik22 commented Sep 20, 2025

Uh oh!

Uh oh!

[DAG] Combine `store + vselect` to `masked_store` #145176

[DAG] Combine `store + vselect` to `masked_store` #145176

llvmbot commented Jun 21, 2025 •

edited

Loading