[InstCombine] Generate better code for `std::bit_floor` from libstdc++ #144842

el-ev · 2025-06-19T05:35:31Z

Closes #61183.
This reverts commit 8a1373d.

Added a mask to the result of ctlz to avoid poison from out-of-range lshr.

Alive2: https://alive2.llvm.org/ce/z/WJiLdq

This reverts commit 8a1373d.

llvmbot · 2025-06-19T05:36:01Z

@llvm/pr-subscribers-llvm-transforms

Author: Iris Shi (el-ev)

Changes

Closes #61183.
This reverts commit 8a1373d.

Added a mask to the result of ctlz to avoid poison from out-of-range lshr.

Alive2: https://alive2.llvm.org/ce/z/WJiLdq

Full diff: https://github.com/llvm/llvm-project/pull/144842.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp (+77)
(modified) llvm/test/Transforms/InstCombine/bit_floor.ll (+12-16)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 73ba0f78e8053..72fa225045662 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3911,6 +3911,80 @@ static Value *foldSelectBitTest(SelectInst &Sel, Value *CondVal, Value *TrueVal,
   return nullptr;
 }
 
+// Transform:
+//
+//   1 << (C - ctlz(X >> 1))
+//
+// into
+//
+//   (1 << (C - 1)) >> ctlz(X)
+//
+// The caller must guarantee that X is nonzero.
+//
+// TODO: Relax the requirement that X be nonzero.  We just need to require X to
+// be nonzero or the second argument of CTLZ to be true (that is, returning
+// poison on zero).
+static Instruction *foldBitFloorNonzero(Value *N, Value *X,
+                                        InstCombiner::BuilderTy &Builder) {
+  Type *NType = N->getType();
+  unsigned BitWidth = NType->getScalarSizeInBits();
+
+  // Match C - ctlz(X >> 1), where C is in (0, BitWidth].
+  // TODO: Handle C in [0, BitWidth] (with 0 included in the range), in which
+  // case 1 << C - ctlz(X >> 1) is equivalent to
+  // (1 << ((C - 1) & (BitWidth - 1))) >> ctlz(X).
+  const APInt *C = nullptr;
+  Value *CTLZ;
+  if (!match(N, m_OneUse(m_Shl(m_One(),
+                               m_OneUse(m_Sub(m_APInt(C), m_Value(CTLZ)))))) ||
+      !(C->ugt(0) && C->ule(BitWidth)) ||
+      !match(CTLZ, m_OneUse(m_Intrinsic<Intrinsic::ctlz>(
+                       m_OneUse(m_LShr(m_Specific(X), m_One())), m_Zero()))))
+    return nullptr;
+
+  APInt ShiftedBit = APInt::getOneBitSet(BitWidth, C->getZExtValue() - 1);
+
+  Value *NewCTLZ =
+      Builder.CreateIntrinsic(Intrinsic::ctlz, {CTLZ->getType()},
+                              {X, cast<Instruction>(CTLZ)->getOperand(1)});
+  Value *Masked =
+      Builder.CreateAnd(NewCTLZ, ConstantInt::get(NType, BitWidth - 1));
+  auto *Shift = cast<Instruction>(
+      Builder.CreateLShr(ConstantInt::get(NType, ShiftedBit), Masked));
+  Shift->setIsExact();
+  return Shift;
+}
+
+// Transform:
+//
+//   X == 0 ? 0 : (1 << (C1 - ctlz(X >> 1)))
+//
+// into
+//
+//   X == 0 ? 0 : (C2 >> ctlz(X))
+//
+// where C2 is computed by foldBitFloorNonzero based on C1.  The caller is
+// responsible for replacing one of the select operands.
+static Instruction *foldBitFloor(SelectInst &SI,
+                                 InstCombiner::BuilderTy &Builder) {
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  CmpPredicate Pred;
+  Value *Cond0;
+  if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(Cond0), m_Zero())) ||
+      !ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  if (Pred == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  if (!match(TrueVal, m_Zero()))
+    return nullptr;
+
+  return foldBitFloorNonzero(FalseVal, Cond0, Builder);
+}
+
 Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -4392,6 +4466,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = foldBitCeil(SI, Builder, *this))
     return I;
 
+  if (Instruction *I = foldBitFloor(SI, Builder))
+    return replaceOperand(SI, match(SI.getTrueValue(), m_Zero()) ? 2 : 1, I);
+
   if (Instruction *I = foldSelectToCmp(SI))
     return I;
 
diff --git a/llvm/test/Transforms/InstCombine/bit_floor.ll b/llvm/test/Transforms/InstCombine/bit_floor.ll
index 2872221e8aa87..bce7e3fb45eba 100644
--- a/llvm/test/Transforms/InstCombine/bit_floor.ll
+++ b/llvm/test/Transforms/InstCombine/bit_floor.ll
@@ -4,10 +4,9 @@
 define i32 @bit_floor_32(i32 %x) {
 ; CHECK-LABEL: @bit_floor_32(
 ; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false)
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SHL:%.*]] = lshr exact i32 -2147483648, [[TMP2]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -23,10 +22,9 @@ define i32 @bit_floor_32(i32 %x) {
 define i64 @bit_floor_64(i64 %x) {
 ; CHECK-LABEL: @bit_floor_64(
 ; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i64 [[X:%.*]], 0
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[X]], 1
-; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i64 1, 65) i64 @llvm.ctlz.i64(i64 [[LSHR]], i1 false)
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[X]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 63
+; CHECK-NEXT:    [[SHL:%.*]] = lshr exact i64 -9223372036854775808, [[TMP2]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i64 0, i64 [[SHL]]
 ; CHECK-NEXT:    ret i64 [[SEL]]
 ;
@@ -43,10 +41,9 @@ define i64 @bit_floor_64(i64 %x) {
 define i32 @bit_floor_commuted_operands(i32 %x) {
 ; CHECK-LABEL: @bit_floor_commuted_operands(
 ; CHECK-NEXT:    [[NE0_NOT:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false)
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SHL:%.*]] = lshr exact i32 -2147483648, [[TMP2]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NE0_NOT]], i32 0, i32 [[SHL]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -151,10 +148,9 @@ define i32 @bit_floor_shl_used_twice(i32 %x, ptr %p) {
 define <4 x i32> @bit_floor_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @bit_floor_v4i32(
 ; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq <4 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 1)
-; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 1, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[LSHR]], i1 false)
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <4 x i32> splat (i32 32), [[CTLZ]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <4 x i32> splat (i32 1), [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[X]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP1]], splat (i32 31)
+; CHECK-NEXT:    [[SHL:%.*]] = lshr exact <4 x i32> splat (i32 -2147483648), [[TMP2]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[EQ0]], <4 x i32> zeroinitializer, <4 x i32> [[SHL]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;

kazutakahirata

Thank you for reviving and revising this patch!

kazutakahirata · 2025-06-22T23:08:31Z

llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp

+//
+// into
+//
+//   (1 << (C - 1)) >> ctlz(X)


Could you update this comment with the mask?

Maybe you can even mention that the mask is free on many machines.

dtcxzyw · 2025-06-23T10:59:33Z

Currently, SimplifyCFG doesn't convert the branch into select: https://godbolt.org/z/1Kn5aPrz4 So this patch may not have real-world effects.
BTW we don't care about undef-related miscompilation in most cases. Just query isGuaranteedNotToBeUndef on X is okay if you really care about that. Alive2: https://godbolt.org/z/1Kn5aPrz4

[InstCombine] Generate better code for std::bit_floor from libstdc++

4072d91

This reverts commit 8a1373d.

el-ev requested review from RKSimon, nunoplopes and kazutakahirata June 19, 2025 05:35

el-ev requested a review from nikic as a code owner June 19, 2025 05:35

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Jun 19, 2025

This was referenced Jun 19, 2025

Task submission dtcxzyw/llvm-opt-benchmark#1312

Open

Fuzz PR144842 dtcxzyw/llvm-mutation-based-fuzz-service#64

Closed

pre-commit: PR144842 dtcxzyw/llvm-opt-benchmark#2467

Closed

kazutakahirata reviewed Jun 22, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[InstCombine] Generate better code for `std::bit_floor` from libstdc++ #144842

[InstCombine] Generate better code for `std::bit_floor` from libstdc++ #144842

el-ev commented Jun 19, 2025

Uh oh!

llvmbot commented Jun 19, 2025

Uh oh!

kazutakahirata left a comment

Uh oh!

kazutakahirata Jun 22, 2025

Uh oh!

dtcxzyw commented Jun 23, 2025

Uh oh!

Uh oh!

[InstCombine] Generate better code for std::bit_floor from libstdc++ #144842

Are you sure you want to change the base?

[InstCombine] Generate better code for std::bit_floor from libstdc++ #144842

Conversation

el-ev commented Jun 19, 2025

Uh oh!

llvmbot commented Jun 19, 2025

Uh oh!

kazutakahirata left a comment

Choose a reason for hiding this comment

Uh oh!

kazutakahirata Jun 22, 2025

Choose a reason for hiding this comment

Uh oh!

dtcxzyw commented Jun 23, 2025

Uh oh!

Uh oh!

[InstCombine] Generate better code for `std::bit_floor` from libstdc++ #144842

[InstCombine] Generate better code for `std::bit_floor` from libstdc++ #144842