Skip to content

Commit 995f741

Browse files
committed
Auto merge of #42556 - scottmcm:ctz-nz, r=BurntSushi
Get LLVM to stop generating dead assembly in next_power_of_two It turns out that LLVM can turn `@llvm.ctlz.i64(_, true)` into `@llvm.ctlz.i64(_, false)` ([`ctlz`](http://llvm.org/docs/LangRef.html#llvm-ctlz-intrinsic)) where valuable, but never does the opposite. That leads to some silly assembly getting generated in certain cases. A contrived-but-clear example https://is.gd/VAIKuC: ```rust fn foo(x:u64) -> u32 { if x == 0 { return !0; } x.leading_zeros() } ``` Generates ```asm testq %rdi, %rdi je .LBB0_1 je .LBB0_3 ; <-- wha? bsrq %rdi, %rax xorq $63, %rax retq .LBB0_1: movl $-1, %eax retq .LBB0_3: movl $64, %eax ; <-- dead retq ``` I noticed this in `next_power_of_two`, which without this PR generates the following: ```asm cmpq $2, %rcx jae .LBB1_2 movl $1, %eax retq .LBB1_2: decq %rcx je .LBB1_3 bsrq %rcx, %rcx xorq $63, %rcx jmp .LBB1_5 .LBB1_3: movl $64, %ecx ; <-- dead .LBB1_5: movq $-1, %rax shrq %cl, %rax incq %rax retq ``` And with this PR becomes ```asm cmpq $2, %rcx jae .LBB0_2 movl $1, %eax retq .LBB0_2: decq %rcx bsrq %rcx, %rcx xorl $63, %ecx movq $-1, %rax shrq %cl, %rax incq %rax retq ```
2 parents e148049 + 6d86f0c commit 995f741

File tree

5 files changed

+96
-3
lines changed

5 files changed

+96
-3
lines changed

src/libcore/intrinsics.rs

+34
Original file line numberDiff line numberDiff line change
@@ -1229,6 +1229,23 @@ extern "rust-intrinsic" {
12291229
/// ```
12301230
pub fn ctlz<T>(x: T) -> T;
12311231

1232+
/// Like `ctlz`, but extra-unsafe as it returns `undef` when
1233+
/// given an `x` with value `0`.
1234+
///
1235+
/// # Examples
1236+
///
1237+
/// ```
1238+
/// #![feature(core_intrinsics)]
1239+
///
1240+
/// use std::intrinsics::ctlz_nonzero;
1241+
///
1242+
/// let x = 0b0001_1100_u8;
1243+
/// let num_leading = unsafe { ctlz_nonzero(x) };
1244+
/// assert_eq!(num_leading, 3);
1245+
/// ```
1246+
#[cfg(not(stage0))]
1247+
pub fn ctlz_nonzero<T>(x: T) -> T;
1248+
12321249
/// Returns the number of trailing unset bits (zeroes) in an integer type `T`.
12331250
///
12341251
/// # Examples
@@ -1256,6 +1273,23 @@ extern "rust-intrinsic" {
12561273
/// ```
12571274
pub fn cttz<T>(x: T) -> T;
12581275

1276+
/// Like `cttz`, but extra-unsafe as it returns `undef` when
1277+
/// given an `x` with value `0`.
1278+
///
1279+
/// # Examples
1280+
///
1281+
/// ```
1282+
/// #![feature(core_intrinsics)]
1283+
///
1284+
/// use std::intrinsics::cttz_nonzero;
1285+
///
1286+
/// let x = 0b0011_1000_u8;
1287+
/// let num_trailing = unsafe { cttz_nonzero(x) };
1288+
/// assert_eq!(num_trailing, 3);
1289+
/// ```
1290+
#[cfg(not(stage0))]
1291+
pub fn cttz_nonzero<T>(x: T) -> T;
1292+
12591293
/// Reverses the bytes in an integer type `T`.
12601294
pub fn bswap<T>(x: T) -> T;
12611295

src/libcore/num/mod.rs

+16-1
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,7 @@ macro_rules! uint_impl {
12621262
($SelfT:ty, $ActualT:ty, $BITS:expr,
12631263
$ctpop:path,
12641264
$ctlz:path,
1265+
$ctlz_nonzero:path,
12651266
$cttz:path,
12661267
$bswap:path,
12671268
$add_with_overflow:path,
@@ -2184,6 +2185,7 @@ macro_rules! uint_impl {
21842185
// This method cannot overflow, as in the `next_power_of_two`
21852186
// overflow cases it instead ends up returning the maximum value
21862187
// of the type, and can return 0 for 0.
2188+
#[inline]
21872189
fn one_less_than_next_power_of_two(self) -> Self {
21882190
if self <= 1 { return 0; }
21892191

@@ -2192,7 +2194,7 @@ macro_rules! uint_impl {
21922194
// (such as intel pre-haswell) have more efficient ctlz
21932195
// intrinsics when the argument is non-zero.
21942196
let p = self - 1;
2195-
let z = p.leading_zeros();
2197+
let z = unsafe { $ctlz_nonzero(p) };
21962198
<$SelfT>::max_value() >> z
21972199
}
21982200

@@ -2236,11 +2238,17 @@ macro_rules! uint_impl {
22362238
}
22372239
}
22382240

2241+
#[cfg(stage0)]
2242+
unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz(x) }
2243+
#[cfg(not(stage0))]
2244+
unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz_nonzero(x) }
2245+
22392246
#[lang = "u8"]
22402247
impl u8 {
22412248
uint_impl! { u8, u8, 8,
22422249
intrinsics::ctpop,
22432250
intrinsics::ctlz,
2251+
ctlz_nonzero,
22442252
intrinsics::cttz,
22452253
intrinsics::bswap,
22462254
intrinsics::add_with_overflow,
@@ -2253,6 +2261,7 @@ impl u16 {
22532261
uint_impl! { u16, u16, 16,
22542262
intrinsics::ctpop,
22552263
intrinsics::ctlz,
2264+
ctlz_nonzero,
22562265
intrinsics::cttz,
22572266
intrinsics::bswap,
22582267
intrinsics::add_with_overflow,
@@ -2265,6 +2274,7 @@ impl u32 {
22652274
uint_impl! { u32, u32, 32,
22662275
intrinsics::ctpop,
22672276
intrinsics::ctlz,
2277+
ctlz_nonzero,
22682278
intrinsics::cttz,
22692279
intrinsics::bswap,
22702280
intrinsics::add_with_overflow,
@@ -2277,6 +2287,7 @@ impl u64 {
22772287
uint_impl! { u64, u64, 64,
22782288
intrinsics::ctpop,
22792289
intrinsics::ctlz,
2290+
ctlz_nonzero,
22802291
intrinsics::cttz,
22812292
intrinsics::bswap,
22822293
intrinsics::add_with_overflow,
@@ -2289,6 +2300,7 @@ impl u128 {
22892300
uint_impl! { u128, u128, 128,
22902301
intrinsics::ctpop,
22912302
intrinsics::ctlz,
2303+
ctlz_nonzero,
22922304
intrinsics::cttz,
22932305
intrinsics::bswap,
22942306
intrinsics::add_with_overflow,
@@ -2302,6 +2314,7 @@ impl usize {
23022314
uint_impl! { usize, u16, 16,
23032315
intrinsics::ctpop,
23042316
intrinsics::ctlz,
2317+
ctlz_nonzero,
23052318
intrinsics::cttz,
23062319
intrinsics::bswap,
23072320
intrinsics::add_with_overflow,
@@ -2314,6 +2327,7 @@ impl usize {
23142327
uint_impl! { usize, u32, 32,
23152328
intrinsics::ctpop,
23162329
intrinsics::ctlz,
2330+
ctlz_nonzero,
23172331
intrinsics::cttz,
23182332
intrinsics::bswap,
23192333
intrinsics::add_with_overflow,
@@ -2327,6 +2341,7 @@ impl usize {
23272341
uint_impl! { usize, u64, 64,
23282342
intrinsics::ctpop,
23292343
intrinsics::ctlz,
2344+
ctlz_nonzero,
23302345
intrinsics::cttz,
23312346
intrinsics::bswap,
23322347
intrinsics::add_with_overflow,

src/librustc_trans/intrinsic.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
267267
};
268268
bcx.call(expect, &[llargs[0], C_i32(ccx, rw), llargs[1], C_i32(ccx, cache_type)], None)
269269
},
270-
"ctlz" | "cttz" | "ctpop" | "bswap" |
270+
"ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "ctpop" | "bswap" |
271271
"add_with_overflow" | "sub_with_overflow" | "mul_with_overflow" |
272272
"overflowing_add" | "overflowing_sub" | "overflowing_mul" |
273273
"unchecked_div" | "unchecked_rem" | "unchecked_shl" | "unchecked_shr" => {
@@ -280,6 +280,12 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
280280
let llfn = ccx.get_intrinsic(&format!("llvm.{}.i{}", name, width));
281281
bcx.call(llfn, &[llargs[0], y], None)
282282
}
283+
"ctlz_nonzero" | "cttz_nonzero" => {
284+
let y = C_bool(bcx.ccx, true);
285+
let llvm_name = &format!("llvm.{}.i{}", &name[..4], width);
286+
let llfn = ccx.get_intrinsic(llvm_name);
287+
bcx.call(llfn, &[llargs[0], y], None)
288+
}
283289
"ctpop" => bcx.call(ccx.get_intrinsic(&format!("llvm.ctpop.i{}", width)),
284290
&llargs, None),
285291
"bswap" => {

src/librustc_typeck/check/intrinsic.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ pub fn check_intrinsic_type<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
272272
"volatile_store" =>
273273
(1, vec![ tcx.mk_mut_ptr(param(0)), param(0) ], tcx.mk_nil()),
274274

275-
"ctpop" | "ctlz" | "cttz" | "bswap" => (1, vec![param(0)], param(0)),
275+
"ctpop" | "ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "bswap" =>
276+
(1, vec![param(0)], param(0)),
276277

277278
"add_with_overflow" | "sub_with_overflow" | "mul_with_overflow" =>
278279
(1, vec![param(0), param(0)],

src/test/run-pass/intrinsics-integer.rs

+37
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ mod rusti {
1414
extern "rust-intrinsic" {
1515
pub fn ctpop<T>(x: T) -> T;
1616
pub fn ctlz<T>(x: T) -> T;
17+
pub fn ctlz_nonzero<T>(x: T) -> T;
1718
pub fn cttz<T>(x: T) -> T;
19+
pub fn cttz_nonzero<T>(x: T) -> T;
1820
pub fn bswap<T>(x: T) -> T;
1921
}
2022
}
@@ -68,6 +70,21 @@ pub fn main() {
6870
assert_eq!(ctlz(100u32), 25); assert_eq!(ctlz(100i32), 25);
6971
assert_eq!(ctlz(100u64), 57); assert_eq!(ctlz(100i64), 57);
7072

73+
assert_eq!(ctlz_nonzero(1u8), 7); assert_eq!(ctlz_nonzero(1i8), 7);
74+
assert_eq!(ctlz_nonzero(1u16), 15); assert_eq!(ctlz_nonzero(1i16), 15);
75+
assert_eq!(ctlz_nonzero(1u32), 31); assert_eq!(ctlz_nonzero(1i32), 31);
76+
assert_eq!(ctlz_nonzero(1u64), 63); assert_eq!(ctlz_nonzero(1i64), 63);
77+
78+
assert_eq!(ctlz_nonzero(10u8), 4); assert_eq!(ctlz_nonzero(10i8), 4);
79+
assert_eq!(ctlz_nonzero(10u16), 12); assert_eq!(ctlz_nonzero(10i16), 12);
80+
assert_eq!(ctlz_nonzero(10u32), 28); assert_eq!(ctlz_nonzero(10i32), 28);
81+
assert_eq!(ctlz_nonzero(10u64), 60); assert_eq!(ctlz_nonzero(10i64), 60);
82+
83+
assert_eq!(ctlz_nonzero(100u8), 1); assert_eq!(ctlz_nonzero(100i8), 1);
84+
assert_eq!(ctlz_nonzero(100u16), 9); assert_eq!(ctlz_nonzero(100i16), 9);
85+
assert_eq!(ctlz_nonzero(100u32), 25); assert_eq!(ctlz_nonzero(100i32), 25);
86+
assert_eq!(ctlz_nonzero(100u64), 57); assert_eq!(ctlz_nonzero(100i64), 57);
87+
7188
assert_eq!(cttz(-1i8 as u8), 0); assert_eq!(cttz(-1i8), 0);
7289
assert_eq!(cttz(-1i16 as u16), 0); assert_eq!(cttz(-1i16), 0);
7390
assert_eq!(cttz(-1i32 as u32), 0); assert_eq!(cttz(-1i32), 0);
@@ -93,6 +110,26 @@ pub fn main() {
93110
assert_eq!(cttz(100u32), 2); assert_eq!(cttz(100i32), 2);
94111
assert_eq!(cttz(100u64), 2); assert_eq!(cttz(100i64), 2);
95112

113+
assert_eq!(cttz_nonzero(-1i8 as u8), 0); assert_eq!(cttz_nonzero(-1i8), 0);
114+
assert_eq!(cttz_nonzero(-1i16 as u16), 0); assert_eq!(cttz_nonzero(-1i16), 0);
115+
assert_eq!(cttz_nonzero(-1i32 as u32), 0); assert_eq!(cttz_nonzero(-1i32), 0);
116+
assert_eq!(cttz_nonzero(-1i64 as u64), 0); assert_eq!(cttz_nonzero(-1i64), 0);
117+
118+
assert_eq!(cttz_nonzero(1u8), 0); assert_eq!(cttz_nonzero(1i8), 0);
119+
assert_eq!(cttz_nonzero(1u16), 0); assert_eq!(cttz_nonzero(1i16), 0);
120+
assert_eq!(cttz_nonzero(1u32), 0); assert_eq!(cttz_nonzero(1i32), 0);
121+
assert_eq!(cttz_nonzero(1u64), 0); assert_eq!(cttz_nonzero(1i64), 0);
122+
123+
assert_eq!(cttz_nonzero(10u8), 1); assert_eq!(cttz_nonzero(10i8), 1);
124+
assert_eq!(cttz_nonzero(10u16), 1); assert_eq!(cttz_nonzero(10i16), 1);
125+
assert_eq!(cttz_nonzero(10u32), 1); assert_eq!(cttz_nonzero(10i32), 1);
126+
assert_eq!(cttz_nonzero(10u64), 1); assert_eq!(cttz_nonzero(10i64), 1);
127+
128+
assert_eq!(cttz_nonzero(100u8), 2); assert_eq!(cttz_nonzero(100i8), 2);
129+
assert_eq!(cttz_nonzero(100u16), 2); assert_eq!(cttz_nonzero(100i16), 2);
130+
assert_eq!(cttz_nonzero(100u32), 2); assert_eq!(cttz_nonzero(100i32), 2);
131+
assert_eq!(cttz_nonzero(100u64), 2); assert_eq!(cttz_nonzero(100i64), 2);
132+
96133
assert_eq!(bswap(0x0Au8), 0x0A); // no-op
97134
assert_eq!(bswap(0x0Ai8), 0x0A); // no-op
98135
assert_eq!(bswap(0x0A0Bu16), 0x0B0A);

0 commit comments

Comments
 (0)