-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Generated Result propagation code is needlessly complex #81146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
I just tried adding <x::foo>:
push %rax
callq <x::bar>
cmp $0x2,%eax
/-- jne <x::foo+0x17>
| callq <x::println>
| mov $0x2,%eax
| xor %edx,%edx
\-> pop %rcx
retq The only thing I'm not sure is necessary is the clearing of LLVM IR before optimization passes; main::foo
; Function Attrs: noinline nonlazybind uwtable
define internal i128 @_ZN4main3foo17hb520db95bcf5df69E() unnamed_addr #2 {
start:
%0 = alloca i128, align 8
%self = alloca %"std::result::Result<(), Error>", align 8
%1 = alloca %"std::result::Result<(), Error>", align 8
; call main::bar
%2 = call i128 @_ZN4main3bar17h199b6044e637b435E()
%3 = bitcast i128* %0 to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
store i128 %2, i128* %0, align 8
%4 = bitcast %"std::result::Result<(), Error>"* %self to i8*
%5 = bitcast i128* %0 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %5, i64 16, i1 false)
%6 = bitcast i128* %0 to i8*
call void @llvm.lifetime.end.p0i8(i64 16, i8* %6)
br label %bb4
bb1: ; preds = %bb4
; call main::println
call void @_ZN4main7println17h4b182729f4c1c974E()
br label %bb3
bb2: ; preds = %bb4
%7 = bitcast %"std::result::Result<(), Error>"* %1 to %"std::result::Result<(), Error>::Err"*
%8 = bitcast %"std::result::Result<(), Error>::Err"* %7 to %Error*
%9 = bitcast %"std::result::Result<(), Error>"* %self to %"std::result::Result<(), Error>::Err"*
%10 = bitcast %"std::result::Result<(), Error>::Err"* %9 to %Error*
%11 = bitcast %Error* %8 to i8*
%12 = bitcast %Error* %10 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %11, i8* align 8 %12, i64 16, i1 false)
%13 = bitcast %"std::result::Result<(), Error>"* %1 to i128*
%14 = load i128, i128* %13, align 8
ret i128 %14
bb3: ; preds = %bb1
%15 = bitcast %"std::result::Result<(), Error>"* %1 to %"std::result::Result<(), Error>::Ok"*
%16 = bitcast %"std::result::Result<(), Error>::Ok"* %15 to {}*
%17 = bitcast %"std::result::Result<(), Error>"* %1 to i32*
store i32 2, i32* %17, align 8
%18 = bitcast %"std::result::Result<(), Error>"* %1 to i128*
%19 = load i128, i128* %18, align 8
ret i128 %19
bb4: ; preds = %start
%20 = bitcast %"std::result::Result<(), Error>"* %self to i32*
%21 = load i32, i32* %20, align 8, !range !13
%22 = sub i32 %21, 2
%23 = icmp eq i32 %22, 0
%_2 = select i1 %23, i64 0, i64 1
%24 = icmp eq i64 %_2, 0
br i1 %24, label %bb1, label %bb2
} Remaining questions:
|
The fact that Rust packs everything into an i128 instead of representing it as a struct is probably a large part of the problem here. |
@nikic I tried changing that a little while ago. For aggregate types it seems LLVM tries to pass each field in a separate register, so arguments end up getting passed indirectly more often, e.g. #[no_mangle]
fn sum(x: [u8; 4], y: [u8; 4]) -> [u8; 4] {
[x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3]]
} ...with integer types (current nightly): mov eax, edi
and eax, 65280
mov ecx, edi
and ecx, 16711680
lea edx, [rsi + rdi]
and edi, -16777216
add edi, esi
and edi, -16777216
add ecx, esi
and ecx, 16711680
or edi, ecx
add eax, esi
and eax, 65280
or edi, eax
movzx eax, dl
or eax, edi
ret Optimized LLVM IRdefine i32 @sum(i32 %0, i32 %1) unnamed_addr #0 {
start:
%.sroa.4.0.extract.shift = and i32 %0, 65280
%.sroa.5.0.extract.shift = and i32 %0, 16711680
%.sroa.6.0.extract.shift = and i32 %0, -16777216
%_3 = add i32 %1, %0
%.sroa.64.0.extract.shift8 = add i32 %.sroa.6.0.extract.shift, %1
%.sroa.46.0.insert.shift = and i32 %.sroa.64.0.extract.shift8, -16777216
%.sroa.53.0.extract.shift10 = add i32 %.sroa.5.0.extract.shift, %1
%.sroa.3.0.insert.shift = and i32 %.sroa.53.0.extract.shift10, 16711680
%.sroa.3.0.insert.insert = or i32 %.sroa.46.0.insert.shift, %.sroa.3.0.insert.shift
%.sroa.42.0.extract.shift12 = add i32 %.sroa.4.0.extract.shift, %1
%.sroa.2.0.insert.shift = and i32 %.sroa.42.0.extract.shift12, 65280
%.sroa.2.0.insert.insert = or i32 %.sroa.3.0.insert.insert, %.sroa.2.0.insert.shift
%.sroa.05.0.insert.ext = and i32 %_3, 255
%.sroa.05.0.insert.insert = or i32 %.sroa.2.0.insert.insert, %.sroa.05.0.insert.ext
ret i32 %.sroa.05.0.insert.insert
}
attributes #0 = { norecurse nounwind nonlazybind readnone uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } ...with aggregate types: mov rax, rdi
add r9b, sil
add dl, byte ptr [rsp + 8]
add cl, byte ptr [rsp + 16]
add r8b, byte ptr [rsp + 24]
mov byte ptr [rdi + 3], r8b
mov byte ptr [rdi + 2], cl
mov byte ptr [rdi + 1], dl
mov byte ptr [rdi], r9b
ret Optimized LLVM IRdefine [4 x i8] @sum([4 x i8] %0, [4 x i8] %1) unnamed_addr #0 {
start:
%.fca.0.extract = extractvalue [4 x i8] %0, 0
%.fca.1.extract = extractvalue [4 x i8] %0, 1
%.fca.2.extract = extractvalue [4 x i8] %0, 2
%.fca.3.extract = extractvalue [4 x i8] %0, 3
%.fca.0.extract1 = extractvalue [4 x i8] %1, 0
%.fca.1.extract2 = extractvalue [4 x i8] %1, 1
%.fca.2.extract3 = extractvalue [4 x i8] %1, 2
%.fca.3.extract4 = extractvalue [4 x i8] %1, 3
%_3 = add i8 %.fca.0.extract1, %.fca.0.extract
%_8 = add i8 %.fca.1.extract2, %.fca.1.extract
%_13 = add i8 %.fca.2.extract3, %.fca.2.extract
%_18 = add i8 %.fca.3.extract4, %.fca.3.extract
%.fca.0.insert = insertvalue [4 x i8] undef, i8 %_3, 0
%.fca.1.insert = insertvalue [4 x i8] %.fca.0.insert, i8 %_8, 1
%.fca.2.insert = insertvalue [4 x i8] %.fca.1.insert, i8 %_13, 2
%.fca.3.insert = insertvalue [4 x i8] %.fca.2.insert, i8 %_18, 3
ret [4 x i8] %.fca.3.insert
}
attributes #0 = { norecurse nounwind nonlazybind readnone uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } Is there some attribute to get LLVM to pass an argument using as few registers as possible? Edit: |
This case was found in #80463. Replacing some infallible code with code that needed to propagate
Result
s led to some performance issues. It looked like theResult
propagation code was less efficient than it could be. The code below approximately reproduces the original issue.The assembly for
foo
(release build for x86_64 Linux target) is what we're interested in:I believe this could optimize down to something like this instead:
When
bar
returns anErr
, all we need to do is propagate the 16 byteResult
, already contained inrax
andrdx
, which is exactly wherefoo
needs to return it from.When
bar
succeeds, all we need to do is callprintln
and then returnResult::Ok
(set the appropriate enum tag inrax
).You can step through the original assembly and see that this all happens, but that it's done in a really convoluted way:
In the
bar
returnsErr
case, the shifting, etc. is such thatrax
andrdx
contain the sameErr
value returned bybar
, as desired. Butrax
andrdx
already contained those values after thebar
call, so all of that shifting, etc. was extra work.In the
bar
returnsOk
case, we do set the enum tag to that of theOk
variant by way ofmov %ebx, %eax
, but we also modifyrdx
needlessly (in theErr(value)
case,rdx
contains thevalue
, but it's not needed inOk
case), and we modifyrcx
andor
it intorax
needlessly (rcx
contains junk in this case, butor
ing it in doesn't affect the tag value, since the lower bits ofrcx
have been shifted out). All of this could have been replaced with amov $0x2,%eax
.LLVM is either missing the optimization or we are not providing enough information to enable it to do the optimization.
LLVM IR after optimization passes
LLVM IR before optimization passes
Also, replacing
?
with manual propagation makes a difference, though it still produces less than optimal code:@rustbot label T-compiler A-LLVM I-slow
The text was updated successfully, but these errors were encountered: