-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[X86] Remove extra MOV after widening atomic load #148898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/jofrn/gt/07-15-_selectiondag_widen_2_x_t_vector_types_for_atomic_load
Are you sure you want to change the base?
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-x86 Author: None (jofrn) ChangesThis change adds patterns to optimize out an extra MOV Full diff: https://github.com/llvm/llvm-project/pull/148898.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 927b2c8b22f05..26b76dd1ca83a 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), (MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (zext (i16 (atomic_load_16 addr:$src)))))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))),
+ (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float>
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index ff5391f44bbe3..4b818b6cfa57e 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -319,159 +319,60 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
define <2 x i16> @atomic_vec2_i16(ptr %x) {
; CHECK-O3-LABEL: atomic_vec2_i16:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: movd %eax, %xmm0
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-O3-NEXT: retq
;
; CHECK-SSE-O3-LABEL: atomic_vec2_i16:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: atomic_vec2_i16:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-O0-LABEL: atomic_vec2_i16:
; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: movd %eax, %xmm0
+; CHECK-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-O0-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec2_i16:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: atomic_vec2_i16:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <2 x i16>, ptr %x acquire, align 4
ret <2 x i16> %ret
}
define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec2_ptr270:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq %rax, %xmm0
-; CHECK-O3-NEXT: retq
-;
-; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O3-NEXT: retq
-;
-; CHECK-O0-LABEL: atomic_vec2_ptr270:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq %rax, %xmm0
-; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O0-NEXT: retq
+; CHECK-LABEL: atomic_vec2_ptr270:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec2_i32_align:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq %rax, %xmm0
-; CHECK-O3-NEXT: retq
-;
-; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec2_i32_align:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O3-NEXT: retq
-;
-; CHECK-O0-LABEL: atomic_vec2_i32_align:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq %rax, %xmm0
-; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec2_i32_align:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec2_i32_align:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O0-NEXT: retq
+; CHECK-LABEL: atomic_vec2_i32_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <2 x i32>, ptr %x acquire, align 8
ret <2 x i32> %ret
}
define <2 x float> @atomic_vec2_float_align(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec2_float_align:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq %rax, %xmm0
-; CHECK-O3-NEXT: retq
-;
-; CHECK-SSE-O3-LABEL: atomic_vec2_float_align:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec2_float_align:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O3-NEXT: retq
-;
-; CHECK-O0-LABEL: atomic_vec2_float_align:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq %rax, %xmm0
-; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec2_float_align:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec2_float_align:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O0-NEXT: retq
+; CHECK-LABEL: atomic_vec2_float_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <2 x float>, ptr %x acquire, align 8
ret <2 x float> %ret
}
@@ -900,79 +801,42 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec4_i8:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: movd %eax, %xmm0
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-O3-NEXT: retq
;
; CHECK-SSE-O3-LABEL: atomic_vec4_i8:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: atomic_vec4_i8:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-O0-LABEL: atomic_vec4_i8:
; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: movd %eax, %xmm0
+; CHECK-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-O0-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec4_i8:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: atomic_vec4_i8:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <4 x i8>, ptr %x acquire, align 4
ret <4 x i8> %ret
}
define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec4_i16:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movq %rax, %xmm0
-; CHECK-O3-NEXT: retq
-;
-; CHECK-SSE-O3-LABEL: atomic_vec4_i16:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec4_i16:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O3-NEXT: retq
-;
-; CHECK-O0-LABEL: atomic_vec4_i16:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movq %rax, %xmm0
-; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec4_i16:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT: movq %rax, %xmm0
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec4_i16:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0
-; CHECK-AVX-O0-NEXT: retq
+; CHECK-LABEL: atomic_vec4_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
%ret = load atomic <4 x i16>, ptr %x acquire, align 8
ret <4 x i16> %ret
}
|
This change adds patterns to optimize out an extra MOV present after widening the atomic load.
@@ -1204,6 +1204,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), (MOV16rm addr:$src)>; | |||
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>; | |||
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>; | |||
|
|||
def : Pat<(v4i32 (scalar_to_vector (i32 (zext (i16 (atomic_load_16 addr:$src)))))), | |||
(MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks wrong. I think you need pinsrw or vpbroadcastw to load 16 bits.
(MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16> | ||
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))), | ||
(MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need to make these patterns SSE only and provide AVX and AVX-512 equivalent patterns.
b8727ee
to
ff8979e
Compare
73ea819
to
da4fe6c
Compare
This change adds patterns to optimize out an extra MOV
present after widening the atomic load.