| commit 7b03725097872fbd3369a7213c1d98b372aa2d78 |
| Author: Sanjay Patel <spatel@rotateright.com> |
| Date: Fri Feb 4 07:45:57 2022 -0500 |
| |
| Revert "[x86] try harder to scalarize a vector load with extracted integer op uses" |
| |
| This reverts commit b4b97ec813a02585000f30ac7d532dda74e8bfda. |
| |
| As discussed in post-commit feedback at: |
| https://reviews.llvm.org/D118376 |
| ...there's a stage 2 failure on a Mac running a clang-refactor tool test. |
| |
| diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp |
| index 1d927d59d1ae..04b5984ce424 100644 |
| --- a/llvm/lib/Target/X86/X86ISelLowering.cpp |
| +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp |
| @@ -43106,38 +43106,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, |
| } |
| } |
| |
| - // If this extract is from a loaded vector value and will be used as an |
| - // integer, that requires a potentially expensive XMM -> GPR transfer. |
| - // Additionally, if we can convert to a scalar integer load, that will likely |
| - // be folded into a subsequent integer op. |
| - // Note: Unlike the related fold for this in DAGCombiner, this is not limited |
| - // to a single-use of the loaded vector. For the reasons above, we |
| - // expect this to be profitable even if it creates an extra load. |
| - bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) { |
| - return Use->getOpcode() == ISD::STORE || |
| - Use->getOpcode() == ISD::INSERT_VECTOR_ELT || |
| - Use->getOpcode() == ISD::SCALAR_TO_VECTOR; |
| - }); |
| - auto *LoadVec = dyn_cast<LoadSDNode>(InputVector); |
| - if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() && |
| - SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() && |
| - !LikelyUsedAsVector) { |
| - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
| - SDValue NewPtr = |
| - TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx); |
| - unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8; |
| - MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff); |
| - Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff); |
| - SDValue Load = |
| - DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment, |
| - LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo()); |
| - SDValue Chain = Load.getValue(1); |
| - SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)}; |
| - SDValue To[] = {Load, Chain}; |
| - DAG.ReplaceAllUsesOfValuesWith(From, To, 2); |
| - return SDValue(N, 0); |
| - } |
| - |
| return SDValue(); |
| } |
| |
| diff --git a/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll |
| index 5fb01b8c1b99..ad52d58bde1c 100644 |
| --- a/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll |
| +++ b/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll |
| @@ -10,13 +10,13 @@ |
| define <4 x i32> @test(<4 x i32>* %p) { |
| ; CHECK-LABEL: test: |
| ; CHECK: # %bb.0: |
| -; CHECK-NEXT: cmpl $3, 8(%rdi) |
| -; CHECK-NEXT: je .LBB0_1 |
| -; CHECK-NEXT: # %bb.2: |
| -; CHECK-NEXT: xorps %xmm0, %xmm0 |
| -; CHECK-NEXT: retq |
| -; CHECK-NEXT: .LBB0_1: |
| ; CHECK-NEXT: movaps (%rdi), %xmm0 |
| +; CHECK-NEXT: extractps $2, %xmm0, %eax |
| +; CHECK-NEXT: cmpl $3, %eax |
| +; CHECK-NEXT: je .LBB0_2 |
| +; CHECK-NEXT: # %bb.1: |
| +; CHECK-NEXT: xorps %xmm0, %xmm0 |
| +; CHECK-NEXT: .LBB0_2: |
| ; CHECK-NEXT: retq |
| %v = load <4 x i32>, <4 x i32>* %p |
| %e = extractelement <4 x i32> %v, i32 2 |
| diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll |
| index f632db7160e6..5d7940b2fa7c 100644 |
| --- a/llvm/test/CodeGen/X86/avx512-cvt.ll |
| +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll |
| @@ -148,12 +148,18 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { |
| define <4 x float> @slto4f32_mem(<4 x i64>* %a) { |
| ; NODQ-LABEL: slto4f32_mem: |
| ; NODQ: # %bb.0: |
| -; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 |
| -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; NODQ-NEXT: vmovdqu (%rdi), %xmm0 |
| +; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 |
| +; NODQ-NEXT: vpextrq $1, %xmm0, %rax |
| +; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 |
| +; NODQ-NEXT: vmovq %xmm0, %rax |
| +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 |
| +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; NODQ-NEXT: vmovq %xmm1, %rax |
| +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 |
| +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; NODQ-NEXT: vpextrq $1, %xmm1, %rax |
| +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 |
| ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; NODQ-NEXT: retq |
| ; |
| diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll |
| index a47dbe570fd5..bf3ffecb414f 100644 |
| --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll |
| +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll |
| @@ -542,8 +542,10 @@ define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind { |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpmovb2m %zmm0, %k0 |
| ; AVX512-NEXT: kmovq %k0, -{{[0-9]+}}(%rsp) |
| -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax |
| -; AVX512-NEXT: addl -{{[0-9]+}}(%rsp), %eax |
| +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 |
| +; AVX512-NEXT: vmovd %xmm0, %ecx |
| +; AVX512-NEXT: vpextrd $1, %xmm0, %eax |
| +; AVX512-NEXT: addl %ecx, %eax |
| ; AVX512-NEXT: vzeroupper |
| ; AVX512-NEXT: retq |
| %1 = icmp slt <64 x i8> %a0, zeroinitializer |
| diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll |
| index c0c32eb3cfce..2a7ed3a8b4e7 100644 |
| --- a/llvm/test/CodeGen/X86/extractelement-load.ll |
| +++ b/llvm/test/CodeGen/X86/extractelement-load.ll |
| @@ -301,35 +301,33 @@ define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* no |
| ret void |
| } |
| |
| -; A scalar load is favored over a XMM->GPR register transfer in this example. |
| - |
| define i32 @multi_use_load_scalarization(<4 x i32>* %p) { |
| ; X32-SSE2-LABEL: multi_use_load_scalarization: |
| ; X32-SSE2: # %bb.0: |
| ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx |
| -; X32-SSE2-NEXT: movl (%ecx), %eax |
| ; X32-SSE2-NEXT: movdqu (%ecx), %xmm0 |
| ; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 |
| +; X32-SSE2-NEXT: movd %xmm0, %eax |
| ; X32-SSE2-NEXT: psubd %xmm1, %xmm0 |
| ; X32-SSE2-NEXT: movdqa %xmm0, (%ecx) |
| ; X32-SSE2-NEXT: retl |
| ; |
| ; X64-SSSE3-LABEL: multi_use_load_scalarization: |
| ; X64-SSSE3: # %bb.0: |
| -; X64-SSSE3-NEXT: movl (%rdi), %eax |
| ; X64-SSSE3-NEXT: movdqu (%rdi), %xmm0 |
| ; X64-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 |
| +; X64-SSSE3-NEXT: movd %xmm0, %eax |
| ; X64-SSSE3-NEXT: psubd %xmm1, %xmm0 |
| ; X64-SSSE3-NEXT: movdqa %xmm0, (%rdi) |
| ; X64-SSSE3-NEXT: retq |
| ; |
| ; X64-AVX-LABEL: multi_use_load_scalarization: |
| ; X64-AVX: # %bb.0: |
| -; X64-AVX-NEXT: movl (%rdi), %eax |
| ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 |
| ; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| -; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 |
| -; X64-AVX-NEXT: vmovdqa %xmm0, (%rdi) |
| +; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 |
| +; X64-AVX-NEXT: vmovdqa %xmm1, (%rdi) |
| +; X64-AVX-NEXT: vmovd %xmm0, %eax |
| ; X64-AVX-NEXT: retq |
| %v = load <4 x i32>, <4 x i32>* %p, align 1 |
| %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1> |
| diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll |
| index fb4c23e7de1a..a28a64f4fc60 100644 |
| --- a/llvm/test/CodeGen/X86/oddsubvector.ll |
| +++ b/llvm/test/CodeGen/X86/oddsubvector.ll |
| @@ -161,46 +161,46 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) { |
| define void @PR42833() { |
| ; SSE2-LABEL: PR42833: |
| ; SSE2: # %bb.0: |
| -; SSE2-NEXT: movl b(%rip), %eax |
| -; SSE2-NEXT: movdqa c+144(%rip), %xmm0 |
| -; SSE2-NEXT: movdqa c+128(%rip), %xmm1 |
| -; SSE2-NEXT: addl c+128(%rip), %eax |
| +; SSE2-NEXT: movdqa c+144(%rip), %xmm1 |
| +; SSE2-NEXT: movdqa c+128(%rip), %xmm0 |
| +; SSE2-NEXT: movd %xmm0, %eax |
| +; SSE2-NEXT: addl b(%rip), %eax |
| ; SSE2-NEXT: movd %eax, %xmm2 |
| ; SSE2-NEXT: movd %eax, %xmm3 |
| -; SSE2-NEXT: paddd %xmm1, %xmm3 |
| +; SSE2-NEXT: paddd %xmm0, %xmm3 |
| ; SSE2-NEXT: movdqa d+144(%rip), %xmm4 |
| -; SSE2-NEXT: psubd %xmm0, %xmm4 |
| -; SSE2-NEXT: paddd %xmm0, %xmm0 |
| -; SSE2-NEXT: movdqa %xmm1, %xmm5 |
| -; SSE2-NEXT: paddd %xmm1, %xmm5 |
| +; SSE2-NEXT: psubd %xmm1, %xmm4 |
| +; SSE2-NEXT: paddd %xmm1, %xmm1 |
| +; SSE2-NEXT: movdqa %xmm0, %xmm5 |
| +; SSE2-NEXT: paddd %xmm0, %xmm5 |
| ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] |
| -; SSE2-NEXT: movdqa %xmm0, c+144(%rip) |
| +; SSE2-NEXT: movdqa %xmm1, c+144(%rip) |
| ; SSE2-NEXT: movaps %xmm5, c+128(%rip) |
| -; SSE2-NEXT: movdqa c+160(%rip), %xmm0 |
| +; SSE2-NEXT: movdqa c+160(%rip), %xmm1 |
| ; SSE2-NEXT: movdqa c+176(%rip), %xmm3 |
| ; SSE2-NEXT: movdqa d+160(%rip), %xmm5 |
| ; SSE2-NEXT: movdqa d+176(%rip), %xmm6 |
| ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 |
| -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] |
| -; SSE2-NEXT: psubd %xmm1, %xmm7 |
| +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] |
| +; SSE2-NEXT: psubd %xmm0, %xmm7 |
| ; SSE2-NEXT: psubd %xmm3, %xmm6 |
| -; SSE2-NEXT: psubd %xmm0, %xmm5 |
| +; SSE2-NEXT: psubd %xmm1, %xmm5 |
| ; SSE2-NEXT: movdqa %xmm5, d+160(%rip) |
| ; SSE2-NEXT: movdqa %xmm6, d+176(%rip) |
| ; SSE2-NEXT: movdqa %xmm4, d+144(%rip) |
| ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) |
| ; SSE2-NEXT: paddd %xmm3, %xmm3 |
| -; SSE2-NEXT: paddd %xmm0, %xmm0 |
| -; SSE2-NEXT: movdqa %xmm0, c+160(%rip) |
| +; SSE2-NEXT: paddd %xmm1, %xmm1 |
| +; SSE2-NEXT: movdqa %xmm1, c+160(%rip) |
| ; SSE2-NEXT: movdqa %xmm3, c+176(%rip) |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE42-LABEL: PR42833: |
| ; SSE42: # %bb.0: |
| -; SSE42-NEXT: movl b(%rip), %eax |
| ; SSE42-NEXT: movdqa c+144(%rip), %xmm0 |
| ; SSE42-NEXT: movdqa c+128(%rip), %xmm1 |
| -; SSE42-NEXT: addl c+128(%rip), %eax |
| +; SSE42-NEXT: movd %xmm1, %eax |
| +; SSE42-NEXT: addl b(%rip), %eax |
| ; SSE42-NEXT: movd %eax, %xmm2 |
| ; SSE42-NEXT: paddd %xmm1, %xmm2 |
| ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 |
| @@ -232,20 +232,20 @@ define void @PR42833() { |
| ; |
| ; AVX1-LABEL: PR42833: |
| ; AVX1: # %bb.0: |
| -; AVX1-NEXT: movl b(%rip), %eax |
| -; AVX1-NEXT: addl c+128(%rip), %eax |
| -; AVX1-NEXT: vmovd %eax, %xmm0 |
| -; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1 |
| -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 |
| -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 |
| +; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0 |
| +; AVX1-NEXT: vmovd %xmm0, %eax |
| +; AVX1-NEXT: addl b(%rip), %eax |
| +; AVX1-NEXT: vmovd %eax, %xmm1 |
| +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 |
| +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 |
| ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 |
| ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 |
| ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] |
| +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] |
| ; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 |
| ; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 |
| -; AVX1-NEXT: vmovups %ymm0, c+128(%rip) |
| -; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 |
| +; AVX1-NEXT: vmovups %ymm1, c+128(%rip) |
| +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 |
| ; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 |
| ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 |
| ; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 |
| @@ -314,20 +314,20 @@ define void @PR42833() { |
| ; |
| ; XOP-LABEL: PR42833: |
| ; XOP: # %bb.0: |
| -; XOP-NEXT: movl b(%rip), %eax |
| -; XOP-NEXT: addl c+128(%rip), %eax |
| -; XOP-NEXT: vmovd %eax, %xmm0 |
| -; XOP-NEXT: vmovdqa c+128(%rip), %xmm1 |
| -; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0 |
| -; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2 |
| +; XOP-NEXT: vmovdqa c+128(%rip), %xmm0 |
| +; XOP-NEXT: vmovd %xmm0, %eax |
| +; XOP-NEXT: addl b(%rip), %eax |
| +; XOP-NEXT: vmovd %eax, %xmm1 |
| +; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 |
| +; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 |
| ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 |
| ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 |
| ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] |
| +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] |
| ; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 |
| ; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 |
| -; XOP-NEXT: vmovups %ymm0, c+128(%rip) |
| -; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 |
| +; XOP-NEXT: vmovups %ymm1, c+128(%rip) |
| +; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 |
| ; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 |
| ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 |
| ; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 |
| diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll |
| index 0f2b21224279..d5da5c73a891 100644 |
| --- a/llvm/test/CodeGen/X86/pr45378.ll |
| +++ b/llvm/test/CodeGen/X86/pr45378.ll |
| @@ -76,23 +76,28 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind { |
| ; SSE2-LABEL: parseHeaders2_scalar_and: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqu (%rdi), %xmm0 |
| -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm0, %rax |
| -; SSE2-NEXT: testq %rax, (%rdi) |
| +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| +; SSE2-NEXT: movq %xmm0, %rcx |
| +; SSE2-NEXT: testq %rcx, %rax |
| ; SSE2-NEXT: sete %al |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE41-LABEL: parseHeaders2_scalar_and: |
| ; SSE41: # %bb.0: |
| -; SSE41-NEXT: movq (%rdi), %rax |
| -; SSE41-NEXT: testq %rax, 8(%rdi) |
| +; SSE41-NEXT: movdqu (%rdi), %xmm0 |
| +; SSE41-NEXT: movq %xmm0, %rax |
| +; SSE41-NEXT: pextrq $1, %xmm0, %rcx |
| +; SSE41-NEXT: testq %rcx, %rax |
| ; SSE41-NEXT: sete %al |
| ; SSE41-NEXT: retq |
| ; |
| ; AVX-LABEL: parseHeaders2_scalar_and: |
| ; AVX: # %bb.0: |
| -; AVX-NEXT: movq (%rdi), %rax |
| -; AVX-NEXT: testq %rax, 8(%rdi) |
| +; AVX-NEXT: vmovdqu (%rdi), %xmm0 |
| +; AVX-NEXT: vmovq %xmm0, %rax |
| +; AVX-NEXT: vpextrq $1, %xmm0, %rcx |
| +; AVX-NEXT: testq %rcx, %rax |
| ; AVX-NEXT: sete %al |
| ; AVX-NEXT: retq |
| %vptr = bitcast i64 * %ptr to <2 x i64> * |
| diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll |
| index 816892d62a0b..d21b44158068 100644 |
| --- a/llvm/test/CodeGen/X86/scalar_widen_div.ll |
| +++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll |
| @@ -403,29 +403,32 @@ define void @test_int_div(<3 x i32>* %dest, <3 x i32>* %old, i32 %n) { |
| ; CHECK-NEXT: testl %edx, %edx |
| ; CHECK-NEXT: jle .LBB12_3 |
| ; CHECK-NEXT: # %bb.1: # %bb.nph |
| -; CHECK-NEXT: movl %edx, %r11d |
| +; CHECK-NEXT: movl %edx, %r9d |
| ; CHECK-NEXT: xorl %ecx, %ecx |
| ; CHECK-NEXT: .p2align 4, 0x90 |
| ; CHECK-NEXT: .LBB12_2: # %for.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| -; CHECK-NEXT: movl 8(%rdi,%rcx), %r8d |
| -; CHECK-NEXT: movl (%rdi,%rcx), %r9d |
| -; CHECK-NEXT: movl 4(%rdi,%rcx), %eax |
| +; CHECK-NEXT: movdqa (%rdi,%rcx), %xmm0 |
| +; CHECK-NEXT: movdqa (%rsi,%rcx), %xmm1 |
| +; CHECK-NEXT: pextrd $1, %xmm0, %eax |
| +; CHECK-NEXT: pextrd $1, %xmm1, %r8d |
| ; CHECK-NEXT: cltd |
| -; CHECK-NEXT: idivl 4(%rsi,%rcx) |
| -; CHECK-NEXT: movl %eax, %r10d |
| -; CHECK-NEXT: movl %r9d, %eax |
| +; CHECK-NEXT: idivl %r8d |
| +; CHECK-NEXT: movl %eax, %r8d |
| +; CHECK-NEXT: movd %xmm0, %eax |
| +; CHECK-NEXT: movd %xmm1, %r10d |
| ; CHECK-NEXT: cltd |
| -; CHECK-NEXT: idivl (%rsi,%rcx) |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: pinsrd $1, %r10d, %xmm0 |
| -; CHECK-NEXT: movl %r8d, %eax |
| +; CHECK-NEXT: idivl %r10d |
| +; CHECK-NEXT: movd %eax, %xmm2 |
| +; CHECK-NEXT: pinsrd $1, %r8d, %xmm2 |
| +; CHECK-NEXT: pextrd $2, %xmm0, %eax |
| +; CHECK-NEXT: pextrd $2, %xmm1, %r8d |
| ; CHECK-NEXT: cltd |
| -; CHECK-NEXT: idivl 8(%rsi,%rcx) |
| +; CHECK-NEXT: idivl %r8d |
| ; CHECK-NEXT: movl %eax, 8(%rdi,%rcx) |
| -; CHECK-NEXT: movq %xmm0, (%rdi,%rcx) |
| +; CHECK-NEXT: movq %xmm2, (%rdi,%rcx) |
| ; CHECK-NEXT: addq $16, %rcx |
| -; CHECK-NEXT: decl %r11d |
| +; CHECK-NEXT: decl %r9d |
| ; CHECK-NEXT: jne .LBB12_2 |
| ; CHECK-NEXT: .LBB12_3: # %for.end |
| ; CHECK-NEXT: retq |
| diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll |
| index 5558b7eefffb..82a633b6d8ca 100644 |
| --- a/llvm/test/CodeGen/X86/shrink_vmul.ll |
| +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll |
| @@ -2072,10 +2072,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X86-SSE-NEXT: divl %ecx |
| ; X86-SSE-NEXT: movd %edx, %xmm4 |
| ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] |
| +; X86-SSE-NEXT: movd %xmm1, %ecx |
| ; X86-SSE-NEXT: movl %esi, %eax |
| ; X86-SSE-NEXT: xorl %edx, %edx |
| -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi |
| -; X86-SSE-NEXT: divl 16(%esi) |
| +; X86-SSE-NEXT: divl %ecx |
| ; X86-SSE-NEXT: movd %edx, %xmm3 |
| ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] |
| ; X86-SSE-NEXT: movd %xmm2, %eax |
| @@ -2086,9 +2086,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X86-SSE-NEXT: movd %edx, %xmm1 |
| ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] |
| ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] |
| +; X86-SSE-NEXT: movd %xmm0, %ecx |
| ; X86-SSE-NEXT: movl %edi, %eax |
| ; X86-SSE-NEXT: xorl %edx, %edx |
| -; X86-SSE-NEXT: divl (%esi) |
| +; X86-SSE-NEXT: divl %ecx |
| ; X86-SSE-NEXT: movd %edx, %xmm1 |
| ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] |
| ; X86-SSE-NEXT: movd %xmm2, %ecx |
| @@ -2114,7 +2115,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] |
| ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload |
| ; X86-SSE-NEXT: xorl %edx, %edx |
| -; X86-SSE-NEXT: divl 32(%esi) |
| +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx |
| +; X86-SSE-NEXT: divl 32(%ecx) |
| ; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] |
| ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] |
| ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 |
| @@ -2149,43 +2151,53 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx |
| ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| -; X86-AVX1-NEXT: vmovd %xmm2, %eax |
| +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| +; X86-AVX1-NEXT: vmovd %xmm1, %eax |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| ; X86-AVX1-NEXT: divl 32(%ecx) |
| ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill |
| -; X86-AVX1-NEXT: vpextrd $3, %xmm1, %eax |
| +; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax |
| +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 |
| +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 |
| +; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 28(%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill |
| -; X86-AVX1-NEXT: vpextrd $2, %xmm1, %eax |
| +; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax |
| +; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 24(%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill |
| -; X86-AVX1-NEXT: vpextrd $1, %xmm1, %eax |
| +; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax |
| +; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 20(%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill |
| -; X86-AVX1-NEXT: vmovd %xmm1, %eax |
| +; X86-AVX1-NEXT: vmovd %xmm2, %eax |
| +; X86-AVX1-NEXT: vmovd %xmm3, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 16(%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: movl %edx, %ebp |
| ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax |
| +; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 12(%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: movl %edx, %ebx |
| ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax |
| +; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 8(%ecx) |
| +; X86-AVX1-NEXT: divl %esi |
| ; X86-AVX1-NEXT: movl %edx, %esi |
| ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax |
| +; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl 4(%ecx) |
| +; X86-AVX1-NEXT: divl %edi |
| ; X86-AVX1-NEXT: movl %edx, %edi |
| ; X86-AVX1-NEXT: vmovd %xmm0, %eax |
| +; X86-AVX1-NEXT: vmovd %xmm1, %ecx |
| ; X86-AVX1-NEXT: xorl %edx, %edx |
| -; X86-AVX1-NEXT: divl (%ecx) |
| +; X86-AVX1-NEXT: divl %ecx |
| ; X86-AVX1-NEXT: vmovd %edx, %xmm0 |
| ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 |
| ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 |
| @@ -2211,47 +2223,58 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; |
| ; X86-AVX2-LABEL: PR34947: |
| ; X86-AVX2: # %bb.0: |
| +; X86-AVX2-NEXT: pushl %edi |
| ; X86-AVX2-NEXT: pushl %esi |
| ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi |
| ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero |
| ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero |
| -; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 |
| -; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax |
| +; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 |
| +; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 |
| +; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx |
| +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 |
| +; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 20(%esi) |
| +; X86-AVX2-NEXT: divl %ecx |
| ; X86-AVX2-NEXT: movl %edx, %ecx |
| -; X86-AVX2-NEXT: vmovd %xmm2, %eax |
| +; X86-AVX2-NEXT: vmovd %xmm3, %edi |
| +; X86-AVX2-NEXT: vmovd %xmm4, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 16(%esi) |
| -; X86-AVX2-NEXT: vmovd %edx, %xmm3 |
| -; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 |
| -; X86-AVX2-NEXT: vpextrd $2, %xmm2, %eax |
| +; X86-AVX2-NEXT: divl %edi |
| +; X86-AVX2-NEXT: vmovd %edx, %xmm5 |
| +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 |
| +; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx |
| +; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 24(%esi) |
| -; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 |
| -; X86-AVX2-NEXT: vpextrd $3, %xmm2, %eax |
| +; X86-AVX2-NEXT: divl %ecx |
| +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 |
| +; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx |
| +; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 28(%esi) |
| -; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2 |
| +; X86-AVX2-NEXT: divl %ecx |
| +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 |
| +; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx |
| ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 4(%esi) |
| +; X86-AVX2-NEXT: divl %ecx |
| ; X86-AVX2-NEXT: movl %edx, %ecx |
| +; X86-AVX2-NEXT: vmovd %xmm2, %edi |
| ; X86-AVX2-NEXT: vmovd %xmm1, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl (%esi) |
| -; X86-AVX2-NEXT: vmovd %edx, %xmm3 |
| -; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 |
| +; X86-AVX2-NEXT: divl %edi |
| +; X86-AVX2-NEXT: vmovd %edx, %xmm4 |
| +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 |
| +; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx |
| ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 8(%esi) |
| -; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 |
| +; X86-AVX2-NEXT: divl %ecx |
| +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 |
| +; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx |
| ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| -; X86-AVX2-NEXT: divl 12(%esi) |
| -; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 |
| -; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 |
| +; X86-AVX2-NEXT: divl %ecx |
| +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 |
| +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 |
| ; X86-AVX2-NEXT: vmovd %xmm0, %eax |
| ; X86-AVX2-NEXT: xorl %edx, %edx |
| ; X86-AVX2-NEXT: divl 32(%esi) |
| @@ -2261,6 +2284,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X86-AVX2-NEXT: movl %eax, (%eax) |
| ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) |
| ; X86-AVX2-NEXT: popl %esi |
| +; X86-AVX2-NEXT: popl %edi |
| ; X86-AVX2-NEXT: vzeroupper |
| ; X86-AVX2-NEXT: retl |
| ; |
| @@ -2293,9 +2317,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X64-SSE-NEXT: divl %edi |
| ; X64-SSE-NEXT: movd %edx, %xmm4 |
| ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] |
| +; X64-SSE-NEXT: movd %xmm1, %edi |
| ; X64-SSE-NEXT: movl %r9d, %eax |
| ; X64-SSE-NEXT: xorl %edx, %edx |
| -; X64-SSE-NEXT: divl 16(%rsi) |
| +; X64-SSE-NEXT: divl %edi |
| ; X64-SSE-NEXT: movd %edx, %xmm3 |
| ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] |
| ; X64-SSE-NEXT: movd %xmm2, %eax |
| @@ -2306,9 +2331,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X64-SSE-NEXT: movd %edx, %xmm1 |
| ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] |
| ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] |
| +; X64-SSE-NEXT: movd %xmm0, %edi |
| ; X64-SSE-NEXT: movl %r10d, %eax |
| ; X64-SSE-NEXT: xorl %edx, %edx |
| -; X64-SSE-NEXT: divl (%rsi) |
| +; X64-SSE-NEXT: divl %edi |
| ; X64-SSE-NEXT: movd %edx, %xmm1 |
| ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] |
| ; X64-SSE-NEXT: movd %xmm2, %edi |
| @@ -2359,50 +2385,60 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X64-AVX1-NEXT: pushq %rbp |
| ; X64-AVX1-NEXT: pushq %rbx |
| ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| -; X64-AVX1-NEXT: vmovd %xmm2, %eax |
| +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| +; X64-AVX1-NEXT: vmovd %xmm1, %eax |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| ; X64-AVX1-NEXT: divl 32(%rsi) |
| ; X64-AVX1-NEXT: movl %edx, %r8d |
| -; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax |
| +; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax |
| +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 |
| +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 |
| +; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 28(%rsi) |
| +; X64-AVX1-NEXT: divl %ecx |
| ; X64-AVX1-NEXT: movl %edx, %r9d |
| -; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax |
| +; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax |
| +; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 24(%rsi) |
| +; X64-AVX1-NEXT: divl %ecx |
| ; X64-AVX1-NEXT: movl %edx, %r10d |
| -; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax |
| +; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax |
| +; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 20(%rsi) |
| +; X64-AVX1-NEXT: divl %ecx |
| ; X64-AVX1-NEXT: movl %edx, %r11d |
| -; X64-AVX1-NEXT: vmovd %xmm1, %eax |
| +; X64-AVX1-NEXT: vmovd %xmm2, %eax |
| +; X64-AVX1-NEXT: vmovd %xmm3, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 16(%rsi) |
| -; X64-AVX1-NEXT: movl %edx, %ecx |
| +; X64-AVX1-NEXT: divl %ecx |
| +; X64-AVX1-NEXT: movl %edx, %esi |
| ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax |
| +; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 12(%rsi) |
| +; X64-AVX1-NEXT: divl %ecx |
| ; X64-AVX1-NEXT: movl %edx, %edi |
| ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax |
| +; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 8(%rsi) |
| -; X64-AVX1-NEXT: movl %edx, %ebx |
| +; X64-AVX1-NEXT: divl %ecx |
| +; X64-AVX1-NEXT: movl %edx, %ecx |
| ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax |
| +; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl 4(%rsi) |
| -; X64-AVX1-NEXT: movl %edx, %ebp |
| +; X64-AVX1-NEXT: divl %ebx |
| +; X64-AVX1-NEXT: movl %edx, %ebx |
| ; X64-AVX1-NEXT: vmovd %xmm0, %eax |
| +; X64-AVX1-NEXT: vmovd %xmm1, %ebp |
| ; X64-AVX1-NEXT: xorl %edx, %edx |
| -; X64-AVX1-NEXT: divl (%rsi) |
| +; X64-AVX1-NEXT: divl %ebp |
| ; X64-AVX1-NEXT: vmovd %edx, %xmm0 |
| -; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 |
| -; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 |
| +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 |
| +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 |
| ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 |
| ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] |
| ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 |
| -; X64-AVX1-NEXT: vmovd %ecx, %xmm2 |
| +; X64-AVX1-NEXT: vmovd %esi, %xmm2 |
| ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 |
| ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 |
| ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 |
| @@ -2419,42 +2455,52 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { |
| ; X64-AVX2: # %bb.0: |
| ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero |
| ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero |
| -; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 |
| -; X64-AVX2-NEXT: vpextrd $1, %xmm2, %eax |
| +; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 |
| +; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 |
| +; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx |
| +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 |
| +; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 20(%rsi) |
| +; X64-AVX2-NEXT: divl %ecx |
| ; X64-AVX2-NEXT: movl %edx, %ecx |
| -; X64-AVX2-NEXT: vmovd %xmm2, %eax |
| +; X64-AVX2-NEXT: vmovd %xmm3, %edi |
| +; X64-AVX2-NEXT: vmovd %xmm4, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 16(%rsi) |
| -; X64-AVX2-NEXT: vmovd %edx, %xmm3 |
| -; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 |
| -; X64-AVX2-NEXT: vpextrd $2, %xmm2, %eax |
| +; X64-AVX2-NEXT: divl %edi |
| +; X64-AVX2-NEXT: vmovd %edx, %xmm5 |
| +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 |
| +; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx |
| +; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 24(%rsi) |
| -; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 |
| -; X64-AVX2-NEXT: vpextrd $3, %xmm2, %eax |
| +; X64-AVX2-NEXT: divl %ecx |
| +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 |
| +; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx |
| +; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 28(%rsi) |
| -; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm2 |
| +; X64-AVX2-NEXT: divl %ecx |
| +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 |
| +; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx |
| ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 4(%rsi) |
| +; X64-AVX2-NEXT: divl %ecx |
| ; X64-AVX2-NEXT: movl %edx, %ecx |
| +; X64-AVX2-NEXT: vmovd %xmm2, %edi |
| ; X64-AVX2-NEXT: vmovd %xmm1, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl (%rsi) |
| -; X64-AVX2-NEXT: vmovd %edx, %xmm3 |
| -; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 |
| +; X64-AVX2-NEXT: divl %edi |
| +; X64-AVX2-NEXT: vmovd %edx, %xmm4 |
| +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 |
| +; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx |
| ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 8(%rsi) |
| -; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3 |
| +; X64-AVX2-NEXT: divl %ecx |
| +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 |
| +; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx |
| ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| -; X64-AVX2-NEXT: divl 12(%rsi) |
| -; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 |
| -; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 |
| +; X64-AVX2-NEXT: divl %ecx |
| +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 |
| +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 |
| ; X64-AVX2-NEXT: vmovd %xmm0, %eax |
| ; X64-AVX2-NEXT: xorl %edx, %edx |
| ; X64-AVX2-NEXT: divl 32(%rsi) |
| diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll |
| index e0089354cc95..0a6bc2f59b68 100644 |
| --- a/llvm/test/CodeGen/X86/vec_cast.ll |
| +++ b/llvm/test/CodeGen/X86/vec_cast.ll |
| @@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind { |
| ; CHECK-WIN-LABEL: h: |
| ; CHECK-WIN: # %bb.0: |
| ; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0 |
| -; CHECK-WIN-NEXT: movl (%rcx), %eax |
| +; CHECK-WIN-NEXT: movd %xmm0, %eax |
| ; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx |
| ; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx |
| ; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax |
| diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll |
| index e95afd00304f..12d7b05bf3da 100644 |
| --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll |
| +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll |
| @@ -2895,7 +2895,8 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { |
| ; SSE2-LABEL: sitofp_load_2i64_to_2f64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqa (%rdi), %xmm1 |
| -; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0 |
| +; SSE2-NEXT: movq %xmm1, %rax |
| +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| @@ -2905,30 +2906,43 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { |
| ; |
| ; SSE41-LABEL: sitofp_load_2i64_to_2f64: |
| ; SSE41: # %bb.0: |
| -; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1 |
| -; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0 |
| +; SSE41-NEXT: movdqa (%rdi), %xmm0 |
| +; SSE41-NEXT: pextrq $1, %xmm0, %rax |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 |
| +; SSE41-NEXT: movq %xmm0, %rax |
| +; SSE41-NEXT: xorps %xmm0, %xmm0 |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 |
| ; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; SSE41-NEXT: retq |
| ; |
| ; VEX-LABEL: sitofp_load_2i64_to_2f64: |
| ; VEX: # %bb.0: |
| -; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 |
| -; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 |
| -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| +; VEX-NEXT: vmovdqa (%rdi), %xmm0 |
| +; VEX-NEXT: vpextrq $1, %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 |
| +; VEX-NEXT: vmovq %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 |
| +; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; VEX-NEXT: retq |
| ; |
| ; AVX512F-LABEL: sitofp_load_2i64_to_2f64: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 |
| +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: sitofp_load_2i64_to_2f64: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 |
| +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; AVX512VL-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: |
| @@ -3078,14 +3092,16 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqa (%rdi), %xmm1 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 |
| -; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0 |
| +; SSE2-NEXT: movq %xmm1, %rax |
| +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 |
| ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| +; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| -; SSE2-NEXT: cvtsi2sdq 16(%rdi), %xmm1 |
| +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: xorps %xmm2, %xmm2 |
| @@ -3095,46 +3111,72 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { |
| ; |
| ; SSE41-LABEL: sitofp_load_4i64_to_4f64: |
| ; SSE41: # %bb.0: |
| -; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1 |
| -; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0 |
| -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| -; SSE41-NEXT: cvtsi2sdq 24(%rdi), %xmm2 |
| +; SSE41-NEXT: movdqa (%rdi), %xmm0 |
| +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 |
| +; SSE41-NEXT: pextrq $1, %xmm0, %rax |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 |
| +; SSE41-NEXT: movq %xmm0, %rax |
| +; SSE41-NEXT: xorps %xmm0, %xmm0 |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 |
| +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] |
| +; SSE41-NEXT: pextrq $1, %xmm1, %rax |
| +; SSE41-NEXT: xorps %xmm2, %xmm2 |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm2 |
| +; SSE41-NEXT: movq %xmm1, %rax |
| ; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2sdq 16(%rdi), %xmm1 |
| +; SSE41-NEXT: cvtsi2sd %rax, %xmm1 |
| ; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] |
| ; SSE41-NEXT: retq |
| ; |
| ; VEX-LABEL: sitofp_load_4i64_to_4f64: |
| ; VEX: # %bb.0: |
| -; VEX-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 |
| -; VEX-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 |
| -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| -; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 |
| -; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 |
| -; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] |
| -; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; VEX-NEXT: vmovapd (%rdi), %xmm0 |
| +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; VEX-NEXT: vpextrq $1, %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 |
| +; VEX-NEXT: vmovq %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 |
| +; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] |
| +; VEX-NEXT: vpextrq $1, %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 |
| +; VEX-NEXT: vmovq %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 |
| +; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] |
| +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; VEX-NEXT: retq |
| ; |
| ; AVX512F-LABEL: sitofp_load_4i64_to_4f64: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| -; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 |
| -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] |
| -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512F-NEXT: vmovapd (%rdi), %xmm0 |
| +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 |
| +; AVX512F-NEXT: vmovq %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 |
| +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 |
| +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] |
| +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| -; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 |
| -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] |
| -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512VL-NEXT: vmovapd (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 |
| +; AVX512VL-NEXT: vmovq %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 |
| +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 |
| +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] |
| +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: |
| @@ -3839,14 +3881,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqa (%rdi), %xmm1 |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 |
| -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm2 |
| +; SSE2-NEXT: movq %xmm0, %rax |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: xorps %xmm0, %xmm0 |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] |
| +; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm0, %xmm0 |
| -; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| @@ -3857,47 +3901,72 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; |
| ; SSE41-LABEL: sitofp_load_4i64_to_4f32: |
| ; SSE41: # %bb.0: |
| -; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0 |
| -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] |
| -; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1 |
| -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| +; SSE41-NEXT: movdqa (%rdi), %xmm0 |
| +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 |
| +; SSE41-NEXT: pextrq $1, %xmm0, %rax |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 |
| +; SSE41-NEXT: movq %xmm0, %rax |
| +; SSE41-NEXT: xorps %xmm0, %xmm0 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 |
| +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; SSE41-NEXT: movq %xmm1, %rax |
| +; SSE41-NEXT: xorps %xmm2, %xmm2 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 |
| +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; SSE41-NEXT: pextrq $1, %xmm1, %rax |
| ; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; SSE41-NEXT: retq |
| ; |
| ; VEX-LABEL: sitofp_load_4i64_to_4f32: |
| ; VEX: # %bb.0: |
| -; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; VEX-NEXT: vmovdqa (%rdi), %xmm0 |
| +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; VEX-NEXT: vpextrq $1, %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 |
| +; VEX-NEXT: vmovq %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; VEX-NEXT: vmovq %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; VEX-NEXT: vpextrq $1, %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 |
| ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; VEX-NEXT: retq |
| ; |
| ; AVX512F-LABEL: sitofp_load_4i64_to_4f32: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; AVX512F-NEXT: vmovq %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 |
| ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 |
| ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; AVX512VL-NEXT: retq |
| ; |
| @@ -3991,29 +4060,33 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 |
| ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 |
| ; SSE2-NEXT: movdqa 48(%rdi), %xmm3 |
| -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm4 |
| +; SSE2-NEXT: movq %xmm0, %rax |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: xorps %xmm0, %xmm0 |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] |
| +; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm0, %xmm0 |
| -; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] |
| +; SSE2-NEXT: movq %xmm3, %rax |
| ; SSE2-NEXT: xorps %xmm4, %xmm4 |
| -; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm4 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm1, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] |
| +; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: xorps %xmm1, %xmm1 |
| -; SSE2-NEXT: cvtsi2ssq 32(%rdi), %xmm1 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: xorps %xmm2, %xmm2 |
| @@ -4024,82 +4097,132 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; |
| ; SSE41-LABEL: sitofp_load_8i64_to_8f32: |
| ; SSE41: # %bb.0: |
| -; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0 |
| -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] |
| -; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1 |
| -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| +; SSE41-NEXT: movdqa (%rdi), %xmm0 |
| +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 |
| +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 |
| +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 |
| +; SSE41-NEXT: pextrq $1, %xmm0, %rax |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 |
| +; SSE41-NEXT: movq %xmm0, %rax |
| +; SSE41-NEXT: xorps %xmm0, %xmm0 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 |
| +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] |
| +; SSE41-NEXT: movq %xmm1, %rax |
| +; SSE41-NEXT: xorps %xmm4, %xmm4 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 |
| +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] |
| +; SSE41-NEXT: pextrq $1, %xmm1, %rax |
| ; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; SSE41-NEXT: cvtsi2ssq 40(%rdi), %xmm2 |
| +; SSE41-NEXT: pextrq $1, %xmm2, %rax |
| +; SSE41-NEXT: xorps %xmm4, %xmm4 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 |
| +; SSE41-NEXT: movq %xmm2, %rax |
| ; SSE41-NEXT: xorps %xmm1, %xmm1 |
| -; SSE41-NEXT: cvtsi2ssq 32(%rdi), %xmm1 |
| -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 |
| +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] |
| +; SSE41-NEXT: movq %xmm3, %rax |
| ; SSE41-NEXT: xorps %xmm2, %xmm2 |
| -; SSE41-NEXT: cvtsi2ssq 48(%rdi), %xmm2 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 |
| ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| +; SSE41-NEXT: pextrq $1, %xmm3, %rax |
| ; SSE41-NEXT: xorps %xmm2, %xmm2 |
| -; SSE41-NEXT: cvtsi2ssq 56(%rdi), %xmm2 |
| +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 |
| ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| ; SSE41-NEXT: retq |
| ; |
| ; VEX-LABEL: sitofp_load_8i64_to_8f32: |
| ; VEX: # %bb.0: |
| -; VEX-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 |
| -; VEX-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 |
| +; VEX-NEXT: vmovaps (%rdi), %xmm0 |
| +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 |
| +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 |
| +; VEX-NEXT: vpextrq $1, %xmm2, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 |
| +; VEX-NEXT: vmovq %xmm2, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] |
| +; VEX-NEXT: vmovq %xmm3, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] |
| +; VEX-NEXT: vpextrq $1, %xmm3, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] |
| +; VEX-NEXT: vpextrq $1, %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; VEX-NEXT: vmovq %xmm0, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] |
| +; VEX-NEXT: vmovq %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] |
| +; VEX-NEXT: vpextrq $1, %xmm1, %rax |
| +; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 |
| ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 |
| -; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] |
| -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 |
| -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| -; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; VEX-NEXT: retq |
| ; |
| ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 |
| +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 |
| +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 |
| +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 |
| +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 |
| +; AVX512F-NEXT: vmovq %xmm2, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] |
| +; AVX512F-NEXT: vmovq %xmm3, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] |
| +; AVX512F-NEXT: vmovq %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 |
| ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] |
| -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 |
| +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 |
| +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 |
| +; AVX512VL-NEXT: vmovq %xmm2, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm3, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 |
| ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: |
| @@ -4229,7 +4352,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; SSE2-LABEL: uitofp_load_4i64_to_4f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 |
| -; SSE2-NEXT: movq 16(%rdi), %rax |
| +; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB83_1 |
| ; SSE2-NEXT: # %bb.2: |
| @@ -4243,23 +4366,23 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: addss %xmm1, %xmm1 |
| ; SSE2-NEXT: .LBB83_3: |
| -; SSE2-NEXT: movq (%rdi), %rax |
| +; SSE2-NEXT: movdqa (%rdi), %xmm2 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| -; SSE2-NEXT: movq %xmm0, %rcx |
| -; SSE2-NEXT: testq %rcx, %rcx |
| +; SSE2-NEXT: movq %xmm0, %rax |
| +; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB83_4 |
| ; SSE2-NEXT: # %bb.5: |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 |
| ; SSE2-NEXT: jmp .LBB83_6 |
| ; SSE2-NEXT: .LBB83_4: |
| -; SSE2-NEXT: movq %rcx, %rdx |
| -; SSE2-NEXT: shrq %rdx |
| -; SSE2-NEXT: andl $1, %ecx |
| -; SSE2-NEXT: orq %rdx, %rcx |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 |
| -; SSE2-NEXT: addss %xmm2, %xmm2 |
| +; SSE2-NEXT: movq %rax, %rcx |
| +; SSE2-NEXT: shrq %rcx |
| +; SSE2-NEXT: andl $1, %eax |
| +; SSE2-NEXT: orq %rcx, %rax |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 |
| +; SSE2-NEXT: addss %xmm3, %xmm3 |
| ; SSE2-NEXT: .LBB83_6: |
| -; SSE2-NEXT: movdqa (%rdi), %xmm3 |
| +; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB83_7 |
| ; SSE2-NEXT: # %bb.8: |
| @@ -4275,8 +4398,8 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: addss %xmm0, %xmm0 |
| ; SSE2-NEXT: .LBB83_9: |
| -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] |
| -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] |
| +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] |
| +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB83_10 |
| @@ -4397,23 +4520,35 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { |
| ; |
| ; AVX512F-LABEL: uitofp_load_4i64_to_4f32: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; AVX512F-NEXT: vmovq %xmm1, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 |
| ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 |
| +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 |
| ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| ; AVX512VL-NEXT: retq |
| ; |
| @@ -4566,7 +4701,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-LABEL: uitofp_load_8i64_to_8f32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 |
| -; SSE2-NEXT: movq 16(%rdi), %rax |
| +; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_1 |
| ; SSE2-NEXT: # %bb.2: |
| @@ -4580,23 +4715,23 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 |
| ; SSE2-NEXT: addss %xmm2, %xmm2 |
| ; SSE2-NEXT: .LBB87_3: |
| -; SSE2-NEXT: movq (%rdi), %rax |
| +; SSE2-NEXT: movdqa (%rdi), %xmm3 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] |
| -; SSE2-NEXT: movq %xmm0, %rcx |
| -; SSE2-NEXT: testq %rcx, %rcx |
| +; SSE2-NEXT: movq %xmm0, %rax |
| +; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_4 |
| ; SSE2-NEXT: # %bb.5: |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: jmp .LBB87_6 |
| ; SSE2-NEXT: .LBB87_4: |
| -; SSE2-NEXT: movq %rcx, %rdx |
| -; SSE2-NEXT: shrq %rdx |
| -; SSE2-NEXT: andl $1, %ecx |
| -; SSE2-NEXT: orq %rdx, %rcx |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 |
| +; SSE2-NEXT: movq %rax, %rcx |
| +; SSE2-NEXT: shrq %rcx |
| +; SSE2-NEXT: andl $1, %eax |
| +; SSE2-NEXT: orq %rcx, %rax |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 |
| ; SSE2-NEXT: addss %xmm1, %xmm1 |
| ; SSE2-NEXT: .LBB87_6: |
| -; SSE2-NEXT: movdqa (%rdi), %xmm3 |
| +; SSE2-NEXT: movq %xmm3, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_7 |
| ; SSE2-NEXT: # %bb.8: |
| @@ -4612,23 +4747,23 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 |
| ; SSE2-NEXT: addss %xmm0, %xmm0 |
| ; SSE2-NEXT: .LBB87_9: |
| -; SSE2-NEXT: movq 48(%rdi), %rax |
| +; SSE2-NEXT: movdqa 48(%rdi), %xmm6 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] |
| -; SSE2-NEXT: movq %xmm3, %rcx |
| -; SSE2-NEXT: testq %rcx, %rcx |
| +; SSE2-NEXT: movq %xmm3, %rax |
| +; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_10 |
| ; SSE2-NEXT: # %bb.11: |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm4 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 |
| ; SSE2-NEXT: jmp .LBB87_12 |
| ; SSE2-NEXT: .LBB87_10: |
| -; SSE2-NEXT: movq %rcx, %rdx |
| -; SSE2-NEXT: shrq %rdx |
| -; SSE2-NEXT: andl $1, %ecx |
| -; SSE2-NEXT: orq %rdx, %rcx |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm4 |
| +; SSE2-NEXT: movq %rax, %rcx |
| +; SSE2-NEXT: shrq %rcx |
| +; SSE2-NEXT: andl $1, %eax |
| +; SSE2-NEXT: orq %rcx, %rax |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 |
| ; SSE2-NEXT: addss %xmm4, %xmm4 |
| ; SSE2-NEXT: .LBB87_12: |
| -; SSE2-NEXT: movdqa 48(%rdi), %xmm5 |
| +; SSE2-NEXT: movq %xmm6, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_13 |
| ; SSE2-NEXT: # %bb.14: |
| @@ -4644,27 +4779,27 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 |
| ; SSE2-NEXT: addss %xmm3, %xmm3 |
| ; SSE2-NEXT: .LBB87_15: |
| -; SSE2-NEXT: movq 32(%rdi), %rax |
| -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] |
| -; SSE2-NEXT: movq %xmm5, %rcx |
| -; SSE2-NEXT: testq %rcx, %rcx |
| +; SSE2-NEXT: movdqa 32(%rdi), %xmm5 |
| +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] |
| +; SSE2-NEXT: movq %xmm6, %rax |
| +; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_16 |
| ; SSE2-NEXT: # %bb.17: |
| -; SSE2-NEXT: xorps %xmm5, %xmm5 |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 |
| +; SSE2-NEXT: xorps %xmm6, %xmm6 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 |
| ; SSE2-NEXT: jmp .LBB87_18 |
| ; SSE2-NEXT: .LBB87_16: |
| -; SSE2-NEXT: movq %rcx, %rdx |
| -; SSE2-NEXT: shrq %rdx |
| -; SSE2-NEXT: andl $1, %ecx |
| -; SSE2-NEXT: orq %rdx, %rcx |
| -; SSE2-NEXT: xorps %xmm5, %xmm5 |
| -; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 |
| -; SSE2-NEXT: addss %xmm5, %xmm5 |
| +; SSE2-NEXT: movq %rax, %rcx |
| +; SSE2-NEXT: shrq %rcx |
| +; SSE2-NEXT: andl $1, %eax |
| +; SSE2-NEXT: orq %rcx, %rax |
| +; SSE2-NEXT: xorps %xmm6, %xmm6 |
| +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 |
| +; SSE2-NEXT: addss %xmm6, %xmm6 |
| ; SSE2-NEXT: .LBB87_18: |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] |
| ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] |
| -; SSE2-NEXT: movdqa 32(%rdi), %xmm4 |
| +; SSE2-NEXT: movq %xmm5, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_19 |
| ; SSE2-NEXT: # %bb.20: |
| @@ -4681,8 +4816,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; SSE2-NEXT: addss %xmm1, %xmm1 |
| ; SSE2-NEXT: .LBB87_21: |
| ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] |
| -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] |
| -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] |
| +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] |
| +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] |
| ; SSE2-NEXT: movq %xmm2, %rax |
| ; SSE2-NEXT: testq %rax, %rax |
| ; SSE2-NEXT: js .LBB87_22 |
| @@ -4886,40 +5021,64 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { |
| ; |
| ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: |
| ; AVX512F: # %bb.0: |
| -; AVX512F-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 |
| -; AVX512F-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 |
| +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 |
| +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 |
| +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 |
| +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 |
| +; AVX512F-NEXT: vmovq %xmm2, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] |
| +; AVX512F-NEXT: vmovq %xmm3, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] |
| +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vmovq %xmm0, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] |
| +; AVX512F-NEXT: vmovq %xmm1, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] |
| +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 |
| ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] |
| -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 |
| -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512F-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: |
| ; AVX512VL: # %bb.0: |
| -; AVX512VL-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 |
| -; AVX512VL-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 |
| +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 |
| +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 |
| +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 |
| +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 |
| +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 |
| +; AVX512VL-NEXT: vmovq %xmm2, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm3, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vmovq %xmm0, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] |
| +; AVX512VL-NEXT: vmovq %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 |
| +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] |
| +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax |
| +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 |
| ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] |
| -; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 |
| -; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] |
| -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 |
| -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] |
| -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX512VL-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: |