| From 8b69549dc5c5fa0f5f8632cde1c740bb2c7d8957 Mon Sep 17 00:00:00 2001 |
| From: Phoebe Wang <phoebe.wang@intel.com> |
| Date: Sun, 14 Aug 2022 09:03:09 +0800 |
| Subject: [PATCH] [X86][FP16] Promote FP16->[U]INT to FP16->FP32->[U]INT |
| |
| This is to avoid f16->i64 being lowered to `__fixhfdi/__fixunshfdi` on 32-bits since neither libgcc nor compiler-rt provide them. https://godbolt.org/z/cjWEsea5v |
| |
| It also helps to improve the performance by promoting the vector type. |
| |
| Reviewed By: LuoYuanke |
| |
| Differential Revision: https://reviews.llvm.org/D131828 |
| --- |
| llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +- |
| llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 348 ++++++++---------- |
| llvm/test/CodeGen/X86/half.ll | 46 ++- |
| .../CodeGen/X86/vector-half-conversions.ll | 55 ++- |
| 4 files changed, 246 insertions(+), 226 deletions(-) |
| |
| diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp |
| index d5d41ca50553..3ff51374c114 100644 |
| --- a/llvm/lib/Target/X86/X86ISelLowering.cpp |
| +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp |
| @@ -32734,8 +32734,29 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, |
| N->getOpcode() == ISD::STRICT_FP_TO_SINT; |
| EVT VT = N->getValueType(0); |
| SDValue Src = N->getOperand(IsStrict ? 1 : 0); |
| + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); |
| EVT SrcVT = Src.getValueType(); |
| |
| + SDValue Res; |
| + if (isSoftFP16(SrcVT)) { |
| + EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; |
| + if (IsStrict) { |
| + Res = |
| + DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, |
| + {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, |
| + {NVT, MVT::Other}, {Chain, Src})}); |
| + Chain = Res.getValue(1); |
| + } else { |
| + Res = DAG.getNode(N->getOpcode(), dl, VT, |
| + DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); |
| + } |
| + Results.push_back(Res); |
| + if (IsStrict) |
| + Results.push_back(Chain); |
| + |
| + return; |
| + } |
| + |
| if (VT.isVector() && Subtarget.hasFP16() && |
| SrcVT.getVectorElementType() == MVT::f16) { |
| EVT EleVT = VT.getVectorElementType(); |
| @@ -32749,7 +32770,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, |
| Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); |
| } |
| |
| - SDValue Res, Chain; |
| if (IsStrict) { |
| unsigned Opc = |
| IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; |
| @@ -32941,7 +32961,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, |
| return; |
| } |
| |
| - SDValue Chain; |
| if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { |
| Results.push_back(V); |
| if (IsStrict) |
| diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll |
| index c22656dc2a16..c6883afe07ed 100644 |
| --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll |
| +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll |
| @@ -863,59 +863,47 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm1 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload |
| -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload |
| +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] |
| +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| @@ -940,82 +928,94 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload |
| -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload |
| -; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload |
| +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] |
| +; CHECK-NEXT: cvttps2dq %xmm2, %xmm0 |
| +; CHECK-NEXT: movdqa %xmm0, %xmm1 |
| +; CHECK-NEXT: psrad $31, %xmm1 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 |
| +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 |
| +; CHECK-NEXT: pand %xmm1, %xmm2 |
| +; CHECK-NEXT: por %xmm0, %xmm2 |
| +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload |
| +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] |
| ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] |
| -; CHECK-NEXT: movdqa %xmm4, %xmm2 |
| -; CHECK-NEXT: pxor %xmm1, %xmm2 |
| -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] |
| -; CHECK-NEXT: movdqa %xmm3, %xmm0 |
| -; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 |
| -; CHECK-NEXT: pand %xmm0, %xmm4 |
| -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 |
| -; CHECK-NEXT: pxor %xmm2, %xmm0 |
| -; CHECK-NEXT: por %xmm4, %xmm0 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload |
| -; CHECK-NEXT: pxor %xmm4, %xmm1 |
| -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 |
| -; CHECK-NEXT: pand %xmm3, %xmm4 |
| -; CHECK-NEXT: pxor %xmm2, %xmm3 |
| -; CHECK-NEXT: por %xmm4, %xmm3 |
| -; CHECK-NEXT: pslld $16, %xmm3 |
| -; CHECK-NEXT: psrad $16, %xmm3 |
| +; CHECK-NEXT: movdqa %xmm2, %xmm3 |
| +; CHECK-NEXT: pxor %xmm1, %xmm3 |
| +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] |
| +; CHECK-NEXT: movdqa %xmm4, %xmm0 |
| +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 |
| +; CHECK-NEXT: pand %xmm0, %xmm2 |
| +; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 |
| +; CHECK-NEXT: pxor %xmm3, %xmm0 |
| +; CHECK-NEXT: por %xmm2, %xmm0 |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload |
| +; CHECK-NEXT: pxor %xmm2, %xmm1 |
| +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 |
| +; CHECK-NEXT: pand %xmm4, %xmm2 |
| +; CHECK-NEXT: pxor %xmm3, %xmm4 |
| +; CHECK-NEXT: por %xmm2, %xmm4 |
| +; CHECK-NEXT: pslld $16, %xmm4 |
| +; CHECK-NEXT: psrad $16, %xmm4 |
| ; CHECK-NEXT: pslld $16, %xmm0 |
| ; CHECK-NEXT: psrad $16, %xmm0 |
| -; CHECK-NEXT: packssdw %xmm3, %xmm0 |
| +; CHECK-NEXT: packssdw %xmm4, %xmm0 |
| ; CHECK-NEXT: addq $72, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 8 |
| ; CHECK-NEXT: retq |
| @@ -1035,59 +1035,47 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload |
| ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload |
| -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] |
| -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 |
| +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] |
| @@ -2437,59 +2425,47 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm1 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload |
| -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload |
| +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] |
| +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| @@ -2512,82 +2488,94 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 |
| +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| +; CHECK-NEXT: psrad $31, %xmm2 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: pand %xmm2, %xmm0 |
| +; CHECK-NEXT: por %xmm1, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %rax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload |
| -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload |
| -; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload |
| +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] |
| +; CHECK-NEXT: cvttps2dq %xmm2, %xmm0 |
| +; CHECK-NEXT: movdqa %xmm0, %xmm1 |
| +; CHECK-NEXT: psrad $31, %xmm1 |
| +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 |
| +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 |
| +; CHECK-NEXT: pand %xmm1, %xmm2 |
| +; CHECK-NEXT: por %xmm0, %xmm2 |
| +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload |
| +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] |
| ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] |
| -; CHECK-NEXT: movdqa %xmm4, %xmm2 |
| -; CHECK-NEXT: pxor %xmm1, %xmm2 |
| -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] |
| -; CHECK-NEXT: movdqa %xmm3, %xmm0 |
| -; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 |
| -; CHECK-NEXT: pand %xmm0, %xmm4 |
| -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 |
| -; CHECK-NEXT: pxor %xmm2, %xmm0 |
| -; CHECK-NEXT: por %xmm4, %xmm0 |
| -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload |
| -; CHECK-NEXT: pxor %xmm4, %xmm1 |
| -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 |
| -; CHECK-NEXT: pand %xmm3, %xmm4 |
| -; CHECK-NEXT: pxor %xmm2, %xmm3 |
| -; CHECK-NEXT: por %xmm4, %xmm3 |
| -; CHECK-NEXT: pslld $16, %xmm3 |
| -; CHECK-NEXT: psrad $16, %xmm3 |
| +; CHECK-NEXT: movdqa %xmm2, %xmm3 |
| +; CHECK-NEXT: pxor %xmm1, %xmm3 |
| +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] |
| +; CHECK-NEXT: movdqa %xmm4, %xmm0 |
| +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 |
| +; CHECK-NEXT: pand %xmm0, %xmm2 |
| +; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 |
| +; CHECK-NEXT: pxor %xmm3, %xmm0 |
| +; CHECK-NEXT: por %xmm2, %xmm0 |
| +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload |
| +; CHECK-NEXT: pxor %xmm2, %xmm1 |
| +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 |
| +; CHECK-NEXT: pand %xmm4, %xmm2 |
| +; CHECK-NEXT: pxor %xmm3, %xmm4 |
| +; CHECK-NEXT: por %xmm2, %xmm4 |
| +; CHECK-NEXT: pslld $16, %xmm4 |
| +; CHECK-NEXT: psrad $16, %xmm4 |
| ; CHECK-NEXT: pslld $16, %xmm0 |
| ; CHECK-NEXT: psrad $16, %xmm0 |
| -; CHECK-NEXT: packssdw %xmm3, %xmm0 |
| +; CHECK-NEXT: packssdw %xmm4, %xmm0 |
| ; CHECK-NEXT: addq $72, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 8 |
| ; CHECK-NEXT: retq |
| @@ -2606,59 +2594,47 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { |
| ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill |
| ; CHECK-NEXT: psrlq $48, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrld $16, %xmm0 |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload |
| ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] |
| -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload |
| -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] |
| -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 |
| +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] |
| ; CHECK-NEXT: callq __extendhfsf2@PLT |
| -; CHECK-NEXT: cvttss2si %xmm0, %eax |
| -; CHECK-NEXT: movd %eax, %xmm0 |
| -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] |
| +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 |
| ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] |
| ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] |
| diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll |
| index 7ec40d8b220e..501db6b2dba0 100644 |
| --- a/llvm/test/CodeGen/X86/half.ll |
| +++ b/llvm/test/CodeGen/X86/half.ll |
| @@ -228,13 +228,24 @@ define i64 @test_fptosi_i64(ptr %p) #0 { |
| ; |
| ; CHECK-I686-LABEL: test_fptosi_i64: |
| ; CHECK-I686: # %bb.0: |
| -; CHECK-I686-NEXT: subl $12, %esp |
| +; CHECK-I686-NEXT: subl $28, %esp |
| ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 |
| ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax |
| ; CHECK-I686-NEXT: movw %ax, (%esp) |
| -; CHECK-I686-NEXT: calll __fixhfdi |
| -; CHECK-I686-NEXT: addl $12, %esp |
| +; CHECK-I686-NEXT: calll __extendhfsf2 |
| +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax |
| +; CHECK-I686-NEXT: orl $3072, %eax # imm = 0xC00 |
| +; CHECK-I686-NEXT: movw %ax, {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax |
| +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %edx |
| +; CHECK-I686-NEXT: addl $28, %esp |
| ; CHECK-I686-NEXT: retl |
| %a = load half, ptr %p, align 2 |
| %r = fptosi half %a to i64 |
| @@ -315,13 +326,36 @@ define i64 @test_fptoui_i64(ptr %p) #0 { |
| ; |
| ; CHECK-I686-LABEL: test_fptoui_i64: |
| ; CHECK-I686: # %bb.0: |
| -; CHECK-I686-NEXT: subl $12, %esp |
| +; CHECK-I686-NEXT: subl $28, %esp |
| ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 |
| ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax |
| ; CHECK-I686-NEXT: movw %ax, (%esp) |
| -; CHECK-I686-NEXT: calll __fixunshfdi |
| -; CHECK-I686-NEXT: addl $12, %esp |
| +; CHECK-I686-NEXT: calll __extendhfsf2 |
| +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| +; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| +; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 |
| +; CHECK-I686-NEXT: jae .LBB9_2 |
| +; CHECK-I686-NEXT: # %bb.1: |
| +; CHECK-I686-NEXT: xorps %xmm1, %xmm1 |
| +; CHECK-I686-NEXT: .LBB9_2: |
| +; CHECK-I686-NEXT: subss %xmm1, %xmm0 |
| +; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: setae %al |
| +; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx |
| +; CHECK-I686-NEXT: orl $3072, %ecx # imm = 0xC00 |
| +; CHECK-I686-NEXT: movw %cx, {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) |
| +; CHECK-I686-NEXT: movzbl %al, %edx |
| +; CHECK-I686-NEXT: shll $31, %edx |
| +; CHECK-I686-NEXT: xorl {{[0-9]+}}(%esp), %edx |
| +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax |
| +; CHECK-I686-NEXT: addl $28, %esp |
| ; CHECK-I686-NEXT: retl |
| %a = load half, ptr %p, align 2 |
| %r = fptoui half %a to i64 |
| diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll |
| index ac14b415f35b..ec9bb73bd2d4 100644 |
| --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll |
| +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll |
| @@ -1271,27 +1271,3 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { |
| store <32 x half> %1, ptr %a1 |
| ret void |
| } |
| - |
| -define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { |
| -; ALL-LABEL: fptosi_2f16_to_4i32: |
| -; ALL: # %bb.0: |
| -; ALL-NEXT: vpsrld $16, %xmm0, %xmm1 |
| -; ALL-NEXT: vpextrw $0, %xmm1, %eax |
| -; ALL-NEXT: movzwl %ax, %eax |
| -; ALL-NEXT: vmovd %eax, %xmm1 |
| -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 |
| -; ALL-NEXT: vcvttss2si %xmm1, %eax |
| -; ALL-NEXT: vpextrw $0, %xmm0, %ecx |
| -; ALL-NEXT: movzwl %cx, %ecx |
| -; ALL-NEXT: vmovd %ecx, %xmm0 |
| -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 |
| -; ALL-NEXT: vcvttss2si %xmm0, %ecx |
| -; ALL-NEXT: vmovd %ecx, %xmm0 |
| -; ALL-NEXT: vmovd %eax, %xmm1 |
| -; ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| -; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero |
| -; ALL-NEXT: retq |
| - %cvt = fptosi <2 x half> %a to <2 x i32> |
| - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| - ret <4 x i32> %ext |
| -} |
| -- |
| 2.37.1.595.g718a3a8f04-goog |
| |