sys-devel/llvm/files/cherry/6601be441974cb1b91c47ee20bf9a284076cf6dd.patch - third_party/overlays/chromiumos-overlay - Git at Google

 commit 6601be441974cb1b91c47ee20bf9a284076cf6dd
 Author: Eli Friedman <efriedma@quicinc.com>
 Date:   Sat Jul 17 11:11:41 2021 -0700

     [X86] Remove incorrect use of known bits in shuffle simplification.

     This reverts commit 2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.

     The result of a shufflevector must not propagate poison from any element
     other than the one noted in the shuffle mask.

     The regressions outside of fptoui-may-overflow.ll can probably be
     recovered some other way; for example, using isGuaranteedNotToBePoison.

     See discussion on https://reviews.llvm.org/D106053 for more background.

     Differential Revision: https://reviews.llvm.org/D106222

 diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
 index 8cbdf7e02385..4b10d6ea9ff3 100644
 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h
 +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
 @@ -1711,11 +1711,6 @@ public:
    bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
                           const APInt &DemandedElts, unsigned Depth = 0) const;

 -  /// Return true if the DemandedElts of the vector Op are all zero.  We
 -  /// use this predicate to simplify operations downstream.
 -  bool MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts,
 -                             unsigned Depth = 0) const;
 -
    /// Return true if '(Op & Mask) == Mask'.
    /// Op and Mask are known to be the same type.
    bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
 diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
 index 8886a2d5f54c..e058c40ce6ff 100644
 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
 +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
 @@ -2446,15 +2446,6 @@ bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
    return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
  }

 -/// Return true if the DemandedElts of the vector Op are all zero.  We
 -/// use this predicate to simplify operations downstream.
 -bool SelectionDAG::MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts,
 -                                         unsigned Depth) const {
 -  unsigned BitWidth = Op.getScalarValueSizeInBits();
 -  APInt DemandedBits = APInt::getAllOnesValue(BitWidth);
 -  return MaskedValueIsZero(Op, DemandedBits, DemandedElts, Depth);
 -}
 -
  /// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
  bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
                                          unsigned Depth) const {
 diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
 index 7b4de8047264..5c64114cc2b3 100644
 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp
 +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
 @@ -35974,15 +35974,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
        }
      }

 -    // See if this is a blend with zero - in which case check if the zero'd
 -    // elements are already zero.
 -    if (isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0)) {
 -      assert(!KnownZero.isNullValue() && "Shuffle has no zero elements");
 -      SDValue NewV1 = CanonicalizeShuffleInput(MaskVT, V1);
 -      if (DAG.MaskedElementsAreZero(NewV1, KnownZero))
 -        return DAG.getBitcast(RootVT, NewV1);
 -    }
 -
      SDValue NewV1 = V1; // Save operand in case early exit happens.
      if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
                            DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
 diff --git a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
 new file mode 100644
 index 000000000000..37bdfaadaf57
 --- /dev/null
 +++ b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
 @@ -0,0 +1,36 @@
 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 +
 +; @fptoui_zext is legal to optimize to a single vcvttps2dq: if one of the i8
 +; results of fptoui is poisoned, the corresponding i32 result of the zext is
 +; also poisoned. We currently don't implement this optimization.
 +
 +define <16 x i8> @fptoui_zext(<4 x float> %arg) {
 +; CHECK-LABEL: fptoui_zext:
 +; CHECK:       # %bb.0:
 +; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 +; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 +; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 +; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 +; CHECK-NEXT:    retq
 +  %f = fptoui <4 x float> %arg to <4 x i8>
 +  %z = zext <4 x i8> %f to <4 x i32>
 +  %b = bitcast <4 x i32> %z to <16 x i8>
 +  ret <16 x i8> %b
 +}
 +
 +; In @fptoui_shuffle, we must preserve the vpand for correctnesss. Only the
 +; i8 values extracted from %s are poison.  The values from the zeroinitializer
 +; are not.
 +
 +define <16 x i8> @fptoui_shuffle(<4 x float> %arg) {
 +; CHECK-LABEL: fptoui_shuffle:
 +; CHECK:       # %bb.0:
 +; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 +; CHECK-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 +; CHECK-NEXT:    retq
 +  %f = fptoui <4 x float> %arg to <4 x i8>
 +  %s = shufflevector <4 x i8> %f, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 +  %ss = shufflevector <16 x i8> %s, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
 +  ret <16 x i8> %ss
 +}
 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
 index 53f44d496263..3d82cb352076 100644
 --- a/llvm/test/CodeGen/X86/oddshuffles.ll
 +++ b/llvm/test/CodeGen/X86/oddshuffles.ll
 @@ -2264,8 +2264,8 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
  ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
  ; AVX1-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
  ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 -; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
 -; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
 +; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
  ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
  ; AVX1-NEXT:    retq
  ;
 @@ -2291,8 +2291,8 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
  ; XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
  ; XOP-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
  ; XOP-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 -; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
 -; XOP-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
 +; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
 +; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
  ; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
  ; XOP-NEXT:    retq
    %1 = load <3 x i32>, <3 x i32>* %ptr, align 1
 diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
 index ffd19662807d..d751fc7ec002 100644
 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
 +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
 @@ -3088,13 +3088,24 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a
  ; X64-SSE2:       # %bb.0:
  ; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
  ; X64-SSE2-NEXT:    psrad $1, %xmm0
 +; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
  ; X64-SSE2-NEXT:    retq
  ;
 -; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
 -; X64-AVX:       # %bb.0:
 -; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 -; X64-AVX-NEXT:    vpsrad $1, %xmm0, %xmm0
 -; X64-AVX-NEXT:    retq
 +; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
 +; X64-AVX1:       # %bb.0:
 +; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 +; X64-AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 +; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 +; X64-AVX1-NEXT:    retq
 +;
 +; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
 +; X64-AVX2:       # %bb.0:
 +; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 +; X64-AVX2-NEXT:    vpsrad $1, %xmm0, %xmm0
 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 +; X64-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 +; X64-AVX2-NEXT:    retq
    %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
    %t1 = ashr <2 x i64> %t0, <i64 1, i64 1>
    ret <2 x i64> %t1
	commit 6601be441974cb1b91c47ee20bf9a284076cf6dd
	Author: Eli Friedman <efriedma@quicinc.com>
	Date: Sat Jul 17 11:11:41 2021 -0700

	[X86] Remove incorrect use of known bits in shuffle simplification.

	This reverts commit 2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.

	The result of a shufflevector must not propagate poison from any element
	other than the one noted in the shuffle mask.

	The regressions outside of fptoui-may-overflow.ll can probably be
	recovered some other way; for example, using isGuaranteedNotToBePoison.

	See discussion on https://reviews.llvm.org/D106053 for more background.

	Differential Revision: https://reviews.llvm.org/D106222

	diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
	index 8cbdf7e02385..4b10d6ea9ff3 100644
	--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
	+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
	@@ -1711,11 +1711,6 @@ public:
	bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
	const APInt &DemandedElts, unsigned Depth = 0) const;

	- /// Return true if the DemandedElts of the vector Op are all zero. We
	- /// use this predicate to simplify operations downstream.
	- bool MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts,
	- unsigned Depth = 0) const;
	-
	/// Return true if '(Op & Mask) == Mask'.
	/// Op and Mask are known to be the same type.
	bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
	diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	index 8886a2d5f54c..e058c40ce6ff 100644
	--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	@@ -2446,15 +2446,6 @@ bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
	return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
	}

	-/// Return true if the DemandedElts of the vector Op are all zero. We
	-/// use this predicate to simplify operations downstream.
	-bool SelectionDAG::MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts,
	- unsigned Depth) const {
	- unsigned BitWidth = Op.getScalarValueSizeInBits();
	- APInt DemandedBits = APInt::getAllOnesValue(BitWidth);
	- return MaskedValueIsZero(Op, DemandedBits, DemandedElts, Depth);
	-}
	-
	/// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
	bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
	unsigned Depth) const {
	diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
	index 7b4de8047264..5c64114cc2b3 100644
	--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
	+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
	@@ -35974,15 +35974,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	}
	}

	- // See if this is a blend with zero - in which case check if the zero'd
	- // elements are already zero.
	- if (isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0)) {
	- assert(!KnownZero.isNullValue() && "Shuffle has no zero elements");
	- SDValue NewV1 = CanonicalizeShuffleInput(MaskVT, V1);
	- if (DAG.MaskedElementsAreZero(NewV1, KnownZero))
	- return DAG.getBitcast(RootVT, NewV1);
	- }
	-
	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	diff --git a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
	new file mode 100644
	index 000000000000..37bdfaadaf57
	--- /dev/null
	+++ b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
	@@ -0,0 +1,36 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s
	+
	+; @fptoui_zext is legal to optimize to a single vcvttps2dq: if one of the i8
	+; results of fptoui is poisoned, the corresponding i32 result of the zext is
	+; also poisoned. We currently don't implement this optimization.
	+
	+define <16 x i8> @fptoui_zext(<4 x float> %arg) {
	+; CHECK-LABEL: fptoui_zext:
	+; CHECK: # %bb.0:
	+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
	+; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
	+; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
	+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	+; CHECK-NEXT: retq
	+ %f = fptoui <4 x float> %arg to <4 x i8>
	+ %z = zext <4 x i8> %f to <4 x i32>
	+ %b = bitcast <4 x i32> %z to <16 x i8>
	+ ret <16 x i8> %b
	+}
	+
	+; In @fptoui_shuffle, we must preserve the vpand for correctnesss. Only the
	+; i8 values extracted from %s are poison. The values from the zeroinitializer
	+; are not.
	+
	+define <16 x i8> @fptoui_shuffle(<4 x float> %arg) {
	+; CHECK-LABEL: fptoui_shuffle:
	+; CHECK: # %bb.0:
	+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
	+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
	+; CHECK-NEXT: retq
	+ %f = fptoui <4 x float> %arg to <4 x i8>
	+ %s = shufflevector <4 x i8> %f, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	+ %ss = shufflevector <16 x i8> %s, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
	+ ret <16 x i8> %ss
	+}
	diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
	index 53f44d496263..3d82cb352076 100644
	--- a/llvm/test/CodeGen/X86/oddshuffles.ll
	+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
	@@ -2264,8 +2264,8 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
	; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1
	; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
	-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
	-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
	+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
	+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
	; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
	; AVX1-NEXT: retq
	;
	@@ -2291,8 +2291,8 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
	; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1
	; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2
	-; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
	-; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
	+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
	+; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
	; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
	; XOP-NEXT: retq
	%1 = load <3 x i32>, <3 x i32>* %ptr, align 1
	diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
	index ffd19662807d..d751fc7ec002 100644
	--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
	+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
	@@ -3088,13 +3088,24 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a
	; X64-SSE2: # %bb.0:
	; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
	; X64-SSE2-NEXT: psrad $1, %xmm0
	+; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
	; X64-SSE2-NEXT: retq
	;
	-; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
	-; X64-AVX: # %bb.0:
	-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
	-; X64-AVX-NEXT: vpsrad $1, %xmm0, %xmm0
	-; X64-AVX-NEXT: retq
	+; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
	+; X64-AVX1: # %bb.0:
	+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
	+; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
	+; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
	+; X64-AVX1-NEXT: retq
	+;
	+; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
	+; X64-AVX2: # %bb.0:
	+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
	+; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0
	+; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
	+; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
	+; X64-AVX2-NEXT: retq
	%t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
	%t1 = ashr <2 x i64> %t0, <i64 1, i64 1>
	ret <2 x i64> %t1