sys-devel/llvm/files/cherry/36911971a58d1ba8b15e97790ac816eaadb0603e.patch - third_party/overlays/chromiumos-overlay - Git at Google

 commit 36911971a58d1ba8b15e97790ac816eaadb0603e
 Author: Alexey Bataev <a.bataev@outlook.com>
 Date:   Fri May 21 06:29:23 2021 -0700

     [SLP]Better detection of perfect/shuffles matches for gather nodes.

     Implemented better scheme for perfect/shuffled matches of the gather
     nodes which allows to fix the performance regressions introduced by
     earlier patches. Starting detecting matches for broadcast nodes and
     extractelement gathering.

     Differential Revision: https://reviews.llvm.org/D102920

 diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
 index 0f8fb09b6f6c..31d2c717f900 100644
 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
 +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
 @@ -22,6 +22,7 @@
  #include "llvm/ADT/Optional.h"
  #include "llvm/ADT/PostOrderIterator.h"
  #include "llvm/ADT/STLExtras.h"
 +#include "llvm/ADT/SetOperations.h"
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallBitVector.h"
  #include "llvm/ADT/SmallPtrSet.h"
 @@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
    if (E->State == TreeEntry::NeedToGather) {
      if (allConstant(VL))
        return 0;
 -    if (isSplat(VL)) {
 -      return ReuseShuffleCost +
 -             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
 -                                 0);
 -    }
      if (isa<InsertElementInst>(VL[0]))
        return InstructionCost::getInvalid();
 -    if (E->getOpcode() == Instruction::ExtractElement &&
 -        allSameType(VL) && allSameBlock(VL)) {
 -      SmallVector<int> Mask;
 -      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
 -          isShuffle(VL, Mask);
 -      if (ShuffleKind.hasValue()) {
 -        InstructionCost Cost =
 -            computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
 -        AdjustExtractsCost(Cost, /*IsGather=*/true);
 -        return ReuseShuffleCost + Cost;
 -      }
 -    }
 -    InstructionCost GatherCost = 0;
      SmallVector<int> Mask;
      SmallVector<const TreeEntry *> Entries;
      Optional<TargetTransformInfo::ShuffleKind> Shuffle =
          isGatherShuffledEntry(E, Mask, Entries);
      if (Shuffle.hasValue()) {
 +      InstructionCost GatherCost = 0;
        if (ShuffleVectorInst::isIdentityMask(Mask)) {
 +        // Perfect match in the graph, will reuse the previously vectorized
 +        // node. Cost is 0.
          LLVM_DEBUG(
              dbgs()
              << "SLP: perfect diamond match for gather bundle that starts with "
 @@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
          LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                            << " entries for bundle that starts with "
                            << *VL.front() << ".\n");
 +        // Detected that instead of gather we can emit a shuffle of single/two
 +        // previously vectorized nodes. Add the cost of the permutation rather
 +        // than gather.
          GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
        }
 -    } else {
 -      GatherCost = getGatherCost(VL);
 +      return ReuseShuffleCost + GatherCost;
 +    }
 +    if (isSplat(VL)) {
 +      // Found the broadcasting of the single scalar, calculate the cost as the
 +      // broadcast.
 +      return ReuseShuffleCost +
 +             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
 +                                 0);
 +    }
 +    if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
 +        allSameBlock(VL)) {
 +      // Check that gather of extractelements can be represented as just a
 +      // shuffle of a single/two vectors the scalars are extracted from.
 +      SmallVector<int> Mask;
 +      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
 +          isShuffle(VL, Mask);
 +      if (ShuffleKind.hasValue()) {
 +        // Found the bunch of extractelement instructions that must be gathered
 +        // into a vector and can be represented as a permutation elements in a
 +        // single input vector or of 2 input vectors.
 +        InstructionCost Cost =
 +            computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
 +        AdjustExtractsCost(Cost, /*IsGather=*/true);
 +        return ReuseShuffleCost + Cost;
 +      }
      }
 -    return ReuseShuffleCost + GatherCost;
 +    return ReuseShuffleCost + getGatherCost(VL);
    }
    assert((E->State == TreeEntry::Vectorize ||
            E->State == TreeEntry::ScatterVectorize) &&
 @@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() {
              return false;
            auto *IE1 = cast<InsertElementInst>(VU);
            auto *IE2 = cast<InsertElementInst>(V);
 +          // Go though of insertelement instructions trying to find either VU as
 +          // the original vector for IE2 or V as the original vector for IE1.
            do {
              if (IE1 == VU || IE2 == V)
                return true;
 @@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() {
  Optional<TargetTransformInfo::ShuffleKind>
  BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
                                 SmallVectorImpl<const TreeEntry *> &Entries) {
 +  // TODO: currently checking only for Scalars in the tree entry, need to count
 +  // reused elements too for better cost estimation.
    Mask.assign(TE->Scalars.size(), UndefMaskElem);
    Entries.clear();
 -  DenseMap<Value *, const TreeEntry *> UsedValuesEntry;
 -  unsigned VF = 0;
 -  // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
 -  // landed.
 -  auto &&GetVF = [](const TreeEntry *TE) {
 -    if (!TE->ReuseShuffleIndices.empty())
 -      return TE->ReuseShuffleIndices.size();
 -    return TE->Scalars.size();
 -  };
 -  for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
 -    Value *V = TE->Scalars[I];
 +  // Build a lists of values to tree entries.
 +  DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
 +  for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
 +    if (EntryPtr.get() == TE)
 +      break;
 +    if (EntryPtr->State != TreeEntry::NeedToGather)
 +      continue;
 +    for (Value *V : EntryPtr->Scalars)
 +      ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
 +  }
 +  // Find all tree entries used by the gathered values. If no common entries
 +  // found - not a shuffle.
 +  // Here we build a set of tree nodes for each gathered value and trying to
 +  // find the intersection between these sets. If we have at least one common
 +  // tree node for each gathered value - we have just a permutation of the
 +  // single vector. If we have 2 different sets, we're in situation where we
 +  // have a permutation of 2 input vectors.
 +  SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
 +  DenseMap<Value *, int> UsedValuesEntry;
 +  for (Value *V : TE->Scalars) {
      if (isa<UndefValue>(V))
        continue;
 -    const TreeEntry *VTE = UsedValuesEntry.lookup(V);
 -    if (!VTE) {
 -      if (Entries.size() == 2)
 -        return None;
 -      VTE = getTreeEntry(V);
 -      if (!VTE || find_if(
 -                      VectorizableTree,
 -                      [VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
 -                        return EntryPtr.get() == VTE || EntryPtr.get() == TE;
 -                      })->get() == TE) {
 -        // Check if it is used in one of the gathered entries.
 -        const auto *It =
 -            find_if(VectorizableTree,
 -                    [V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
 -                      return EntryPtr.get() == TE ||
 -                             (EntryPtr->State == TreeEntry::NeedToGather &&
 -                              is_contained(EntryPtr->Scalars, V));
 -                    });
 -        // The vector factor of shuffled entries must be the same.
 -        if (It->get() == TE)
 +    // Build a list of tree entries where V is used.
 +    SmallPtrSet<const TreeEntry *, 4> VToTEs;
 +    auto It = ValueToTEs.find(V);
 +    if (It != ValueToTEs.end())
 +      VToTEs = It->second;
 +    if (const TreeEntry *VTE = getTreeEntry(V))
 +      VToTEs.insert(VTE);
 +    if (VToTEs.empty())
 +      return None;
 +    if (UsedTEs.empty()) {
 +      // The first iteration, just insert the list of nodes to vector.
 +      UsedTEs.push_back(VToTEs);
 +    } else {
 +      // Need to check if there are any previously used tree nodes which use V.
 +      // If there are no such nodes, consider that we have another one input
 +      // vector.
 +      SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
 +      unsigned Idx = 0;
 +      for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
 +        // Do we have a non-empty intersection of previously listed tree entries
 +        // and tree entries using current V?
 +        set_intersect(VToTEs, Set);
 +        if (!VToTEs.empty()) {
 +          // Yes, write the new subset and continue analysis for the next
 +          // scalar.
 +          Set.swap(VToTEs);
 +          break;
 +        }
 +        VToTEs = SavedVToTEs;
 +        ++Idx;
 +      }
 +      // No non-empty intersection found - need to add a second set of possible
 +      // source vectors.
 +      if (Idx == UsedTEs.size()) {
 +        // If the number of input vectors is greater than 2 - not a permutation,
 +        // fallback to the regular gather.
 +        if (UsedTEs.size() == 2)
            return None;
 -        VTE = It->get();
 +        UsedTEs.push_back(SavedVToTEs);
 +        Idx = UsedTEs.size() - 1;
        }
 -      Entries.push_back(VTE);
 -      if (Entries.size() == 1) {
 -        VF = GetVF(VTE);
 -      } else if (VF != GetVF(VTE)) {
 -        assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
 -        assert(VF > 0 && "Expected non-zero vector factor.");
 -        return None;
 +      UsedValuesEntry.try_emplace(V, Idx);
 +    }
 +  }
 +
 +  unsigned VF = 0;
 +  if (UsedTEs.size() == 1) {
 +    // Try to find the perfect match in another gather node at first.
 +    auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
 +      return EntryPtr->isSame(TE->Scalars);
 +    });
 +    if (It != UsedTEs.front().end()) {
 +      Entries.push_back(*It);
 +      std::iota(Mask.begin(), Mask.end(), 0);
 +      return TargetTransformInfo::SK_PermuteSingleSrc;
 +    }
 +    // No perfect match, just shuffle, so choose the first tree node.
 +    Entries.push_back(*UsedTEs.front().begin());
 +  } else {
 +    // Try to find nodes with the same vector factor.
 +    assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
 +    // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
 +    // landed.
 +    auto &&GetVF = [](const TreeEntry *TE) {
 +      if (!TE->ReuseShuffleIndices.empty())
 +        return TE->ReuseShuffleIndices.size();
 +      return TE->Scalars.size();
 +    };
 +    DenseMap<int, const TreeEntry *> VFToTE;
 +    for (const TreeEntry *TE : UsedTEs.front())
 +      VFToTE.try_emplace(GetVF(TE), TE);
 +    for (const TreeEntry *TE : UsedTEs.back()) {
 +      auto It = VFToTE.find(GetVF(TE));
 +      if (It != VFToTE.end()) {
 +        VF = It->first;
 +        Entries.push_back(It->second);
 +        Entries.push_back(TE);
 +        break;
        }
 -      for (Value *SV : VTE->Scalars)
 -        UsedValuesEntry.try_emplace(SV, VTE);
      }
 +    // No 2 source vectors with the same vector factor - give up and do regular
 +    // gather.
 +    if (Entries.empty())
 +      return None;
 +  }
 +
 +  // Build a shuffle mask for better cost estimation and vector emission.
 +  for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
 +    Value *V = TE->Scalars[I];
 +    if (isa<UndefValue>(V))
 +      continue;
 +    unsigned Idx = UsedValuesEntry.lookup(V);
 +    const TreeEntry *VTE = Entries[Idx];
      int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
 -    Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
 +    Mask[I] = Idx * VF + FoundLane;
      // Extra check required by isSingleSourceMaskImpl function (called by
      // ShuffleVectorInst::isSingleSourceMask).
      if (Mask[I] >= 2 * E)
 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
 index f147911a6d1f..1c806c6b1f73 100644
 --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
 +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
 @@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {

  define i8 @j(<4 x i8> %x, <4 x i8> %y) {
  ; CHECK-LABEL: @j(
 -; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
 -; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
 -; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 -; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 -; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
 -; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
 -; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 -; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 -; CHECK-NEXT:    ret i8 [[TMP8]]
 +; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
 +; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
 +; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
 +; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
 +; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 +; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
 +; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
 +; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
 +; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
 +; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
 +; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
 +; CHECK-NEXT:    ret i8 [[TMP3]]
  ;
    %x0 = extractelement <4 x i8> %x, i32 0
    %x3 = extractelement <4 x i8> %x, i32 3
 @@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

  define i8 @k(<4 x i8> %x) {
  ; CHECK-LABEL: @k(
 -; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
 -; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
 -; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
 -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
 -; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
 -; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
 -; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 -; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 -; CHECK-NEXT:    ret i8 [[TMP8]]
 +; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
 +; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
 +; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
 +; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 +; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 +; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
 +; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
 +; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
 +; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
 +; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
 +; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
 +; CHECK-NEXT:    ret i8 [[TMP3]]
  ;
    %x0 = extractelement <4 x i8> %x, i32 0
    %x3 = extractelement <4 x i8> %x, i32 3
 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
 index 1a4d0c3b50bd..0fcc096773dd 100644
 --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
 +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
 @@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {

  define i8 @j(<4 x i8> %x, <4 x i8> %y) {
  ; CHECK-LABEL: @j(
 -; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
 -; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
 -; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 -; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 -; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
 -; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
 -; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 -; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 -; CHECK-NEXT:    ret i8 [[TMP8]]
 +; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
 +; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
 +; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
 +; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
 +; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 +; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
 +; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
 +; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
 +; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
 +; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
 +; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
 +; CHECK-NEXT:    ret i8 [[TMP3]]
  ;
    %x0 = extractelement <4 x i8> %x, i32 0
    %x3 = extractelement <4 x i8> %x, i32 3
 @@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

  define i8 @k(<4 x i8> %x) {
  ; CHECK-LABEL: @k(
 -; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
 -; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
 -; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
 -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
 -; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
 -; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
 -; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 -; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 -; CHECK-NEXT:    ret i8 [[TMP8]]
 +; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
 +; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
 +; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
 +; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 +; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 +; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
 +; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
 +; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
 +; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
 +; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
 +; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
 +; CHECK-NEXT:    ret i8 [[TMP3]]
  ;
    %x0 = extractelement <4 x i8> %x, i32 0
    %x3 = extractelement <4 x i8> %x, i32 3
 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
 index d34645707584..4eeebdbb2871 100644
 --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
 +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
 @@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
  ; CHECK-LABEL: @diamond_broadcast(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 -; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
 -; CHECK-NEXT:    store i32 [[MUL]], i32* [[B:%.*]], align 4
 -; CHECK-NEXT:    [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
 -; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
 -; CHECK-NEXT:    store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
 -; CHECK-NEXT:    [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
 +; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
  ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 -; CHECK-NEXT:    store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4
 -; CHECK-NEXT:    [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]
 +; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 +; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
 +; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
 +; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
 +; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
  ; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 -; CHECK-NEXT:    store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
 +; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 +; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
  ; CHECK-NEXT:    ret i32 0
  ;
  entry:
	commit 36911971a58d1ba8b15e97790ac816eaadb0603e
	Author: Alexey Bataev <a.bataev@outlook.com>
	Date: Fri May 21 06:29:23 2021 -0700

	[SLP]Better detection of perfect/shuffles matches for gather nodes.

	Implemented better scheme for perfect/shuffled matches of the gather
	nodes which allows to fix the performance regressions introduced by
	earlier patches. Starting detecting matches for broadcast nodes and
	extractelement gathering.

	Differential Revision: https://reviews.llvm.org/D102920

	diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	index 0f8fb09b6f6c..31d2c717f900 100644
	--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	@@ -22,6 +22,7 @@
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	+#include "llvm/ADT/SetOperations.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	@@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
	if (E->State == TreeEntry::NeedToGather) {
	if (allConstant(VL))
	return 0;
	- if (isSplat(VL)) {
	- return ReuseShuffleCost +
	- TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
	- 0);
	- }
	if (isa<InsertElementInst>(VL[0]))
	return InstructionCost::getInvalid();
	- if (E->getOpcode() == Instruction::ExtractElement &&
	- allSameType(VL) && allSameBlock(VL)) {
	- SmallVector<int> Mask;
	- Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
	- isShuffle(VL, Mask);
	- if (ShuffleKind.hasValue()) {
	- InstructionCost Cost =
	- computeExtractCost(VL, VecTy, ShuffleKind, Mask, TTI);
	- AdjustExtractsCost(Cost, /IsGather=/true);
	- return ReuseShuffleCost + Cost;
	- }
	- }
	- InstructionCost GatherCost = 0;
	SmallVector<int> Mask;
	SmallVector<const TreeEntry *> Entries;
	Optional<TargetTransformInfo::ShuffleKind> Shuffle =
	isGatherShuffledEntry(E, Mask, Entries);
	if (Shuffle.hasValue()) {
	+ InstructionCost GatherCost = 0;
	if (ShuffleVectorInst::isIdentityMask(Mask)) {
	+ // Perfect match in the graph, will reuse the previously vectorized
	+ // node. Cost is 0.
	LLVM_DEBUG(
	dbgs()
	<< "SLP: perfect diamond match for gather bundle that starts with "
	@@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
	LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
	<< " entries for bundle that starts with "
	<< *VL.front() << ".\n");
	+ // Detected that instead of gather we can emit a shuffle of single/two
	+ // previously vectorized nodes. Add the cost of the permutation rather
	+ // than gather.
	GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
	}
	- } else {
	- GatherCost = getGatherCost(VL);
	+ return ReuseShuffleCost + GatherCost;
	+ }
	+ if (isSplat(VL)) {
	+ // Found the broadcasting of the single scalar, calculate the cost as the
	+ // broadcast.
	+ return ReuseShuffleCost +
	+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
	+ 0);
	+ }
	+ if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
	+ allSameBlock(VL)) {
	+ // Check that gather of extractelements can be represented as just a
	+ // shuffle of a single/two vectors the scalars are extracted from.
	+ SmallVector<int> Mask;
	+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
	+ isShuffle(VL, Mask);
	+ if (ShuffleKind.hasValue()) {
	+ // Found the bunch of extractelement instructions that must be gathered
	+ // into a vector and can be represented as a permutation elements in a
	+ // single input vector or of 2 input vectors.
	+ InstructionCost Cost =
	+ computeExtractCost(VL, VecTy, ShuffleKind, Mask, TTI);
	+ AdjustExtractsCost(Cost, /IsGather=/true);
	+ return ReuseShuffleCost + Cost;
	+ }
	}
	- return ReuseShuffleCost + GatherCost;
	+ return ReuseShuffleCost + getGatherCost(VL);
	}
	assert((E->State == TreeEntry::Vectorize \|\|
	E->State == TreeEntry::ScatterVectorize) &&
	@@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() {
	return false;
	auto *IE1 = cast<InsertElementInst>(VU);
	auto *IE2 = cast<InsertElementInst>(V);
	+ // Go though of insertelement instructions trying to find either VU as
	+ // the original vector for IE2 or V as the original vector for IE1.
	do {
	if (IE1 == VU \|\| IE2 == V)
	return true;
	@@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() {
	Optional<TargetTransformInfo::ShuffleKind>
	BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<const TreeEntry *> &Entries) {
	+ // TODO: currently checking only for Scalars in the tree entry, need to count
	+ // reused elements too for better cost estimation.
	Mask.assign(TE->Scalars.size(), UndefMaskElem);
	Entries.clear();
	- DenseMap<Value , const TreeEntry > UsedValuesEntry;
	- unsigned VF = 0;
	- // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
	- // landed.
	- auto &&GetVF = [](const TreeEntry *TE) {
	- if (!TE->ReuseShuffleIndices.empty())
	- return TE->ReuseShuffleIndices.size();
	- return TE->Scalars.size();
	- };
	- for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
	- Value *V = TE->Scalars[I];
	+ // Build a lists of values to tree entries.
	+ DenseMap<Value , SmallPtrSet<const TreeEntry , 4>> ValueToTEs;
	+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
	+ if (EntryPtr.get() == TE)
	+ break;
	+ if (EntryPtr->State != TreeEntry::NeedToGather)
	+ continue;
	+ for (Value *V : EntryPtr->Scalars)
	+ ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
	+ }
	+ // Find all tree entries used by the gathered values. If no common entries
	+ // found - not a shuffle.
	+ // Here we build a set of tree nodes for each gathered value and trying to
	+ // find the intersection between these sets. If we have at least one common
	+ // tree node for each gathered value - we have just a permutation of the
	+ // single vector. If we have 2 different sets, we're in situation where we
	+ // have a permutation of 2 input vectors.
	+ SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
	+ DenseMap<Value *, int> UsedValuesEntry;
	+ for (Value *V : TE->Scalars) {
	if (isa<UndefValue>(V))
	continue;
	- const TreeEntry *VTE = UsedValuesEntry.lookup(V);
	- if (!VTE) {
	- if (Entries.size() == 2)
	- return None;
	- VTE = getTreeEntry(V);
	- if (!VTE \|\| find_if(
	- VectorizableTree,
	- [VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
	- return EntryPtr.get() == VTE \|\| EntryPtr.get() == TE;
	- })->get() == TE) {
	- // Check if it is used in one of the gathered entries.
	- const auto *It =
	- find_if(VectorizableTree,
	- [V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
	- return EntryPtr.get() == TE \|\|
	- (EntryPtr->State == TreeEntry::NeedToGather &&
	- is_contained(EntryPtr->Scalars, V));
	- });
	- // The vector factor of shuffled entries must be the same.
	- if (It->get() == TE)
	+ // Build a list of tree entries where V is used.
	+ SmallPtrSet<const TreeEntry *, 4> VToTEs;
	+ auto It = ValueToTEs.find(V);
	+ if (It != ValueToTEs.end())
	+ VToTEs = It->second;
	+ if (const TreeEntry *VTE = getTreeEntry(V))
	+ VToTEs.insert(VTE);
	+ if (VToTEs.empty())
	+ return None;
	+ if (UsedTEs.empty()) {
	+ // The first iteration, just insert the list of nodes to vector.
	+ UsedTEs.push_back(VToTEs);
	+ } else {
	+ // Need to check if there are any previously used tree nodes which use V.
	+ // If there are no such nodes, consider that we have another one input
	+ // vector.
	+ SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
	+ unsigned Idx = 0;
	+ for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
	+ // Do we have a non-empty intersection of previously listed tree entries
	+ // and tree entries using current V?
	+ set_intersect(VToTEs, Set);
	+ if (!VToTEs.empty()) {
	+ // Yes, write the new subset and continue analysis for the next
	+ // scalar.
	+ Set.swap(VToTEs);
	+ break;
	+ }
	+ VToTEs = SavedVToTEs;
	+ ++Idx;
	+ }
	+ // No non-empty intersection found - need to add a second set of possible
	+ // source vectors.
	+ if (Idx == UsedTEs.size()) {
	+ // If the number of input vectors is greater than 2 - not a permutation,
	+ // fallback to the regular gather.
	+ if (UsedTEs.size() == 2)
	return None;
	- VTE = It->get();
	+ UsedTEs.push_back(SavedVToTEs);
	+ Idx = UsedTEs.size() - 1;
	}
	- Entries.push_back(VTE);
	- if (Entries.size() == 1) {
	- VF = GetVF(VTE);
	- } else if (VF != GetVF(VTE)) {
	- assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
	- assert(VF > 0 && "Expected non-zero vector factor.");
	- return None;
	+ UsedValuesEntry.try_emplace(V, Idx);
	+ }
	+ }
	+
	+ unsigned VF = 0;
	+ if (UsedTEs.size() == 1) {
	+ // Try to find the perfect match in another gather node at first.
	+ auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
	+ return EntryPtr->isSame(TE->Scalars);
	+ });
	+ if (It != UsedTEs.front().end()) {
	+ Entries.push_back(*It);
	+ std::iota(Mask.begin(), Mask.end(), 0);
	+ return TargetTransformInfo::SK_PermuteSingleSrc;
	+ }
	+ // No perfect match, just shuffle, so choose the first tree node.
	+ Entries.push_back(*UsedTEs.front().begin());
	+ } else {
	+ // Try to find nodes with the same vector factor.
	+ assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
	+ // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
	+ // landed.
	+ auto &&GetVF = [](const TreeEntry *TE) {
	+ if (!TE->ReuseShuffleIndices.empty())
	+ return TE->ReuseShuffleIndices.size();
	+ return TE->Scalars.size();
	+ };
	+ DenseMap<int, const TreeEntry *> VFToTE;
	+ for (const TreeEntry *TE : UsedTEs.front())
	+ VFToTE.try_emplace(GetVF(TE), TE);
	+ for (const TreeEntry *TE : UsedTEs.back()) {
	+ auto It = VFToTE.find(GetVF(TE));
	+ if (It != VFToTE.end()) {
	+ VF = It->first;
	+ Entries.push_back(It->second);
	+ Entries.push_back(TE);
	+ break;
	}
	- for (Value *SV : VTE->Scalars)
	- UsedValuesEntry.try_emplace(SV, VTE);
	}
	+ // No 2 source vectors with the same vector factor - give up and do regular
	+ // gather.
	+ if (Entries.empty())
	+ return None;
	+ }
	+
	+ // Build a shuffle mask for better cost estimation and vector emission.
	+ for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
	+ Value *V = TE->Scalars[I];
	+ if (isa<UndefValue>(V))
	+ continue;
	+ unsigned Idx = UsedValuesEntry.lookup(V);
	+ const TreeEntry *VTE = Entries[Idx];
	int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
	- Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
	+ Mask[I] = Idx * VF + FoundLane;
	// Extra check required by isSingleSourceMaskImpl function (called by
	// ShuffleVectorInst::isSingleSourceMask).
	if (Mask[I] >= 2 * E)
	diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
	index f147911a6d1f..1c806c6b1f73 100644
	--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
	+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
	@@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {

	define i8 @j(<4 x i8> %x, <4 x i8> %y) {
	; CHECK-LABEL: @j(
	-; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
	-; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
	-; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
	-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
	-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
	-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
	-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
	-; CHECK-NEXT: ret i8 [[TMP8]]
	+; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
	+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
	+; CHECK-NEXT: [[Y1:%.]] = extractelement <4 x i8> [[Y:%.]], i32 1
	+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
	+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
	+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
	+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
	+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
	+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
	+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
	+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
	+; CHECK-NEXT: ret i8 [[TMP3]]
	;
	%x0 = extractelement <4 x i8> %x, i32 0
	%x3 = extractelement <4 x i8> %x, i32 3
	@@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

	define i8 @k(<4 x i8> %x) {
	; CHECK-LABEL: @k(
	-; CHECK-NEXT: [[TMP1:%.]] = mul <4 x i8> [[X:%.]], [[X]]
	-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
	-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
	-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
	-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
	-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
	-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
	-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
	-; CHECK-NEXT: ret i8 [[TMP8]]
	+; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
	+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
	+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
	+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
	+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
	+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
	+; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
	+; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
	+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
	+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
	+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
	+; CHECK-NEXT: ret i8 [[TMP3]]
	;
	%x0 = extractelement <4 x i8> %x, i32 0
	%x3 = extractelement <4 x i8> %x, i32 3
	diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
	index 1a4d0c3b50bd..0fcc096773dd 100644
	--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
	+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
	@@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {

	define i8 @j(<4 x i8> %x, <4 x i8> %y) {
	; CHECK-LABEL: @j(
	-; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i8> [[X:%.]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
	-; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
	-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
	-; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
	-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
	-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
	-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
	-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
	-; CHECK-NEXT: ret i8 [[TMP8]]
	+; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
	+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
	+; CHECK-NEXT: [[Y1:%.]] = extractelement <4 x i8> [[Y:%.]], i32 1
	+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
	+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
	+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
	+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
	+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
	+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
	+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
	+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
	+; CHECK-NEXT: ret i8 [[TMP3]]
	;
	%x0 = extractelement <4 x i8> %x, i32 0
	%x3 = extractelement <4 x i8> %x, i32 3
	@@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

	define i8 @k(<4 x i8> %x) {
	; CHECK-LABEL: @k(
	-; CHECK-NEXT: [[TMP1:%.]] = mul <4 x i8> [[X:%.]], [[X]]
	-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
	-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
	-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
	-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
	-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
	-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
	-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
	-; CHECK-NEXT: ret i8 [[TMP8]]
	+; CHECK-NEXT: [[X0:%.]] = extractelement <4 x i8> [[X:%.]], i32 0
	+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
	+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
	+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
	+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
	+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
	+; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
	+; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
	+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
	+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
	+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
	+; CHECK-NEXT: ret i8 [[TMP3]]
	;
	%x0 = extractelement <4 x i8> %x, i32 0
	%x3 = extractelement <4 x i8> %x, i32 3
	diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
	index d34645707584..4eeebdbb2871 100644
	--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
	+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
	@@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
	; CHECK-LABEL: @diamond_broadcast(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[LD:%.]] = load i32, i32 [[A:%.*]], align 4
	-; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
	-; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4
	-; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
	-; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i32, i32 [[B]], i64 1
	-; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
	-; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
	+; CHECK-NEXT: [[ARRAYIDX9:%.]] = getelementptr inbounds i32, i32 [[B:%.*]], i64 1
	; CHECK-NEXT: [[ARRAYIDX15:%.]] = getelementptr inbounds i32, i32 [[B]], i64 2
	-; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4
	-; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]
	+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
	+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
	+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
	+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
	+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
	; CHECK-NEXT: [[ARRAYIDX21:%.]] = getelementptr inbounds i32, i32 [[B]], i64 3
	-; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
	+; CHECK-NEXT: [[TMP5:%.]] = bitcast i32 [[B]] to <4 x i32>*
	+; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
	; CHECK-NEXT: ret i32 0
	;
	entry: