blob: 7be45a206bded0d9963fddcddb899188bbb2ca0a [file] [log] [blame]
commit 36911971a58d1ba8b15e97790ac816eaadb0603e
Author: Alexey Bataev <a.bataev@outlook.com>
Date: Fri May 21 06:29:23 2021 -0700
[SLP]Better detection of perfect/shuffles matches for gather nodes.
Implemented better scheme for perfect/shuffled matches of the gather
nodes which allows to fix the performance regressions introduced by
earlier patches. Starting detecting matches for broadcast nodes and
extractelement gathering.
Differential Revision: https://reviews.llvm.org/D102920
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0f8fb09b6f6c..31d2c717f900 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
- if (isSplat(VL)) {
- return ReuseShuffleCost +
- TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
- 0);
- }
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- if (E->getOpcode() == Instruction::ExtractElement &&
- allSameType(VL) && allSameBlock(VL)) {
- SmallVector<int> Mask;
- Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
- isShuffle(VL, Mask);
- if (ShuffleKind.hasValue()) {
- InstructionCost Cost =
- computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- AdjustExtractsCost(Cost, /*IsGather=*/true);
- return ReuseShuffleCost + Cost;
- }
- }
- InstructionCost GatherCost = 0;
SmallVector<int> Mask;
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
if (Shuffle.hasValue()) {
+ InstructionCost GatherCost = 0;
if (ShuffleVectorInst::isIdentityMask(Mask)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
LLVM_DEBUG(
dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "
@@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle that starts with "
<< *VL.front() << ".\n");
+ // Detected that instead of gather we can emit a shuffle of single/two
+ // previously vectorized nodes. Add the cost of the permutation rather
+ // than gather.
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
}
- } else {
- GatherCost = getGatherCost(VL);
+ return ReuseShuffleCost + GatherCost;
+ }
+ if (isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as the
+ // broadcast.
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
+ 0);
+ }
+ if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
+ allSameBlock(VL)) {
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ SmallVector<int> Mask;
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
+ isShuffle(VL, Mask);
+ if (ShuffleKind.hasValue()) {
+ // Found the bunch of extractelement instructions that must be gathered
+ // into a vector and can be represented as a permutation elements in a
+ // single input vector or of 2 input vectors.
+ InstructionCost Cost =
+ computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
+ AdjustExtractsCost(Cost, /*IsGather=*/true);
+ return ReuseShuffleCost + Cost;
+ }
}
- return ReuseShuffleCost + GatherCost;
+ return ReuseShuffleCost + getGatherCost(VL);
}
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
@@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() {
return false;
auto *IE1 = cast<InsertElementInst>(VU);
auto *IE2 = cast<InsertElementInst>(V);
+ // Go though of insertelement instructions trying to find either VU as
+ // the original vector for IE2 or V as the original vector for IE1.
do {
if (IE1 == VU || IE2 == V)
return true;
@@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() {
Optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries) {
+ // TODO: currently checking only for Scalars in the tree entry, need to count
+ // reused elements too for better cost estimation.
Mask.assign(TE->Scalars.size(), UndefMaskElem);
Entries.clear();
- DenseMap<Value *, const TreeEntry *> UsedValuesEntry;
- unsigned VF = 0;
- // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
- // landed.
- auto &&GetVF = [](const TreeEntry *TE) {
- if (!TE->ReuseShuffleIndices.empty())
- return TE->ReuseShuffleIndices.size();
- return TE->Scalars.size();
- };
- for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
- Value *V = TE->Scalars[I];
+ // Build a lists of values to tree entries.
+ DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
+ if (EntryPtr.get() == TE)
+ break;
+ if (EntryPtr->State != TreeEntry::NeedToGather)
+ continue;
+ for (Value *V : EntryPtr->Scalars)
+ ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
+ }
+ // Find all tree entries used by the gathered values. If no common entries
+ // found - not a shuffle.
+ // Here we build a set of tree nodes for each gathered value and trying to
+ // find the intersection between these sets. If we have at least one common
+ // tree node for each gathered value - we have just a permutation of the
+ // single vector. If we have 2 different sets, we're in situation where we
+ // have a permutation of 2 input vectors.
+ SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
+ DenseMap<Value *, int> UsedValuesEntry;
+ for (Value *V : TE->Scalars) {
if (isa<UndefValue>(V))
continue;
- const TreeEntry *VTE = UsedValuesEntry.lookup(V);
- if (!VTE) {
- if (Entries.size() == 2)
- return None;
- VTE = getTreeEntry(V);
- if (!VTE || find_if(
- VectorizableTree,
- [VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
- return EntryPtr.get() == VTE || EntryPtr.get() == TE;
- })->get() == TE) {
- // Check if it is used in one of the gathered entries.
- const auto *It =
- find_if(VectorizableTree,
- [V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
- return EntryPtr.get() == TE ||
- (EntryPtr->State == TreeEntry::NeedToGather &&
- is_contained(EntryPtr->Scalars, V));
- });
- // The vector factor of shuffled entries must be the same.
- if (It->get() == TE)
+ // Build a list of tree entries where V is used.
+ SmallPtrSet<const TreeEntry *, 4> VToTEs;
+ auto It = ValueToTEs.find(V);
+ if (It != ValueToTEs.end())
+ VToTEs = It->second;
+ if (const TreeEntry *VTE = getTreeEntry(V))
+ VToTEs.insert(VTE);
+ if (VToTEs.empty())
+ return None;
+ if (UsedTEs.empty()) {
+ // The first iteration, just insert the list of nodes to vector.
+ UsedTEs.push_back(VToTEs);
+ } else {
+ // Need to check if there are any previously used tree nodes which use V.
+ // If there are no such nodes, consider that we have another one input
+ // vector.
+ SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
+ unsigned Idx = 0;
+ for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
+ // Do we have a non-empty intersection of previously listed tree entries
+ // and tree entries using current V?
+ set_intersect(VToTEs, Set);
+ if (!VToTEs.empty()) {
+ // Yes, write the new subset and continue analysis for the next
+ // scalar.
+ Set.swap(VToTEs);
+ break;
+ }
+ VToTEs = SavedVToTEs;
+ ++Idx;
+ }
+ // No non-empty intersection found - need to add a second set of possible
+ // source vectors.
+ if (Idx == UsedTEs.size()) {
+ // If the number of input vectors is greater than 2 - not a permutation,
+ // fallback to the regular gather.
+ if (UsedTEs.size() == 2)
return None;
- VTE = It->get();
+ UsedTEs.push_back(SavedVToTEs);
+ Idx = UsedTEs.size() - 1;
}
- Entries.push_back(VTE);
- if (Entries.size() == 1) {
- VF = GetVF(VTE);
- } else if (VF != GetVF(VTE)) {
- assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
- assert(VF > 0 && "Expected non-zero vector factor.");
- return None;
+ UsedValuesEntry.try_emplace(V, Idx);
+ }
+ }
+
+ unsigned VF = 0;
+ if (UsedTEs.size() == 1) {
+ // Try to find the perfect match in another gather node at first.
+ auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
+ return EntryPtr->isSame(TE->Scalars);
+ });
+ if (It != UsedTEs.front().end()) {
+ Entries.push_back(*It);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ }
+ // No perfect match, just shuffle, so choose the first tree node.
+ Entries.push_back(*UsedTEs.front().begin());
+ } else {
+ // Try to find nodes with the same vector factor.
+ assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
+ // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
+ // landed.
+ auto &&GetVF = [](const TreeEntry *TE) {
+ if (!TE->ReuseShuffleIndices.empty())
+ return TE->ReuseShuffleIndices.size();
+ return TE->Scalars.size();
+ };
+ DenseMap<int, const TreeEntry *> VFToTE;
+ for (const TreeEntry *TE : UsedTEs.front())
+ VFToTE.try_emplace(GetVF(TE), TE);
+ for (const TreeEntry *TE : UsedTEs.back()) {
+ auto It = VFToTE.find(GetVF(TE));
+ if (It != VFToTE.end()) {
+ VF = It->first;
+ Entries.push_back(It->second);
+ Entries.push_back(TE);
+ break;
}
- for (Value *SV : VTE->Scalars)
- UsedValuesEntry.try_emplace(SV, VTE);
}
+ // No 2 source vectors with the same vector factor - give up and do regular
+ // gather.
+ if (Entries.empty())
+ return None;
+ }
+
+ // Build a shuffle mask for better cost estimation and vector emission.
+ for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
+ Value *V = TE->Scalars[I];
+ if (isa<UndefValue>(V))
+ continue;
+ unsigned Idx = UsedValuesEntry.lookup(V);
+ const TreeEntry *VTE = Entries[Idx];
int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
- Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
+ Mask[I] = Idx * VF + FoundLane;
// Extra check required by isSingleSourceMaskImpl function (called by
// ShuffleVectorInst::isSingleSourceMask).
if (Mask[I] >= 2 * E)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index f147911a6d1f..1c806c6b1f73 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: ret i8 [[TMP8]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: ret i8 [[TMP8]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 1a4d0c3b50bd..0fcc096773dd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: ret i8 [[TMP8]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: ret i8 [[TMP8]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
index d34645707584..4eeebdbb2871 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
@@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
; CHECK-LABEL: @diamond_broadcast(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
-; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4
-; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
; CHECK-NEXT: ret i32 0
;
entry: