| commit 36911971a58d1ba8b15e97790ac816eaadb0603e |
| Author: Alexey Bataev <a.bataev@outlook.com> |
| Date: Fri May 21 06:29:23 2021 -0700 |
| |
| [SLP]Better detection of perfect/shuffles matches for gather nodes. |
| |
| Implemented better scheme for perfect/shuffled matches of the gather |
| nodes which allows to fix the performance regressions introduced by |
| earlier patches. Starting detecting matches for broadcast nodes and |
| extractelement gathering. |
| |
| Differential Revision: https://reviews.llvm.org/D102920 |
| |
| diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| index 0f8fb09b6f6c..31d2c717f900 100644 |
| --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| @@ -22,6 +22,7 @@ |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/PostOrderIterator.h" |
| #include "llvm/ADT/STLExtras.h" |
| +#include "llvm/ADT/SetOperations.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallBitVector.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| @@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) { |
| if (E->State == TreeEntry::NeedToGather) { |
| if (allConstant(VL)) |
| return 0; |
| - if (isSplat(VL)) { |
| - return ReuseShuffleCost + |
| - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, |
| - 0); |
| - } |
| if (isa<InsertElementInst>(VL[0])) |
| return InstructionCost::getInvalid(); |
| - if (E->getOpcode() == Instruction::ExtractElement && |
| - allSameType(VL) && allSameBlock(VL)) { |
| - SmallVector<int> Mask; |
| - Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = |
| - isShuffle(VL, Mask); |
| - if (ShuffleKind.hasValue()) { |
| - InstructionCost Cost = |
| - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); |
| - AdjustExtractsCost(Cost, /*IsGather=*/true); |
| - return ReuseShuffleCost + Cost; |
| - } |
| - } |
| - InstructionCost GatherCost = 0; |
| SmallVector<int> Mask; |
| SmallVector<const TreeEntry *> Entries; |
| Optional<TargetTransformInfo::ShuffleKind> Shuffle = |
| isGatherShuffledEntry(E, Mask, Entries); |
| if (Shuffle.hasValue()) { |
| + InstructionCost GatherCost = 0; |
| if (ShuffleVectorInst::isIdentityMask(Mask)) { |
| + // Perfect match in the graph, will reuse the previously vectorized |
| + // node. Cost is 0. |
| LLVM_DEBUG( |
| dbgs() |
| << "SLP: perfect diamond match for gather bundle that starts with " |
| @@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) { |
| LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() |
| << " entries for bundle that starts with " |
| << *VL.front() << ".\n"); |
| + // Detected that instead of gather we can emit a shuffle of single/two |
| + // previously vectorized nodes. Add the cost of the permutation rather |
| + // than gather. |
| GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); |
| } |
| - } else { |
| - GatherCost = getGatherCost(VL); |
| + return ReuseShuffleCost + GatherCost; |
| + } |
| + if (isSplat(VL)) { |
| + // Found the broadcasting of the single scalar, calculate the cost as the |
| + // broadcast. |
| + return ReuseShuffleCost + |
| + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, |
| + 0); |
| + } |
| + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && |
| + allSameBlock(VL)) { |
| + // Check that gather of extractelements can be represented as just a |
| + // shuffle of a single/two vectors the scalars are extracted from. |
| + SmallVector<int> Mask; |
| + Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = |
| + isShuffle(VL, Mask); |
| + if (ShuffleKind.hasValue()) { |
| + // Found the bunch of extractelement instructions that must be gathered |
| + // into a vector and can be represented as a permutation elements in a |
| + // single input vector or of 2 input vectors. |
| + InstructionCost Cost = |
| + computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); |
| + AdjustExtractsCost(Cost, /*IsGather=*/true); |
| + return ReuseShuffleCost + Cost; |
| + } |
| } |
| - return ReuseShuffleCost + GatherCost; |
| + return ReuseShuffleCost + getGatherCost(VL); |
| } |
| assert((E->State == TreeEntry::Vectorize || |
| E->State == TreeEntry::ScatterVectorize) && |
| @@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() { |
| return false; |
| auto *IE1 = cast<InsertElementInst>(VU); |
| auto *IE2 = cast<InsertElementInst>(V); |
| + // Go though of insertelement instructions trying to find either VU as |
| + // the original vector for IE2 or V as the original vector for IE1. |
| do { |
| if (IE1 == VU || IE2 == V) |
| return true; |
| @@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() { |
| Optional<TargetTransformInfo::ShuffleKind> |
| BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, |
| SmallVectorImpl<const TreeEntry *> &Entries) { |
| + // TODO: currently checking only for Scalars in the tree entry, need to count |
| + // reused elements too for better cost estimation. |
| Mask.assign(TE->Scalars.size(), UndefMaskElem); |
| Entries.clear(); |
| - DenseMap<Value *, const TreeEntry *> UsedValuesEntry; |
| - unsigned VF = 0; |
| - // FIXME: Shall be replaced by GetVF function once non-power-2 patch is |
| - // landed. |
| - auto &&GetVF = [](const TreeEntry *TE) { |
| - if (!TE->ReuseShuffleIndices.empty()) |
| - return TE->ReuseShuffleIndices.size(); |
| - return TE->Scalars.size(); |
| - }; |
| - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { |
| - Value *V = TE->Scalars[I]; |
| + // Build a lists of values to tree entries. |
| + DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; |
| + for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { |
| + if (EntryPtr.get() == TE) |
| + break; |
| + if (EntryPtr->State != TreeEntry::NeedToGather) |
| + continue; |
| + for (Value *V : EntryPtr->Scalars) |
| + ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); |
| + } |
| + // Find all tree entries used by the gathered values. If no common entries |
| + // found - not a shuffle. |
| + // Here we build a set of tree nodes for each gathered value and trying to |
| + // find the intersection between these sets. If we have at least one common |
| + // tree node for each gathered value - we have just a permutation of the |
| + // single vector. If we have 2 different sets, we're in situation where we |
| + // have a permutation of 2 input vectors. |
| + SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; |
| + DenseMap<Value *, int> UsedValuesEntry; |
| + for (Value *V : TE->Scalars) { |
| if (isa<UndefValue>(V)) |
| continue; |
| - const TreeEntry *VTE = UsedValuesEntry.lookup(V); |
| - if (!VTE) { |
| - if (Entries.size() == 2) |
| - return None; |
| - VTE = getTreeEntry(V); |
| - if (!VTE || find_if( |
| - VectorizableTree, |
| - [VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) { |
| - return EntryPtr.get() == VTE || EntryPtr.get() == TE; |
| - })->get() == TE) { |
| - // Check if it is used in one of the gathered entries. |
| - const auto *It = |
| - find_if(VectorizableTree, |
| - [V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) { |
| - return EntryPtr.get() == TE || |
| - (EntryPtr->State == TreeEntry::NeedToGather && |
| - is_contained(EntryPtr->Scalars, V)); |
| - }); |
| - // The vector factor of shuffled entries must be the same. |
| - if (It->get() == TE) |
| + // Build a list of tree entries where V is used. |
| + SmallPtrSet<const TreeEntry *, 4> VToTEs; |
| + auto It = ValueToTEs.find(V); |
| + if (It != ValueToTEs.end()) |
| + VToTEs = It->second; |
| + if (const TreeEntry *VTE = getTreeEntry(V)) |
| + VToTEs.insert(VTE); |
| + if (VToTEs.empty()) |
| + return None; |
| + if (UsedTEs.empty()) { |
| + // The first iteration, just insert the list of nodes to vector. |
| + UsedTEs.push_back(VToTEs); |
| + } else { |
| + // Need to check if there are any previously used tree nodes which use V. |
| + // If there are no such nodes, consider that we have another one input |
| + // vector. |
| + SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); |
| + unsigned Idx = 0; |
| + for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { |
| + // Do we have a non-empty intersection of previously listed tree entries |
| + // and tree entries using current V? |
| + set_intersect(VToTEs, Set); |
| + if (!VToTEs.empty()) { |
| + // Yes, write the new subset and continue analysis for the next |
| + // scalar. |
| + Set.swap(VToTEs); |
| + break; |
| + } |
| + VToTEs = SavedVToTEs; |
| + ++Idx; |
| + } |
| + // No non-empty intersection found - need to add a second set of possible |
| + // source vectors. |
| + if (Idx == UsedTEs.size()) { |
| + // If the number of input vectors is greater than 2 - not a permutation, |
| + // fallback to the regular gather. |
| + if (UsedTEs.size() == 2) |
| return None; |
| - VTE = It->get(); |
| + UsedTEs.push_back(SavedVToTEs); |
| + Idx = UsedTEs.size() - 1; |
| } |
| - Entries.push_back(VTE); |
| - if (Entries.size() == 1) { |
| - VF = GetVF(VTE); |
| - } else if (VF != GetVF(VTE)) { |
| - assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries."); |
| - assert(VF > 0 && "Expected non-zero vector factor."); |
| - return None; |
| + UsedValuesEntry.try_emplace(V, Idx); |
| + } |
| + } |
| + |
| + unsigned VF = 0; |
| + if (UsedTEs.size() == 1) { |
| + // Try to find the perfect match in another gather node at first. |
| + auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { |
| + return EntryPtr->isSame(TE->Scalars); |
| + }); |
| + if (It != UsedTEs.front().end()) { |
| + Entries.push_back(*It); |
| + std::iota(Mask.begin(), Mask.end(), 0); |
| + return TargetTransformInfo::SK_PermuteSingleSrc; |
| + } |
| + // No perfect match, just shuffle, so choose the first tree node. |
| + Entries.push_back(*UsedTEs.front().begin()); |
| + } else { |
| + // Try to find nodes with the same vector factor. |
| + assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); |
| + // FIXME: Shall be replaced by GetVF function once non-power-2 patch is |
| + // landed. |
| + auto &&GetVF = [](const TreeEntry *TE) { |
| + if (!TE->ReuseShuffleIndices.empty()) |
| + return TE->ReuseShuffleIndices.size(); |
| + return TE->Scalars.size(); |
| + }; |
| + DenseMap<int, const TreeEntry *> VFToTE; |
| + for (const TreeEntry *TE : UsedTEs.front()) |
| + VFToTE.try_emplace(GetVF(TE), TE); |
| + for (const TreeEntry *TE : UsedTEs.back()) { |
| + auto It = VFToTE.find(GetVF(TE)); |
| + if (It != VFToTE.end()) { |
| + VF = It->first; |
| + Entries.push_back(It->second); |
| + Entries.push_back(TE); |
| + break; |
| } |
| - for (Value *SV : VTE->Scalars) |
| - UsedValuesEntry.try_emplace(SV, VTE); |
| } |
| + // No 2 source vectors with the same vector factor - give up and do regular |
| + // gather. |
| + if (Entries.empty()) |
| + return None; |
| + } |
| + |
| + // Build a shuffle mask for better cost estimation and vector emission. |
| + for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { |
| + Value *V = TE->Scalars[I]; |
| + if (isa<UndefValue>(V)) |
| + continue; |
| + unsigned Idx = UsedValuesEntry.lookup(V); |
| + const TreeEntry *VTE = Entries[Idx]; |
| int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V); |
| - Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane; |
| + Mask[I] = Idx * VF + FoundLane; |
| // Extra check required by isSingleSourceMaskImpl function (called by |
| // ShuffleVectorInst::isSingleSourceMask). |
| if (Mask[I] >= 2 * E) |
| diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll |
| index f147911a6d1f..1c806c6b1f73 100644 |
| --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll |
| +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll |
| @@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { |
| |
| define i8 @j(<4 x i8> %x, <4 x i8> %y) { |
| ; CHECK-LABEL: @j( |
| -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5> |
| -; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] |
| -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6> |
| -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] |
| -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] |
| -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 |
| -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 |
| -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] |
| -; CHECK-NEXT: ret i8 [[TMP8]] |
| +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 |
| +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 |
| +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 |
| +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 |
| +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] |
| +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] |
| +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] |
| +; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] |
| +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] |
| +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]] |
| +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: ret i8 [[TMP3]] |
| ; |
| %x0 = extractelement <4 x i8> %x, i32 0 |
| %x3 = extractelement <4 x i8> %x, i32 3 |
| @@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { |
| |
| define i8 @k(<4 x i8> %x) { |
| ; CHECK-LABEL: @k( |
| -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] |
| -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1> |
| -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] |
| -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2> |
| -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] |
| -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 |
| -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 |
| -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] |
| -; CHECK-NEXT: ret i8 [[TMP8]] |
| +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 |
| +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 |
| +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1 |
| +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2 |
| +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] |
| +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] |
| +; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] |
| +; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] |
| +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] |
| +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] |
| +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: ret i8 [[TMP3]] |
| ; |
| %x0 = extractelement <4 x i8> %x, i32 0 |
| %x3 = extractelement <4 x i8> %x, i32 3 |
| diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll |
| index 1a4d0c3b50bd..0fcc096773dd 100644 |
| --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll |
| +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll |
| @@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) { |
| |
| define i8 @j(<4 x i8> %x, <4 x i8> %y) { |
| ; CHECK-LABEL: @j( |
| -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5> |
| -; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]] |
| -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6> |
| -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]] |
| -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] |
| -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 |
| -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 |
| -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] |
| -; CHECK-NEXT: ret i8 [[TMP8]] |
| +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 |
| +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 |
| +; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1 |
| +; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2 |
| +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] |
| +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] |
| +; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]] |
| +; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]] |
| +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] |
| +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]] |
| +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: ret i8 [[TMP3]] |
| ; |
| %x0 = extractelement <4 x i8> %x, i32 0 |
| %x3 = extractelement <4 x i8> %x, i32 3 |
| @@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) { |
| |
| define i8 @k(<4 x i8> %x) { |
| ; CHECK-LABEL: @k( |
| -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] |
| -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1> |
| -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] |
| -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2> |
| -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] |
| -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 |
| -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 |
| -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] |
| -; CHECK-NEXT: ret i8 [[TMP8]] |
| +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 |
| +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 |
| +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1 |
| +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2 |
| +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] |
| +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] |
| +; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] |
| +; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] |
| +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] |
| +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] |
| +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: ret i8 [[TMP3]] |
| ; |
| %x0 = extractelement <4 x i8> %x, i32 0 |
| %x3 = extractelement <4 x i8> %x, i32 3 |
| diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll |
| index d34645707584..4eeebdbb2871 100644 |
| --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll |
| +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll |
| @@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture |
| ; CHECK-LABEL: @diamond_broadcast( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 |
| -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]] |
| -; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4 |
| -; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]] |
| -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 |
| -; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4 |
| -; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]] |
| +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 |
| ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 |
| -; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4 |
| -; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]] |
| +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 |
| +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1 |
| +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2 |
| +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3 |
| +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]] |
| ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 |
| -; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4 |
| +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>* |
| +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 |
| ; CHECK-NEXT: ret i32 0 |
| ; |
| entry: |