blob: ad09d6360b74b5114078b9e6d5b2c8223e6f2dae [file] [log] [blame]
commit d7a40a447f1ed0294c6bc8fe82b6b2460e31de06
Author: Alexey Bataev <a.bataev@outlook.com>
Date: Tue Apr 18 10:40:20 2023 -0700
Revert "[SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions."
This reverts commit cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc to fix
a crash revealed by buildbot https://lab.llvm.org/buildbot#builders/124/builds/7108.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 163e3581ea7b..a1fabec2a8d7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -946,14 +946,9 @@ static bool isSimple(Instruction *I) {
}
/// Shuffles \p Mask in accordance with the given \p SubMask.
-/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
-/// one but two input vectors.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
- bool ExtendingManyInputs = false) {
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
if (SubMask.empty())
return;
- assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) &&
- "SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
return;
@@ -961,9 +956,8 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
- if ((!ExtendingManyInputs &&
- (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) ||
- SubMask[I] == UndefMaskElem)
+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+ Mask[SubMask[I]] >= TermValue)
continue;
NewMask[I] = Mask[SubMask[I]];
}
@@ -6794,8 +6788,6 @@ protected:
/// analysis/transformations.
class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
bool IsFinalized = false;
- SmallVector<int> CommonMask;
- SmallVector<Value *, 2> InVectors;
const TargetTransformInfo &TTI;
InstructionCost Cost = 0;
ArrayRef<Value *> VectorizedVals;
@@ -7017,53 +7009,19 @@ public:
VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
- InVectors.assign(1, VecBase);
return VecBase;
}
- void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(
- 2, Constant::getNullValue(FixedVectorType::get(
- E1->Scalars.front()->getType(),
- std::max(E1->getVectorFactor(), E2->getVectorFactor()))));
- }
- void add(const TreeEntry *E1, ArrayRef<int> Mask) {
- CommonMask.assign(Mask.begin(), Mask.end());
- InVectors.assign(
- 1, Constant::getNullValue(FixedVectorType::get(
- E1->Scalars.front()->getType(), E1->getVectorFactor())));
- }
void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
- if (!Root) {
- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
- InVectors.assign(1, Constant::getNullValue(FixedVectorType::get(
- VL.front()->getType(), VL.size())));
- }
}
/// Finalize emission of the shuffles.
- InstructionCost finalize(ArrayRef<int> ExtMask) {
+ InstructionCost finalize() {
IsFinalized = true;
- ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
- if (CommonMask.empty())
- return Cost;
- int Limit = CommonMask.size() * 2;
- if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(CommonMask))
- return Cost;
- return Cost +
- TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc
- : TTI::SK_PermuteSingleSrc,
- FixedVectorType::get(
- cast<VectorType>(InVectors.front()->getType())
- ->getElementType(),
- CommonMask.size()),
- CommonMask);
+ return Cost;
}
~ShuffleCostEstimator() {
- assert((IsFinalized || CommonMask.empty()) &&
- "Shuffle construction must be finalized.");
+ assert(IsFinalized && "Shuffle construction must be finalized.");
}
};
@@ -7151,30 +7109,35 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (Mask[I] != UndefMaskElem)
GatheredScalars[I] = PoisonValue::get(ScalarTy);
}
- LLVM_DEBUG(
- int Limit = Mask.size() * 2;
- if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
- all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask)) {
- // Perfect match in the graph, will reuse the previously
- // vectorized node. Cost is 0.
- dbgs() << "SLP: perfect diamond match for gather bundle "
- "that starts with "
- << *VL.front() << ".\n";
- } else {
- dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with " << *VL.front()
- << ".\n";
- });
- if (Entries.size() == 1)
- Estimator.add(Entries.front(), Mask);
- else
- Estimator.add(Entries.front(), Entries.back(), Mask);
+ InstructionCost GatherCost = 0;
+ int Limit = Mask.size() * 2;
+ if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: perfect diamond match for gather bundle that starts with "
+ << *VL.front() << ".\n");
+ if (NeedToShuffleReuses)
+ GatherCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ FinalVecTy, E->ReuseShuffleIndices);
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
+ << " entries for bundle that starts with "
+ << *VL.front() << ".\n");
+ // Detected that instead of gather we can emit a shuffle of single/two
+ // previously vectorized nodes. Add the cost of the permutation rather
+ // than gather.
+ ::addMask(Mask, E->ReuseShuffleIndices);
+ GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
+ }
Estimator.gather(
GatheredScalars,
Constant::getNullValue(FixedVectorType::get(
GatheredScalars.front()->getType(), GatheredScalars.size())));
- return Estimator.finalize(E->ReuseShuffleIndices);
+ return GatherCost + Estimator.finalize();
}
if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
// Check that gather of extractelements can be represented as just a
@@ -7184,15 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
- return Cost + Estimator.finalize(E->ReuseShuffleIndices);
- }
- Estimator.gather(
- GatheredScalars,
- (ExtractShuffle || GatherShuffle)
- ? Constant::getNullValue(FixedVectorType::get(
- GatheredScalars.front()->getType(), GatheredScalars.size()))
- : nullptr);
- return Estimator.finalize(E->ReuseShuffleIndices);
+ if (NeedToShuffleReuses)
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ FinalVecTy, E->ReuseShuffleIndices);
+ return Cost + Estimator.finalize();
+ }
+ InstructionCost ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses)
+ ReuseShuffleCost = TTI->getShuffleCost(
+ TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
+ Estimator.gather(GatheredScalars);
+ return ReuseShuffleCost + Estimator.finalize();
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index f26d14c44db8..c5b6ac647aa3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,36 +93,20 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
}
define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_diff_preds(
-; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]]
-; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false
-; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
-; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; SSE-NEXT: ret i1 [[S3]]
-;
-; AVX-LABEL: @logical_and_icmp_diff_preds(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
-; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
-; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
-; AVX-NEXT: ret i1 [[S3]]
+; CHECK-LABEL: @logical_and_icmp_diff_preds(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
+; CHECK-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1