| commit d7a40a447f1ed0294c6bc8fe82b6b2460e31de06 |
| Author: Alexey Bataev <a.bataev@outlook.com> |
| Date: Tue Apr 18 10:40:20 2023 -0700 |
| |
| Revert "[SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions." |
| |
| This reverts commit cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc to fix |
| a crash revealed by buildbot https://lab.llvm.org/buildbot#builders/124/builds/7108. |
| |
| diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| index 163e3581ea7b..a1fabec2a8d7 100644 |
| --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| @@ -946,14 +946,9 @@ static bool isSimple(Instruction *I) { |
| } |
| |
| /// Shuffles \p Mask in accordance with the given \p SubMask. |
| -/// \param ExtendingManyInputs Supports reshuffling of the mask with not only |
| -/// one but two input vectors. |
| -static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, |
| - bool ExtendingManyInputs = false) { |
| +static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { |
| if (SubMask.empty()) |
| return; |
| - assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) && |
| - "SubMask with many inputs support must be larger than the mask."); |
| if (Mask.empty()) { |
| Mask.append(SubMask.begin(), SubMask.end()); |
| return; |
| @@ -961,9 +956,8 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, |
| SmallVector<int> NewMask(SubMask.size(), UndefMaskElem); |
| int TermValue = std::min(Mask.size(), SubMask.size()); |
| for (int I = 0, E = SubMask.size(); I < E; ++I) { |
| - if ((!ExtendingManyInputs && |
| - (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) || |
| - SubMask[I] == UndefMaskElem) |
| + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || |
| + Mask[SubMask[I]] >= TermValue) |
| continue; |
| NewMask[I] = Mask[SubMask[I]]; |
| } |
| @@ -6794,8 +6788,6 @@ protected: |
| /// analysis/transformations. |
| class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { |
| bool IsFinalized = false; |
| - SmallVector<int> CommonMask; |
| - SmallVector<Value *, 2> InVectors; |
| const TargetTransformInfo &TTI; |
| InstructionCost Cost = 0; |
| ArrayRef<Value *> VectorizedVals; |
| @@ -7017,53 +7009,19 @@ public: |
| VecTy, std::nullopt, CostKind, 0, EEVTy); |
| } |
| } |
| - InVectors.assign(1, VecBase); |
| return VecBase; |
| } |
| - void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) { |
| - CommonMask.assign(Mask.begin(), Mask.end()); |
| - InVectors.assign( |
| - 2, Constant::getNullValue(FixedVectorType::get( |
| - E1->Scalars.front()->getType(), |
| - std::max(E1->getVectorFactor(), E2->getVectorFactor())))); |
| - } |
| - void add(const TreeEntry *E1, ArrayRef<int> Mask) { |
| - CommonMask.assign(Mask.begin(), Mask.end()); |
| - InVectors.assign( |
| - 1, Constant::getNullValue(FixedVectorType::get( |
| - E1->Scalars.front()->getType(), E1->getVectorFactor()))); |
| - } |
| void gather(ArrayRef<Value *> VL, Value *Root = nullptr) { |
| Cost += getBuildVectorCost(VL, Root); |
| - if (!Root) { |
| - assert(InVectors.empty() && "Unexpected input vectors for buildvector."); |
| - InVectors.assign(1, Constant::getNullValue(FixedVectorType::get( |
| - VL.front()->getType(), VL.size()))); |
| - } |
| } |
| /// Finalize emission of the shuffles. |
| - InstructionCost finalize(ArrayRef<int> ExtMask) { |
| + InstructionCost finalize() { |
| IsFinalized = true; |
| - ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); |
| - if (CommonMask.empty()) |
| - return Cost; |
| - int Limit = CommonMask.size() * 2; |
| - if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && |
| - ShuffleVectorInst::isIdentityMask(CommonMask)) |
| - return Cost; |
| - return Cost + |
| - TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc |
| - : TTI::SK_PermuteSingleSrc, |
| - FixedVectorType::get( |
| - cast<VectorType>(InVectors.front()->getType()) |
| - ->getElementType(), |
| - CommonMask.size()), |
| - CommonMask); |
| + return Cost; |
| } |
| |
| ~ShuffleCostEstimator() { |
| - assert((IsFinalized || CommonMask.empty()) && |
| - "Shuffle construction must be finalized."); |
| + assert(IsFinalized && "Shuffle construction must be finalized."); |
| } |
| }; |
| |
| @@ -7151,30 +7109,35 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, |
| if (Mask[I] != UndefMaskElem) |
| GatheredScalars[I] = PoisonValue::get(ScalarTy); |
| } |
| - LLVM_DEBUG( |
| - int Limit = Mask.size() * 2; |
| - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && |
| - all_of(Mask, [=](int Idx) { return Idx < Limit; }) && |
| - ShuffleVectorInst::isIdentityMask(Mask)) { |
| - // Perfect match in the graph, will reuse the previously |
| - // vectorized node. Cost is 0. |
| - dbgs() << "SLP: perfect diamond match for gather bundle " |
| - "that starts with " |
| - << *VL.front() << ".\n"; |
| - } else { |
| - dbgs() << "SLP: shuffled " << Entries.size() |
| - << " entries for bundle that starts with " << *VL.front() |
| - << ".\n"; |
| - }); |
| - if (Entries.size() == 1) |
| - Estimator.add(Entries.front(), Mask); |
| - else |
| - Estimator.add(Entries.front(), Entries.back(), Mask); |
| + InstructionCost GatherCost = 0; |
| + int Limit = Mask.size() * 2; |
| + if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && |
| + ShuffleVectorInst::isIdentityMask(Mask)) { |
| + // Perfect match in the graph, will reuse the previously vectorized |
| + // node. Cost is 0. |
| + LLVM_DEBUG( |
| + dbgs() |
| + << "SLP: perfect diamond match for gather bundle that starts with " |
| + << *VL.front() << ".\n"); |
| + if (NeedToShuffleReuses) |
| + GatherCost = |
| + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, |
| + FinalVecTy, E->ReuseShuffleIndices); |
| + } else { |
| + LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() |
| + << " entries for bundle that starts with " |
| + << *VL.front() << ".\n"); |
| + // Detected that instead of gather we can emit a shuffle of single/two |
| + // previously vectorized nodes. Add the cost of the permutation rather |
| + // than gather. |
| + ::addMask(Mask, E->ReuseShuffleIndices); |
| + GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); |
| + } |
| Estimator.gather( |
| GatheredScalars, |
| Constant::getNullValue(FixedVectorType::get( |
| GatheredScalars.front()->getType(), GatheredScalars.size()))); |
| - return Estimator.finalize(E->ReuseShuffleIndices); |
| + return GatherCost + Estimator.finalize(); |
| } |
| if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { |
| // Check that gather of extractelements can be represented as just a |
| @@ -7184,15 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, |
| // single input vector or of 2 input vectors. |
| InstructionCost Cost = |
| computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); |
| - return Cost + Estimator.finalize(E->ReuseShuffleIndices); |
| - } |
| - Estimator.gather( |
| - GatheredScalars, |
| - (ExtractShuffle || GatherShuffle) |
| - ? Constant::getNullValue(FixedVectorType::get( |
| - GatheredScalars.front()->getType(), GatheredScalars.size())) |
| - : nullptr); |
| - return Estimator.finalize(E->ReuseShuffleIndices); |
| + if (NeedToShuffleReuses) |
| + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, |
| + FinalVecTy, E->ReuseShuffleIndices); |
| + return Cost + Estimator.finalize(); |
| + } |
| + InstructionCost ReuseShuffleCost = 0; |
| + if (NeedToShuffleReuses) |
| + ReuseShuffleCost = TTI->getShuffleCost( |
| + TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); |
| + Estimator.gather(GatheredScalars); |
| + return ReuseShuffleCost + Estimator.finalize(); |
| } |
| InstructionCost CommonCost = 0; |
| SmallVector<int> Mask; |
| diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll |
| index f26d14c44db8..c5b6ac647aa3 100644 |
| --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll |
| +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll |
| @@ -93,36 +93,20 @@ define i1 @logical_or_fcmp(<4 x float> %x) { |
| } |
| |
| define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { |
| -; SSE-LABEL: @logical_and_icmp_diff_preds( |
| -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 |
| -; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 |
| -; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 |
| -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2> |
| -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3> |
| -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1> |
| -; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]] |
| -; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 |
| -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 |
| -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false |
| -; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 |
| -; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false |
| -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false |
| -; SSE-NEXT: ret i1 [[S3]] |
| -; |
| -; AVX-LABEL: @logical_and_icmp_diff_preds( |
| -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7> |
| -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2> |
| -; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] |
| -; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] |
| -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7> |
| -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 |
| -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 |
| -; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false |
| -; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 |
| -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false |
| -; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 |
| -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false |
| -; AVX-NEXT: ret i1 [[S3]] |
| +; CHECK-LABEL: @logical_and_icmp_diff_preds( |
| +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7> |
| +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2> |
| +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] |
| +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7> |
| +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 |
| +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 |
| +; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false |
| +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 |
| +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false |
| +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 |
| +; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false |
| +; CHECK-NEXT: ret i1 [[S3]] |
| ; |
| %x0 = extractelement <4 x i32> %x, i32 0 |
| %x1 = extractelement <4 x i32> %x, i32 1 |