| commit b446ec56a2987ba02dc2c80d42100be4a4689215 |
| Author: Florian Hahn <flo@fhahn.com> |
| Date: Tue Jun 2 10:33:10 2020 +0100 |
| |
| [LV] Make sure the MaxVF is a power-of-2 by rounding down. |
| |
| LV currently only supports power of 2 vectorization factors, which has |
| been made explicit with the assertion added in |
| 840450549c9199150cbdee29acef756c19660ca1. |
| |
| However, if the widest type is not a power-of-2 the computed MaxVF won't |
| be a power-of-2 either. This patch updates computeFeasibleMaxVF to |
| ensure the returned value is a power-of-2 by rounding down to the |
| nearest power-of-2. |
| |
| Fixes PR46139. |
| |
| Reviewers: Ayal, gilr, rengolin |
| |
| Reviewed By: Ayal |
| |
| Differential Revision: https://reviews.llvm.org/D80870 |
| |
| diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| index 5e5f029578f..4f40d8d529f 100644 |
| --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |
| @@ -1323,8 +1323,9 @@ public: |
| private: |
| unsigned NumPredStores = 0; |
| |
| - /// \return An upper bound for the vectorization factor, larger than zero. |
| - /// One is returned if vectorization should best be avoided due to cost. |
| + /// \return An upper bound for the vectorization factor, a power-of-2 larger |
| + /// than zero. One is returned if vectorization should best be avoided due |
| + /// to cost. |
| unsigned computeFeasibleMaxVF(unsigned ConstTripCount); |
| |
| /// The vectorization cost is a combination of the cost itself and a boolean |
| @@ -5058,9 +5059,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { |
| WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); |
| |
| // Ensure MaxVF is a power of 2; the dependence distance bound may not be. |
| - WidestRegister = PowerOf2Floor(WidestRegister); |
| - |
| - unsigned MaxVectorSize = WidestRegister / WidestType; |
| + // Note that both WidestRegister and WidestType may not be a powers of 2. |
| + unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); |
| |
| LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
| << " / " << WidestType << " bits.\n"); |
| diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll |
| new file mode 100644 |
| index 00000000000..2e12e31342b |
| --- /dev/null |
| +++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll |
| @@ -0,0 +1,40 @@ |
| +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| +; RUN: opt -loop-vectorize -S %s -mattr=+avx512f | FileCheck %s |
| + |
| +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" |
| +target triple = "x86_64-apple-macosx10.15.0" |
| + |
| +; Make sure non-power-of-2 types are handled correctly, i.e., MaxVF is still a power-of-2. |
| + |
| +; Test case from PR46139. |
| + |
| +define x86_fp80 @test() { |
| +; CHECK-LABEL: @test( |
| +; CHECK-NEXT: foo.exit: |
| +; CHECK-NEXT: br label [[FOR_BODY3_I_3:%.*]] |
| +; CHECK: for.body3.i.3: |
| +; CHECK-NEXT: [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ] |
| +; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ] |
| +; CHECK-NEXT: [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000 |
| +; CHECK-NEXT: [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1 |
| +; CHECK-NEXT: [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1 |
| +; CHECK-NEXT: br i1 [[CMP2_I_3]], label [[FOR_BODY3_I_3]], label [[FOO_EXIT_3:%.*]] |
| +; CHECK: foo.exit.3: |
| +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi x86_fp80 [ [[MUL_I_3]], [[FOR_BODY3_I_3]] ] |
| +; CHECK-NEXT: ret x86_fp80 [[MUL_LCSSA]] |
| +; |
| +foo.exit: |
| + br label %for.body3.i.3 |
| + |
| +for.body3.i.3: ; preds = %for.body3.i.3, %foo.exit |
| + %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ] |
| + %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ] |
| + %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000 |
| + %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1 |
| + %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1 |
| + br i1 %cmp2.i.3, label %for.body3.i.3, label %foo.exit.3 |
| + |
| +foo.exit.3: ; preds = %for.body3.i.3 |
| + %mul.lcssa = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ] |
| + ret x86_fp80 %mul.lcssa |
| +} |