Skip to content

Commit 85be19d

Browse files
committed
[VPlan] Explicitly unoll replicate-regions without live-outs by VF.
This patch adds a new replicateReplicateRegionsByVF transform to unroll replicate=regions by VF, dissolving them. The transform creates VF copies of the replicate-region's content, connects them and converts recipes to single-scalar variants for the corresponding lanes. The initial version skips regions with live-outs (VPPredInstPHIRecipe), which will be added in follow-up patches. Depends on #170053
1 parent affdde9 commit 85be19d

File tree

59 files changed

+696
-765
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+696
-765
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7316,6 +7316,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73167316
BestVPlan);
73177317
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
73187318
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7319+
VPlanTransforms::runPass(VPlanTransforms::unrollReplicateRegions, BestVPlan,
7320+
BestVF);
7321+
VPlanTransforms::runPass(VPlanTransforms::mergeBlocksIntoPredecessors,
7322+
BestVPlan);
73197323
bool HasBranchWeights =
73207324
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
73217325
if (HasBranchWeights) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3813,8 +3813,9 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
38133813
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
38143814
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
38153815
getDebugLoc());
3816-
if (getNumOperands() == 4)
3817-
NewR->addOperand(getOperand(3));
3816+
// Add start index operand, if present.
3817+
for (VPValue *Op : drop_begin(operands(), 3))
3818+
NewR->addOperand(Op);
38183819
return NewR;
38193820
}
38203821

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ static void addReplicateRegions(VPlan &Plan) {
497497

498498
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
499499
/// the predecessor has a single successor.
500-
static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
500+
bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
501501
SmallVector<VPBasicBlock *> WorkList;
502502
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
503503
vp_depth_first_deep(Plan.getEntry()))) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ struct VPlanTransforms {
178178
/// replicate regions, thereby dissolving the latter.
179179
static void replicateByVF(VPlan &Plan, ElementCount VF);
180180

181+
/// Replace replicate regions by explicitly replicating the regions' contents
182+
/// \p VF times, each copy processing a single lane.
183+
static void unrollReplicateRegions(VPlan &Plan, ElementCount VF);
184+
181185
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
182186
/// resulting plan to \p BestVF and \p BestUF.
183187
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -189,6 +193,8 @@ struct VPlanTransforms {
189193
/// block merging.
190194
LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
191195

196+
static bool mergeBlocksIntoPredecessors(VPlan &Plan);
197+
192198
/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
193199
/// region block and remove the mask operand. Optimize the created regions by
194200
/// iteratively sinking scalar operands into the region, followed by merging

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#include "llvm/ADT/ScopeExit.h"
2525
#include "llvm/Analysis/IVDescriptors.h"
2626
#include "llvm/IR/Intrinsics.h"
27+
#include "llvm/Support/Debug.h"
28+
29+
#define DEBUG_TYPE "vplan"
2730

2831
using namespace llvm;
2932
using namespace llvm::VPlanPatternMatch;
@@ -124,6 +127,7 @@ class UnrollState {
124127
R->setOperand(OpIdx, getValueForPart(Op, Part));
125128
}
126129
};
130+
127131
} // namespace
128132

129133
static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
@@ -665,3 +669,176 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
665669
for (auto *R : reverse(ToRemove))
666670
R->eraseFromParent();
667671
}
672+
673+
/// Process recipes in a single lane's blocks, updating them for lane-specific
674+
/// operations.
675+
static void processLane(VPlan &Plan, Type *IdxTy, unsigned Lane,
676+
ElementCount VF, ArrayRef<VPBlockBase *> RegionBlocks,
677+
DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBlocks) {
678+
DenseMap<VPValue *, VPValue *> Old2NewVPValues;
679+
for (VPBlockBase *OldVPB : RegionBlocks) {
680+
auto *OldBB = cast<VPBasicBlock>(OldVPB);
681+
auto *NewBB = cast<VPBasicBlock>(Old2NewBlocks.lookup(OldVPB));
682+
for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
683+
for (const auto &[OldV, NewV] :
684+
zip(OldR.definedValues(), NewR.definedValues()))
685+
Old2NewVPValues[OldV] = NewV;
686+
}
687+
688+
// Update lane operands and remap operands to use copies for current lane.
689+
for (VPRecipeBase &NewR : make_early_inc_range(*NewBB)) {
690+
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) {
691+
VPTypeAnalysis TypeInfo(Plan);
692+
if (Lane != 0) {
693+
Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0));
694+
VPBuilder Builder(Steps);
695+
Steps->setOperand(
696+
3, Builder.createNaryOp(Instruction::Add,
697+
{Steps->getOperand(3),
698+
Plan.getConstantInt(BaseIVTy, Lane)}));
699+
}
700+
701+
} else if (match(&NewR, m_ExtractElement(m_VPValue(), m_ZeroInt())))
702+
NewR.setOperand(1, Plan.getConstantInt(IdxTy, Lane));
703+
704+
// Remap operands to use lane-specific values.
705+
for (const auto &[I, Op] : enumerate(NewR.operands())) {
706+
// Use cloned value if operand was defined in the region.
707+
if (auto *New = Old2NewVPValues.lookup(Op))
708+
NewR.setOperand(I, New);
709+
}
710+
}
711+
}
712+
}
713+
714+
/// Process a single lane: clone blocks (or reuse original for lane 0), collect
715+
/// value mappings, and process recipes for lane-specific operations.
716+
static void processSingleLane(
717+
VPlan &Plan, Type *IdxTy, unsigned Lane, ElementCount VF,
718+
ArrayRef<VPBlockBase *> RegionBlocks, VPBlockBase *Entry,
719+
VPBlockBase *Exiting,
720+
SmallVectorImpl<std::pair<VPBlockBase *, VPBlockBase *>> &LaneClones) {
721+
DenseMap<VPBlockBase *, VPBlockBase *> Old2NewBlocks;
722+
if (Lane == 0) {
723+
// Lane 0 uses the original blocks, and the recipes are adjusted:
724+
// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
725+
// converted into BranchOnCond and extracts are created as needed.
726+
for (VPBlockBase *VPB : RegionBlocks) {
727+
Old2NewBlocks[VPB] = VPB;
728+
729+
for (VPRecipeBase &NewR :
730+
make_early_inc_range(*cast<VPBasicBlock>(VPB))) {
731+
VPBuilder Builder(&NewR);
732+
for (const auto &[I, Op] : enumerate(NewR.operands())) {
733+
// Skip operands that don't need extraction: scalar VF (no vectors),
734+
// values defined in the same block (already scalar), or values that
735+
// are already single scalars.
736+
if (VF.isScalar() ||
737+
(Op->getDefiningRecipe() &&
738+
Op->getDefiningRecipe()->getParent() == VPB) ||
739+
vputils::isSingleScalar(Op))
740+
continue;
741+
742+
// Extract the lane from values defined outside the region.
743+
VPValue *Idx = Plan.getConstantInt(IdxTy, Lane);
744+
VPValue *Extract = Builder.createNaryOp(
745+
Instruction::ExtractElement, {Op, Idx}, NewR.getDebugLoc());
746+
NewR.setOperand(I, Extract);
747+
}
748+
749+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&NewR)) {
750+
auto *New = new VPReplicateRecipe(
751+
RepR->getUnderlyingInstr(), RepR->operands(),
752+
/* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR,
753+
RepR->getDebugLoc());
754+
New->insertBefore(RepR);
755+
RepR->replaceAllUsesWith(New);
756+
RepR->eraseFromParent();
757+
} else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&NewR)) {
758+
Builder.createNaryOp(VPInstruction::BranchOnCond,
759+
{BranchOnMask->getOperand(0)},
760+
BranchOnMask->getDebugLoc());
761+
BranchOnMask->eraseFromParent();
762+
} else if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) {
763+
VPTypeAnalysis TypeInfo(Plan);
764+
if (Steps->getNumOperands() == 3)
765+
addStartIndexForScalarSteps(Steps, 0, Plan, TypeInfo);
766+
}
767+
}
768+
}
769+
} else {
770+
// Clone blocks and connect them according to original structure.
771+
for (VPBlockBase *OrigBlock : RegionBlocks) {
772+
VPBlockBase *ClonedBlock = OrigBlock->clone();
773+
Old2NewBlocks[OrigBlock] = ClonedBlock;
774+
ClonedBlock->setParent(Entry->getParent());
775+
}
776+
for (VPBlockBase *OrigBlock : RegionBlocks) {
777+
if (OrigBlock == Exiting)
778+
continue;
779+
for (VPBlockBase *OrigSucc : OrigBlock->successors())
780+
VPBlockUtils::connectBlocks(Old2NewBlocks[OrigBlock],
781+
Old2NewBlocks[OrigSucc]);
782+
}
783+
}
784+
785+
processLane(Plan, IdxTy, Lane, VF, RegionBlocks, Old2NewBlocks);
786+
LaneClones.push_back({Old2NewBlocks[Entry], Old2NewBlocks[Exiting]});
787+
}
788+
789+
void VPlanTransforms::unrollReplicateRegions(VPlan &Plan, ElementCount VF) {
790+
// Collect all replicate regions in the plan before modifying the CFG.
791+
SmallVector<VPRegionBlock *> ReplicateRegions;
792+
for (VPBlockBase *Block :
793+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) {
794+
if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
795+
if (Region->isReplicator())
796+
ReplicateRegions.push_back(Region);
797+
}
798+
}
799+
800+
Type *IdxTy = IntegerType::get(Plan.getContext(), 32);
801+
802+
for (VPRegionBlock *Region : ReplicateRegions) {
803+
assert(!VF.isScalable() && "cannot replicate across scalable VFs");
804+
805+
VPBlockBase *Entry = Region->getEntry();
806+
VPBlockBase *Exiting = Region->getExiting();
807+
808+
// Skip regions with live-outs as packing scalar results back into vectors
809+
// is not yet implemented.
810+
if (any_of(*cast<VPBasicBlock>(Exiting), IsaPred<VPPredInstPHIRecipe>))
811+
continue;
812+
813+
// Get region context before dissolving.
814+
VPBlockBase *Pred = Region->getSinglePredecessor();
815+
assert(Pred && "Replicate region must have a single predecessor");
816+
SmallVector<VPBlockBase *> Successors(Region->successors());
817+
818+
// Disconnect and dissolve the region.
819+
VPBlockUtils::disconnectBlocks(Pred, Region);
820+
for (VPBlockBase *Succ : Successors)
821+
VPBlockUtils::disconnectBlocks(Region, Succ);
822+
823+
SmallVector<VPBlockBase *> RegionBlocks(vp_depth_first_shallow(Entry));
824+
VPRegionBlock *ParentRegion = Region->getParent();
825+
for (VPBlockBase *Block : RegionBlocks)
826+
Block->setParent(ParentRegion);
827+
VPBlockUtils::connectBlocks(Pred, Entry);
828+
829+
// Process each lane: clone blocks, collect value mappings, and process
830+
// recipes for lane-specific operations.
831+
SmallVector<std::pair<VPBlockBase *, VPBlockBase *>> LaneClones;
832+
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) {
833+
processSingleLane(Plan, IdxTy, Lane, VF, RegionBlocks, Entry, Exiting,
834+
LaneClones);
835+
}
836+
837+
// Connect lanes sequentially and connect last lane to successors.
838+
for (unsigned Lane = 1; Lane < VF.getKnownMinValue(); ++Lane)
839+
VPBlockUtils::connectBlocks(LaneClones[Lane - 1].second,
840+
LaneClones[Lane].first);
841+
for (VPBlockBase *Succ : Successors)
842+
VPBlockUtils::connectBlocks(LaneClones.back().second, Succ);
843+
}
844+
}

llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
199199
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0
200200
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
201201
; CHECK: [[PRED_STORE_IF]]:
202-
; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[IV]], 0
203-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP72]]
202+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
204203
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 0
205204
; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP9]], align 1
206205
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]

0 commit comments

Comments
 (0)