LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Return true if we do not know how to (mechanically) hoist or sink \p R out
234/// of a loop region.
236 // Assumes don't alias anything or throw; as long as they're guaranteed to
237 // execute, they're safe to hoist.
239 return false;
240
241 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
242 // memory location is not modified in the vector loop.
243 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
244 return true;
245
246 // Allocas cannot be hoisted.
247 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
248 return RepR && RepR->getOpcode() == Instruction::Alloca;
249}
250
251static bool sinkScalarOperands(VPlan &Plan) {
252 auto Iter = vp_depth_first_deep(Plan.getEntry());
253 bool ScalarVFOnly = Plan.hasScalarVFOnly();
254 bool Changed = false;
255
257 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
258 VPBasicBlock *SinkTo, VPValue *Op) {
259 auto *Candidate =
260 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
261 if (!Candidate)
262 return;
263
264 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
265 // for now.
267 return;
268
269 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
270 return;
271
272 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
273 if (!ScalarVFOnly && RepR->isSingleScalar())
274 return;
275
276 WorkList.insert({SinkTo, Candidate});
277 };
278
279 // First, collect the operands of all recipes in replicate blocks as seeds for
280 // sinking.
282 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
283 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
284 continue;
285 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
286 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
287 continue;
288 for (auto &Recipe : *VPBB)
289 for (VPValue *Op : Recipe.operands())
290 InsertIfValidSinkCandidate(VPBB, Op);
291 }
292
293 // Try to sink each replicate or scalar IV steps recipe in the worklist.
294 for (unsigned I = 0; I != WorkList.size(); ++I) {
295 VPBasicBlock *SinkTo;
296 VPSingleDefRecipe *SinkCandidate;
297 std::tie(SinkTo, SinkCandidate) = WorkList[I];
298
299 // All recipe users of SinkCandidate must be in the same block SinkTo or all
300 // users outside of SinkTo must only use the first lane of SinkCandidate. In
301 // the latter case, we need to duplicate SinkCandidate.
302 auto UsersOutsideSinkTo =
303 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
304 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
305 });
306 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
307 return !U->usesFirstLaneOnly(SinkCandidate);
308 }))
309 continue;
310 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
311
312 if (NeedsDuplicating) {
313 if (ScalarVFOnly)
314 continue;
315 VPSingleDefRecipe *Clone;
316 if (auto *SinkCandidateRepR =
317 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
318 // TODO: Handle converting to uniform recipes as separate transform,
319 // then cloning should be sufficient here.
320 Instruction *I = SinkCandidate->getUnderlyingInstr();
321 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
322 nullptr /*Mask*/, *SinkCandidateRepR,
323 *SinkCandidateRepR);
324 // TODO: add ".cloned" suffix to name of Clone's VPValue.
325 } else {
326 Clone = SinkCandidate->clone();
327 }
328
329 Clone->insertBefore(SinkCandidate);
330 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
331 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
332 });
333 }
334 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
335 for (VPValue *Op : SinkCandidate->operands())
336 InsertIfValidSinkCandidate(SinkTo, Op);
337 Changed = true;
338 }
339 return Changed;
340}
341
342/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
343/// the mask.
345 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
346 if (!EntryBB || EntryBB->size() != 1 ||
347 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
348 return nullptr;
349
350 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
351}
352
353/// If \p R is a triangle region, return the 'then' block of the triangle.
355 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
356 if (EntryBB->getNumSuccessors() != 2)
357 return nullptr;
358
359 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
360 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
361 if (!Succ0 || !Succ1)
362 return nullptr;
363
364 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
365 return nullptr;
366 if (Succ0->getSingleSuccessor() == Succ1)
367 return Succ0;
368 if (Succ1->getSingleSuccessor() == Succ0)
369 return Succ1;
370 return nullptr;
371}
372
373// Merge replicate regions in their successor region, if a replicate region
374// is connected to a successor replicate region with the same predicate by a
375// single, empty VPBasicBlock.
377 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
378
379 // Collect replicate regions followed by an empty block, followed by another
380 // replicate region with matching masks to process front. This is to avoid
381 // iterator invalidation issues while merging regions.
384 vp_depth_first_deep(Plan.getEntry()))) {
385 if (!Region1->isReplicator())
386 continue;
387 auto *MiddleBasicBlock =
388 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
389 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
390 continue;
391
392 auto *Region2 =
393 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
394 if (!Region2 || !Region2->isReplicator())
395 continue;
396
397 VPValue *Mask1 = getPredicatedMask(Region1);
398 VPValue *Mask2 = getPredicatedMask(Region2);
399 if (!Mask1 || Mask1 != Mask2)
400 continue;
401
402 assert(Mask1 && Mask2 && "both region must have conditions");
403 WorkList.push_back(Region1);
404 }
405
406 // Move recipes from Region1 to its successor region, if both are triangles.
407 for (VPRegionBlock *Region1 : WorkList) {
408 if (TransformedRegions.contains(Region1))
409 continue;
410 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
411 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
412
413 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
414 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
415 if (!Then1 || !Then2)
416 continue;
417
418 // Note: No fusion-preventing memory dependencies are expected in either
419 // region. Such dependencies should be rejected during earlier dependence
420 // checks, which guarantee accesses can be re-ordered for vectorization.
421 //
422 // Move recipes to the successor region.
423 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
424 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
425
426 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
427 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
428
429 // Move VPPredInstPHIRecipes from the merge block to the successor region's
430 // merge block. Update all users inside the successor region to use the
431 // original values.
432 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
433 VPValue *PredInst1 =
434 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
435 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
436 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
437 return cast<VPRecipeBase>(&U)->getParent() == Then2;
438 });
439
440 // Remove phi recipes that are unused after merging the regions.
441 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
442 Phi1ToMove.eraseFromParent();
443 continue;
444 }
445 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
446 }
447
448 // Remove the dead recipes in Region1's entry block.
449 for (VPRecipeBase &R :
450 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
451 R.eraseFromParent();
452
453 // Finally, remove the first region.
454 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
455 VPBlockUtils::disconnectBlocks(Pred, Region1);
456 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
457 }
458 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
459 TransformedRegions.insert(Region1);
460 }
461
462 return !TransformedRegions.empty();
463}
464
466 VPlan &Plan) {
467 Instruction *Instr = PredRecipe->getUnderlyingInstr();
468 // Build the triangular if-then region.
469 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
470 assert(Instr->getParent() && "Predicated instruction not in any basic block");
471 auto *BlockInMask = PredRecipe->getMask();
472 auto *MaskDef = BlockInMask->getDefiningRecipe();
473 auto *BOMRecipe = new VPBranchOnMaskRecipe(
474 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
475 auto *Entry =
476 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
477
478 // Replace predicated replicate recipe with a replicate recipe without a
479 // mask but in the replicate region.
480 auto *RecipeWithoutMask = new VPReplicateRecipe(
481 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
482 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
483 PredRecipe->getDebugLoc());
484 auto *Pred =
485 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
486
487 VPPredInstPHIRecipe *PHIRecipe = nullptr;
488 if (PredRecipe->getNumUsers() != 0) {
489 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
490 RecipeWithoutMask->getDebugLoc());
491 PredRecipe->replaceAllUsesWith(PHIRecipe);
492 PHIRecipe->setOperand(0, RecipeWithoutMask);
493 }
494 PredRecipe->eraseFromParent();
495 auto *Exiting =
496 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
498 Plan.createReplicateRegion(Entry, Exiting, RegionName);
499
500 // Note: first set Entry as region entry and then connect successors starting
501 // from it in order, to propagate the "parent" of each VPBasicBlock.
502 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
503 VPBlockUtils::connectBlocks(Pred, Exiting);
504
505 return Region;
506}
507
508static void addReplicateRegions(VPlan &Plan) {
511 vp_depth_first_deep(Plan.getEntry()))) {
512 for (VPRecipeBase &R : *VPBB)
513 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
514 if (RepR->isPredicated())
515 WorkList.push_back(RepR);
516 }
517 }
518
519 unsigned BBNum = 0;
520 for (VPReplicateRecipe *RepR : WorkList) {
521 VPBasicBlock *CurrentBlock = RepR->getParent();
522 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
523
524 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
525 SplitBlock->setName(
526 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
527 // Record predicated instructions for above packing optimizations.
529 Region->setParent(CurrentBlock->getParent());
531
532 VPRegionBlock *ParentRegion = Region->getParent();
533 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
534 ParentRegion->setExiting(SplitBlock);
535 }
536}
537
538/// Remove redundant VPBasicBlocks by merging them into their predecessor if
539/// the predecessor has a single successor.
543 vp_depth_first_deep(Plan.getEntry()))) {
544 // Don't fold the blocks in the skeleton of the Plan into their single
545 // predecessors for now.
546 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
547 if (!VPBB->getParent())
548 continue;
549 auto *PredVPBB =
550 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
551 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
552 isa<VPIRBasicBlock>(PredVPBB))
553 continue;
554 WorkList.push_back(VPBB);
555 }
556
557 for (VPBasicBlock *VPBB : WorkList) {
558 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
559 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
560 R.moveBefore(*PredVPBB, PredVPBB->end());
561 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
562 auto *ParentRegion = VPBB->getParent();
563 if (ParentRegion && ParentRegion->getExiting() == VPBB)
564 ParentRegion->setExiting(PredVPBB);
565 for (auto *Succ : to_vector(VPBB->successors())) {
567 VPBlockUtils::connectBlocks(PredVPBB, Succ);
568 }
569 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
570 }
571 return !WorkList.empty();
572}
573
575 // Convert masked VPReplicateRecipes to if-then region blocks.
577
578 bool ShouldSimplify = true;
579 while (ShouldSimplify) {
580 ShouldSimplify = sinkScalarOperands(Plan);
581 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
582 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
583 }
584}
585
586/// Remove redundant casts of inductions.
587///
588/// Such redundant casts are casts of induction variables that can be ignored,
589/// because we already proved that the casted phi is equal to the uncasted phi
590/// in the vectorized loop. There is no need to vectorize the cast - the same
591/// value can be used for both the phi and casts in the vector loop.
593 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
595 if (!IV || IV->getTruncInst())
596 continue;
597
598 // A sequence of IR Casts has potentially been recorded for IV, which
599 // *must be bypassed* when the IV is vectorized, because the vectorized IV
600 // will produce the desired casted value. This sequence forms a def-use
601 // chain and is provided in reverse order, ending with the cast that uses
602 // the IV phi. Search for the recipe of the last cast in the chain and
603 // replace it with the original IV. Note that only the final cast is
604 // expected to have users outside the cast-chain and the dead casts left
605 // over will be cleaned up later.
606 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
607 VPValue *FindMyCast = IV;
608 for (Instruction *IRCast : reverse(Casts)) {
609 VPSingleDefRecipe *FoundUserCast = nullptr;
610 for (auto *U : FindMyCast->users()) {
611 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
612 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
613 FoundUserCast = UserCast;
614 break;
615 }
616 }
617 FindMyCast = FoundUserCast;
618 }
619 FindMyCast->replaceAllUsesWith(IV);
620 }
621}
622
623/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
624/// recipe, if it exists.
626 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
627 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
628 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
629 for (VPUser *U : CanonicalIV->users()) {
631 if (WidenNewIV)
632 break;
633 }
634
635 if (!WidenNewIV)
636 return;
637
638 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
639 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
640 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
641
642 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
643 continue;
644
645 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
646 // everything WidenNewIV's users need. That is, WidenOriginalIV will
647 // generate a vector phi or all users of WidenNewIV demand the first lane
648 // only.
649 if (Plan.hasScalarVFOnly() ||
650 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
651 vputils::onlyFirstLaneUsed(WidenNewIV)) {
652 // We are replacing a wide canonical iv with a suitable wide induction.
653 // This is used to compute header mask, hence all lanes will be used and
654 // we need to drop wrap flags only applying to lanes guranteed to execute
655 // in the original scalar loop.
656 WidenOriginalIV->dropPoisonGeneratingFlags();
657 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
658 WidenNewIV->eraseFromParent();
659 return;
660 }
661 }
662}
663
664/// Returns true if \p R is dead and can be removed.
665static bool isDeadRecipe(VPRecipeBase &R) {
666 // Do remove conditional assume instructions as their conditions may be
667 // flattened.
668 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
669 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
671 if (IsConditionalAssume)
672 return true;
673
674 if (R.mayHaveSideEffects())
675 return false;
676
677 // Recipe is dead if no user keeps the recipe alive.
678 return all_of(R.definedValues(),
679 [](VPValue *V) { return V->getNumUsers() == 0; });
680}
681
684 vp_post_order_deep(Plan.getEntry()))) {
685 // The recipes in the block are processed in reverse order, to catch chains
686 // of dead recipes.
687 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
688 if (isDeadRecipe(R)) {
689 R.eraseFromParent();
690 continue;
691 }
692
693 // Check if R is a dead VPPhi <-> update cycle and remove it.
694 auto *PhiR = dyn_cast<VPPhi>(&R);
695 if (!PhiR || PhiR->getNumOperands() != 2)
696 continue;
697 VPUser *PhiUser = PhiR->getSingleUser();
698 if (!PhiUser)
699 continue;
700 VPValue *Incoming = PhiR->getOperand(1);
701 if (PhiUser != Incoming->getDefiningRecipe() ||
702 Incoming->getNumUsers() != 1)
703 continue;
704 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
705 PhiR->eraseFromParent();
706 Incoming->getDefiningRecipe()->eraseFromParent();
707 }
708 }
709}
710
713 Instruction::BinaryOps InductionOpcode,
714 FPMathOperator *FPBinOp, Instruction *TruncI,
715 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
716 VPBuilder &Builder) {
717 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
718 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
719 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
720 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
721 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
722
723 // Truncate base induction if needed.
724 VPTypeAnalysis TypeInfo(Plan);
725 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
726 if (TruncI) {
727 Type *TruncTy = TruncI->getType();
728 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
729 "Not truncating.");
730 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
731 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
732 ResultTy = TruncTy;
733 }
734
735 // Truncate step if needed.
736 Type *StepTy = TypeInfo.inferScalarType(Step);
737 if (ResultTy != StepTy) {
738 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
739 "Not truncating.");
740 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
741 auto *VecPreheader =
743 VPBuilder::InsertPointGuard Guard(Builder);
744 Builder.setInsertPoint(VecPreheader);
745 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
746 }
747 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
748 &Plan.getVF(), DL);
749}
750
753 for (unsigned I = 0; I != Users.size(); ++I) {
755 if (isa<VPHeaderPHIRecipe>(Cur))
756 continue;
757 for (VPValue *V : Cur->definedValues())
758 Users.insert_range(V->users());
759 }
760 return Users.takeVector();
761}
762
763/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
764/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
765/// generates scalar values.
766static VPValue *
768 VPlan &Plan, VPBuilder &Builder) {
770 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
771 VPValue *StepV = PtrIV->getOperand(1);
773 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
774 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
775
776 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
777 PtrIV->getDebugLoc(), "next.gep");
778}
779
780/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
781/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
782/// VPWidenPointerInductionRecipe will generate vectors only. If some users
783/// require vectors while other require scalars, the scalar uses need to extract
784/// the scalars from the generated vectors (Note that this is different to how
785/// int/fp inductions are handled). Legalize extract-from-ends using uniform
786/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
787/// the correct end value is available. Also optimize
788/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
789/// providing them scalar steps built on the canonical scalar IV and update the
790/// original IV's users. This is an optional optimization to reduce the needs of
791/// vector extracts.
794 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
795 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
796 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
797 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
798 if (!PhiR)
799 continue;
800
801 // Try to narrow wide and replicating recipes to uniform recipes, based on
802 // VPlan analysis.
803 // TODO: Apply to all recipes in the future, to replace legacy uniformity
804 // analysis.
805 auto Users = collectUsersRecursively(PhiR);
806 for (VPUser *U : reverse(Users)) {
807 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
808 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
809 // Skip recipes that shouldn't be narrowed.
810 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
811 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
812 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
813 continue;
814
815 // Skip recipes that may have other lanes than their first used.
817 continue;
818
819 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
820 Def->operands(), /*IsUniform*/ true,
821 /*Mask*/ nullptr, /*Flags*/ *Def);
822 Clone->insertAfter(Def);
823 Def->replaceAllUsesWith(Clone);
824 }
825
826 // Replace wide pointer inductions which have only their scalars used by
827 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
828 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
829 if (!Plan.hasScalarVFOnly() &&
830 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
831 continue;
832
833 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
834 PtrIV->replaceAllUsesWith(PtrAdd);
835 continue;
836 }
837
838 // Replace widened induction with scalar steps for users that only use
839 // scalars.
840 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
841 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
842 return U->usesScalars(WideIV);
843 }))
844 continue;
845
846 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
848 Plan, ID.getKind(), ID.getInductionOpcode(),
849 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
850 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
851 WideIV->getDebugLoc(), Builder);
852
853 // Update scalar users of IV to use Step instead.
854 if (!HasOnlyVectorVFs) {
855 assert(!Plan.hasScalableVF() &&
856 "plans containing a scalar VF cannot also include scalable VFs");
857 WideIV->replaceAllUsesWith(Steps);
858 } else {
859 bool HasScalableVF = Plan.hasScalableVF();
860 WideIV->replaceUsesWithIf(Steps,
861 [WideIV, HasScalableVF](VPUser &U, unsigned) {
862 if (HasScalableVF)
863 return U.usesFirstLaneOnly(WideIV);
864 return U.usesScalars(WideIV);
865 });
866 }
867 }
868}
869
870/// Check if \p VPV is an untruncated wide induction, either before or after the
871/// increment. If so return the header IV (before the increment), otherwise
872/// return null.
875 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
876 if (WideIV) {
877 // VPV itself is a wide induction, separately compute the end value for exit
878 // users if it is not a truncated IV.
879 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
880 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
881 }
882
883 // Check if VPV is an optimizable induction increment.
884 VPRecipeBase *Def = VPV->getDefiningRecipe();
885 if (!Def || Def->getNumOperands() != 2)
886 return nullptr;
887 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
888 if (!WideIV)
889 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
890 if (!WideIV)
891 return nullptr;
892
893 auto IsWideIVInc = [&]() {
894 auto &ID = WideIV->getInductionDescriptor();
895
896 // Check if VPV increments the induction by the induction step.
897 VPValue *IVStep = WideIV->getStepValue();
898 switch (ID.getInductionOpcode()) {
899 case Instruction::Add:
900 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
901 case Instruction::FAdd:
902 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
903 case Instruction::FSub:
904 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
905 m_Specific(IVStep)));
906 case Instruction::Sub: {
907 // IVStep will be the negated step of the subtraction. Check if Step == -1
908 // * IVStep.
909 VPValue *Step;
910 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
911 return false;
912 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
913 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
914 ScalarEvolution &SE = *PSE.getSE();
915 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
916 !isa<SCEVCouldNotCompute>(StepSCEV) &&
917 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
918 }
919 default:
920 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
921 match(VPV, m_GetElementPtr(m_Specific(WideIV),
922 m_Specific(WideIV->getStepValue())));
923 }
924 llvm_unreachable("should have been covered by switch above");
925 };
926 return IsWideIVInc() ? WideIV : nullptr;
927}
928
929/// Attempts to optimize the induction variable exit values for users in the
930/// early exit block.
932 VPTypeAnalysis &TypeInfo,
933 VPBlockBase *PredVPBB,
934 VPValue *Op,
936 VPValue *Incoming, *Mask;
939 return nullptr;
940
941 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
942 if (!WideIV)
943 return nullptr;
944
945 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
946 if (WideIntOrFp && WideIntOrFp->getTruncInst())
947 return nullptr;
948
949 // Calculate the final index.
950 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
951 auto *CanonicalIV = LoopRegion->getCanonicalIV();
952 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
953 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
954
955 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
956 VPValue *FirstActiveLane =
957 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
958 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
959 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
960 FirstActiveLaneType, DL);
961 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
962
963 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
964 // changed it means the exit is using the incremented value, so we need to
965 // add the step.
966 if (Incoming != WideIV) {
967 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
968 EndValue = B.createAdd(EndValue, One, DL);
969 }
970
971 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
972 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
973 VPIRValue *Start = WideIV->getStartValue();
974 VPValue *Step = WideIV->getStepValue();
975 EndValue = B.createDerivedIV(
976 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
977 Start, EndValue, Step);
978 }
979
980 return EndValue;
981}
982
983/// Compute the end value for \p WideIV, unless it is truncated. Creates a
984/// VPDerivedIVRecipe for non-canonical inductions.
986 VPBuilder &VectorPHBuilder,
987 VPTypeAnalysis &TypeInfo,
988 VPValue *VectorTC) {
989 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
990 // Truncated wide inductions resume from the last lane of their vector value
991 // in the last vector iteration which is handled elsewhere.
992 if (WideIntOrFp && WideIntOrFp->getTruncInst())
993 return nullptr;
994
995 VPIRValue *Start = WideIV->getStartValue();
996 VPValue *Step = WideIV->getStepValue();
998 VPValue *EndValue = VectorTC;
999 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1000 EndValue = VectorPHBuilder.createDerivedIV(
1001 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1002 Start, VectorTC, Step);
1003 }
1004
1005 // EndValue is derived from the vector trip count (which has the same type as
1006 // the widest induction) and thus may be wider than the induction here.
1007 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1008 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1009 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1010 ScalarTypeOfWideIV,
1011 WideIV->getDebugLoc());
1012 }
1013
1014 return EndValue;
1015}
1016
1017/// Attempts to optimize the induction variable exit values for users in the
1018/// exit block coming from the latch in the original scalar loop.
1020 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1023 VPWidenInductionRecipe *WideIV = nullptr;
1025 WideIV = getOptimizableIVOf(Incoming, PSE);
1026
1027 if (!WideIV)
1028 return nullptr;
1029
1030 VPValue *EndValue = EndValues.lookup(WideIV);
1031 assert(EndValue && "Must have computed the end value up front");
1032
1033 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1034 // changed it means the exit is using the incremented value, so we don't
1035 // need to subtract the step.
1036 if (Incoming != WideIV)
1037 return EndValue;
1038
1039 // Otherwise, subtract the step from the EndValue.
1040 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1041 VPValue *Step = WideIV->getStepValue();
1042 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1043 if (ScalarTy->isIntegerTy())
1044 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1045 if (ScalarTy->isPointerTy()) {
1046 Type *StepTy = TypeInfo.inferScalarType(Step);
1047 auto *Zero = Plan.getZero(StepTy);
1048 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1049 DebugLoc::getUnknown(), "ind.escape");
1050 }
1051 if (ScalarTy->isFloatingPointTy()) {
1052 const auto &ID = WideIV->getInductionDescriptor();
1053 return B.createNaryOp(
1054 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1055 ? Instruction::FSub
1056 : Instruction::FAdd,
1057 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1058 }
1059 llvm_unreachable("all possible induction types must be handled");
1060 return nullptr;
1061}
1062
1064 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1065 // Compute end values for all inductions.
1066 VPTypeAnalysis TypeInfo(Plan);
1067 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1068 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1069 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1071 VPValue *ResumeTC =
1072 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1073 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1074 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1075 if (!WideIV)
1076 continue;
1078 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1079 EndValues[WideIV] = EndValue;
1080 }
1081
1082 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1083 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1084 VPValue *Op;
1085 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1086 continue;
1087 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1088 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1089 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1090 R.eraseFromParent();
1091 }
1092 }
1093
1094 // Then, optimize exit block users.
1095 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1096 for (VPRecipeBase &R : ExitVPBB->phis()) {
1097 auto *ExitIRI = cast<VPIRPhi>(&R);
1098
1099 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1100 VPValue *Escape = nullptr;
1101 if (PredVPBB == MiddleVPBB)
1102 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1103 ExitIRI->getOperand(Idx),
1104 EndValues, PSE);
1105 else
1107 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1108 if (Escape)
1109 ExitIRI->setOperand(Idx, Escape);
1110 }
1111 }
1112 }
1113}
1114
1115/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1116/// them with already existing recipes expanding the same SCEV expression.
1119
1120 for (VPRecipeBase &R :
1122 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1123 if (!ExpR)
1124 continue;
1125
1126 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1127 if (Inserted)
1128 continue;
1129 ExpR->replaceAllUsesWith(V->second);
1130 ExpR->eraseFromParent();
1131 }
1132}
1133
1135 SmallVector<VPValue *> WorkList;
1137 WorkList.push_back(V);
1138
1139 while (!WorkList.empty()) {
1140 VPValue *Cur = WorkList.pop_back_val();
1141 if (!Seen.insert(Cur).second)
1142 continue;
1143 VPRecipeBase *R = Cur->getDefiningRecipe();
1144 if (!R)
1145 continue;
1146 if (!isDeadRecipe(*R))
1147 continue;
1148 append_range(WorkList, R->operands());
1149 R->eraseFromParent();
1150 }
1151}
1152
1153/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1154/// Returns an optional pair, where the first element indicates whether it is
1155/// an intrinsic ID.
1156static std::optional<std::pair<bool, unsigned>>
1158 return TypeSwitch<const VPSingleDefRecipe *,
1159 std::optional<std::pair<bool, unsigned>>>(R)
1162 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1163 .Case([](const VPWidenIntrinsicRecipe *I) {
1164 return std::make_pair(true, I->getVectorIntrinsicID());
1165 })
1166 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1167 // For recipes that do not directly map to LLVM IR instructions,
1168 // assign opcodes after the last VPInstruction opcode (which is also
1169 // after the last IR Instruction opcode), based on the VPRecipeID.
1170 return std::make_pair(false,
1171 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1172 })
1173 .Default([](auto *) { return std::nullopt; });
1174}
1175
1176/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1177/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1178/// Operands are foldable live-ins.
1180 ArrayRef<VPValue *> Operands,
1181 const DataLayout &DL,
1182 VPTypeAnalysis &TypeInfo) {
1183 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1184 if (!OpcodeOrIID)
1185 return nullptr;
1186
1188 for (VPValue *Op : Operands) {
1189 if (!match(Op, m_LiveIn()))
1190 return nullptr;
1191 Value *V = Op->getUnderlyingValue();
1192 if (!V)
1193 return nullptr;
1194 Ops.push_back(V);
1195 }
1196
1197 auto FoldToIRValue = [&]() -> Value * {
1198 InstSimplifyFolder Folder(DL);
1199 if (OpcodeOrIID->first) {
1200 if (R.getNumOperands() != 2)
1201 return nullptr;
1202 unsigned ID = OpcodeOrIID->second;
1203 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1204 TypeInfo.inferScalarType(&R));
1205 }
1206 unsigned Opcode = OpcodeOrIID->second;
1207 if (Instruction::isBinaryOp(Opcode))
1208 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1209 Ops[0], Ops[1]);
1210 if (Instruction::isCast(Opcode))
1211 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1212 TypeInfo.inferScalarType(R.getVPSingleValue()));
1213 switch (Opcode) {
1215 return Folder.FoldSelect(Ops[0], Ops[1],
1217 case VPInstruction::Not:
1218 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1220 case Instruction::Select:
1221 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1222 case Instruction::ICmp:
1223 case Instruction::FCmp:
1224 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1225 Ops[1]);
1226 case Instruction::GetElementPtr: {
1227 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1228 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1229 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1230 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1231 }
1234 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1235 Ops[0], Ops[1],
1236 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1237 // An extract of a live-in is an extract of a broadcast, so return the
1238 // broadcasted element.
1239 case Instruction::ExtractElement:
1240 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1241 return Ops[0];
1242 }
1243 return nullptr;
1244 };
1245
1246 if (Value *V = FoldToIRValue())
1247 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1248 return nullptr;
1249}
1250
1251/// Try to simplify VPSingleDefRecipe \p Def.
1253 VPlan *Plan = Def->getParent()->getPlan();
1254
1255 // Simplification of live-in IR values for SingleDef recipes using
1256 // InstSimplifyFolder.
1257 const DataLayout &DL =
1259 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1260 return Def->replaceAllUsesWith(V);
1261
1262 // Fold PredPHI LiveIn -> LiveIn.
1263 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1264 VPValue *Op = PredPHI->getOperand(0);
1265 if (isa<VPIRValue>(Op))
1266 PredPHI->replaceAllUsesWith(Op);
1267 }
1268
1269 VPBuilder Builder(Def);
1270
1271 // Avoid replacing VPInstructions with underlying values with new
1272 // VPInstructions, as we would fail to create widen/replicate recpes from the
1273 // new VPInstructions without an underlying value, and miss out on some
1274 // transformations that only apply to widened/replicated recipes later, by
1275 // doing so.
1276 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1277 // VPInstructions without underlying values, as those will get skipped during
1278 // cost computation.
1279 bool CanCreateNewRecipe =
1280 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1281
1282 VPValue *A;
1283 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1284 Type *TruncTy = TypeInfo.inferScalarType(Def);
1285 Type *ATy = TypeInfo.inferScalarType(A);
1286 if (TruncTy == ATy) {
1287 Def->replaceAllUsesWith(A);
1288 } else {
1289 // Don't replace a non-widened cast recipe with a widened cast.
1290 if (!isa<VPWidenCastRecipe>(Def))
1291 return;
1292 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1293
1294 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1295 ? Instruction::SExt
1296 : Instruction::ZExt;
1297 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1298 TruncTy);
1299 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1300 // UnderlyingExt has distinct return type, used to retain legacy cost.
1301 Ext->setUnderlyingValue(UnderlyingExt);
1302 }
1303 Def->replaceAllUsesWith(Ext);
1304 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1305 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1306 Def->replaceAllUsesWith(Trunc);
1307 }
1308 }
1309#ifndef NDEBUG
1310 // Verify that the cached type info is for both A and its users is still
1311 // accurate by comparing it to freshly computed types.
1312 VPTypeAnalysis TypeInfo2(*Plan);
1313 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1314 for (VPUser *U : A->users()) {
1315 auto *R = cast<VPRecipeBase>(U);
1316 for (VPValue *VPV : R->definedValues())
1317 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1318 }
1319#endif
1320 }
1321
1322 // Simplify (X && Y) | (X && !Y) -> X.
1323 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1324 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1325 // recipes to be visited during simplification.
1326 VPValue *X, *Y, *Z;
1327 if (match(Def,
1330 Def->replaceAllUsesWith(X);
1331 Def->eraseFromParent();
1332 return;
1333 }
1334
1335 // x | AllOnes -> AllOnes
1336 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1337 return Def->replaceAllUsesWith(
1338 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1339
1340 // x | 0 -> x
1341 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1342 return Def->replaceAllUsesWith(X);
1343
1344 // x | !x -> AllOnes
1346 return Def->replaceAllUsesWith(
1347 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1348
1349 // x & 0 -> 0
1350 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1351 return Def->replaceAllUsesWith(
1352 Plan->getZero(TypeInfo.inferScalarType(Def)));
1353
1354 // x & AllOnes -> x
1355 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1356 return Def->replaceAllUsesWith(X);
1357
1358 // x && false -> false
1359 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1360 return Def->replaceAllUsesWith(Plan->getFalse());
1361
1362 // x && true -> x
1363 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1364 return Def->replaceAllUsesWith(X);
1365
1366 // (x && y) | (x && z) -> x && (y | z)
1367 if (CanCreateNewRecipe &&
1370 // Simplify only if one of the operands has one use to avoid creating an
1371 // extra recipe.
1372 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1373 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1374 return Def->replaceAllUsesWith(
1375 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1376
1377 // x && !x -> 0
1379 return Def->replaceAllUsesWith(Plan->getFalse());
1380
1381 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1382 return Def->replaceAllUsesWith(X);
1383
1384 // select c, false, true -> not c
1385 VPValue *C;
1386 if (CanCreateNewRecipe &&
1387 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1388 return Def->replaceAllUsesWith(Builder.createNot(C));
1389
1390 // select !c, x, y -> select c, y, x
1391 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1392 Def->setOperand(0, C);
1393 Def->setOperand(1, Y);
1394 Def->setOperand(2, X);
1395 return;
1396 }
1397
1398 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1399 return Def->replaceAllUsesWith(A);
1400
1401 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1402 return Def->replaceAllUsesWith(A);
1403
1404 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1405 return Def->replaceAllUsesWith(
1406 Plan->getZero(TypeInfo.inferScalarType(Def)));
1407
1408 const APInt *APC;
1409 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1410 APC->isPowerOf2())
1411 return Def->replaceAllUsesWith(Builder.createNaryOp(
1412 Instruction::Shl,
1413 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1414 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1415
1416 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1417 // not allowed in them.
1418 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1419 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1420 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1421 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1422 return Def->replaceAllUsesWith(Builder.createNaryOp(
1423 Instruction::LShr,
1424 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1425 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1426
1427 if (match(Def, m_Not(m_VPValue(A)))) {
1428 if (match(A, m_Not(m_VPValue(A))))
1429 return Def->replaceAllUsesWith(A);
1430
1431 // Try to fold Not into compares by adjusting the predicate in-place.
1432 CmpPredicate Pred;
1433 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1434 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1435 if (all_of(Cmp->users(),
1437 m_Not(m_Specific(Cmp)),
1438 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1439 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1440 for (VPUser *U : to_vector(Cmp->users())) {
1441 auto *R = cast<VPSingleDefRecipe>(U);
1442 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1443 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1444 R->setOperand(1, Y);
1445 R->setOperand(2, X);
1446 } else {
1447 // not (cmp pred) -> cmp inv_pred
1448 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1449 R->replaceAllUsesWith(Cmp);
1450 }
1451 }
1452 // If Cmp doesn't have a debug location, use the one from the negation,
1453 // to preserve the location.
1454 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1455 Cmp->setDebugLoc(Def->getDebugLoc());
1456 }
1457 }
1458 }
1459
1460 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1461 // any-of (fcmp uno %A, %B), ...
1462 if (match(Def, m_AnyOf())) {
1464 VPRecipeBase *UnpairedCmp = nullptr;
1465 for (VPValue *Op : Def->operands()) {
1466 VPValue *X;
1467 if (Op->getNumUsers() > 1 ||
1469 m_Deferred(X)))) {
1470 NewOps.push_back(Op);
1471 } else if (!UnpairedCmp) {
1472 UnpairedCmp = Op->getDefiningRecipe();
1473 } else {
1474 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1475 UnpairedCmp->getOperand(0), X));
1476 UnpairedCmp = nullptr;
1477 }
1478 }
1479
1480 if (UnpairedCmp)
1481 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1482
1483 if (NewOps.size() < Def->getNumOperands()) {
1484 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1485 return Def->replaceAllUsesWith(NewAnyOf);
1486 }
1487 }
1488
1489 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1490 // This is useful for fmax/fmin without fast-math flags, where we need to
1491 // check if any operand is NaN.
1492 if (CanCreateNewRecipe &&
1494 m_Deferred(X)),
1496 m_Deferred(Y))))) {
1497 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1498 return Def->replaceAllUsesWith(NewCmp);
1499 }
1500
1501 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1502 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1503 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1504 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1505 TypeInfo.inferScalarType(Def))
1506 return Def->replaceAllUsesWith(Def->getOperand(1));
1507
1509 m_One()))) {
1510 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1511 if (TypeInfo.inferScalarType(X) != WideStepTy)
1512 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1513 Def->replaceAllUsesWith(X);
1514 return;
1515 }
1516
1517 // For i1 vp.merges produced by AnyOf reductions:
1518 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1520 m_VPValue(X), m_VPValue())) &&
1522 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1523 Def->setOperand(1, Def->getOperand(0));
1524 Def->setOperand(0, Y);
1525 return;
1526 }
1527
1528 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1529 if (Phi->getOperand(0) == Phi->getOperand(1))
1530 Phi->replaceAllUsesWith(Phi->getOperand(0));
1531 return;
1532 }
1533
1534 // Simplify MaskedCond with no block mask to its single operand.
1536 !cast<VPInstruction>(Def)->isMasked())
1537 return Def->replaceAllUsesWith(Def->getOperand(0));
1538
1539 // Look through ExtractLastLane.
1540 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1541 if (match(A, m_BuildVector())) {
1542 auto *BuildVector = cast<VPInstruction>(A);
1543 Def->replaceAllUsesWith(
1544 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1545 return;
1546 }
1547 if (Plan->hasScalarVFOnly())
1548 return Def->replaceAllUsesWith(A);
1549 }
1550
1551 // Look through ExtractPenultimateElement (BuildVector ....).
1553 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1554 Def->replaceAllUsesWith(
1555 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1556 return;
1557 }
1558
1559 uint64_t Idx;
1561 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1562 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1563 return;
1564 }
1565
1566 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1567 Def->replaceAllUsesWith(
1568 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1569 return;
1570 }
1571
1572 // Look through broadcast of single-scalar when used as select conditions; in
1573 // that case the scalar condition can be used directly.
1574 if (match(Def,
1577 "broadcast operand must be single-scalar");
1578 Def->setOperand(0, C);
1579 return;
1580 }
1581
1583 if (Def->getNumOperands() == 1)
1584 Def->replaceAllUsesWith(Def->getOperand(0));
1585 return;
1586 }
1587
1588 VPIRValue *IRV;
1589 if (Def->getNumOperands() == 1 &&
1591 return Def->replaceAllUsesWith(IRV);
1592
1593 // Some simplifications can only be applied after unrolling. Perform them
1594 // below.
1595 if (!Plan->isUnrolled())
1596 return;
1597
1598 // After unrolling, extract-lane may be used to extract values from multiple
1599 // scalar sources. Only simplify when extracting from a single scalar source.
1600 VPValue *LaneToExtract;
1601 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1602 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1604 return Def->replaceAllUsesWith(A);
1605
1606 // Simplify extract-lane with single source to extract-element.
1607 Def->replaceAllUsesWith(Builder.createNaryOp(
1608 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1609 return;
1610 }
1611
1612 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1613 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1614 isa<VPPhi>(X)) {
1615 auto *Phi = cast<VPPhi>(X);
1616 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1617 Phi->getSingleUser() == Def) {
1618 Phi->setOperand(0, Y);
1619 Def->replaceAllUsesWith(Phi);
1620 return;
1621 }
1622 }
1623
1624 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1625 // just the pointer operand.
1626 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1627 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1628 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1629
1630 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1631 // the start index is zero and only the first lane 0 is demanded.
1632 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1633 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1634 Steps->replaceAllUsesWith(Steps->getOperand(0));
1635 return;
1636 }
1637 }
1638 // Simplify redundant ReductionStartVector recipes after unrolling.
1639 VPValue *StartV;
1641 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1642 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1643 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1644 return PhiR && PhiR->isInLoop();
1645 });
1646 return;
1647 }
1648
1650 Def->replaceAllUsesWith(A);
1651 return;
1652 }
1653
1654 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1657 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1658 all_of(A->users(),
1659 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1660 return Def->replaceAllUsesWith(A);
1661 }
1662
1663 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1664 return Def->replaceAllUsesWith(A);
1665}
1666
1669 Plan.getEntry());
1670 VPTypeAnalysis TypeInfo(Plan);
1672 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1673 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1674 simplifyRecipe(Def, TypeInfo);
1675 }
1676}
1677
1678/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1679/// header mask to be simplified further when tail folding, e.g. in
1680/// optimizeEVLMasks.
1681static void reassociateHeaderMask(VPlan &Plan) {
1682 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1683 if (!HeaderMask)
1684 return;
1685
1686 SmallVector<VPUser *> Worklist;
1687 for (VPUser *U : HeaderMask->users())
1688 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1690
1691 while (!Worklist.empty()) {
1692 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1693 VPValue *X, *Y;
1694 if (!R || !match(R, m_LogicalAnd(
1695 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1696 m_VPValue(Y))))
1697 continue;
1698 append_range(Worklist, R->users());
1699 VPBuilder Builder(R);
1700 R->replaceAllUsesWith(
1701 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1702 }
1703}
1704
1706 if (Plan.hasScalarVFOnly())
1707 return;
1708
1709 // Try to narrow wide and replicating recipes to single scalar recipes,
1710 // based on VPlan analysis. Only process blocks in the loop region for now,
1711 // without traversing into nested regions, as recipes in replicate regions
1712 // cannot be converted yet.
1715 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1717 VPWidenStoreRecipe>(&R))
1718 continue;
1719 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1720 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1721 continue;
1722
1723 // Convert an unmasked scatter with an uniform address into
1724 // extract-last-lane + scalar store.
1725 // TODO: Add a profitability check comparing the cost of a scatter vs.
1726 // extract + scalar store.
1727 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1728 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1729 !WidenStoreR->isConsecutive()) {
1730 assert(!WidenStoreR->isReverse() &&
1731 "Not consecutive memory recipes shouldn't be reversed");
1732 VPValue *Mask = WidenStoreR->getMask();
1733
1734 // Only convert the scatter to a scalar store if it is unmasked.
1735 // TODO: Support converting scatter masked by the header mask to scalar
1736 // store.
1737 if (Mask)
1738 continue;
1739
1741 {WidenStoreR->getOperand(1)});
1742 Extract->insertBefore(WidenStoreR);
1743
1744 // TODO: Sink the scalar store recipe to middle block if possible.
1745 auto *ScalarStore = new VPReplicateRecipe(
1746 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1747 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1748 *WidenStoreR /*Metadata*/);
1749 ScalarStore->insertBefore(WidenStoreR);
1750 WidenStoreR->eraseFromParent();
1751 continue;
1752 }
1753
1754 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1755 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1756 vputils::isSingleScalar(RepR->getOperand(1))) {
1757 auto *Clone = new VPReplicateRecipe(
1758 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1759 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1760 *RepR /*Metadata*/, RepR->getDebugLoc());
1761 Clone->insertBefore(RepOrWidenR);
1762 VPBuilder Builder(Clone);
1763 VPValue *ExtractOp = Clone->getOperand(0);
1764 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1765 ExtractOp =
1766 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1767 ExtractOp =
1768 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1769 Clone->setOperand(0, ExtractOp);
1770 RepR->eraseFromParent();
1771 continue;
1772 }
1773
1774 // Skip recipes that aren't single scalars.
1775 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1776 continue;
1777
1778 // Predicate to check if a user of Op introduces extra broadcasts.
1779 auto IntroducesBCastOf = [](const VPValue *Op) {
1780 return [Op](const VPUser *U) {
1781 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1785 VPI->getOpcode()))
1786 return false;
1787 }
1788 return !U->usesScalars(Op);
1789 };
1790 };
1791
1792 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1793 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1794 if (any_of(
1795 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1796 IntroducesBCastOf(Op)))
1797 return false;
1798 // Non-constant live-ins require broadcasts, while constants do not
1799 // need explicit broadcasts.
1800 auto *IRV = dyn_cast<VPIRValue>(Op);
1801 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1802 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1803 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1804 }))
1805 continue;
1806
1807 auto *Clone = new VPReplicateRecipe(
1808 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1809 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1810 Clone->insertBefore(RepOrWidenR);
1811 RepOrWidenR->replaceAllUsesWith(Clone);
1812 if (isDeadRecipe(*RepOrWidenR))
1813 RepOrWidenR->eraseFromParent();
1814 }
1815 }
1816}
1817
1818/// Try to see if all of \p Blend's masks share a common value logically and'ed
1819/// and remove it from the masks.
1821 if (Blend->isNormalized())
1822 return;
1823 VPValue *CommonEdgeMask;
1824 if (!match(Blend->getMask(0),
1825 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1826 return;
1827 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1828 if (!match(Blend->getMask(I),
1829 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1830 return;
1831 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1832 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1833}
1834
1835/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1836/// to make sure the masks are simplified.
1837static void simplifyBlends(VPlan &Plan) {
1840 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1841 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1842 if (!Blend)
1843 continue;
1844
1845 removeCommonBlendMask(Blend);
1846
1847 // Try to remove redundant blend recipes.
1848 SmallPtrSet<VPValue *, 4> UniqueValues;
1849 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1850 UniqueValues.insert(Blend->getIncomingValue(0));
1851 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1852 if (!match(Blend->getMask(I), m_False()))
1853 UniqueValues.insert(Blend->getIncomingValue(I));
1854
1855 if (UniqueValues.size() == 1) {
1856 Blend->replaceAllUsesWith(*UniqueValues.begin());
1857 Blend->eraseFromParent();
1858 continue;
1859 }
1860
1861 if (Blend->isNormalized())
1862 continue;
1863
1864 // Normalize the blend so its first incoming value is used as the initial
1865 // value with the others blended into it.
1866
1867 unsigned StartIndex = 0;
1868 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1869 // If a value's mask is used only by the blend then is can be deadcoded.
1870 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1871 // that's used by multiple blends where it can be removed from them all.
1872 VPValue *Mask = Blend->getMask(I);
1873 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1874 StartIndex = I;
1875 break;
1876 }
1877 }
1878
1879 SmallVector<VPValue *, 4> OperandsWithMask;
1880 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1881
1882 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1883 if (I == StartIndex)
1884 continue;
1885 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1886 OperandsWithMask.push_back(Blend->getMask(I));
1887 }
1888
1889 auto *NewBlend =
1890 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1891 OperandsWithMask, *Blend, Blend->getDebugLoc());
1892 NewBlend->insertBefore(&R);
1893
1894 VPValue *DeadMask = Blend->getMask(StartIndex);
1895 Blend->replaceAllUsesWith(NewBlend);
1896 Blend->eraseFromParent();
1898
1899 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1900 VPValue *NewMask;
1901 if (NewBlend->getNumOperands() == 3 &&
1902 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1903 VPValue *Inc0 = NewBlend->getOperand(0);
1904 VPValue *Inc1 = NewBlend->getOperand(1);
1905 VPValue *OldMask = NewBlend->getOperand(2);
1906 NewBlend->setOperand(0, Inc1);
1907 NewBlend->setOperand(1, Inc0);
1908 NewBlend->setOperand(2, NewMask);
1909 if (OldMask->getNumUsers() == 0)
1910 cast<VPInstruction>(OldMask)->eraseFromParent();
1911 }
1912 }
1913 }
1914}
1915
1916/// Optimize the width of vector induction variables in \p Plan based on a known
1917/// constant Trip Count, \p BestVF and \p BestUF.
1919 ElementCount BestVF,
1920 unsigned BestUF) {
1921 // Only proceed if we have not completely removed the vector region.
1922 if (!Plan.getVectorLoopRegion())
1923 return false;
1924
1925 const APInt *TC;
1926 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1927 return false;
1928
1929 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1930 // and UF. Returns at least 8.
1931 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1932 APInt AlignedTC =
1935 APInt MaxVal = AlignedTC - 1;
1936 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1937 };
1938 unsigned NewBitWidth =
1939 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1940
1941 LLVMContext &Ctx = Plan.getContext();
1942 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1943
1944 bool MadeChange = false;
1945
1946 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1947 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1948 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1949
1950 // Currently only handle canonical IVs as it is trivial to replace the start
1951 // and stop values, and we currently only perform the optimization when the
1952 // IV has a single use.
1953 if (!WideIV || !WideIV->isCanonical() ||
1954 WideIV->hasMoreThanOneUniqueUser() ||
1955 NewIVTy == WideIV->getScalarType())
1956 continue;
1957
1958 // Currently only handle cases where the single user is a header-mask
1959 // comparison with the backedge-taken-count.
1960 VPUser *SingleUser = WideIV->getSingleUser();
1961 if (!SingleUser ||
1962 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1965 continue;
1966
1967 // Update IV operands and comparison bound to use new narrower type.
1968 auto *NewStart = Plan.getZero(NewIVTy);
1969 WideIV->setStartValue(NewStart);
1970 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1971 WideIV->setStepValue(NewStep);
1972
1973 auto *NewBTC = new VPWidenCastRecipe(
1974 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
1975 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
1976 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1977 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1978 Cmp->setOperand(1, NewBTC);
1979
1980 MadeChange = true;
1981 }
1982
1983 return MadeChange;
1984}
1985
1986/// Return true if \p Cond is known to be true for given \p BestVF and \p
1987/// BestUF.
1989 ElementCount BestVF, unsigned BestUF,
1992 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1993 &PSE](VPValue *C) {
1994 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1995 });
1996
1997 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1999 m_Specific(CanIV->getBackedgeValue()),
2000 m_Specific(&Plan.getVectorTripCount()))))
2001 return false;
2002
2003 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2004 // count is not conveniently available as SCEV so far, so we compare directly
2005 // against the original trip count. This is stricter than necessary, as we
2006 // will only return true if the trip count == vector trip count.
2007 const SCEV *VectorTripCount =
2009 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2010 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2011 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2012 "Trip count SCEV must be computable");
2013 ScalarEvolution &SE = *PSE.getSE();
2014 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2015 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2016 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2017}
2018
2019/// Try to replace multiple active lane masks used for control flow with
2020/// a single, wide active lane mask instruction followed by multiple
2021/// extract subvector intrinsics. This applies to the active lane mask
2022/// instructions both in the loop and in the preheader.
2023/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2024/// new extracts from the first active lane mask, which has it's last
2025/// operand (multiplier) set to UF.
2027 unsigned UF) {
2028 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2029 return false;
2030
2031 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2032 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2033 auto *Term = &ExitingVPBB->back();
2034
2035 using namespace llvm::VPlanPatternMatch;
2037 m_VPValue(), m_VPValue(), m_VPValue())))))
2038 return false;
2039
2040 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2041 LLVMContext &Ctx = Plan.getContext();
2042
2043 auto ExtractFromALM = [&](VPInstruction *ALM,
2044 SmallVectorImpl<VPValue *> &Extracts) {
2045 DebugLoc DL = ALM->getDebugLoc();
2046 for (unsigned Part = 0; Part < UF; ++Part) {
2048 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2049 auto *Ext =
2050 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2051 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2052 Extracts[Part] = Ext;
2053 Ext->insertAfter(ALM);
2054 }
2055 };
2056
2057 // Create a list of each active lane mask phi, ordered by unroll part.
2059 for (VPRecipeBase &R : Header->phis()) {
2061 if (!Phi)
2062 continue;
2063 VPValue *Index = nullptr;
2064 match(Phi->getBackedgeValue(),
2066 assert(Index && "Expected index from ActiveLaneMask instruction");
2067
2068 uint64_t Part;
2069 if (match(Index,
2071 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2072 Phis[Part] = Phi;
2073 else {
2074 // Anything other than a CanonicalIVIncrementForPart is part 0
2075 assert(!match(
2076 Index,
2078 Phis[0] = Phi;
2079 }
2080 }
2081
2082 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2083 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2084
2085 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2086 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2087
2088 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2089 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2090 "Expected incoming values of Phi to be ActiveLaneMasks");
2091
2092 // When using wide lane masks, the return type of the get.active.lane.mask
2093 // intrinsic is VF x UF (last operand).
2094 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2095 EntryALM->setOperand(2, ALMMultiplier);
2096 LoopALM->setOperand(2, ALMMultiplier);
2097
2098 // Create UF x extract vectors and insert into preheader.
2099 SmallVector<VPValue *> EntryExtracts(UF);
2100 ExtractFromALM(EntryALM, EntryExtracts);
2101
2102 // Create UF x extract vectors and insert before the loop compare & branch,
2103 // updating the compare to use the first extract.
2104 SmallVector<VPValue *> LoopExtracts(UF);
2105 ExtractFromALM(LoopALM, LoopExtracts);
2106 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2107 Not->setOperand(0, LoopExtracts[0]);
2108
2109 // Update the incoming values of active lane mask phis.
2110 for (unsigned Part = 0; Part < UF; ++Part) {
2111 Phis[Part]->setStartValue(EntryExtracts[Part]);
2112 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2113 }
2114
2115 return true;
2116}
2117
2118/// Try to simplify the branch condition of \p Plan. This may restrict the
2119/// resulting plan to \p BestVF and \p BestUF.
2121 unsigned BestUF,
2123 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2124 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2125 auto *Term = &ExitingVPBB->back();
2126 VPValue *Cond;
2127 if (match(Term,
2129 m_VPValue())) ||
2131 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2132 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2133 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2134 const SCEV *VectorTripCount =
2136 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2137 VectorTripCount =
2139 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2140 "Trip count SCEV must be computable");
2141 ScalarEvolution &SE = *PSE.getSE();
2142 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2143 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2144 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2145 return false;
2146 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2148 // For BranchOnCond, check if we can prove the condition to be true using VF
2149 // and UF.
2150 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2151 return false;
2152 } else {
2153 return false;
2154 }
2155
2156 // The vector loop region only executes once. If possible, completely remove
2157 // the region, otherwise replace the terminator controlling the latch with
2158 // (BranchOnCond true).
2159 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2160 // support for other non-canonical widen induction recipes (e.g.,
2161 // VPWidenPointerInductionRecipe).
2162 // TODO: fold branch-on-constant after dissolving region.
2163 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2164 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2165 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2166 return R->isCanonical();
2167 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2168 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2169 })) {
2170 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2171 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2172 VPBuilder Builder(Plan.getVectorPreheader());
2173 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2174 R->getScalarType());
2175 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2176 HeaderR.eraseFromParent();
2177 continue;
2178 }
2179 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2180 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2181 HeaderR.eraseFromParent();
2182 }
2183
2184 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2185 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2186 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2187 for (VPBlockBase *Exit : Exits)
2188 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2189
2190 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2191 B->setParent(nullptr);
2192
2193 VPBlockUtils::connectBlocks(Preheader, Header);
2194
2195 for (VPBlockBase *Exit : Exits)
2196 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2197
2198 // Replace terminating branch-on-two-conds with branch-on-cond to early
2199 // exit.
2200 if (Exits.size() != 1) {
2201 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2202 "BranchOnTwoConds needs 2 remaining exits");
2204 Term->getOperand(0));
2205 }
2207 } else {
2208 // The vector region contains header phis for which we cannot remove the
2209 // loop region yet.
2210
2211 // For BranchOnTwoConds, set the latch exit condition to true directly.
2212 if (match(Term, m_BranchOnTwoConds())) {
2213 Term->setOperand(1, Plan.getTrue());
2214 return true;
2215 }
2216
2217 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2218 {}, {}, Term->getDebugLoc());
2219 ExitingVPBB->appendRecipe(BOC);
2220 }
2221
2222 Term->eraseFromParent();
2223
2224 return true;
2225}
2226
2227/// From the definition of llvm.experimental.get.vector.length,
2228/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2232 vp_depth_first_deep(Plan.getEntry()))) {
2233 for (VPRecipeBase &R : *VPBB) {
2234 VPValue *AVL;
2235 if (!match(&R, m_EVL(m_VPValue(AVL))))
2236 continue;
2237
2238 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2239 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2240 continue;
2241 ScalarEvolution &SE = *PSE.getSE();
2242 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2243 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2244 continue;
2245
2247 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2248 R.getDebugLoc());
2249 if (Trunc != AVL) {
2250 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2251 const DataLayout &DL =
2253 VPTypeAnalysis TypeInfo(Plan);
2254 if (VPValue *Folded =
2255 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2256 Trunc = Folded;
2257 }
2258 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2259 return true;
2260 }
2261 }
2262 return false;
2263}
2264
2266 unsigned BestUF,
2268 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2269 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2270
2271 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2272 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2273 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2274 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2275
2276 if (MadeChange) {
2277 Plan.setVF(BestVF);
2278 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2279 }
2280}
2281
2282/// Sink users of \p FOR after the recipe defining the previous value \p
2283/// Previous of the recurrence. \returns true if all users of \p FOR could be
2284/// re-arranged as needed or false if it is not possible.
2285static bool
2287 VPRecipeBase *Previous,
2288 VPDominatorTree &VPDT) {
2289 // If Previous is a live-in (no defining recipe), it naturally dominates all
2290 // recipes in the loop, so no sinking is needed.
2291 if (!Previous)
2292 return true;
2293
2294 // Collect recipes that need sinking.
2297 Seen.insert(Previous);
2298 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2299 // The previous value must not depend on the users of the recurrence phi. In
2300 // that case, FOR is not a fixed order recurrence.
2301 if (SinkCandidate == Previous)
2302 return false;
2303
2304 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2305 !Seen.insert(SinkCandidate).second ||
2306 VPDT.properlyDominates(Previous, SinkCandidate))
2307 return true;
2308
2309 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2310 return false;
2311
2312 WorkList.push_back(SinkCandidate);
2313 return true;
2314 };
2315
2316 // Recursively sink users of FOR after Previous.
2317 WorkList.push_back(FOR);
2318 for (unsigned I = 0; I != WorkList.size(); ++I) {
2319 VPRecipeBase *Current = WorkList[I];
2320 assert(Current->getNumDefinedValues() == 1 &&
2321 "only recipes with a single defined value expected");
2322
2323 for (VPUser *User : Current->getVPSingleValue()->users()) {
2324 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2325 return false;
2326 }
2327 }
2328
2329 // Keep recipes to sink ordered by dominance so earlier instructions are
2330 // processed first.
2331 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2332 return VPDT.properlyDominates(A, B);
2333 });
2334
2335 for (VPRecipeBase *SinkCandidate : WorkList) {
2336 if (SinkCandidate == FOR)
2337 continue;
2338
2339 SinkCandidate->moveAfter(Previous);
2340 Previous = SinkCandidate;
2341 }
2342 return true;
2343}
2344
2345/// Try to hoist \p Previous and its operands before all users of \p FOR.
2347 VPRecipeBase *Previous,
2348 VPDominatorTree &VPDT) {
2349 if (cannotHoistOrSinkRecipe(*Previous))
2350 return false;
2351
2352 // Collect recipes that need hoisting.
2353 SmallVector<VPRecipeBase *> HoistCandidates;
2355 VPRecipeBase *HoistPoint = nullptr;
2356 // Find the closest hoist point by looking at all users of FOR and selecting
2357 // the recipe dominating all other users.
2358 for (VPUser *U : FOR->users()) {
2359 auto *R = cast<VPRecipeBase>(U);
2360 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2361 HoistPoint = R;
2362 }
2363 assert(all_of(FOR->users(),
2364 [&VPDT, HoistPoint](VPUser *U) {
2365 auto *R = cast<VPRecipeBase>(U);
2366 return HoistPoint == R ||
2367 VPDT.properlyDominates(HoistPoint, R);
2368 }) &&
2369 "HoistPoint must dominate all users of FOR");
2370
2371 auto NeedsHoisting = [HoistPoint, &VPDT,
2372 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2373 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2374 if (!HoistCandidate)
2375 return nullptr;
2376 VPRegionBlock *EnclosingLoopRegion =
2377 HoistCandidate->getParent()->getEnclosingLoopRegion();
2378 assert((!HoistCandidate->getRegion() ||
2379 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2380 "CFG in VPlan should still be flat, without replicate regions");
2381 // Hoist candidate was already visited, no need to hoist.
2382 if (!Visited.insert(HoistCandidate).second)
2383 return nullptr;
2384
2385 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2386 // hoisting.
2387 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2388 return nullptr;
2389
2390 // If we reached a recipe that dominates HoistPoint, we don't need to
2391 // hoist the recipe.
2392 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2393 return nullptr;
2394 return HoistCandidate;
2395 };
2396
2397 if (!NeedsHoisting(Previous->getVPSingleValue()))
2398 return true;
2399
2400 // Recursively try to hoist Previous and its operands before all users of FOR.
2401 HoistCandidates.push_back(Previous);
2402
2403 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2404 VPRecipeBase *Current = HoistCandidates[I];
2405 assert(Current->getNumDefinedValues() == 1 &&
2406 "only recipes with a single defined value expected");
2407 if (cannotHoistOrSinkRecipe(*Current))
2408 return false;
2409
2410 for (VPValue *Op : Current->operands()) {
2411 // If we reach FOR, it means the original Previous depends on some other
2412 // recurrence that in turn depends on FOR. If that is the case, we would
2413 // also need to hoist recipes involving the other FOR, which may break
2414 // dependencies.
2415 if (Op == FOR)
2416 return false;
2417
2418 if (auto *R = NeedsHoisting(Op)) {
2419 // Bail out if the recipe defines multiple values.
2420 // TODO: Hoisting such recipes requires additional handling.
2421 if (R->getNumDefinedValues() != 1)
2422 return false;
2423 HoistCandidates.push_back(R);
2424 }
2425 }
2426 }
2427
2428 // Order recipes to hoist by dominance so earlier instructions are processed
2429 // first.
2430 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2431 return VPDT.properlyDominates(A, B);
2432 });
2433
2434 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2435 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2436 HoistPoint->getIterator());
2437 }
2438
2439 return true;
2440}
2441
2443 VPBuilder &LoopBuilder) {
2444 VPDominatorTree VPDT(Plan);
2445
2447 for (VPRecipeBase &R :
2450 RecurrencePhis.push_back(FOR);
2451
2452 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2454 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2455 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2456 // to terminate.
2457 while (auto *PrevPhi =
2459 assert(PrevPhi->getParent() == FOR->getParent());
2460 assert(SeenPhis.insert(PrevPhi).second);
2461 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2462 }
2463
2464 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2465 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2466 return false;
2467
2468 // Introduce a recipe to combine the incoming and previous values of a
2469 // fixed-order recurrence.
2470 VPBasicBlock *InsertBlock =
2471 Previous ? Previous->getParent() : FOR->getParent();
2472 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2473 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2474 else
2475 LoopBuilder.setInsertPoint(InsertBlock,
2476 std::next(Previous->getIterator()));
2477
2478 auto *RecurSplice =
2480 {FOR, FOR->getBackedgeValue()});
2481
2482 FOR->replaceAllUsesWith(RecurSplice);
2483 // Set the first operand of RecurSplice to FOR again, after replacing
2484 // all users.
2485 RecurSplice->setOperand(0, FOR);
2486
2487 // Check for users extracting at the penultimate active lane of the FOR.
2488 // If only a single lane is active in the current iteration, we need to
2489 // select the last element from the previous iteration (from the FOR phi
2490 // directly).
2491 for (VPUser *U : RecurSplice->users()) {
2493 m_Specific(RecurSplice))))
2494 continue;
2495
2497 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2498 VPValue *Zero = Plan.getConstantInt(64, 0);
2499 VPValue *One = Plan.getConstantInt(64, 1);
2500 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2501 VPValue *PenultimateLastIter =
2502 B.createNaryOp(VPInstruction::ExtractLane,
2503 {PenultimateIndex, FOR->getBackedgeValue()});
2504 VPValue *LastPrevIter =
2505 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2506
2507 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2508 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2509 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2510 }
2511 }
2512 return true;
2513}
2514
2516 for (VPRecipeBase &R :
2518 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2519 if (!PhiR)
2520 continue;
2521 RecurKind RK = PhiR->getRecurrenceKind();
2522 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2524 continue;
2525
2526 for (VPUser *U : collectUsersRecursively(PhiR))
2527 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2528 RecWithFlags->dropPoisonGeneratingFlags();
2529 }
2530 }
2531}
2532
2533namespace {
2534struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2535 static bool isSentinel(const VPSingleDefRecipe *Def) {
2536 return Def == getEmptyKey() || Def == getTombstoneKey();
2537 }
2538
2539 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2540 /// return that source element type.
2541 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2542 // All VPInstructions that lower to GEPs must have the i8 source element
2543 // type (as they are PtrAdds), so we omit it.
2545 .Case([](const VPReplicateRecipe *I) -> Type * {
2546 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2547 return GEP->getSourceElementType();
2548 return nullptr;
2549 })
2550 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2551 [](auto *I) { return I->getSourceElementType(); })
2552 .Default([](auto *) { return nullptr; });
2553 }
2554
2555 /// Returns true if recipe \p Def can be safely handed for CSE.
2556 static bool canHandle(const VPSingleDefRecipe *Def) {
2557 // We can extend the list of handled recipes in the future,
2558 // provided we account for the data embedded in them while checking for
2559 // equality or hashing.
2560 auto C = getOpcodeOrIntrinsicID(Def);
2561
2562 // The issue with (Insert|Extract)Value is that the index of the
2563 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2564 // VPlan.
2565 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2566 C->second == Instruction::ExtractValue)))
2567 return false;
2568
2569 // During CSE, we can only handle recipes that don't read from memory: if
2570 // they read from memory, there could be an intervening write to memory
2571 // before the next instance is CSE'd, leading to an incorrect result.
2572 return !Def->mayReadFromMemory();
2573 }
2574
2575 /// Hash the underlying data of \p Def.
2576 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2577 const VPlan *Plan = Def->getParent()->getPlan();
2578 VPTypeAnalysis TypeInfo(*Plan);
2579 hash_code Result = hash_combine(
2580 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2581 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2583 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2584 if (RFlags->hasPredicate())
2585 return hash_combine(Result, RFlags->getPredicate());
2586 return Result;
2587 }
2588
2589 /// Check equality of underlying data of \p L and \p R.
2590 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2591 if (isSentinel(L) || isSentinel(R))
2592 return L == R;
2593 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2595 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2597 !equal(L->operands(), R->operands()))
2598 return false;
2600 "must have valid opcode info for both recipes");
2601 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2602 if (LFlags->hasPredicate() &&
2603 LFlags->getPredicate() !=
2604 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2605 return false;
2606 // Recipes in replicate regions implicitly depend on predicate. If either
2607 // recipe is in a replicate region, only consider them equal if both have
2608 // the same parent.
2609 const VPRegionBlock *RegionL = L->getRegion();
2610 const VPRegionBlock *RegionR = R->getRegion();
2611 if (((RegionL && RegionL->isReplicator()) ||
2612 (RegionR && RegionR->isReplicator())) &&
2613 L->getParent() != R->getParent())
2614 return false;
2615 const VPlan *Plan = L->getParent()->getPlan();
2616 VPTypeAnalysis TypeInfo(*Plan);
2617 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2618 }
2619};
2620} // end anonymous namespace
2621
2622/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2623/// Plan.
2625 VPDominatorTree VPDT(Plan);
2627
2629 vp_depth_first_deep(Plan.getEntry()))) {
2630 for (VPRecipeBase &R : *VPBB) {
2631 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2632 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2633 continue;
2634 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2635 // V must dominate Def for a valid replacement.
2636 if (!VPDT.dominates(V->getParent(), VPBB))
2637 continue;
2638 // Only keep flags present on both V and Def.
2639 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2640 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2641 Def->replaceAllUsesWith(V);
2642 continue;
2643 }
2644 CSEMap[Def] = Def;
2645 }
2646 }
2647}
2648
2649/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2650static void licm(VPlan &Plan) {
2651 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2652
2653 // Hoist any loop invariant recipes from the vector loop region to the
2654 // preheader. Preform a shallow traversal of the vector loop region, to
2655 // exclude recipes in replicate regions. Since the top-level blocks in the
2656 // vector loop region are guaranteed to execute if the vector pre-header is,
2657 // we don't need to check speculation safety.
2658 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2659 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2660 "Expected vector prehader's successor to be the vector loop region");
2662 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2663 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2665 continue;
2666 if (any_of(R.operands(), [](VPValue *Op) {
2667 return !Op->isDefinedOutsideLoopRegions();
2668 }))
2669 continue;
2670 R.moveBefore(*Preheader, Preheader->end());
2671 }
2672 }
2673
2674#ifndef NDEBUG
2675 VPDominatorTree VPDT(Plan);
2676#endif
2677 // Sink recipes with no users inside the vector loop region if all users are
2678 // in the same exit block of the region.
2679 // TODO: Extend to sink recipes from inner loops.
2681 vp_post_order_shallow(LoopRegion->getEntry()))) {
2682 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2684 continue;
2685
2686 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2687 // handles sunk recipes correctly.
2688 if (isa<VPReplicateRecipe>(&R))
2689 continue;
2690
2691 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2692 // support recipes with multiple defined values (e.g., interleaved loads).
2693 auto *Def = cast<VPSingleDefRecipe>(&R);
2694 // Skip recipes without users as we cannot determine a sink block.
2695 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2696 // their execution frequency.
2697 if (Def->getNumUsers() == 0)
2698 continue;
2699
2700 VPBasicBlock *SinkBB = nullptr;
2701 // Cannot sink the recipe if any user
2702 // * is defined in any loop region, or
2703 // * is a phi, or
2704 // * multiple users in different blocks.
2705 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2706 auto *UserR = cast<VPRecipeBase>(U);
2707 VPBasicBlock *Parent = UserR->getParent();
2708 // TODO: If the user is a PHI node, we should check the block of
2709 // incoming value. Support PHI node users if needed.
2710 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2711 return true;
2712 // TODO: Support sinking when users are in multiple blocks.
2713 if (SinkBB && SinkBB != Parent)
2714 return true;
2715 SinkBB = Parent;
2716 return false;
2717 }))
2718 continue;
2719
2720 // Only sink to dedicated exit blocks of the loop region.
2721 if (SinkBB->getSinglePredecessor() != LoopRegion)
2722 continue;
2723
2724 // TODO: This will need to be a check instead of a assert after
2725 // conditional branches in vectorized loops are supported.
2726 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2727 "Defining block must dominate sink block");
2728 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2729 // just moving.
2730 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2731 }
2732 }
2733}
2734
2736 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2737 if (Plan.hasScalarVFOnly())
2738 return;
2739 // Keep track of created truncates, so they can be re-used. Note that we
2740 // cannot use RAUW after creating a new truncate, as this would could make
2741 // other uses have different types for their operands, making them invalidly
2742 // typed.
2744 VPTypeAnalysis TypeInfo(Plan);
2745 VPBasicBlock *PH = Plan.getVectorPreheader();
2748 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2751 continue;
2752
2753 VPValue *ResultVPV = R.getVPSingleValue();
2754 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2755 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2756 if (!NewResSizeInBits)
2757 continue;
2758
2759 // If the value wasn't vectorized, we must maintain the original scalar
2760 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2761 // skip casts which do not need to be handled explicitly here, as
2762 // redundant casts will be removed during recipe simplification.
2764 continue;
2765
2766 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2767 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2768 assert(OldResTy->isIntegerTy() && "only integer types supported");
2769 (void)OldResSizeInBits;
2770
2771 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2772
2773 // Any wrapping introduced by shrinking this operation shouldn't be
2774 // considered undefined behavior. So, we can't unconditionally copy
2775 // arithmetic wrapping flags to VPW.
2776 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2777 VPW->dropPoisonGeneratingFlags();
2778
2779 if (OldResSizeInBits != NewResSizeInBits &&
2780 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2781 // Extend result to original width.
2782 auto *Ext = new VPWidenCastRecipe(
2783 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2784 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2785 Ext->insertAfter(&R);
2786 ResultVPV->replaceAllUsesWith(Ext);
2787 Ext->setOperand(0, ResultVPV);
2788 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2789 } else {
2790 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2791 "Only ICmps should not need extending the result.");
2792 }
2793
2794 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2796 continue;
2797
2798 // Shrink operands by introducing truncates as needed.
2799 unsigned StartIdx =
2800 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2801 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2802 auto *Op = R.getOperand(Idx);
2803 unsigned OpSizeInBits =
2805 if (OpSizeInBits == NewResSizeInBits)
2806 continue;
2807 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2808 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2809 if (!IterIsEmpty) {
2810 R.setOperand(Idx, ProcessedIter->second);
2811 continue;
2812 }
2813
2814 VPBuilder Builder;
2815 if (isa<VPIRValue>(Op))
2816 Builder.setInsertPoint(PH);
2817 else
2818 Builder.setInsertPoint(&R);
2819 VPWidenCastRecipe *NewOp =
2820 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2821 ProcessedIter->second = NewOp;
2822 R.setOperand(Idx, NewOp);
2823 }
2824
2825 }
2826 }
2827}
2828
2832 VPValue *Cond;
2833 // Skip blocks that are not terminated by BranchOnCond.
2834 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2835 continue;
2836
2837 assert(VPBB->getNumSuccessors() == 2 &&
2838 "Two successors expected for BranchOnCond");
2839 unsigned RemovedIdx;
2840 if (match(Cond, m_True()))
2841 RemovedIdx = 1;
2842 else if (match(Cond, m_False()))
2843 RemovedIdx = 0;
2844 else
2845 continue;
2846
2847 VPBasicBlock *RemovedSucc =
2848 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2849 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2850 "There must be a single edge between VPBB and its successor");
2851 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2852 // these recipes.
2853 for (VPRecipeBase &R : RemovedSucc->phis())
2854 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2855
2856 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2857 // automatically on VPlan destruction if it becomes unreachable.
2858 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2859 VPBB->back().eraseFromParent();
2860 }
2861}
2862
2884
2885// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2886// the loop terminator with a branch-on-cond recipe with the negated
2887// active-lane-mask as operand. Note that this turns the loop into an
2888// uncountable one. Only the existing terminator is replaced, all other existing
2889// recipes/users remain unchanged, except for poison-generating flags being
2890// dropped from the canonical IV increment. Return the created
2891// VPActiveLaneMaskPHIRecipe.
2892//
2893// The function adds the following recipes:
2894//
2895// vector.ph:
2896// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2897// %EntryALM = active-lane-mask %EntryInc, TC
2898//
2899// vector.body:
2900// ...
2901// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2902// ...
2903// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2904// %ALM = active-lane-mask %InLoopInc, TC
2905// %Negated = Not %ALM
2906// branch-on-cond %Negated
2907//
2910 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2911 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2912 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2913 VPValue *StartV = CanonicalIVPHI->getStartValue();
2914
2915 auto *CanonicalIVIncrement =
2916 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2917 // TODO: Check if dropping the flags is needed.
2918 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2919 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2920 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2921 // we have to take unrolling into account. Each part needs to start at
2922 // Part * VF
2923 auto *VecPreheader = Plan.getVectorPreheader();
2924 VPBuilder Builder(VecPreheader);
2925
2926 // Create the ActiveLaneMask instruction using the correct start values.
2927 VPValue *TC = Plan.getTripCount();
2928 VPValue *VF = &Plan.getVF();
2929
2930 auto *EntryIncrement = Builder.createOverflowingOp(
2931 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2932 DL, "index.part.next");
2933
2934 // Create the active lane mask instruction in the VPlan preheader.
2935 VPValue *ALMMultiplier =
2936 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2937 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2938 {EntryIncrement, TC, ALMMultiplier}, DL,
2939 "active.lane.mask.entry");
2940
2941 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2942 // preheader ActiveLaneMask instruction.
2943 auto *LaneMaskPhi =
2945 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2946
2947 // Create the active lane mask for the next iteration of the loop before the
2948 // original terminator.
2949 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2950 Builder.setInsertPoint(OriginalTerminator);
2951 auto *InLoopIncrement = Builder.createOverflowingOp(
2953 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2954 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2955 {InLoopIncrement, TC, ALMMultiplier}, DL,
2956 "active.lane.mask.next");
2957 LaneMaskPhi->addOperand(ALM);
2958
2959 // Replace the original terminator with BranchOnCond. We have to invert the
2960 // mask here because a true condition means jumping to the exit block.
2961 auto *NotMask = Builder.createNot(ALM, DL);
2962 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2963 OriginalTerminator->eraseFromParent();
2964 return LaneMaskPhi;
2965}
2966
2968 bool UseActiveLaneMaskForControlFlow) {
2969 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2970 auto *FoundWidenCanonicalIVUser = find_if(
2972 assert(FoundWidenCanonicalIVUser &&
2973 "Must have widened canonical IV when tail folding!");
2974 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2975 auto *WideCanonicalIV =
2976 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2977 VPSingleDefRecipe *LaneMask;
2978 if (UseActiveLaneMaskForControlFlow) {
2979 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2980 } else {
2981 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2982 VPValue *ALMMultiplier =
2983 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2984 LaneMask =
2985 B.createNaryOp(VPInstruction::ActiveLaneMask,
2986 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2987 nullptr, "active.lane.mask");
2988 }
2989
2990 // Walk users of WideCanonicalIV and replace the header mask of the form
2991 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2992 // removing the old one to ensure there is always only a single header mask.
2993 HeaderMask->replaceAllUsesWith(LaneMask);
2994 HeaderMask->eraseFromParent();
2995}
2996
2997template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2998 Op0_t In;
3000
3001 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3002
3003 template <typename OpTy> bool match(OpTy *V) const {
3004 if (m_Specific(In).match(V)) {
3005 Out = nullptr;
3006 return true;
3007 }
3008 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3009 }
3010};
3011
3012/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3013/// Returns the remaining part \p Out if so, or nullptr otherwise.
3014template <typename Op0_t, typename Op1_t>
3015static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3016 Op1_t &Out) {
3017 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3018}
3019
3020/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3021/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3022/// recipe could be created.
3023/// \p HeaderMask Header Mask.
3024/// \p CurRecipe Recipe to be transform.
3025/// \p TypeInfo VPlan-based type analysis.
3026/// \p EVL The explicit vector length parameter of vector-predication
3027/// intrinsics.
3029 VPRecipeBase &CurRecipe,
3030 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3031 VPlan *Plan = CurRecipe.getParent()->getPlan();
3032 DebugLoc DL = CurRecipe.getDebugLoc();
3033 VPValue *Addr, *Mask, *EndPtr;
3034
3035 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3036 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3037 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3038 EVLEndPtr->insertBefore(&CurRecipe);
3039 EVLEndPtr->setOperand(1, &EVL);
3040 return EVLEndPtr;
3041 };
3042
3043 if (match(&CurRecipe,
3044 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3045 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3046 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3047 EVL, Mask);
3048
3049 VPValue *ReversedVal;
3050 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3051 match(ReversedVal,
3052 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3053 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3054 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3055 auto *LoadR = new VPWidenLoadEVLRecipe(
3056 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3057 LoadR->insertBefore(&CurRecipe);
3058 return new VPWidenIntrinsicRecipe(
3059 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3060 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3061 }
3062
3063 VPValue *StoredVal;
3064 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3065 m_RemoveMask(HeaderMask, Mask))) &&
3066 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3067 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3068 StoredVal, EVL, Mask);
3069
3070 if (match(&CurRecipe,
3071 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3072 m_RemoveMask(HeaderMask, Mask))) &&
3073 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3074 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3075 auto *NewReverse = new VPWidenIntrinsicRecipe(
3076 Intrinsic::experimental_vp_reverse,
3077 {ReversedVal, Plan->getTrue(), &EVL},
3078 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3079 NewReverse->insertBefore(&CurRecipe);
3080 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3081 AdjustEndPtr(EndPtr), NewReverse, EVL,
3082 Mask);
3083 }
3084
3085 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3086 if (Rdx->isConditional() &&
3087 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3088 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3089
3090 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3091 if (Interleave->getMask() &&
3092 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3093 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3094
3095 VPValue *LHS, *RHS;
3096 if (match(&CurRecipe,
3097 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3098 return new VPWidenIntrinsicRecipe(
3099 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3100 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3101
3102 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3103 m_VPValue(RHS))))
3104 return new VPWidenIntrinsicRecipe(
3105 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3106 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3107
3108 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3109 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3110 VPValue *ZExt =
3111 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3112 return new VPInstruction(
3113 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3114 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3115 }
3116
3117 return nullptr;
3118}
3119
3120/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3121/// The transforms here need to preserve the original semantics.
3123 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3124 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3127 m_VPValue(EVL))) &&
3128 match(EVL, m_EVL(m_VPValue()))) {
3129 HeaderMask = R.getVPSingleValue();
3130 break;
3131 }
3132 }
3133 if (!HeaderMask)
3134 return;
3135
3136 VPTypeAnalysis TypeInfo(Plan);
3137 SmallVector<VPRecipeBase *> OldRecipes;
3138 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3140 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3141 NewR->insertBefore(R);
3142 for (auto [Old, New] :
3143 zip_equal(R->definedValues(), NewR->definedValues()))
3144 Old->replaceAllUsesWith(New);
3145 OldRecipes.push_back(R);
3146 }
3147 }
3148 // Erase old recipes at the end so we don't invalidate TypeInfo.
3149 for (VPRecipeBase *R : reverse(OldRecipes)) {
3150 SmallVector<VPValue *> PossiblyDead(R->operands());
3151 R->eraseFromParent();
3152 for (VPValue *Op : PossiblyDead)
3154 }
3155}
3156
3157/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3158/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3159/// iteration.
3160static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3161 VPTypeAnalysis TypeInfo(Plan);
3162 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3163 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3164
3165 assert(all_of(Plan.getVF().users(),
3168 "User of VF that we can't transform to EVL.");
3169 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3171 });
3172
3173 assert(all_of(Plan.getVFxUF().users(),
3174 [&LoopRegion, &Plan](VPUser *U) {
3175 return match(U,
3176 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3177 m_Specific(&Plan.getVFxUF()))) ||
3178 isa<VPWidenPointerInductionRecipe>(U);
3179 }) &&
3180 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3181 "increment of the canonical induction.");
3182 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3183 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3184 // canonical induction must not be updated.
3186 });
3187
3188 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3189 // contained.
3190 bool ContainsFORs =
3192 if (ContainsFORs) {
3193 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3194 VPValue *MaxEVL = &Plan.getVF();
3195 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3196 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3197 MaxEVL = Builder.createScalarZExtOrTrunc(
3198 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3199 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3200
3201 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3202 VPValue *PrevEVL = Builder.createScalarPhi(
3203 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3204
3207 for (VPRecipeBase &R : *VPBB) {
3208 VPValue *V1, *V2;
3209 if (!match(&R,
3211 m_VPValue(V1), m_VPValue(V2))))
3212 continue;
3213 VPValue *Imm = Plan.getOrAddLiveIn(
3216 Intrinsic::experimental_vp_splice,
3217 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3218 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3219 R.getDebugLoc());
3220 VPSplice->insertBefore(&R);
3221 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3222 }
3223 }
3224 }
3225
3226 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3227 if (!HeaderMask)
3228 return;
3229
3230 // Replace header masks with a mask equivalent to predicating by EVL:
3231 //
3232 // icmp ule widen-canonical-iv backedge-taken-count
3233 // ->
3234 // icmp ult step-vector, EVL
3235 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3236 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3237 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3238 VPValue *EVLMask = Builder.createICmp(
3240 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3241 HeaderMask->replaceAllUsesWith(EVLMask);
3242}
3243
3244/// Converts a tail folded vector loop region to step by
3245/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3246/// iteration.
3247///
3248/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3249/// replaces all uses except the canonical IV increment of
3250/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3251/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3252/// this transformation.
3253///
3254/// - The header mask is replaced with a header mask based on the EVL.
3255///
3256/// - Plans with FORs have a new phi added to keep track of the EVL of the
3257/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3258/// @llvm.vp.splice.
3259///
3260/// The function uses the following definitions:
3261/// %StartV is the canonical induction start value.
3262///
3263/// The function adds the following recipes:
3264///
3265/// vector.ph:
3266/// ...
3267///
3268/// vector.body:
3269/// ...
3270/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3271/// [ %NextIter, %vector.body ]
3272/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3273/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3274/// ...
3275/// %OpEVL = cast i32 %VPEVL to IVSize
3276/// %NextIter = add IVSize %OpEVL, %CurrentIter
3277/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3278/// ...
3279///
3280/// If MaxSafeElements is provided, the function adds the following recipes:
3281/// vector.ph:
3282/// ...
3283///
3284/// vector.body:
3285/// ...
3286/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3287/// [ %NextIter, %vector.body ]
3288/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3289/// %cmp = cmp ult %AVL, MaxSafeElements
3290/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3291/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3292/// ...
3293/// %OpEVL = cast i32 %VPEVL to IVSize
3294/// %NextIter = add IVSize %OpEVL, %CurrentIter
3295/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3296/// ...
3297///
3299 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3300 if (Plan.hasScalarVFOnly())
3301 return;
3302 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3303 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3304
3305 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3306 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3307 VPValue *StartV = CanonicalIVPHI->getStartValue();
3308
3309 // Create the CurrentIteration recipe in the vector loop.
3310 auto *CurrentIteration =
3312 CurrentIteration->insertAfter(CanonicalIVPHI);
3313 VPBuilder Builder(Header, Header->getFirstNonPhi());
3314 // Create the AVL (application vector length), starting from TC -> 0 in steps
3315 // of EVL.
3316 VPPhi *AVLPhi = Builder.createScalarPhi(
3317 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3318 VPValue *AVL = AVLPhi;
3319
3320 if (MaxSafeElements) {
3321 // Support for MaxSafeDist for correct loop emission.
3322 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3323 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3324 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3325 "safe_avl");
3326 }
3327 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3328 DebugLoc::getUnknown(), "evl");
3329
3330 auto *CanonicalIVIncrement =
3331 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3332 Builder.setInsertPoint(CanonicalIVIncrement);
3333 VPValue *OpVPEVL = VPEVL;
3334
3335 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3336 OpVPEVL = Builder.createScalarZExtOrTrunc(
3337 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3338
3339 auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
3340 CanonicalIVIncrement->getDebugLoc(),
3341 "current.iteration.next",
3342 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3343 CanonicalIVIncrement->hasNoSignedWrap()});
3344 CurrentIteration->addOperand(NextIter);
3345
3346 VPValue *NextAVL =
3347 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3348 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3349 AVLPhi->addOperand(NextAVL);
3350
3351 fixupVFUsersForEVL(Plan, *VPEVL);
3352 removeDeadRecipes(Plan);
3353
3354 // Replace all uses of VPCanonicalIVPHIRecipe by
3355 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3356 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3357 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3358 // TODO: support unroll factor > 1.
3359 Plan.setUF(1);
3360}
3361
3363 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3364 // There should be only one VPCurrentIteration in the entire plan.
3365 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3366
3369 for (VPRecipeBase &R : VPBB->phis())
3370 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3371 assert(!CurrentIteration &&
3372 "Found multiple CurrentIteration. Only one expected");
3373 CurrentIteration = PhiR;
3374 }
3375
3376 // Early return if it is not variable-length stepping.
3377 if (!CurrentIteration)
3378 return;
3379
3380 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3381 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3382
3383 // Convert CurrentIteration to concrete recipe.
3384 auto *ScalarR =
3385 VPBuilder(CurrentIteration)
3387 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3388 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3389 CurrentIteration->replaceAllUsesWith(ScalarR);
3390 CurrentIteration->eraseFromParent();
3391
3392 // Replace CanonicalIVInc with CurrentIteration increment.
3393 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3394 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3395 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3396 m_Specific(&Plan.getVFxUF()))) &&
3397 "Unexpected canonical iv");
3398 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3399
3400 // Remove unused phi and increment.
3401 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3402 CanonicalIVIncrement->eraseFromParent();
3403 CanonicalIV->eraseFromParent();
3404}
3405
3407 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3408 // The canonical IV may not exist at this stage.
3409 if (!LoopRegion ||
3411 return;
3412 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3413 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3414 return;
3415 // The EVL IV is always immediately after the canonical IV.
3417 std::next(CanIV->getIterator()));
3418 if (!EVLPhi)
3419 return;
3420
3421 // Bail if not an EVL tail folded loop.
3422 VPValue *AVL;
3423 if (!match(EVLPhi->getBackedgeValue(),
3424 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3425 return;
3426
3427 // The AVL may be capped to a safe distance.
3428 VPValue *SafeAVL;
3429 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3430 AVL = SafeAVL;
3431
3432 VPValue *AVLNext;
3433 [[maybe_unused]] bool FoundAVLNext =
3435 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3436 assert(FoundAVLNext && "Didn't find AVL backedge?");
3437
3438 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3439 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3440 if (match(LatchBr, m_BranchOnCond(m_True())))
3441 return;
3442
3443 assert(
3444 match(LatchBr,
3447 m_Specific(&Plan.getVectorTripCount())))) &&
3448 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3449 "trip count");
3450
3451 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3452 VPBuilder Builder(LatchBr);
3453 LatchBr->setOperand(
3454 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3455}
3456
3458 VPlan &Plan, PredicatedScalarEvolution &PSE,
3459 const DenseMap<Value *, const SCEV *> &StridesMap) {
3460 // Replace VPValues for known constant strides guaranteed by predicate scalar
3461 // evolution.
3462 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3463 auto *R = cast<VPRecipeBase>(&U);
3464 return R->getRegion() ||
3465 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3466 };
3467 ValueToSCEVMapTy RewriteMap;
3468 for (const SCEV *Stride : StridesMap.values()) {
3469 using namespace SCEVPatternMatch;
3470 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3471 const APInt *StrideConst;
3472 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3473 // Only handle constant strides for now.
3474 continue;
3475
3476 auto *CI = Plan.getConstantInt(*StrideConst);
3477 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3478 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3479
3480 // The versioned value may not be used in the loop directly but through a
3481 // sext/zext. Add new live-ins in those cases.
3482 for (Value *U : StrideV->users()) {
3484 continue;
3485 VPValue *StrideVPV = Plan.getLiveIn(U);
3486 if (!StrideVPV)
3487 continue;
3488 unsigned BW = U->getType()->getScalarSizeInBits();
3489 APInt C =
3490 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3491 VPValue *CI = Plan.getConstantInt(C);
3492 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3493 }
3494 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3495 }
3496
3497 for (VPRecipeBase &R : *Plan.getEntry()) {
3498 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3499 if (!ExpSCEV)
3500 continue;
3501 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3502 auto *NewSCEV =
3503 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3504 if (NewSCEV != ScevExpr) {
3505 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3506 ExpSCEV->replaceAllUsesWith(NewExp);
3507 if (Plan.getTripCount() == ExpSCEV)
3508 Plan.resetTripCount(NewExp);
3509 }
3510 }
3511}
3512
3514 VPlan &Plan,
3515 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3516 // Collect recipes in the backward slice of `Root` that may generate a poison
3517 // value that is used after vectorization.
3519 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3521 Worklist.push_back(Root);
3522
3523 // Traverse the backward slice of Root through its use-def chain.
3524 while (!Worklist.empty()) {
3525 VPRecipeBase *CurRec = Worklist.pop_back_val();
3526
3527 if (!Visited.insert(CurRec).second)
3528 continue;
3529
3530 // Prune search if we find another recipe generating a widen memory
3531 // instruction. Widen memory instructions involved in address computation
3532 // will lead to gather/scatter instructions, which don't need to be
3533 // handled.
3535 VPHeaderPHIRecipe>(CurRec))
3536 continue;
3537
3538 // This recipe contributes to the address computation of a widen
3539 // load/store. If the underlying instruction has poison-generating flags,
3540 // drop them directly.
3541 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3542 VPValue *A, *B;
3543 // Dropping disjoint from an OR may yield incorrect results, as some
3544 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3545 // for dependence analysis). Instead, replace it with an equivalent Add.
3546 // This is possible as all users of the disjoint OR only access lanes
3547 // where the operands are disjoint or poison otherwise.
3548 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3549 RecWithFlags->isDisjoint()) {
3550 VPBuilder Builder(RecWithFlags);
3551 VPInstruction *New =
3552 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3553 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3554 RecWithFlags->replaceAllUsesWith(New);
3555 RecWithFlags->eraseFromParent();
3556 CurRec = New;
3557 } else
3558 RecWithFlags->dropPoisonGeneratingFlags();
3559 } else {
3562 (void)Instr;
3563 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3564 "found instruction with poison generating flags not covered by "
3565 "VPRecipeWithIRFlags");
3566 }
3567
3568 // Add new definitions to the worklist.
3569 for (VPValue *Operand : CurRec->operands())
3570 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3571 Worklist.push_back(OpDef);
3572 }
3573 });
3574
3575 // Traverse all the recipes in the VPlan and collect the poison-generating
3576 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3577 // VPInterleaveRecipe.
3578 auto Iter = vp_depth_first_deep(Plan.getEntry());
3580 for (VPRecipeBase &Recipe : *VPBB) {
3581 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3582 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3583 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3584 if (AddrDef && WidenRec->isConsecutive() &&
3585 BlockNeedsPredication(UnderlyingInstr.getParent()))
3586 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3587 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3588 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3589 if (AddrDef) {
3590 // Check if any member of the interleave group needs predication.
3591 const InterleaveGroup<Instruction> *InterGroup =
3592 InterleaveRec->getInterleaveGroup();
3593 bool NeedPredication = false;
3594 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3595 I < NumMembers; ++I) {
3596 Instruction *Member = InterGroup->getMember(I);
3597 if (Member)
3598 NeedPredication |= BlockNeedsPredication(Member->getParent());
3599 }
3600
3601 if (NeedPredication)
3602 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3603 }
3604 }
3605 }
3606 }
3607}
3608
3610 VPlan &Plan,
3612 &InterleaveGroups,
3613 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3614 if (InterleaveGroups.empty())
3615 return;
3616
3617 // Interleave memory: for each Interleave Group we marked earlier as relevant
3618 // for this VPlan, replace the Recipes widening its memory instructions with a
3619 // single VPInterleaveRecipe at its insertion point.
3620 VPDominatorTree VPDT(Plan);
3621 for (const auto *IG : InterleaveGroups) {
3622 auto *Start =
3623 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3624 VPIRMetadata InterleaveMD(*Start);
3625 SmallVector<VPValue *, 4> StoredValues;
3626 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3627 StoredValues.push_back(StoreR->getStoredValue());
3628 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3629 Instruction *MemberI = IG->getMember(I);
3630 if (!MemberI)
3631 continue;
3632 VPWidenMemoryRecipe *MemoryR =
3633 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3634 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3635 StoredValues.push_back(StoreR->getStoredValue());
3636 InterleaveMD.intersect(*MemoryR);
3637 }
3638
3639 bool NeedsMaskForGaps =
3640 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3641 (!StoredValues.empty() && !IG->isFull());
3642
3643 Instruction *IRInsertPos = IG->getInsertPos();
3644 auto *InsertPos =
3645 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3646
3648 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3649 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3650 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3651
3652 // Get or create the start address for the interleave group.
3653 VPValue *Addr = Start->getAddr();
3654 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3655 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3656 // We cannot re-use the address of member zero because it does not
3657 // dominate the insert position. Instead, use the address of the insert
3658 // position and create a PtrAdd adjusting it to the address of member
3659 // zero.
3660 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3661 // InsertPos or sink loads above zero members to join it.
3662 assert(IG->getIndex(IRInsertPos) != 0 &&
3663 "index of insert position shouldn't be zero");
3664 auto &DL = IRInsertPos->getDataLayout();
3665 APInt Offset(32,
3666 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3667 IG->getIndex(IRInsertPos),
3668 /*IsSigned=*/true);
3669 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3670 VPBuilder B(InsertPos);
3671 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3672 }
3673 // If the group is reverse, adjust the index to refer to the last vector
3674 // lane instead of the first. We adjust the index from the first vector
3675 // lane, rather than directly getting the pointer for lane VF - 1, because
3676 // the pointer operand of the interleaved access is supposed to be uniform.
3677 if (IG->isReverse()) {
3678 auto *ReversePtr = new VPVectorEndPointerRecipe(
3679 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3680 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3681 ReversePtr->insertBefore(InsertPos);
3682 Addr = ReversePtr;
3683 }
3684 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3685 InsertPos->getMask(), NeedsMaskForGaps,
3686 InterleaveMD, InsertPos->getDebugLoc());
3687 VPIG->insertBefore(InsertPos);
3688
3689 unsigned J = 0;
3690 for (unsigned i = 0; i < IG->getFactor(); ++i)
3691 if (Instruction *Member = IG->getMember(i)) {
3692 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3693 if (!Member->getType()->isVoidTy()) {
3694 VPValue *OriginalV = MemberR->getVPSingleValue();
3695 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3696 J++;
3697 }
3698 MemberR->eraseFromParent();
3699 }
3700 }
3701}
3702
3703/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3704/// value, phi and backedge value. In the following example:
3705///
3706/// vector.ph:
3707/// Successor(s): vector loop
3708///
3709/// <x1> vector loop: {
3710/// vector.body:
3711/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3712/// ...
3713/// EMIT branch-on-count ...
3714/// No successors
3715/// }
3716///
3717/// WIDEN-INDUCTION will get expanded to:
3718///
3719/// vector.ph:
3720/// ...
3721/// vp<%induction.start> = ...
3722/// vp<%induction.increment> = ...
3723///
3724/// Successor(s): vector loop
3725///
3726/// <x1> vector loop: {
3727/// vector.body:
3728/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3729/// ...
3730/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3731/// EMIT branch-on-count ...
3732/// No successors
3733/// }
3734static void
3736 VPTypeAnalysis &TypeInfo) {
3737 VPlan *Plan = WidenIVR->getParent()->getPlan();
3738 VPValue *Start = WidenIVR->getStartValue();
3739 VPValue *Step = WidenIVR->getStepValue();
3740 VPValue *VF = WidenIVR->getVFValue();
3741 DebugLoc DL = WidenIVR->getDebugLoc();
3742
3743 // The value from the original loop to which we are mapping the new induction
3744 // variable.
3745 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3746
3747 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3750 VPIRFlags Flags = *WidenIVR;
3751 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3752 AddOp = Instruction::Add;
3753 MulOp = Instruction::Mul;
3754 } else {
3755 AddOp = ID.getInductionOpcode();
3756 MulOp = Instruction::FMul;
3757 }
3758
3759 // If the phi is truncated, truncate the start and step values.
3760 VPBuilder Builder(Plan->getVectorPreheader());
3761 Type *StepTy = TypeInfo.inferScalarType(Step);
3762 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3763 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3764 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3765 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3766 // Truncation doesn't preserve WrapFlags.
3767 Flags.dropPoisonGeneratingFlags();
3768 StepTy = Ty;
3769 }
3770
3771 // Construct the initial value of the vector IV in the vector loop preheader.
3772 Type *IVIntTy =
3774 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3775 if (StepTy->isFloatingPointTy())
3776 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3777
3778 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3779 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3780
3781 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3782 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3783 DebugLoc::getUnknown(), "induction");
3784
3785 // Create the widened phi of the vector IV.
3786 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3787 WidenIVR->getDebugLoc(), "vec.ind");
3788 WidePHI->insertBefore(WidenIVR);
3789
3790 // Create the backedge value for the vector IV.
3791 VPValue *Inc;
3792 VPValue *Prev;
3793 // If unrolled, use the increment and prev value from the operands.
3794 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3795 Inc = SplatVF;
3796 Prev = WidenIVR->getLastUnrolledPartOperand();
3797 } else {
3798 if (VPRecipeBase *R = VF->getDefiningRecipe())
3799 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3800 // Multiply the vectorization factor by the step using integer or
3801 // floating-point arithmetic as appropriate.
3802 if (StepTy->isFloatingPointTy())
3803 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3804 DL);
3805 else
3806 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3807 TypeInfo.inferScalarType(VF), DL);
3808
3809 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3810 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3811 Prev = WidePHI;
3812 }
3813
3815 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3816 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3817 WidenIVR->getDebugLoc(), "vec.ind.next");
3818
3819 WidePHI->addOperand(Next);
3820
3821 WidenIVR->replaceAllUsesWith(WidePHI);
3822}
3823
3824/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3825/// initial value, phi and backedge value. In the following example:
3826///
3827/// <x1> vector loop: {
3828/// vector.body:
3829/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3830/// ...
3831/// EMIT branch-on-count ...
3832/// }
3833///
3834/// WIDEN-POINTER-INDUCTION will get expanded to:
3835///
3836/// <x1> vector loop: {
3837/// vector.body:
3838/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3839/// EMIT %mul = mul %stepvector, %step
3840/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3841/// ...
3842/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3843/// EMIT branch-on-count ...
3844/// }
3846 VPTypeAnalysis &TypeInfo) {
3847 VPlan *Plan = R->getParent()->getPlan();
3848 VPValue *Start = R->getStartValue();
3849 VPValue *Step = R->getStepValue();
3850 VPValue *VF = R->getVFValue();
3851
3852 assert(R->getInductionDescriptor().getKind() ==
3854 "Not a pointer induction according to InductionDescriptor!");
3855 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3856 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3857 "Recipe should have been replaced");
3858
3859 VPBuilder Builder(R);
3860 DebugLoc DL = R->getDebugLoc();
3861
3862 // Build a scalar pointer phi.
3863 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3864
3865 // Create actual address geps that use the pointer phi as base and a
3866 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3867 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3868 Type *StepTy = TypeInfo.inferScalarType(Step);
3869 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3870 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3871 VPValue *PtrAdd =
3872 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3873 R->replaceAllUsesWith(PtrAdd);
3874
3875 // Create the backedge value for the scalar pointer phi.
3877 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3878 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3879 DL);
3880 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3881
3882 VPValue *InductionGEP =
3883 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3884 ScalarPtrPhi->addOperand(InductionGEP);
3885}
3886
3888 // Replace loop regions with explicity CFG.
3889 SmallVector<VPRegionBlock *> LoopRegions;
3891 vp_depth_first_deep(Plan.getEntry()))) {
3892 if (!R->isReplicator())
3893 LoopRegions.push_back(R);
3894 }
3895 for (VPRegionBlock *R : LoopRegions)
3896 R->dissolveToCFGLoop();
3897}
3898
3901 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3902 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3905 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3906 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3907 }
3908
3909 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3910 // single-condition branches:
3911 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3912 // the first condition is true, and otherwise jumps to a new interim block.
3913 // 2. A branch that ends the interim block, jumps to the second successor if
3914 // the second condition is true, and otherwise jumps to the third
3915 // successor.
3916 for (VPInstruction *Br : WorkList) {
3917 assert(Br->getNumOperands() == 2 &&
3918 "BranchOnTwoConds must have exactly 2 conditions");
3919 DebugLoc DL = Br->getDebugLoc();
3920 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3921 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3922 assert(Successors.size() == 3 &&
3923 "BranchOnTwoConds must have exactly 3 successors");
3924
3925 for (VPBlockBase *Succ : Successors)
3926 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3927
3928 VPValue *Cond0 = Br->getOperand(0);
3929 VPValue *Cond1 = Br->getOperand(1);
3930 VPBlockBase *Succ0 = Successors[0];
3931 VPBlockBase *Succ1 = Successors[1];
3932 VPBlockBase *Succ2 = Successors[2];
3933 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3934 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3935
3936 VPBasicBlock *InterimBB =
3937 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3938
3939 VPBuilder(BrOnTwoCondsBB)
3941 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3942 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3943
3945 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3946 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3947 Br->eraseFromParent();
3948 }
3949}
3950
3952 VPTypeAnalysis TypeInfo(Plan);
3955 vp_depth_first_deep(Plan.getEntry()))) {
3956 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3957 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3958 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3959 ToRemove.push_back(WidenIVR);
3960 continue;
3961 }
3962
3963 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3964 // If the recipe only generates scalars, scalarize it instead of
3965 // expanding it.
3966 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3967 VPBuilder Builder(WidenIVR);
3968 VPValue *PtrAdd =
3969 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3970 WidenIVR->replaceAllUsesWith(PtrAdd);
3971 ToRemove.push_back(WidenIVR);
3972 continue;
3973 }
3974 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3975 ToRemove.push_back(WidenIVR);
3976 continue;
3977 }
3978
3979 // Expand VPBlendRecipe into VPInstruction::Select.
3980 VPBuilder Builder(&R);
3981 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3982 VPValue *Select = Blend->getIncomingValue(0);
3983 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3984 Select = Builder.createSelect(Blend->getMask(I),
3985 Blend->getIncomingValue(I), Select,
3986 R.getDebugLoc(), "predphi", *Blend);
3987 Blend->replaceAllUsesWith(Select);
3988 ToRemove.push_back(Blend);
3989 }
3990
3991 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
3992 if (!VEPR->getOffset()) {
3993 assert(Plan.getConcreteUF() == 1 &&
3994 "Expected unroller to have materialized offset for UF != 1");
3995 VEPR->materializeOffset();
3996 }
3997 }
3998
3999 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4000 Expr->decompose();
4001 ToRemove.push_back(Expr);
4002 }
4003
4004 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4005 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4006 if (LastActiveL &&
4007 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4008 // Create Not(Mask) for all operands.
4010 for (VPValue *Op : LastActiveL->operands()) {
4011 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4012 NotMasks.push_back(NotMask);
4013 }
4014
4015 // Create FirstActiveLane on the inverted masks.
4016 VPValue *FirstInactiveLane = Builder.createNaryOp(
4018 LastActiveL->getDebugLoc(), "first.inactive.lane");
4019
4020 // Subtract 1 to get the last active lane.
4021 VPValue *One = Plan.getConstantInt(64, 1);
4022 VPValue *LastLane =
4023 Builder.createSub(FirstInactiveLane, One,
4024 LastActiveL->getDebugLoc(), "last.active.lane");
4025
4026 LastActiveL->replaceAllUsesWith(LastLane);
4027 ToRemove.push_back(LastActiveL);
4028 continue;
4029 }
4030
4031 // Lower MaskedCond with block mask to LogicalAnd.
4033 auto *VPI = cast<VPInstruction>(&R);
4034 assert(VPI->isMasked() &&
4035 "Unmasked MaskedCond should be simplified earlier");
4036 VPI->replaceAllUsesWith(Builder.createNaryOp(
4037 VPInstruction::LogicalAnd, {VPI->getOperand(0), VPI->getMask()}));
4038 ToRemove.push_back(VPI);
4039 continue;
4040 }
4041
4042 // Lower BranchOnCount to ICmp + BranchOnCond.
4043 VPValue *IV, *TC;
4044 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4045 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4046 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4047 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4048 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4049 ToRemove.push_back(BranchOnCountInst);
4050 continue;
4051 }
4052
4053 VPValue *VectorStep;
4054 VPValue *ScalarStep;
4056 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4057 continue;
4058
4059 // Expand WideIVStep.
4060 auto *VPI = cast<VPInstruction>(&R);
4061 Type *IVTy = TypeInfo.inferScalarType(VPI);
4062 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4064 ? Instruction::UIToFP
4065 : Instruction::Trunc;
4066 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4067 }
4068
4069 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4070 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4071 ScalarStep =
4072 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4073 }
4074
4075 VPIRFlags Flags;
4076 unsigned MulOpc;
4077 if (IVTy->isFloatingPointTy()) {
4078 MulOpc = Instruction::FMul;
4079 Flags = VPI->getFastMathFlags();
4080 } else {
4081 MulOpc = Instruction::Mul;
4082 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4083 }
4084
4085 VPInstruction *Mul = Builder.createNaryOp(
4086 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4087 VectorStep = Mul;
4088 VPI->replaceAllUsesWith(VectorStep);
4089 ToRemove.push_back(VPI);
4090 }
4091 }
4092
4093 for (VPRecipeBase *R : ToRemove)
4094 R->eraseFromParent();
4095}
4096
4098 VPBasicBlock *HeaderVPBB,
4099 VPBasicBlock *LatchVPBB,
4100 VPBasicBlock *MiddleVPBB) {
4101 struct EarlyExitInfo {
4102 VPBasicBlock *EarlyExitingVPBB;
4103 VPIRBasicBlock *EarlyExitVPBB;
4104 VPValue *CondToExit;
4105 };
4106
4107 VPDominatorTree VPDT(Plan);
4108 VPBuilder Builder(LatchVPBB->getTerminator());
4110 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4111 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4112 if (Pred == MiddleVPBB)
4113 continue;
4114 // Collect condition for this early exit.
4115 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4116 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4117 VPValue *CondOfEarlyExitingVPBB;
4118 [[maybe_unused]] bool Matched =
4119 match(EarlyExitingVPBB->getTerminator(),
4120 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4121 assert(Matched && "Terminator must be BranchOnCond");
4122
4123 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4124 // the correct block mask.
4125 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4126 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4128 TrueSucc == ExitBlock
4129 ? CondOfEarlyExitingVPBB
4130 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4131 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4132 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4133 VPDT.properlyDominates(
4134 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4135 LatchVPBB)) &&
4136 "exit condition must dominate the latch");
4137 Exits.push_back({
4138 EarlyExitingVPBB,
4139 ExitBlock,
4140 CondToEarlyExit,
4141 });
4142 }
4143 }
4144
4145 assert(!Exits.empty() && "must have at least one early exit");
4146 // Sort exits by RPO order to get correct program order. RPO gives a
4147 // topological ordering of the CFG, ensuring upstream exits are checked
4148 // before downstream exits in the dispatch chain.
4150 HeaderVPBB);
4152 for (const auto &[Num, VPB] : enumerate(RPOT))
4153 RPOIdx[VPB] = Num;
4154 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4155 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4156 });
4157#ifndef NDEBUG
4158 // After RPO sorting, verify that for any pair where one exit dominates
4159 // another, the dominating exit comes first. This is guaranteed by RPO
4160 // (topological order) and is required for the dispatch chain correctness.
4161 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4162 for (unsigned J = I + 1; J < Exits.size(); ++J)
4163 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4164 Exits[I].EarlyExitingVPBB) &&
4165 "RPO sort must place dominating exits before dominated ones");
4166#endif
4167
4168 // Build the AnyOf condition for the latch terminator using logical OR
4169 // to avoid poison propagation from later exit conditions when an earlier
4170 // exit is taken.
4171 VPValue *Combined = Exits[0].CondToExit;
4172 for (const EarlyExitInfo &Info : drop_begin(Exits))
4173 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4174
4175 VPValue *IsAnyExitTaken =
4176 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4177
4178 // Create the vector.early.exit blocks.
4179 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4180 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4181 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4182 VPBasicBlock *VectorEarlyExitVPBB =
4183 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4184 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4185 }
4186
4187 // Create the dispatch block (or reuse the single exit block if only one
4188 // exit). The dispatch block computes the first active lane of the combined
4189 // condition and, for multiple exits, chains through conditions to determine
4190 // which exit to take.
4191 VPBasicBlock *DispatchVPBB =
4192 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4193 : Plan.createVPBasicBlock("vector.early.exit.check");
4194 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4195 VPValue *FirstActiveLane =
4196 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4197 DebugLoc::getUnknown(), "first.active.lane");
4198
4199 // For each early exit, disconnect the original exiting block
4200 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4201 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4202 // values at the first active lane:
4203 //
4204 // Input:
4205 // early.exiting.I:
4206 // ...
4207 // EMIT branch-on-cond vp<%cond.I>
4208 // Successor(s): in.loop.succ, ir-bb<exit.I>
4209 //
4210 // ir-bb<exit.I>:
4211 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4212 //
4213 // Output:
4214 // early.exiting.I:
4215 // ...
4216 // Successor(s): in.loop.succ
4217 //
4218 // vector.early.exit.I:
4219 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4220 // Successor(s): ir-bb<exit.I>
4221 //
4222 // ir-bb<exit.I>:
4223 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4224 // vector.early.exit.I)
4225 //
4226 for (auto [Exit, VectorEarlyExitVPBB] :
4227 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4228 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4229 // Adjust the phi nodes in EarlyExitVPBB.
4230 // 1. remove incoming values from EarlyExitingVPBB,
4231 // 2. extract the incoming value at FirstActiveLane
4232 // 3. add back the extracts as last operands for the phis
4233 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4234 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4235 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4236 // values from VectorEarlyExitVPBB.
4237 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4238 auto *ExitIRI = cast<VPIRPhi>(&R);
4239 VPValue *IncomingVal =
4240 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4241 VPValue *NewIncoming = IncomingVal;
4242 if (!isa<VPIRValue>(IncomingVal)) {
4243 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4244 NewIncoming = EarlyExitBuilder.createNaryOp(
4245 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4246 DebugLoc::getUnknown(), "early.exit.value");
4247 }
4248 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4249 ExitIRI->addOperand(NewIncoming);
4250 }
4251
4252 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4253 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4254 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4255 }
4256
4257 // Chain through exits: for each exit, check if its condition is true at
4258 // the first active lane. If so, take that exit; otherwise, try the next.
4259 // The last exit needs no check since it must be taken if all others fail.
4260 //
4261 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4262 //
4263 // latch:
4264 // ...
4265 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4266 // ...
4267 //
4268 // vector.early.exit.check:
4269 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4270 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4271 // EMIT branch-on-cond vp<%at.cond.0>
4272 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4273 //
4274 // vector.early.exit.check.0:
4275 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4276 // EMIT branch-on-cond vp<%at.cond.1>
4277 // Successor(s): vector.early.exit.1, vector.early.exit.2
4278 VPBasicBlock *CurrentBB = DispatchVPBB;
4279 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4280 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4281 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4282 DebugLoc::getUnknown(), "exit.cond.at.lane");
4283
4284 // For the last dispatch, branch directly to the last exit on false;
4285 // otherwise, create a new check block.
4286 bool IsLastDispatch = (I + 2 == Exits.size());
4287 VPBasicBlock *FalseBB =
4288 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4289 : Plan.createVPBasicBlock(
4290 Twine("vector.early.exit.check.") + Twine(I));
4291
4292 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4293 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4294 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4295 FalseBB->setPredecessors({CurrentBB});
4296
4297 CurrentBB = FalseBB;
4298 DispatchBuilder.setInsertPoint(CurrentBB);
4299 }
4300
4301 // Replace the latch terminator with the new branching logic.
4302 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4303 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4304 "Unexpected terminator");
4305 auto *IsLatchExitTaken =
4306 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4307 LatchExitingBranch->getOperand(1));
4308
4309 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4310 LatchExitingBranch->eraseFromParent();
4311 Builder.setInsertPoint(LatchVPBB);
4312 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4313 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4314 LatchVPBB->clearSuccessors();
4315 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4316 DispatchVPBB->setPredecessors({LatchVPBB});
4317}
4318
4319/// This function tries convert extended in-loop reductions to
4320/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4321/// valid. The created recipe must be decomposed to its constituent
4322/// recipes before execution.
4323static VPExpressionRecipe *
4325 VFRange &Range) {
4326 Type *RedTy = Ctx.Types.inferScalarType(Red);
4327 VPValue *VecOp = Red->getVecOp();
4328
4329 // Clamp the range if using extended-reduction is profitable.
4330 auto IsExtendedRedValidAndClampRange =
4331 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4333 [&](ElementCount VF) {
4334 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4336
4338 InstructionCost ExtCost =
4339 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4340 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4341
4342 if (Red->isPartialReduction()) {
4345 // FIXME: Move partial reduction creation, costing and clamping
4346 // here from LoopVectorize.cpp.
4347 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4348 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4349 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4350 RedTy->isFloatingPointTy()
4351 ? std::optional{Red->getFastMathFlags()}
4352 : std::nullopt);
4353 } else if (!RedTy->isFloatingPointTy()) {
4354 // TTI::getExtendedReductionCost only supports integer types.
4355 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4356 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4357 Red->getFastMathFlags(), CostKind);
4358 }
4359 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4360 },
4361 Range);
4362 };
4363
4364 VPValue *A;
4365 // Match reduce(ext)).
4366 if (isa<VPWidenCastRecipe>(VecOp) &&
4367 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4368 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4369 IsExtendedRedValidAndClampRange(
4370 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4371 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4372 Ctx.Types.inferScalarType(A)))
4373 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4374
4375 return nullptr;
4376}
4377
4378/// This function tries convert extended in-loop reductions to
4379/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4380/// and valid. The created VPExpressionRecipe must be decomposed to its
4381/// constituent recipes before execution. Patterns of the
4382/// VPExpressionRecipe:
4383/// reduce.add(mul(...)),
4384/// reduce.add(mul(ext(A), ext(B))),
4385/// reduce.add(ext(mul(ext(A), ext(B)))).
4386/// reduce.fadd(fmul(ext(A), ext(B)))
4387static VPExpressionRecipe *
4389 VPCostContext &Ctx, VFRange &Range) {
4390 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4391 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4392 Opcode != Instruction::FAdd)
4393 return nullptr;
4394
4395 Type *RedTy = Ctx.Types.inferScalarType(Red);
4396
4397 // Clamp the range if using multiply-accumulate-reduction is profitable.
4398 auto IsMulAccValidAndClampRange =
4400 VPWidenCastRecipe *OuterExt) -> bool {
4402 [&](ElementCount VF) {
4404 Type *SrcTy =
4405 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4406 InstructionCost MulAccCost;
4407
4408 if (Red->isPartialReduction()) {
4409 Type *SrcTy2 =
4410 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4411 // FIXME: Move partial reduction creation, costing and clamping
4412 // here from LoopVectorize.cpp.
4413 MulAccCost = Ctx.TTI.getPartialReductionCost(
4414 Opcode, SrcTy, SrcTy2, RedTy, VF,
4416 Ext0->getOpcode())
4419 Ext1->getOpcode())
4421 Mul->getOpcode(), CostKind,
4422 RedTy->isFloatingPointTy()
4423 ? std::optional{Red->getFastMathFlags()}
4424 : std::nullopt);
4425 } else {
4426 // Only partial reductions support mixed or floating-point extends
4427 // at the moment.
4428 if (Ext0 && Ext1 &&
4429 (Ext0->getOpcode() != Ext1->getOpcode() ||
4430 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4431 return false;
4432
4433 bool IsZExt =
4434 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4435 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4436 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4437 SrcVecTy, CostKind);
4438 }
4439
4440 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4441 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4442 InstructionCost ExtCost = 0;
4443 if (Ext0)
4444 ExtCost += Ext0->computeCost(VF, Ctx);
4445 if (Ext1)
4446 ExtCost += Ext1->computeCost(VF, Ctx);
4447 if (OuterExt)
4448 ExtCost += OuterExt->computeCost(VF, Ctx);
4449
4450 return MulAccCost.isValid() &&
4451 MulAccCost < ExtCost + MulCost + RedCost;
4452 },
4453 Range);
4454 };
4455
4456 VPValue *VecOp = Red->getVecOp();
4457 VPRecipeBase *Sub = nullptr;
4458 VPValue *A, *B;
4459 VPValue *Tmp = nullptr;
4460
4461 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4462 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4463 assert(Opcode == Instruction::FAdd &&
4464 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4465 "instruction");
4466 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4467 if (!FMul)
4468 return nullptr;
4469
4470 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4471 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4472
4473 if (RecipeA && RecipeB &&
4474 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4475 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4476 }
4477 }
4478 if (RedTy->isFloatingPointTy())
4479 return nullptr;
4480
4481 // Sub reductions could have a sub between the add reduction and vec op.
4482 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4483 Sub = VecOp->getDefiningRecipe();
4484 VecOp = Tmp;
4485 }
4486
4487 // If ValB is a constant and can be safely extended, truncate it to the same
4488 // type as ExtA's operand, then extend it to the same type as ExtA. This
4489 // creates two uniform extends that can more easily be matched by the rest of
4490 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4491 // replaced with the new extend of the constant.
4492 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4493 VPWidenCastRecipe *&ExtB,
4494 VPValue *&ValB,
4495 VPWidenRecipe *Mul) {
4496 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4497 return;
4498 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4499 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4500 const APInt *Const;
4501 if (!match(ValB, m_APInt(Const)) ||
4503 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4504 return;
4505 // The truncate ensures that the type of each extended operand is the
4506 // same, and it's been proven that the constant can be extended from
4507 // NarrowTy safely. Necessary since ExtA's extended operand would be
4508 // e.g. an i8, while the const will likely be an i32. This will be
4509 // elided by later optimisations.
4510 VPBuilder Builder(Mul);
4511 auto *Trunc =
4512 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4513 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4514 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4515 Mul->setOperand(1, ExtB);
4516 };
4517
4518 // Try to match reduce.add(mul(...)).
4519 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4522 auto *Mul = cast<VPWidenRecipe>(VecOp);
4523
4524 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4525 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4526
4527 // Match reduce.add/sub(mul(ext, ext)).
4528 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4529 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4530 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4531 if (Sub)
4532 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4533 cast<VPWidenRecipe>(Sub), Red);
4534 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4535 }
4536 // TODO: Add an expression type for this variant with a negated mul
4537 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4538 return new VPExpressionRecipe(Mul, Red);
4539 }
4540 // TODO: Add an expression type for negated versions of other expression
4541 // variants.
4542 if (Sub)
4543 return nullptr;
4544
4545 // Match reduce.add(ext(mul(A, B))).
4546 if (!Red->isPartialReduction() &&
4547 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4548 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4549 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4552
4553 // reduce.add(ext(mul(ext, const)))
4554 // -> reduce.add(ext(mul(ext, ext(const))))
4555 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4556
4557 // reduce.add(ext(mul(ext(A), ext(B))))
4558 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4559 // The inner extends must either have the same opcode as the outer extend or
4560 // be the same, in which case the multiply can never result in a negative
4561 // value and the outer extend can be folded away by doing wider
4562 // extends for the operands of the mul.
4563 if (Ext0 && Ext1 &&
4564 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4565 Ext0->getOpcode() == Ext1->getOpcode() &&
4566 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4567 auto *NewExt0 = new VPWidenCastRecipe(
4568 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4569 *Ext0, *Ext0, Ext0->getDebugLoc());
4570 NewExt0->insertBefore(Ext0);
4571
4572 VPWidenCastRecipe *NewExt1 = NewExt0;
4573 if (Ext0 != Ext1) {
4574 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4575 Ext->getResultType(), nullptr, *Ext1,
4576 *Ext1, Ext1->getDebugLoc());
4577 NewExt1->insertBefore(Ext1);
4578 }
4579 Mul->setOperand(0, NewExt0);
4580 Mul->setOperand(1, NewExt1);
4581 Red->setOperand(1, Mul);
4582 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4583 }
4584 }
4585 return nullptr;
4586}
4587
4588/// This function tries to create abstract recipes from the reduction recipe for
4589/// following optimizations and cost estimation.
4591 VPCostContext &Ctx,
4592 VFRange &Range) {
4593 VPExpressionRecipe *AbstractR = nullptr;
4594 auto IP = std::next(Red->getIterator());
4595 auto *VPBB = Red->getParent();
4596 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4597 AbstractR = MulAcc;
4598 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4599 AbstractR = ExtRed;
4600 // Cannot create abstract inloop reduction recipes.
4601 if (!AbstractR)
4602 return;
4603
4604 AbstractR->insertBefore(*VPBB, IP);
4605 Red->replaceAllUsesWith(AbstractR);
4606}
4607
4618
4620 if (Plan.hasScalarVFOnly())
4621 return;
4622
4623#ifndef NDEBUG
4624 VPDominatorTree VPDT(Plan);
4625#endif
4626
4627 SmallVector<VPValue *> VPValues;
4630 append_range(VPValues, Plan.getLiveIns());
4631 for (VPRecipeBase &R : *Plan.getEntry())
4632 append_range(VPValues, R.definedValues());
4633
4634 auto *VectorPreheader = Plan.getVectorPreheader();
4635 for (VPValue *VPV : VPValues) {
4637 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4638 continue;
4639
4640 // Add explicit broadcast at the insert point that dominates all users.
4641 VPBasicBlock *HoistBlock = VectorPreheader;
4642 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4643 for (VPUser *User : VPV->users()) {
4644 if (User->usesScalars(VPV))
4645 continue;
4646 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4647 HoistPoint = HoistBlock->begin();
4648 else
4649 assert(VPDT.dominates(VectorPreheader,
4650 cast<VPRecipeBase>(User)->getParent()) &&
4651 "All users must be in the vector preheader or dominated by it");
4652 }
4653
4654 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4655 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4656 VPV->replaceUsesWithIf(Broadcast,
4657 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4658 return Broadcast != &U && !U.usesScalars(VPV);
4659 });
4660 }
4661}
4662
4664 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4665
4666 // Collect candidate loads with invariant addresses and noalias scopes
4667 // metadata and memory-writing recipes with noalias metadata.
4671 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4672 for (VPRecipeBase &R : *VPBB) {
4673 // Only handle single-scalar replicated loads with invariant addresses.
4674 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4675 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4676 RepR->getOpcode() != Instruction::Load)
4677 continue;
4678
4679 VPValue *Addr = RepR->getOperand(0);
4680 if (Addr->isDefinedOutsideLoopRegions()) {
4682 if (!Loc.AATags.Scope)
4683 continue;
4684 CandidateLoads.push_back({RepR, Loc});
4685 }
4686 }
4687 if (R.mayWriteToMemory()) {
4689 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4690 return;
4691 Stores.push_back(*Loc);
4692 }
4693 }
4694 }
4695
4696 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4697 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4698 // Hoist the load to the preheader if it doesn't alias with any stores
4699 // according to the noalias metadata. Other loads should have been hoisted
4700 // by other passes
4701 const AAMDNodes &LoadAA = LoadLoc.AATags;
4702 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4704 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4705 })) {
4706 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4707 }
4708 }
4709}
4710
4711// Collect common metadata from a group of replicate recipes by intersecting
4712// metadata from all recipes in the group.
4714 VPIRMetadata CommonMetadata = *Recipes.front();
4715 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4716 CommonMetadata.intersect(*Recipe);
4717 return CommonMetadata;
4718}
4719
4720template <unsigned Opcode>
4724 const Loop *L) {
4725 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4726 "Only Load and Store opcodes supported");
4727 constexpr bool IsLoad = (Opcode == Instruction::Load);
4728 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4729 VPDominatorTree VPDT(Plan);
4730 VPTypeAnalysis TypeInfo(Plan);
4731
4732 // Group predicated operations by their address SCEV.
4734 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4735 auto *VPBB = cast<VPBasicBlock>(Block);
4736 for (VPRecipeBase &R : *VPBB) {
4737 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4738 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4739 continue;
4740
4741 // For loads, operand 0 is address; for stores, operand 1 is address.
4742 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4743 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4744 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4745 RecipesByAddress[AddrSCEV].push_back(RepR);
4746 }
4747 }
4748
4749 // For each address, collect operations with the same or complementary masks.
4751 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4752 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4753 };
4754 for (auto &[Addr, Recipes] : RecipesByAddress) {
4755 if (Recipes.size() < 2)
4756 continue;
4757
4758 // Collect groups with the same or complementary masks.
4759 for (VPReplicateRecipe *&RecipeI : Recipes) {
4760 if (!RecipeI)
4761 continue;
4762
4763 VPValue *MaskI = RecipeI->getMask();
4764 Type *TypeI = GetLoadStoreValueType(RecipeI);
4766 Group.push_back(RecipeI);
4767 RecipeI = nullptr;
4768
4769 // Find all operations with the same or complementary masks.
4770 bool HasComplementaryMask = false;
4771 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4772 if (!RecipeJ)
4773 continue;
4774
4775 VPValue *MaskJ = RecipeJ->getMask();
4776 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4777 if (TypeI == TypeJ) {
4778 // Check if any operation in the group has a complementary mask with
4779 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4780 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4781 match(MaskJ, m_Not(m_Specific(MaskI)));
4782 Group.push_back(RecipeJ);
4783 RecipeJ = nullptr;
4784 }
4785 }
4786
4787 if (HasComplementaryMask) {
4788 assert(Group.size() >= 2 && "must have at least 2 entries");
4789 // Sort replicates by dominance order, with earliest (most dominating)
4790 // first.
4791 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4792 return VPDT.properlyDominates(A, B);
4793 });
4794 AllGroups.push_back(std::move(Group));
4795 }
4796 }
4797 }
4798
4799 return AllGroups;
4800}
4801
4802// Find the recipe with minimum alignment in the group.
4803template <typename InstType>
4804static VPReplicateRecipe *
4806 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4807 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4808 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4809 });
4810}
4811
4814 const Loop *L) {
4815 auto Groups =
4817 if (Groups.empty())
4818 return;
4819
4820 // Process each group of loads.
4821 for (auto &Group : Groups) {
4822 // Try to use the earliest (most dominating) load to replace all others.
4823 VPReplicateRecipe *EarliestLoad = Group[0];
4824 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4825 VPBasicBlock *LastBB = Group.back()->getParent();
4826
4827 // Check that the load doesn't alias with stores between first and last.
4828 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4829 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4830 continue;
4831
4832 // Collect common metadata from all loads in the group.
4833 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4834
4835 // Find the load with minimum alignment to use.
4836 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4837
4838 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4839 assert(all_of(Group,
4840 [IsSingleScalar](VPReplicateRecipe *R) {
4841 return R->isSingleScalar() == IsSingleScalar;
4842 }) &&
4843 "all members in group must agree on IsSingleScalar");
4844
4845 // Create an unpredicated version of the earliest load with common
4846 // metadata.
4847 auto *UnpredicatedLoad = new VPReplicateRecipe(
4848 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4849 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4850
4851 UnpredicatedLoad->insertBefore(EarliestLoad);
4852
4853 // Replace all loads in the group with the unpredicated load.
4854 for (VPReplicateRecipe *Load : Group) {
4855 Load->replaceAllUsesWith(UnpredicatedLoad);
4856 Load->eraseFromParent();
4857 }
4858 }
4859}
4860
4861static bool
4863 PredicatedScalarEvolution &PSE, const Loop &L,
4864 VPTypeAnalysis &TypeInfo) {
4865 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4866 if (!StoreLoc || !StoreLoc->AATags.Scope)
4867 return false;
4868
4869 // When sinking a group of stores, all members of the group alias each other.
4870 // Skip them during the alias checks.
4871 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4872 StoresToSink.end());
4873
4874 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4875 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4876 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4877 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4878}
4879
4882 const Loop *L) {
4883 auto Groups =
4885 if (Groups.empty())
4886 return;
4887
4888 VPTypeAnalysis TypeInfo(Plan);
4889
4890 for (auto &Group : Groups) {
4891 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4892 continue;
4893
4894 // Use the last (most dominated) store's location for the unconditional
4895 // store.
4896 VPReplicateRecipe *LastStore = Group.back();
4897 VPBasicBlock *InsertBB = LastStore->getParent();
4898
4899 // Collect common alias metadata from all stores in the group.
4900 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4901
4902 // Build select chain for stored values.
4903 VPValue *SelectedValue = Group[0]->getOperand(0);
4904 VPBuilder Builder(InsertBB, LastStore->getIterator());
4905
4906 bool IsSingleScalar = Group[0]->isSingleScalar();
4907 for (unsigned I = 1; I < Group.size(); ++I) {
4908 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4909 "all members in group must agree on IsSingleScalar");
4910 VPValue *Mask = Group[I]->getMask();
4911 VPValue *Value = Group[I]->getOperand(0);
4912 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4913 Group[I]->getDebugLoc());
4914 }
4915
4916 // Find the store with minimum alignment to use.
4917 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4918
4919 // Create unconditional store with selected value and common metadata.
4920 auto *UnpredicatedStore = new VPReplicateRecipe(
4921 StoreWithMinAlign->getUnderlyingInstr(),
4922 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4923 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4924 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4925
4926 // Remove all predicated stores from the group.
4927 for (VPReplicateRecipe *Store : Group)
4928 Store->eraseFromParent();
4929 }
4930}
4931
4933 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4935 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4936 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4937
4938 VPValue *TC = Plan.getTripCount();
4939 // Skip cases for which the trip count may be non-trivial to materialize.
4940 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4941 // tail is required.
4942 if (!Plan.hasScalarTail() ||
4944 Plan.getScalarPreheader() ||
4945 !isa<VPIRValue>(TC))
4946 return;
4947
4948 // Materialize vector trip counts for constants early if it can simply
4949 // be computed as (Original TC / VF * UF) * VF * UF.
4950 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4951 // tail-folded loops.
4952 ScalarEvolution &SE = *PSE.getSE();
4953 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4954 if (!isa<SCEVConstant>(TCScev))
4955 return;
4956 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4957 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4958 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4959 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4960}
4961
4963 VPBasicBlock *VectorPH) {
4965 if (BTC->getNumUsers() == 0)
4966 return;
4967
4968 VPBuilder Builder(VectorPH, VectorPH->begin());
4969 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4970 auto *TCMO =
4971 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4972 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4973 BTC->replaceAllUsesWith(TCMO);
4974}
4975
4977 if (Plan.hasScalarVFOnly())
4978 return;
4979
4980 VPTypeAnalysis TypeInfo(Plan);
4981 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4982 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4984 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4985 vp_depth_first_shallow(LoopRegion->getEntry()));
4986 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4987 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4988 // regions. Those are not materialized explicitly yet. Those vector users are
4989 // still handled in VPReplicateRegion::execute(), via shouldPack().
4990 // TODO: materialize build vectors for replicating recipes in replicating
4991 // regions.
4992 for (VPBasicBlock *VPBB :
4993 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4994 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4996 continue;
4997 auto *DefR = cast<VPSingleDefRecipe>(&R);
4998 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4999 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5000 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5001 };
5002 if ((isa<VPReplicateRecipe>(DefR) &&
5003 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5004 (isa<VPInstruction>(DefR) &&
5006 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5007 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5008 continue;
5009
5010 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5011 unsigned Opcode = ScalarTy->isStructTy()
5014 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5015 BuildVector->insertAfter(DefR);
5016
5017 DefR->replaceUsesWithIf(
5018 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5019 VPUser &U, unsigned) {
5020 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5021 });
5022 }
5023 }
5024
5025 // Create explicit VPInstructions to convert vectors to scalars. The current
5026 // implementation is conservative - it may miss some cases that may or may not
5027 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5028 // if they are known to operate on scalar values.
5029 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5030 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5033 continue;
5034 for (VPValue *Def : R.definedValues()) {
5035 // Skip recipes that are single-scalar or only have their first lane
5036 // used.
5037 // TODO: The Defs skipped here may or may not be vector values.
5038 // Introduce Unpacks, and remove them later, if they are guaranteed to
5039 // produce scalar values.
5041 continue;
5042
5043 // At the moment, we create unpacks only for scalar users outside
5044 // replicate regions. Recipes inside replicate regions still extract the
5045 // required lanes implicitly.
5046 // TODO: Remove once replicate regions are unrolled completely.
5047 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5048 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5049 return U->usesScalars(Def) &&
5050 (!ParentRegion || !ParentRegion->isReplicator());
5051 };
5052 if (none_of(Def->users(), IsCandidateUnpackUser))
5053 continue;
5054
5055 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5056 if (R.isPhi())
5057 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5058 else
5059 Unpack->insertAfter(&R);
5060 Def->replaceUsesWithIf(Unpack,
5061 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5062 return IsCandidateUnpackUser(&U);
5063 });
5064 }
5065 }
5066 }
5067}
5068
5070 VPBasicBlock *VectorPHVPBB,
5071 bool TailByMasking,
5072 bool RequiresScalarEpilogue,
5073 VPValue *Step) {
5074 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5075 // There's nothing to do if there are no users of the vector trip count or its
5076 // IR value has already been set.
5077 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5078 return;
5079
5080 VPValue *TC = Plan.getTripCount();
5081 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5082 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5083 if (auto *StepR = Step->getDefiningRecipe()) {
5084 assert(StepR->getParent() == VectorPHVPBB &&
5085 "Step must be defined in VectorPHVPBB");
5086 // Insert after Step's definition to maintain valid def-use ordering.
5087 InsertPt = std::next(StepR->getIterator());
5088 }
5089 VPBuilder Builder(VectorPHVPBB, InsertPt);
5090
5091 // If the tail is to be folded by masking, round the number of iterations N
5092 // up to a multiple of Step instead of rounding down. This is done by first
5093 // adding Step-1 and then rounding down. Note that it's ok if this addition
5094 // overflows: the vector induction variable will eventually wrap to zero given
5095 // that it starts at zero and its Step is a power of two; the loop will then
5096 // exit, with the last early-exit vector comparison also producing all-true.
5097 if (TailByMasking) {
5098 TC = Builder.createAdd(
5099 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5100 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5101 }
5102
5103 // Now we need to generate the expression for the part of the loop that the
5104 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5105 // iterations are not required for correctness, or N - Step, otherwise. Step
5106 // is equal to the vectorization factor (number of SIMD elements) times the
5107 // unroll factor (number of SIMD instructions).
5108 VPValue *R =
5109 Builder.createNaryOp(Instruction::URem, {TC, Step},
5110 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5111
5112 // There are cases where we *must* run at least one iteration in the remainder
5113 // loop. See the cost model for when this can happen. If the step evenly
5114 // divides the trip count, we set the remainder to be equal to the step. If
5115 // the step does not evenly divide the trip count, no adjustment is necessary
5116 // since there will already be scalar iterations. Note that the minimum
5117 // iterations check ensures that N >= Step.
5118 if (RequiresScalarEpilogue) {
5119 assert(!TailByMasking &&
5120 "requiring scalar epilogue is not supported with fail folding");
5121 VPValue *IsZero =
5122 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5123 R = Builder.createSelect(IsZero, Step, R);
5124 }
5125
5126 VPValue *Res =
5127 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5128 VectorTC.replaceAllUsesWith(Res);
5129}
5130
5132 ElementCount VFEC) {
5133 VPBuilder Builder(VectorPH, VectorPH->begin());
5134 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5135 VPValue &VF = Plan.getVF();
5136 VPValue &VFxUF = Plan.getVFxUF();
5137 // Note that after the transform, no further uses of Plan.getVF and
5138 // Plan.getVFxUF should be added.
5139 // TODO: Add assertions for this.
5140
5141 // If there are no users of the runtime VF, compute VFxUF by constant folding
5142 // the multiplication of VF and UF.
5143 if (VF.getNumUsers() == 0) {
5144 VPValue *RuntimeVFxUF =
5145 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5146 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5147 return;
5148 }
5149
5150 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5151 // vscale) * UF.
5152 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5154 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5156 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5157 }
5158 VF.replaceAllUsesWith(RuntimeVF);
5159
5160 VPValue *MulByUF = Builder.createOverflowingOp(
5161 Instruction::Mul,
5162 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5163 {true, false});
5164 VFxUF.replaceAllUsesWith(MulByUF);
5165}
5166
5169 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5170
5171 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5172 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5173 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5174 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5176 continue;
5177 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5178 if (!ExpSCEV)
5179 break;
5180 const SCEV *Expr = ExpSCEV->getSCEV();
5181 Value *Res =
5182 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5183 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5184 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5185 ExpSCEV->replaceAllUsesWith(Exp);
5186 if (Plan.getTripCount() == ExpSCEV)
5187 Plan.resetTripCount(Exp);
5188 ExpSCEV->eraseFromParent();
5189 }
5191 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5192 "before any VPIRInstructions");
5193 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5194 // to the VPIRBasicBlock.
5195 auto EI = Entry->begin();
5196 for (Instruction &I : drop_end(*EntryBB)) {
5197 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5198 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5199 EI++;
5200 continue;
5201 }
5203 }
5204
5205 return ExpandedSCEVs;
5206}
5207
5208/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5209/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5210/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5211/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5212/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5213/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5214/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5215/// is defined at \p Idx of a load interleave group.
5216static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5217 VPValue *OpV, unsigned Idx) {
5218 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5219 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5220 if (!Member0OpR)
5221 return Member0Op == OpV;
5222 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5223 return !W->getMask() && W->isConsecutive() && Member0Op == OpV;
5224 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5225 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5226 return false;
5227}
5228
5231 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5232 if (!WideMember0)
5233 return false;
5234 for (VPValue *V : Ops) {
5236 return false;
5237 auto *R = cast<VPSingleDefRecipe>(V);
5238 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5239 return false;
5240 }
5241
5242 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5244 for (VPValue *Op : Ops)
5245 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5246
5247 if (canNarrowOps(OpsI))
5248 continue;
5249
5250 if (any_of(enumerate(OpsI), [WideMember0, Idx](const auto &P) {
5251 const auto &[OpIdx, OpV] = P;
5252 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx);
5253 }))
5254 return false;
5255 }
5256
5257 return true;
5258}
5259
5260/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5261/// number of members both equal to VF. The interleave group must also access
5262/// the full vector width.
5263static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5265 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5266 if (!InterleaveR || InterleaveR->getMask())
5267 return std::nullopt;
5268
5269 Type *GroupElementTy = nullptr;
5270 if (InterleaveR->getStoredValues().empty()) {
5271 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5272 if (!all_of(InterleaveR->definedValues(),
5273 [&TypeInfo, GroupElementTy](VPValue *Op) {
5274 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5275 }))
5276 return std::nullopt;
5277 } else {
5278 GroupElementTy =
5279 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5280 if (!all_of(InterleaveR->getStoredValues(),
5281 [&TypeInfo, GroupElementTy](VPValue *Op) {
5282 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5283 }))
5284 return std::nullopt;
5285 }
5286
5287 auto IG = InterleaveR->getInterleaveGroup();
5288 if (IG->getFactor() != IG->getNumMembers())
5289 return std::nullopt;
5290
5291 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5292 TypeSize Size = TTI.getRegisterBitWidth(
5295 assert(Size.isScalable() == VF.isScalable() &&
5296 "if Size is scalable, VF must be scalable and vice versa");
5297 return Size.getKnownMinValue();
5298 };
5299
5300 for (ElementCount VF : VFs) {
5301 unsigned MinVal = VF.getKnownMinValue();
5302 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5303 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5304 return {VF};
5305 }
5306 return std::nullopt;
5307}
5308
5309/// Returns true if \p VPValue is a narrow VPValue.
5310static bool isAlreadyNarrow(VPValue *VPV) {
5311 if (isa<VPIRValue>(VPV))
5312 return true;
5313 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5314 return RepR && RepR->isSingleScalar();
5315}
5316
5317// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5318// a narrow variant.
5319static VPValue *
5321 auto *R = V->getDefiningRecipe();
5322 if (!R || NarrowedOps.contains(V))
5323 return V;
5324
5325 if (isAlreadyNarrow(V))
5326 return V;
5327
5329 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5330 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5331 WideMember0->setOperand(
5332 Idx,
5333 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5334 return V;
5335 }
5336
5337 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5338 // Narrow interleave group to wide load, as transformed VPlan will only
5339 // process one original iteration.
5340 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5341 auto *L = new VPWidenLoadRecipe(
5342 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5343 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5344 L->insertBefore(LoadGroup);
5345 NarrowedOps.insert(L);
5346 return L;
5347 }
5348
5349 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5350 assert(RepR->isSingleScalar() &&
5351 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5352 "must be a single scalar load");
5353 NarrowedOps.insert(RepR);
5354 return RepR;
5355 }
5356
5357 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5358 VPValue *PtrOp = WideLoad->getAddr();
5359 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5360 PtrOp = VecPtr->getOperand(0);
5361 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5362 // process one original iteration.
5363 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5364 /*IsUniform*/ true,
5365 /*Mask*/ nullptr, {}, *WideLoad);
5366 N->insertBefore(WideLoad);
5367 NarrowedOps.insert(N);
5368 return N;
5369}
5370
5371std::unique_ptr<VPlan>
5373 const TargetTransformInfo &TTI) {
5374 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5375
5376 if (!VectorLoop)
5377 return nullptr;
5378
5379 // Only handle single-block loops for now.
5380 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5381 return nullptr;
5382
5383 // Skip plans when we may not be able to properly narrow.
5384 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5385 if (!match(&Exiting->back(), m_BranchOnCount()))
5386 return nullptr;
5387
5388 assert(match(&Exiting->back(),
5390 m_Specific(&Plan.getVectorTripCount()))) &&
5391 "unexpected branch-on-count");
5392
5393 VPTypeAnalysis TypeInfo(Plan);
5395 std::optional<ElementCount> VFToOptimize;
5396 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5398 continue;
5399
5402 continue;
5403
5404 // Bail out on recipes not supported at the moment:
5405 // * phi recipes other than the canonical induction
5406 // * recipes writing to memory except interleave groups
5407 // Only support plans with a canonical induction phi.
5408 if (R.isPhi())
5409 return nullptr;
5410
5411 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5412 if (R.mayWriteToMemory() && !InterleaveR)
5413 return nullptr;
5414
5415 // All other ops are allowed, but we reject uses that cannot be converted
5416 // when checking all allowed consumers (store interleave groups) below.
5417 if (!InterleaveR)
5418 continue;
5419
5420 // Try to find a single VF, where all interleave groups are consecutive and
5421 // saturate the full vector width. If we already have a candidate VF, check
5422 // if it is applicable for the current InterleaveR, otherwise look for a
5423 // suitable VF across the Plan's VFs.
5425 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5426 : to_vector(Plan.vectorFactors());
5427 std::optional<ElementCount> NarrowedVF =
5428 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5429 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5430 return nullptr;
5431 VFToOptimize = NarrowedVF;
5432
5433 // Skip read interleave groups.
5434 if (InterleaveR->getStoredValues().empty())
5435 continue;
5436
5437 // Narrow interleave groups, if all operands are already matching narrow
5438 // ops.
5439 auto *Member0 = InterleaveR->getStoredValues()[0];
5440 if (isAlreadyNarrow(Member0) &&
5441 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5442 StoreGroups.push_back(InterleaveR);
5443 continue;
5444 }
5445
5446 // For now, we only support full interleave groups storing load interleave
5447 // groups.
5448 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5449 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5450 if (!DefR)
5451 return false;
5452 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5453 return IR && IR->getInterleaveGroup()->isFull() &&
5454 IR->getVPValue(Op.index()) == Op.value();
5455 })) {
5456 StoreGroups.push_back(InterleaveR);
5457 continue;
5458 }
5459
5460 // Check if all values feeding InterleaveR are matching wide recipes, which
5461 // operands that can be narrowed.
5462 if (!canNarrowOps(InterleaveR->getStoredValues()))
5463 return nullptr;
5464 StoreGroups.push_back(InterleaveR);
5465 }
5466
5467 if (StoreGroups.empty())
5468 return nullptr;
5469
5470 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5471 bool RequiresScalarEpilogue =
5472 MiddleVPBB->getNumSuccessors() == 1 &&
5473 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5474 // Bail out for tail-folding (middle block with a single successor to exit).
5475 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5476 return nullptr;
5477
5478 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5479 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5480 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5481 // TODO: Handle cases where only some interleave groups can be narrowed.
5482 std::unique_ptr<VPlan> NewPlan;
5483 if (size(Plan.vectorFactors()) != 1) {
5484 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5485 Plan.setVF(*VFToOptimize);
5486 NewPlan->removeVF(*VFToOptimize);
5487 }
5488
5489 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5490 SmallPtrSet<VPValue *, 4> NarrowedOps;
5491 // Narrow operation tree rooted at store groups.
5492 for (auto *StoreGroup : StoreGroups) {
5493 VPValue *Res =
5494 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5495 auto *SI =
5496 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5497 auto *S = new VPWidenStoreRecipe(
5498 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5499 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5500 S->insertBefore(StoreGroup);
5501 StoreGroup->eraseFromParent();
5502 }
5503
5504 // Adjust induction to reflect that the transformed plan only processes one
5505 // original iteration.
5506 auto *CanIV = VectorLoop->getCanonicalIV();
5507 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5508 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5509 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5510
5511 VPValue *UF = &Plan.getUF();
5512 VPValue *Step;
5513 if (VFToOptimize->isScalable()) {
5514 VPValue *VScale = PHBuilder.createElementCount(
5516 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5517 {true, false});
5518 Plan.getVF().replaceAllUsesWith(VScale);
5519 } else {
5520 Step = UF;
5522 Plan.getConstantInt(CanIV->getScalarType(), 1));
5523 }
5524 // Materialize vector trip count with the narrowed step.
5525 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5526 RequiresScalarEpilogue, Step);
5527
5528 Inc->setOperand(1, Step);
5529 Plan.getVFxUF().replaceAllUsesWith(Step);
5530
5531 removeDeadRecipes(Plan);
5532 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5534 "All VPVectorPointerRecipes should have been removed");
5535 return NewPlan;
5536}
5537
5538/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5539/// BranchOnCond recipe.
5541 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5542 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5543 auto *MiddleTerm =
5545 // Only add branch metadata if there is a (conditional) terminator.
5546 if (!MiddleTerm)
5547 return;
5548
5549 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5550 "must have a BranchOnCond");
5551 // Assume that `TripCount % VectorStep ` is equally distributed.
5552 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5553 if (VF.isScalable() && VScaleForTuning.has_value())
5554 VectorStep *= *VScaleForTuning;
5555 assert(VectorStep > 0 && "trip count should not be zero");
5556 MDBuilder MDB(Plan.getContext());
5557 MDNode *BranchWeights =
5558 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5559 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5560}
5561
5563 VFRange &Range) {
5564 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5565 auto *MiddleVPBB = Plan.getMiddleBlock();
5566 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5567
5568 auto IsScalableOne = [](ElementCount VF) -> bool {
5569 return VF == ElementCount::getScalable(1);
5570 };
5571
5572 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5573 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5574 if (!FOR)
5575 continue;
5576
5577 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5578 "Cannot handle loops with uncountable early exits");
5579
5580 // This is the second phase of vectorizing first-order recurrences, creating
5581 // extract for users outside the loop. An overview of the transformation is
5582 // described below. Suppose we have the following loop with some use after
5583 // the loop of the last a[i-1],
5584 //
5585 // for (int i = 0; i < n; ++i) {
5586 // t = a[i - 1];
5587 // b[i] = a[i] - t;
5588 // }
5589 // use t;
5590 //
5591 // There is a first-order recurrence on "a". For this loop, the shorthand
5592 // scalar IR looks like:
5593 //
5594 // scalar.ph:
5595 // s.init = a[-1]
5596 // br scalar.body
5597 //
5598 // scalar.body:
5599 // i = phi [0, scalar.ph], [i+1, scalar.body]
5600 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5601 // s2 = a[i]
5602 // b[i] = s2 - s1
5603 // br cond, scalar.body, exit.block
5604 //
5605 // exit.block:
5606 // use = lcssa.phi [s1, scalar.body]
5607 //
5608 // In this example, s1 is a recurrence because it's value depends on the
5609 // previous iteration. In the first phase of vectorization, we created a
5610 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5611 // for users in the scalar preheader and exit block.
5612 //
5613 // vector.ph:
5614 // v_init = vector(..., ..., ..., a[-1])
5615 // br vector.body
5616 //
5617 // vector.body
5618 // i = phi [0, vector.ph], [i+4, vector.body]
5619 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5620 // v2 = a[i, i+1, i+2, i+3]
5621 // b[i] = v2 - v1
5622 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5623 // b[i, i+1, i+2, i+3] = v2 - v1
5624 // br cond, vector.body, middle.block
5625 //
5626 // middle.block:
5627 // vector.recur.extract.for.phi = v2(2)
5628 // vector.recur.extract = v2(3)
5629 // br cond, scalar.ph, exit.block
5630 //
5631 // scalar.ph:
5632 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5633 // [s.init, otherwise]
5634 // br scalar.body
5635 //
5636 // scalar.body:
5637 // i = phi [0, scalar.ph], [i+1, scalar.body]
5638 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5639 // s2 = a[i]
5640 // b[i] = s2 - s1
5641 // br cond, scalar.body, exit.block
5642 //
5643 // exit.block:
5644 // lo = lcssa.phi [s1, scalar.body],
5645 // [vector.recur.extract.for.phi, middle.block]
5646 //
5647 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5648 // Extract the penultimate value of the recurrence and use it as operand for
5649 // the VPIRInstruction modeling the phi.
5651 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5653 continue;
5654
5655 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5656 // penultimate value of the recurrence. Instead we rely on the existing
5657 // extract of the last element from the result of
5658 // VPInstruction::FirstOrderRecurrenceSplice.
5659 // TODO: Consider vscale_range info and UF.
5661 Range))
5662 return;
5663 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5664 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5665 "vector.recur.extract.for.phi");
5666 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5667 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5668 if (!ExitPhi)
5669 continue;
5670 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5671 }
5672 }
5673 }
5674}
5675
5678 Loop &L) {
5679 ScalarEvolution &SE = *PSE.getSE();
5680 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5681
5682 // Helper lambda to check if the IV range excludes the sentinel value.
5683 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5684 bool Signed) -> std::optional<APInt> {
5685 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5686 APInt Sentinel =
5687 UseMax
5690
5691 ConstantRange IVRange =
5692 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5693 if (!IVRange.contains(Sentinel))
5694 return Sentinel;
5695 return std::nullopt;
5696 };
5697
5698 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5699 for (VPRecipeBase &Phi :
5700 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5701 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5703 PhiR->getRecurrenceKind()))
5704 continue;
5705
5706 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5707 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5708 continue;
5709
5710 // If there's a header mask, the backedge select will not be the find-last
5711 // select.
5712 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5713 VPValue *CondSelect = BackedgeVal;
5714 if (HeaderMask &&
5715 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5716 m_VPValue(CondSelect), m_Specific(PhiR))))
5717 llvm_unreachable("expected header mask select");
5718
5719 // Get the IV from the conditional select of the reduction phi.
5720 // The conditional select should be a select between the phi and the IV.
5721 VPValue *Cond, *TrueVal, *FalseVal;
5722 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5723 m_VPValue(FalseVal))))
5724 continue;
5725
5726 // The non-phi operand of the select is the IV.
5727 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5728 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5729
5730 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5731 const SCEV *Step;
5732 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5733 continue;
5734
5735 // Determine direction from SCEV step.
5736 if (!SE.isKnownNonZero(Step))
5737 continue;
5738
5739 // Positive step means we need UMax/SMax to find the last IV value, and
5740 // UMin/SMin otherwise.
5741 bool UseMax = SE.isKnownPositive(Step);
5742 bool UseSigned = true;
5743 std::optional<APInt> SentinelVal =
5744 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5745 if (!SentinelVal) {
5746 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5747 UseSigned = false;
5748 }
5749
5750 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5751 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5752 // cannot use min/max.
5753 if (!SentinelVal) {
5754 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5755 if (AR->hasNoSignedWrap())
5756 UseSigned = true;
5757 else if (AR->hasNoUnsignedWrap())
5758 UseSigned = false;
5759 else
5760 continue;
5761 }
5762
5764 BackedgeVal,
5766
5767 RecurKind MinMaxKind =
5768 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5769 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5770 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5771 FastMathFlags());
5772 DebugLoc ExitDL = RdxResult->getDebugLoc();
5773 VPBuilder MiddleBuilder(RdxResult);
5774 VPValue *ReducedIV =
5776 RdxResult->getOperand(0), Flags, ExitDL);
5777
5778 VPValue *NewRdxResult;
5779 VPValue *StartVPV = PhiR->getStartValue();
5780 if (SentinelVal) {
5781 // Sentinel-based approach: reduce IVs with min/max, compare against
5782 // sentinel to detect if condition was ever true, select accordingly.
5783 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5784 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5785 Sentinel, ExitDL);
5786 NewRdxResult =
5787 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5788 StartVPV = Sentinel;
5789 } else {
5790 // Introduce a boolean AnyOf reduction to track if the condition was ever
5791 // true in the loop. Use it to select the initial start value, if it was
5792 // never true.
5793 auto *AnyOfPhi = new VPReductionPHIRecipe(
5794 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5795 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5796 AnyOfPhi->insertAfter(PhiR);
5797
5798 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5799 VPValue *AnyOfCond = Cond;
5800 if (TrueVal == PhiR)
5801 AnyOfCond = LoopBuilder.createNot(Cond);
5802 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5803 AnyOfPhi->setOperand(1, OrVal);
5804
5805 NewRdxResult =
5807 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5808
5809 // Initialize the IV reduction phi with the neutral element, not the
5810 // original start value, to ensure correct min/max reduction results.
5811 StartVPV = Plan.getOrAddLiveIn(
5812 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5813 }
5814 RdxResult->replaceAllUsesWith(NewRdxResult);
5815 RdxResult->eraseFromParent();
5816
5817 auto *NewPhiR = new VPReductionPHIRecipe(
5818 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5819 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5820 NewPhiR->insertBefore(PhiR);
5821 PhiR->replaceAllUsesWith(NewPhiR);
5822 PhiR->eraseFromParent();
5823 }
5824}
5825
5826namespace {
5827
5828/// A chain of recipes that form a partial reduction. Matches either
5829/// reduction_bin_op (extend (A), accumulator), or
5830/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5831struct VPPartialReductionChain {
5832 /// The top-level binary operation that forms the reduction to a scalar
5833 /// after the loop body.
5834 VPWidenRecipe *ReductionBinOp;
5835 /// The extension of each of the inner binary operation's operands.
5836 VPWidenCastRecipe *ExtendA;
5837 VPWidenCastRecipe *ExtendB;
5838 /// The user of the extends that is then reduced.
5839 VPWidenRecipe *BinOp;
5840 unsigned ScaleFactor;
5841 /// The recurrence kind for the entire partial reduction chain.
5842 /// This allows distinguishing between Sub and AddWithSub recurrences,
5843 /// when the ReductionBinOp is a Instruction::Sub.
5844 RecurKind RK;
5845};
5846
5847static VPSingleDefRecipe *
5848optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5849 VPTypeAnalysis &TypeInfo) {
5850 // reduce.add(mul(ext(A), C))
5851 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5852 const APInt *Const;
5853 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5854 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5855 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5856 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5857 if (!BinOp->hasOneUse() ||
5859 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5860 return BinOp;
5861
5862 VPBuilder Builder(BinOp);
5863 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5864 BinOp->getOperand(1), NarrowTy);
5865 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5866 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5867 return BinOp;
5868 }
5869
5870 // reduce.add(ext(mul(ext(A), ext(B))))
5871 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5873 m_ZExtOrSExt(m_VPValue()))))) {
5874 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5875 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5876 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5877 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5878 if (!Mul->hasOneUse() ||
5879 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5880 MulLHS->getOpcode() != MulRHS->getOpcode())
5881 return BinOp;
5882 VPBuilder Builder(Mul);
5883 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5884 MulLHS->getOperand(0),
5885 Ext->getResultType()));
5886 Mul->setOperand(1, MulLHS == MulRHS
5887 ? Mul->getOperand(0)
5888 : Builder.createWidenCast(MulRHS->getOpcode(),
5889 MulRHS->getOperand(0),
5890 Ext->getResultType()));
5891 return Mul;
5892 }
5893
5894 return BinOp;
5895}
5896
5897// Helper to transform a partial reduction chain into a partial reduction
5898// recipe. Assumes profitability has been checked.
5899static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5900 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5901 VPReductionPHIRecipe *RdxPhi) {
5902 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5903 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5904
5905 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5906 VPValue *Accumulator = WidenRecipe->getOperand(1);
5907
5908 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5910 isa<VPExpressionRecipe>(BinOpVal))
5911 std::swap(BinOpVal, Accumulator);
5912 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5913
5914 // Sub-reductions can be implemented in two ways:
5915 // (1) negate the operand in the vector loop (the default way).
5916 // (2) subtract the reduced value from the init value in the middle block.
5917 // Both ways keep the reduction itself as an 'add' reduction.
5918 //
5919 // The ISD nodes for partial reductions don't support folding the
5920 // sub/negation into its operands because the following is not a valid
5921 // transformation:
5922 // sub(0, mul(ext(a), ext(b)))
5923 // -> mul(ext(a), ext(sub(0, b)))
5924 //
5925 // It's therefore better to choose option (2) such that the partial
5926 // reduction is always positive (starting at '0') and to do a final
5927 // subtract in the middle block.
5928 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5929 Chain.RK != RecurKind::Sub) {
5930 VPBuilder Builder(WidenRecipe);
5931 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5932 auto *Zero = Plan.getZero(ElemTy);
5933 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5934 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5935 : VPIRFlags();
5936 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5938 Builder.insert(NegRecipe);
5939 BinOp = NegRecipe;
5940 }
5941
5942 // FIXME: Do these transforms before invoking the cost-model.
5943 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5944
5945 // Check if WidenRecipe is the final result of the reduction. If so look
5946 // through selects for predicated reductions.
5947 VPValue *Cond = nullptr;
5949 WidenRecipe,
5950 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5951 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5952 RdxPhi->getBackedgeValue() == ExitValue;
5953 assert((!ExitValue || IsLastInChain) &&
5954 "if we found ExitValue, it must match RdxPhi's backedge value");
5955
5956 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
5957 RecurKind RdxKind =
5959 auto *PartialRed = new VPReductionRecipe(
5960 RdxKind,
5961 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5962 : FastMathFlags(),
5963 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
5964 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
5965 PartialRed->insertBefore(WidenRecipe);
5966
5967 if (Cond)
5968 ExitValue->replaceAllUsesWith(PartialRed);
5969 WidenRecipe->replaceAllUsesWith(PartialRed);
5970
5971 // We only need to update the PHI node once, which is when we find the
5972 // last reduction in the chain.
5973 if (!IsLastInChain)
5974 return;
5975
5976 // Scale the PHI and ReductionStartVector by the VFScaleFactor
5977 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
5978 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
5979
5980 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
5981 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
5982 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
5983 StartInst->setOperand(2, NewScaleFactor);
5984
5985 // If this is the last value in a sub-reduction chain, then update the PHI
5986 // node to start at `0` and update the reduction-result to subtract from
5987 // the PHI's start value.
5988 if (Chain.RK != RecurKind::Sub)
5989 return;
5990
5991 VPValue *OldStartValue = StartInst->getOperand(0);
5992 StartInst->setOperand(0, StartInst->getOperand(1));
5993
5994 // Replace reduction_result by 'sub (startval, reductionresult)'.
5996 assert(RdxResult && "Could not find reduction result");
5997
5998 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
5999 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6000 VPInstruction *NewResult = Builder.createNaryOp(
6001 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6002 RdxPhi->getDebugLoc());
6003 RdxResult->replaceUsesWithIf(
6004 NewResult,
6005 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6006}
6007
6008/// Check if a partial reduction chain is is supported by the target (i.e. does
6009/// not have an invalid cost) for the given VF range. Clamps the range and
6010/// returns true if profitable for any VF.
6011static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6012 Type *PhiType, VPCostContext &CostCtx,
6013 VFRange &Range) {
6014 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6015 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6016 if (!Ext)
6017 return {nullptr, TargetTransformInfo::PR_None};
6018 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6020 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6021 return {ExtOpType, ExtKind};
6022 };
6023 auto ExtInfoA = GetExtInfo(Chain.ExtendA);
6024 auto ExtInfoB = GetExtInfo(Chain.ExtendB);
6025 Type *ExtOpTypeA = ExtInfoA.first;
6026 Type *ExtOpTypeB = ExtInfoB.first;
6027 auto ExtKindA = ExtInfoA.second;
6028 auto ExtKindB = ExtInfoB.second;
6029
6030 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6031 // was a constant that can use the same extend kind as the first.
6032 if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) {
6033 const APInt *Const = nullptr;
6034 for (VPValue *Op : Chain.BinOp->operands()) {
6035 if (match(Op, m_APInt(Const)))
6036 break;
6037 }
6038 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6039 return false;
6040 ExtOpTypeB = ExtOpTypeA;
6041 ExtKindB = ExtKindA;
6042 }
6043
6044 std::optional<unsigned> BinOpc =
6045 (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp)
6046 ? std::make_optional(Chain.BinOp->getOpcode())
6047 : std::nullopt;
6048 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6050 [&](ElementCount VF) {
6051 return CostCtx.TTI
6053 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6054 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6055 PhiType->isFloatingPointTy()
6056 ? std::optional{WidenRecipe->getFastMathFlags()}
6057 : std::nullopt)
6058 .isValid();
6059 },
6060 Range);
6061}
6062
6063/// Examines reduction operations to see if the target can use a cheaper
6064/// operation with a wider per-iteration input VF and narrower PHI VF.
6065/// Recursively calls itself to identify chained scaled reductions.
6066/// Returns true if this invocation added an entry to Chains, otherwise false.
6067static bool
6068getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPValue *PrevValue,
6070 VPCostContext &CostCtx, VFRange &Range) {
6071 auto *UpdateR = dyn_cast<VPWidenRecipe>(PrevValue);
6072 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6073 return false;
6074
6075 VPValue *Op = UpdateR->getOperand(0);
6076 VPValue *PhiOp = UpdateR->getOperand(1);
6077 if (Op == RedPhiR)
6078 std::swap(Op, PhiOp);
6079
6080 // If Op is an extend, then it's still a valid partial reduction if the
6081 // extended mul fulfills the other requirements.
6082 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6083 // reduction since the inner extends will be widened. We already have oneUse
6084 // checks on the inner extends so widening them is safe.
6085 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6088 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Op);
6089 if (!CastRecipe)
6090 return false;
6091 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6092 OuterExtKind = TTI::getPartialReductionExtendKind(CastOp);
6093 Op = CastRecipe->getOperand(0);
6094 }
6095
6096 // Try and get a scaled reduction from the first non-phi operand.
6097 // If one is found, we use the discovered reduction instruction in
6098 // place of the accumulator for costing.
6099 if (getScaledReductions(RedPhiR, Op, Chains, CostCtx, Range)) {
6100 Op = UpdateR->getOperand(0);
6101 PhiOp = UpdateR->getOperand(1);
6102 if (Op == Chains.rbegin()->ReductionBinOp)
6103 std::swap(Op, PhiOp);
6104 assert(PhiOp == Chains.rbegin()->ReductionBinOp &&
6105 "PhiOp must be the chain value");
6106 assert(CostCtx.Types.inferScalarType(RedPhiR) ==
6107 CostCtx.Types.inferScalarType(PhiOp) &&
6108 "Unexpected type for chain values");
6109 } else if (RedPhiR != PhiOp) {
6110 // If neither operand of this instruction is the reduction PHI node or a
6111 // link in the reduction chain, then this is just an operand to the chain
6112 // and not a link in the chain itself.
6113 return false;
6114 }
6115
6116 // If the update is a binary op, check both of its operands to see if
6117 // they are extends. Otherwise, see if the update comes directly from an
6118 // extend.
6119 VPWidenCastRecipe *CastRecipes[2] = {nullptr};
6120
6121 // Match extends and populate CastRecipes. Returns false if matching fails.
6122 auto MatchExtends = [OuterExtKind,
6123 &CastRecipes](ArrayRef<VPValue *> Operands) {
6124 assert(Operands.size() <= 2 && "expected at most 2 operands");
6125
6126 for (const auto &[I, OpVal] : enumerate(Operands)) {
6127 // Allow constant as second operand - validation happens in
6128 // isValidPartialReduction.
6129 const APInt *Unused;
6130 if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused)))
6131 continue;
6132
6133 VPValue *ExtInput;
6134 if (!match(OpVal, m_ZExtOrSExt(m_VPValue(ExtInput))) &&
6135 !match(OpVal, m_FPExt(m_VPValue(ExtInput))))
6136 return false;
6137
6138 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(OpVal);
6139 if (!CastRecipes[I])
6140 return false;
6141
6142 // The outer extend kind must match the inner extends for folding.
6143 if (OuterExtKind) {
6144 auto CastOp =
6145 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
6146 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOp))
6147 return false;
6148 }
6149 }
6150 return CastRecipes[0] != nullptr;
6151 };
6152
6153 // If Op is a binary operator, check both of its operands to see if they are
6154 // extends. Otherwise, see if the update comes directly from an extend.
6155 auto *BinOp = dyn_cast<VPWidenRecipe>(Op);
6156 if (BinOp && Instruction::isBinaryOp(BinOp->getOpcode())) {
6157 if (!BinOp->hasOneUse())
6158 return false;
6159
6160 // Handle neg(binop(ext, ext)) pattern.
6161 VPValue *OtherOp = nullptr;
6162 if (match(BinOp, m_Sub(m_ZeroInt(), m_VPValue(OtherOp))))
6163 BinOp = dyn_cast<VPWidenRecipe>(OtherOp);
6164
6165 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6166 !MatchExtends(BinOp->operands()))
6167 return false;
6168 } else if (match(UpdateR, m_Add(m_VPValue(), m_VPValue())) ||
6169 match(UpdateR, m_FAdd(m_VPValue(), m_VPValue()))) {
6170 // We already know the operands for Update are Op and PhiOp.
6171 if (!MatchExtends({Op}))
6172 return false;
6173 BinOp = UpdateR;
6174 } else {
6175 return false;
6176 }
6177
6178 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6179 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6180 Type *ExtOpType =
6181 CostCtx.Types.inferScalarType(CastRecipes[0]->getOperand(0));
6182 TypeSize ASize = ExtOpType->getPrimitiveSizeInBits();
6183 if (!PHISize.hasKnownScalarFactor(ASize))
6184 return false;
6185
6186 RecurKind RK = cast<VPReductionPHIRecipe>(RedPhiR)->getRecurrenceKind();
6187 VPPartialReductionChain Chain(
6188 {UpdateR, CastRecipes[0], CastRecipes[1], BinOp,
6189 static_cast<unsigned>(PHISize.getKnownScalarFactor(ASize)), RK});
6190 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6191 return false;
6192
6193 Chains.push_back(Chain);
6194 return true;
6195}
6196} // namespace
6197
6199 VPCostContext &CostCtx,
6200 VFRange &Range) {
6201 // Find all possible valid partial reductions, grouping chains by their PHI.
6202 // This grouping allows invalidating the whole chain, if any link is not a
6203 // valid partial reduction.
6205 ChainsByPhi;
6206 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6207 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6208 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6209 if (!RedPhiR)
6210 continue;
6211
6212 // Get the backedge value from the reduction PHI and find the
6213 // ComputeReductionResult that uses it (directly or through a select for
6214 // predicated reductions).
6215 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6216 VPValue *ExitValue = RdxResult->getOperand(0);
6217 match(ExitValue,
6218 m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6219 getScaledReductions(RedPhiR, ExitValue, ChainsByPhi[RedPhiR], CostCtx,
6220 Range);
6221 }
6222 }
6223
6224 if (ChainsByPhi.empty())
6225 return;
6226
6227 // Build set of partial reduction operations for extend user validation and
6228 // a map of reduction bin ops to their scale factors for scale validation.
6229 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6230 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6231 for (const auto &[_, Chains] : ChainsByPhi)
6232 for (const VPPartialReductionChain &Chain : Chains) {
6233 PartialReductionOps.insert(Chain.BinOp);
6234 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6235 }
6236
6237 // A partial reduction is invalid if any of its extends are used by
6238 // something that isn't another partial reduction. This is because the
6239 // extends are intended to be lowered along with the reduction itself.
6240 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6241 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6242 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6243 });
6244 };
6245
6246 // Validate chains: check that extends are only used by partial reductions,
6247 // and that reduction bin ops are only used by other partial reductions with
6248 // matching scale factors, are outside the loop region or the select
6249 // introduced by tail-folding. Otherwise we would create users of scaled
6250 // reductions where the types of the other operands don't match.
6251 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6252 for (const VPPartialReductionChain &Chain : Chains) {
6253 if (!ExtendUsersValid(Chain.ExtendA) ||
6254 !ExtendUsersValid(Chain.ExtendB)) {
6255 Chains.clear();
6256 break;
6257 }
6258 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6259 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6260 return PhiR == RedPhiR;
6261 auto *R = cast<VPSingleDefRecipe>(U);
6262 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6264 m_Specific(Chain.ReductionBinOp))) ||
6265 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6266 m_Specific(RedPhiR)));
6267 };
6268 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6269 Chains.clear();
6270 break;
6271 }
6272
6273 // Check if the compute-reduction-result is used by a sunk store.
6274 // TODO: Also form partial reductions in those cases.
6275 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6276 if (any_of(RdxResult->users(), [](VPUser *U) {
6277 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6278 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6279 })) {
6280 Chains.clear();
6281 break;
6282 }
6283 }
6284 }
6285 }
6286
6287 for (auto &[Phi, Chains] : ChainsByPhi)
6288 for (const VPPartialReductionChain &Chain : Chains)
6289 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6290}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool canNarrowOps(ArrayRef< VPValue * > Ops)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1574
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3856
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4236
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4311
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4263
iterator end()
Definition VPlan.h:4273
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4271
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4324
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4283
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4285
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2761
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2797
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2787
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2803
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2783
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:82
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:301
VPRegionBlock * getParent()
Definition VPlan.h:174
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:292
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:205
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:165
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:311
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:216
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:265
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:210
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:199
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:269
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:290
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:202
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:221
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:239
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3265
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3798
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3888
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3968
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3310
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2273
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2315
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2304
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4389
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4413
Class to record and manage LLVM IR flags.
Definition VPlan.h:672
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1138
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1193
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1295
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1240
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1237
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1289
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1232
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1229
@ CanonicalIVIncrementForPart
Definition VPlan.h:1213
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2906
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2898
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2927
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2979
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2937
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1560
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3452
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:388
VPRegionBlock * getRegion()
Definition VPlan.h:4541
VPBasicBlock * getParent()
Definition VPlan.h:463
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:537
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3139
A recipe for handling reduction phis.
Definition VPlan.h:2667
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2714
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2707
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3030
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4424
const VPBlockBase * getEntry() const
Definition VPlan.h:4460
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4535
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4492
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4477
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4522
const VPBlockBase * getExiting() const
Definition VPlan.h:4472
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4485
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3184
bool isSingleScalar() const
Definition VPlan.h:3225
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3249
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4040
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:589
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:657
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
unsigned getNumOperands() const
Definition VPlanValue.h:296
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1405
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
bool hasOneUse() const
Definition VPlanValue.h:142
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:172
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1408
unsigned getNumUsers() const
Definition VPlanValue.h:104
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1412
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2121
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3931
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1807
Instruction::CastOps getOpcode() const
Definition VPlan.h:1845
A recipe for handling GEP instructions.
Definition VPlan.h:2057
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2339
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2367
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2385
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2370
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2390
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2421
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2468
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2472
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2499
A recipe for widening vector intrinsics.
Definition VPlan.h:1859
A common base class for widening memory operations.
Definition VPlan.h:3495
A recipe for widened phis.
Definition VPlan.h:2557
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1751
unsigned getOpcode() const
Definition VPlan.h:1788
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4554
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4858
bool hasVF(ElementCount VF) const
Definition VPlan.h:4763
LLVMContext & getContext() const
Definition VPlan.h:4745
VPBasicBlock * getEntry()
Definition VPlan.h:4646
bool hasScalableVF() const
Definition VPlan.h:4764
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4743
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4736
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4704
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4725
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4770
VPValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4740
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4829
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4835
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4906
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4861
bool hasUF(unsigned UF) const
Definition VPlan.h:4781
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4694
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4733
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4806
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4832
void setVF(ElementCount VF)
Definition VPlan.h:4751
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4797
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1038
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4784
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4718
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4671
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4884
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4826
bool hasScalarVFOnly() const
Definition VPlan.h:4774
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4685
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4690
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4651
void setUF(unsigned UF)
Definition VPlan.h:4789
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4938
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1186
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4840
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:427
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:273
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1767
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:266
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2649
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2605
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:183
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:223
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3628
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3586
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3713
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3669
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...