LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
33#include "llvm/Analysis/Loads.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
46
47using namespace llvm;
48using namespace VPlanPatternMatch;
49using namespace SCEVPatternMatch;
50
52 VPlan &Plan, const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
76 Phi->getName());
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
91 Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc(), GEP);
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
158 VPReplicateRecipe &GroupLeader;
160 const Loop &L;
161
162 // Return true if \p A and \p B are known to not alias for all VFs in the
163 // plan, checked via the distance between the accesses
164 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
165 if (A->getOpcode() != Instruction::Store ||
166 B->getOpcode() != Instruction::Store)
167 return false;
168
169 VPValue *AddrA = A->getOperand(1);
170 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
171 VPValue *AddrB = B->getOperand(1);
172 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
174 return false;
175
176 const APInt *Distance;
177 ScalarEvolution &SE = *PSE.getSE();
178 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
179 return false;
180
181 const DataLayout &DL = SE.getDataLayout();
182 Type *TyA = A->getOperand(0)->getScalarType();
183 uint64_t SizeA = DL.getTypeStoreSize(TyA);
184 Type *TyB = B->getOperand(0)->getScalarType();
185 uint64_t SizeB = DL.getTypeStoreSize(TyB);
186
187 // Use the maximum store size to ensure no overlap from either direction.
188 // Currently only handles fixed sizes, as it is only used for
189 // replicating VPReplicateRecipes.
190 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
191
192 auto VFs = B->getParent()->getPlan()->vectorFactors();
194 if (MaxVF.isScalable())
195 return false;
196 return Distance->abs().uge(
197 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
198 }
199
200public:
203 const Loop &L)
204 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
205 L(L) {}
206
207 /// Return true if \p R should be skipped during alias checking, either
208 /// because it's in the exclude set or because no-alias can be proven via
209 /// SCEV.
210 bool shouldSkip(VPRecipeBase &R) const {
211 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
212 return ExcludeRecipes.contains(&R) ||
213 (Store && isNoAliasViaDistance(Store, &GroupLeader));
214 }
215};
216
217/// Check if a memory operation doesn't alias with memory operations using
218/// scoped noalias metadata, in blocks in the single-successor chain between \p
219/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
220/// write to memory are checked (for load hoisting). Otherwise recipes that both
221/// read and write memory are checked, and SCEV is used to prove no-alias
222/// between the group leader and other replicate recipes (for store sinking).
223static bool
225 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
226 std::optional<SinkStoreInfo> SinkInfo = {}) {
227 bool CheckReads = SinkInfo.has_value();
228 if (!MemLoc.AATags.Scope)
229 return false;
230
231 for (VPBasicBlock *VPBB :
233 for (VPRecipeBase &R : *VPBB) {
234 if (SinkInfo && SinkInfo->shouldSkip(R))
235 continue;
236
237 // Skip recipes that don't need checking.
238 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
239 continue;
240
242 if (!Loc)
243 // Conservatively assume aliasing for memory operations without
244 // location.
245 return false;
246
248 return false;
249 }
250 }
251 return true;
252}
253
254/// Get the value type of the replicate load or store. \p IsLoad indicates
255/// whether it is a load.
257 return (IsLoad ? R : R->getOperand(0))->getScalarType();
258}
259
260/// Collect either replicated Loads or Stores grouped by their address SCEV and
261/// their load-store type, in a deep-traversal of the vector loop region in \p
262/// Plan.
263template <unsigned Opcode>
266 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
267 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
268 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
269 "Only Load and Store opcodes supported");
270 constexpr bool IsLoad = (Opcode == Instruction::Load);
273 RecipesByAddressAndType;
276 for (VPRecipeBase &R : *VPBB) {
277 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
278 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
279 continue;
280
281 // For loads, operand 0 is address; for stores, operand 1 is address.
282 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
283 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
284 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
285 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
286 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
287 }
288 }
289 auto Groups = to_vector(RecipesByAddressAndType.values());
290 VPDominatorTree VPDT(Plan);
291 for (auto &Group : Groups) {
292 // Sort mem ops by dominance order, with earliest (most dominating) first.
294 return VPDT.properlyDominates(A, B);
295 });
296 }
297 return Groups;
298}
299
300static bool sinkScalarOperands(VPlan &Plan) {
301 auto Iter = vp_depth_first_deep(Plan.getEntry());
302 bool ScalarVFOnly = Plan.hasScalarVFOnly();
303 bool Changed = false;
304
306 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
307 VPBasicBlock *SinkTo, VPValue *Op) {
308 auto *Candidate =
309 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
310 if (!Candidate)
311 return;
312
313 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
314 // for now.
316 return;
317
318 if (Candidate->getParent() == SinkTo ||
319 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
320 return;
321
322 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
323 if (!ScalarVFOnly && RepR->isSingleScalar())
324 return;
325
326 WorkList.insert({SinkTo, Candidate});
327 };
328
329 // First, collect the operands of all recipes in replicate blocks as seeds for
330 // sinking.
332 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
333 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
334 continue;
335 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
336 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
337 continue;
338 for (auto &Recipe : *VPBB)
339 for (VPValue *Op : Recipe.operands())
340 InsertIfValidSinkCandidate(VPBB, Op);
341 }
342
343 // Try to sink each replicate or scalar IV steps recipe in the worklist.
344 for (unsigned I = 0; I != WorkList.size(); ++I) {
345 VPBasicBlock *SinkTo;
346 VPSingleDefRecipe *SinkCandidate;
347 std::tie(SinkTo, SinkCandidate) = WorkList[I];
348
349 // All recipe users of SinkCandidate must be in the same block SinkTo or all
350 // users outside of SinkTo must only use the first lane of SinkCandidate. In
351 // the latter case, we need to duplicate SinkCandidate.
352 auto UsersOutsideSinkTo =
353 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
354 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
355 });
356 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
357 return !U->usesFirstLaneOnly(SinkCandidate);
358 }))
359 continue;
360 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
361
362 if (NeedsDuplicating) {
363 if (ScalarVFOnly)
364 continue;
365 VPSingleDefRecipe *Clone;
366 if (auto *SinkCandidateRepR =
367 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
368 // TODO: Handle converting to uniform recipes as separate transform,
369 // then cloning should be sufficient here.
370 Instruction *I = SinkCandidate->getUnderlyingInstr();
371 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
372 nullptr /*Mask*/, *SinkCandidateRepR,
373 *SinkCandidateRepR);
374 // TODO: add ".cloned" suffix to name of Clone's VPValue.
375 } else {
376 Clone = SinkCandidate->clone();
377 }
378
379 Clone->insertBefore(SinkCandidate);
380 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
381 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
382 });
383 }
384 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
385 for (VPValue *Op : SinkCandidate->operands())
386 InsertIfValidSinkCandidate(SinkTo, Op);
387 Changed = true;
388 }
389 return Changed;
390}
391
392/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
393/// the mask.
395 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
396 if (!EntryBB || EntryBB->size() != 1 ||
397 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
398 return nullptr;
399
400 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
401}
402
403/// If \p R is a triangle region, return the 'then' block of the triangle.
405 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
406 if (EntryBB->getNumSuccessors() != 2)
407 return nullptr;
408
409 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
410 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
411 if (!Succ0 || !Succ1)
412 return nullptr;
413
414 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
415 return nullptr;
416 if (Succ0->getSingleSuccessor() == Succ1)
417 return Succ0;
418 if (Succ1->getSingleSuccessor() == Succ0)
419 return Succ1;
420 return nullptr;
421}
422
423// Merge replicate regions in their successor region, if a replicate region
424// is connected to a successor replicate region with the same predicate by a
425// single, empty VPBasicBlock.
427 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
428
429 // Collect replicate regions followed by an empty block, followed by another
430 // replicate region with matching masks to process front. This is to avoid
431 // iterator invalidation issues while merging regions.
434 vp_depth_first_deep(Plan.getEntry()))) {
435 if (!Region1->isReplicator())
436 continue;
437 auto *MiddleBasicBlock =
438 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
439 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
440 continue;
441
442 auto *Region2 =
443 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
444 if (!Region2 || !Region2->isReplicator())
445 continue;
446
447 VPValue *Mask1 = getPredicatedMask(Region1);
448 VPValue *Mask2 = getPredicatedMask(Region2);
449 if (!Mask1 || Mask1 != Mask2)
450 continue;
451
452 assert(Mask1 && Mask2 && "both region must have conditions");
453 WorkList.push_back(Region1);
454 }
455
456 // Move recipes from Region1 to its successor region, if both are triangles.
457 for (VPRegionBlock *Region1 : WorkList) {
458 if (TransformedRegions.contains(Region1))
459 continue;
460 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
461 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
462
463 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
464 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
465 if (!Then1 || !Then2)
466 continue;
467
468 // Note: No fusion-preventing memory dependencies are expected in either
469 // region. Such dependencies should be rejected during earlier dependence
470 // checks, which guarantee accesses can be re-ordered for vectorization.
471 //
472 // Move recipes to the successor region.
473 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
474 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
475
476 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
477 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
478
479 // Move VPPredInstPHIRecipes from the merge block to the successor region's
480 // merge block. Update all users inside the successor region to use the
481 // original values.
482 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
483 VPValue *PredInst1 =
484 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
485 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
486 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
487 return cast<VPRecipeBase>(&U)->getParent() == Then2;
488 });
489
490 // Remove phi recipes that are unused after merging the regions.
491 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
492 Phi1ToMove.eraseFromParent();
493 continue;
494 }
495 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
496 }
497
498 // Remove the dead recipes in Region1's entry block.
499 for (VPRecipeBase &R :
500 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
501 R.eraseFromParent();
502
503 // Finally, remove the first region.
504 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
505 VPBlockUtils::disconnectBlocks(Pred, Region1);
506 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
507 }
508 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
509 TransformedRegions.insert(Region1);
510 }
511
512 return !TransformedRegions.empty();
513}
514
516 VPRegionBlock *ParentRegion,
517 VPlan &Plan) {
518 Instruction *Instr = PredRecipe->getUnderlyingInstr();
519 // Build the triangular if-then region.
520 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
521 assert(Instr->getParent() && "Predicated instruction not in any basic block");
522 auto *BlockInMask = PredRecipe->getMask();
523 auto *MaskDef = BlockInMask->getDefiningRecipe();
524 auto *BOMRecipe = new VPBranchOnMaskRecipe(
525 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
526 auto *Entry =
527 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
528
529 // Replace predicated replicate recipe with a replicate recipe without a
530 // mask but in the replicate region.
531 auto *RecipeWithoutMask = new VPReplicateRecipe(
532 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
533 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
534 PredRecipe->getDebugLoc());
535 auto *Pred =
536 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
537 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
539 Plan.createReplicateRegion(Entry, Exiting, RegionName);
540
541 // Note: first set Entry as region entry and then connect successors starting
542 // from it in order, to propagate the "parent" of each VPBasicBlock.
543 Region->setParent(ParentRegion);
544 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
545 VPBlockUtils::connectBlocks(Pred, Exiting);
546
547 if (PredRecipe->getNumUsers() != 0) {
548 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
549 RecipeWithoutMask->getDebugLoc());
550 Exiting->appendRecipe(PHIRecipe);
551 PredRecipe->replaceAllUsesWith(PHIRecipe);
552 }
553 PredRecipe->eraseFromParent();
554 return Region;
555}
556
557static void addReplicateRegions(VPlan &Plan) {
560 vp_depth_first_deep(Plan.getEntry()))) {
561 for (VPRecipeBase &R : *VPBB)
562 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
563 if (RepR->isPredicated())
564 WorkList.push_back(RepR);
565 }
566 }
567
568 unsigned BBNum = 0;
569 for (VPReplicateRecipe *RepR : WorkList) {
570 VPBasicBlock *CurrentBlock = RepR->getParent();
571 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
572
573 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
574 SplitBlock->setName(
575 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
576 // Record predicated instructions for above packing optimizations.
578 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
580
581 VPRegionBlock *ParentRegion = Region->getParent();
582 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
583 ParentRegion->setExiting(SplitBlock);
584 }
585}
586
590 vp_depth_first_deep(Plan.getEntry()))) {
591 // Don't fold the blocks in the skeleton of the Plan into their single
592 // predecessors for now.
593 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
594 if (!VPBB->getParent())
595 continue;
596 auto *PredVPBB =
597 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
598 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
599 isa<VPIRBasicBlock>(PredVPBB))
600 continue;
601 WorkList.push_back(VPBB);
602 }
603
604 for (VPBasicBlock *VPBB : WorkList) {
605 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
606 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
607 R.moveBefore(*PredVPBB, PredVPBB->end());
608 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
609 auto *ParentRegion = VPBB->getParent();
610 if (ParentRegion && ParentRegion->getExiting() == VPBB)
611 ParentRegion->setExiting(PredVPBB);
612 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
613 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
614 }
615 return !WorkList.empty();
616}
617
619 // Convert masked VPReplicateRecipes to if-then region blocks.
621
622 bool ShouldSimplify = true;
623 while (ShouldSimplify) {
624 ShouldSimplify = sinkScalarOperands(Plan);
625 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
626 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
627 }
628}
629
630/// Remove redundant casts of inductions.
631///
632/// Such redundant casts are casts of induction variables that can be ignored,
633/// because we already proved that the casted phi is equal to the uncasted phi
634/// in the vectorized loop. There is no need to vectorize the cast - the same
635/// value can be used for both the phi and casts in the vector loop.
637 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
639 if (!IV || IV->getTruncInst())
640 continue;
641
642 // A sequence of IR Casts has potentially been recorded for IV, which
643 // *must be bypassed* when the IV is vectorized, because the vectorized IV
644 // will produce the desired casted value. This sequence forms a def-use
645 // chain and is provided in reverse order, ending with the cast that uses
646 // the IV phi. Search for the recipe of the last cast in the chain and
647 // replace it with the original IV. Note that only the final cast is
648 // expected to have users outside the cast-chain and the dead casts left
649 // over will be cleaned up later.
650 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
651 VPValue *FindMyCast = IV;
652 for (Instruction *IRCast : reverse(Casts)) {
653 VPSingleDefRecipe *FoundUserCast = nullptr;
654 for (auto *U : FindMyCast->users()) {
655 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
656 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
657 FoundUserCast = UserCast;
658 break;
659 }
660 }
661 // A cast recipe in the chain may have been removed by earlier DCE.
662 if (!FoundUserCast)
663 break;
664 FindMyCast = FoundUserCast;
665 }
666 if (FindMyCast != IV)
667 FindMyCast->replaceAllUsesWith(IV);
668 }
669}
670
673 Instruction::BinaryOps InductionOpcode,
674 FPMathOperator *FPBinOp, Instruction *TruncI,
675 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
676 VPBuilder &Builder) {
677 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
678 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
679 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
680 VPSingleDefRecipe *BaseIV =
681 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
682
683 // Truncate base induction if needed.
684 Type *ResultTy = BaseIV->getScalarType();
685 if (TruncI) {
686 Type *TruncTy = TruncI->getType();
687 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
688 "Not truncating.");
689 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
690 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
691 ResultTy = TruncTy;
692 }
693
694 // Truncate step if needed.
695 Type *StepTy = Step->getScalarType();
696 if (ResultTy != StepTy) {
697 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
698 "Not truncating.");
699 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
700 auto *VecPreheader =
702 VPBuilder::InsertPointGuard Guard(Builder);
703 Builder.setInsertPoint(VecPreheader);
704 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
705 }
706 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
707 &Plan.getVF(), DL);
708}
709
711 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
713 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
714 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
715 if (!LoopRegion)
716 return;
717
718 auto *WideCanIV =
720 if (!WideCanIV)
721 return;
722
723 Type *CanIVTy = LoopRegion->getCanonicalIVType();
724
725 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
726 // IV.
727 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
728 VPBuilder Builder(WideCanIV);
729 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
730 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
731 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
732 WideCanIV->getDebugLoc(), Builder));
733 WideCanIV->eraseFromParent();
734 return;
735 }
736
737 if (vputils::onlyScalarValuesUsed(WideCanIV))
738 return;
739
740 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
741 // in the header, reuse it instead of introducing another wide induction phi.
742 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
743 for (VPRecipeBase &Phi : Header->phis()) {
745 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
746 continue;
747 // The reused wide IV feeds the header mask, whose lanes may extend past
748 // the trip count; drop flags that only hold inside the scalar loop.
749 WidenIV->dropPoisonGeneratingFlags();
750 WideCanIV->replaceAllUsesWith(WidenIV);
751 WideCanIV->eraseFromParent();
752 return;
753 }
754
755 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
756 auto *VecTy = VectorType::get(CanIVTy, VF);
757 InstructionCost BroadcastCost = TTI.getShuffleCost(
759 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
760 if (PHICost > BroadcastCost)
761 return;
762
763 // Bail out if the additional wide induction phi increase the expected spill
764 // cost.
765 VPRegisterUsage UnrolledBase =
766 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
767 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
768 NumUsers *= UF;
769 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
770 VPRegisterUsage Projected = UnrolledBase;
771 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
772 if (Projected.spillCost(TTI, CostKind) >
773 UnrolledBase.spillCost(TTI, CostKind))
774 return;
775
778 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
779 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
780 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
781 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
782 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
783 WideCanIV->replaceAllUsesWith(NewWideIV);
784 WideCanIV->eraseFromParent();
785}
786
787/// Returns true if \p R is dead and can be removed.
788static bool isDeadRecipe(VPRecipeBase &R) {
789 // Do remove conditional assume instructions as their conditions may be
790 // flattened.
791 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
792 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
794 if (IsConditionalAssume)
795 return true;
796
797 if (R.mayHaveSideEffects())
798 return false;
799
800 // Recipe is dead if no user keeps the recipe alive.
801 return all_of(R.definedValues(),
802 [](VPValue *V) { return V->getNumUsers() == 0; });
803}
804
807 Plan.getEntry());
809 // The recipes in the block are processed in reverse order, to catch chains
810 // of dead recipes.
811 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
812 if (isDeadRecipe(R)) {
813 R.eraseFromParent();
814 continue;
815 }
816
817 // Check if R is a dead VPPhi <-> update cycle and remove it.
818 VPValue *Start, *Incoming;
819 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
820 continue;
821 auto *PhiR = cast<VPPhi>(&R);
822 VPUser *PhiUser = PhiR->getSingleUser();
823 if (!PhiUser)
824 continue;
825 if (PhiUser != Incoming->getDefiningRecipe() ||
826 Incoming->getNumUsers() != 1)
827 continue;
828 PhiR->replaceAllUsesWith(Start);
829 PhiR->eraseFromParent();
830 Incoming->getDefiningRecipe()->eraseFromParent();
831 }
832 }
833}
834
837 for (unsigned I = 0; I != Users.size(); ++I) {
839 for (VPValue *V : Cur->definedValues())
840 Users.insert_range(V->users());
841 }
842 return Users.takeVector();
843}
844
845/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
846/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
847/// generates scalar values.
848static VPValue *
850 VPlan &Plan, VPBuilder &Builder) {
852 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
853 VPValue *StepV = PtrIV->getOperand(1);
855 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
856 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
857
858 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
859 PtrIV->getDebugLoc(), "next.gep");
860}
861
862/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
863/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
864/// VPWidenPointerInductionRecipe will generate vectors only. If some users
865/// require vectors while other require scalars, the scalar uses need to extract
866/// the scalars from the generated vectors (Note that this is different to how
867/// int/fp inductions are handled). Legalize extract-from-ends using uniform
868/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
869/// the correct end value is available. Also optimize
870/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
871/// providing them scalar steps built on the canonical scalar IV and update the
872/// original IV's users. This is an optional optimization to reduce the needs of
873/// vector extracts.
876 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
877 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
878 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
879 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
880 if (!PhiR)
881 continue;
882
883 // Try to narrow wide and replicating recipes to uniform recipes, based on
884 // VPlan analysis.
885 // TODO: Apply to all recipes in the future, to replace legacy uniformity
886 // analysis.
887 auto Users = collectUsersRecursively(PhiR);
888 for (VPUser *U : reverse(Users)) {
889 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
890 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
891 // Skip recipes that shouldn't be narrowed.
892 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
893 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
894 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
895 continue;
896
897 // Skip recipes that may have other lanes than their first used.
899 continue;
900
901 // TODO: Support scalarizing ExtractValue.
902 if (match(Def,
904 continue;
905
906 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
907 Def->operands(), /*IsUniform*/ true,
908 /*Mask*/ nullptr, /*Flags*/ *Def);
909 Clone->insertAfter(Def);
910 Def->replaceAllUsesWith(Clone);
911 }
912
913 // Replace wide pointer inductions which have only their scalars used by
914 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
915 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
916 if (!Plan.hasScalarVFOnly() &&
917 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
918 continue;
919
920 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
921 PtrIV->replaceAllUsesWith(PtrAdd);
922 continue;
923 }
924
925 // Replace widened induction with scalar steps for users that only use
926 // scalars.
927 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
928 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
929 return U->usesScalars(WideIV);
930 }))
931 continue;
932
933 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
935 Plan, ID.getKind(), ID.getInductionOpcode(),
936 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
937 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
938 WideIV->getDebugLoc(), Builder);
939
940 // Update scalar users of IV to use Step instead.
941 if (!HasOnlyVectorVFs) {
942 assert(!Plan.hasScalableVF() &&
943 "plans containing a scalar VF cannot also include scalable VFs");
944 WideIV->replaceAllUsesWith(Steps);
945 } else {
946 bool HasScalableVF = Plan.hasScalableVF();
947 WideIV->replaceUsesWithIf(Steps,
948 [WideIV, HasScalableVF](VPUser &U, unsigned) {
949 if (HasScalableVF)
950 return U.usesFirstLaneOnly(WideIV);
951 return U.usesScalars(WideIV);
952 });
953 }
954 }
955}
956
957/// Check if \p VPV is an untruncated wide induction, either before or after the
958/// increment. If so return the header IV (before the increment), otherwise
959/// return null.
962 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
963 if (WideIV) {
964 // VPV itself is a wide induction, separately compute the end value for exit
965 // users if it is not a truncated IV.
966 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
967 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
968 }
969
970 // Check if VPV is an optimizable induction increment.
971 VPRecipeBase *Def = VPV->getDefiningRecipe();
972 if (!Def || Def->getNumOperands() != 2)
973 return nullptr;
974 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
975 if (!WideIV)
976 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
977 if (!WideIV)
978 return nullptr;
979
980 auto IsWideIVInc = [&]() {
981 auto &ID = WideIV->getInductionDescriptor();
982
983 // Check if VPV increments the induction by the induction step.
984 VPValue *IVStep = WideIV->getStepValue();
985 switch (ID.getInductionOpcode()) {
986 case Instruction::Add:
987 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
988 case Instruction::FAdd:
989 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
990 case Instruction::FSub:
991 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
992 m_Specific(IVStep)));
993 case Instruction::Sub: {
994 // IVStep will be the negated step of the subtraction. Check if Step == -1
995 // * IVStep.
996 VPValue *Step;
997 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
998 return false;
999 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1000 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1001 ScalarEvolution &SE = *PSE.getSE();
1002 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1003 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1004 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1005 }
1006 default:
1007 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1008 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1009 m_Specific(WideIV->getStepValue())));
1010 }
1011 llvm_unreachable("should have been covered by switch above");
1012 };
1013 return IsWideIVInc() ? WideIV : nullptr;
1014}
1015
1016/// Attempts to optimize the induction variable exit values for users in the
1017/// early exit block.
1020 VPValue *Incoming, *Mask;
1022 m_VPValue(Incoming))))
1023 return nullptr;
1024
1025 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1026 if (!WideIV)
1027 return nullptr;
1028
1029 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1030 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1031 return nullptr;
1032
1033 // Calculate the final index.
1034 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1035 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1036 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1037 auto *ExtractR = cast<VPInstruction>(Op);
1038 VPBuilder B(ExtractR);
1039
1040 DebugLoc DL = ExtractR->getDebugLoc();
1041 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1042 FirstActiveLane = B.createScalarZExtOrTrunc(
1043 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1044 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1045
1046 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1047 // changed it means the exit is using the incremented value, so we need to
1048 // add the step.
1049 if (Incoming != WideIV) {
1050 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1051 EndValue = B.createAdd(EndValue, One, DL);
1052 }
1053
1054 if (!match(WideIV, m_CanonicalWidenIV())) {
1055 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1056 VPIRValue *Start = WideIV->getStartValue();
1057 VPValue *Step = WideIV->getStepValue();
1058 EndValue = B.createDerivedIV(
1059 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1060 Start, EndValue, Step);
1061 }
1062
1063 return EndValue;
1064}
1065
1066/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1067/// VPDerivedIVRecipe for non-canonical inductions.
1069 VPBuilder &VectorPHBuilder,
1070 VPValue *VectorTC) {
1071 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1072 // Truncated wide inductions resume from the last lane of their vector value
1073 // in the last vector iteration which is handled elsewhere.
1074 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1075 return nullptr;
1076
1077 VPIRValue *Start = WideIV->getStartValue();
1078 VPValue *Step = WideIV->getStepValue();
1080 VPValue *EndValue = VectorTC;
1081 if (!match(WideIV, m_CanonicalWidenIV())) {
1082 EndValue = VectorPHBuilder.createDerivedIV(
1083 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1084 Start, VectorTC, Step);
1085 }
1086
1087 // EndValue is derived from the vector trip count (which has the same type as
1088 // the widest induction) and thus may be wider than the induction here.
1089 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1090 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1091 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1092 ScalarTypeOfWideIV,
1093 WideIV->getDebugLoc());
1094 }
1095
1096 return EndValue;
1097}
1098
1099/// Attempts to optimize the induction variable exit values for users in the
1100/// exit block coming from the latch in the original scalar loop.
1101static VPValue *
1105 VPValue *Incoming;
1107 return nullptr;
1108
1109 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1110 if (!WideIV)
1111 return nullptr;
1112
1113 VPValue *EndValue = EndValues.lookup(WideIV);
1114 assert(EndValue && "Must have computed the end value up front");
1115
1116 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1117 // changed it means the exit is using the incremented value, so we don't
1118 // need to subtract the step.
1119 if (Incoming != WideIV)
1120 return EndValue;
1121
1122 // Otherwise, subtract the step from the EndValue.
1123 auto *ExtractR = cast<VPInstruction>(Op);
1124 VPBuilder B(ExtractR);
1125 VPValue *Step = WideIV->getStepValue();
1126 Type *ScalarTy = WideIV->getScalarType();
1127 if (ScalarTy->isIntegerTy())
1128 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1129 if (ScalarTy->isPointerTy()) {
1130 Type *StepTy = Step->getScalarType();
1131 auto *Zero = Plan.getZero(StepTy);
1132 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1133 DebugLoc::getUnknown(), "ind.escape");
1134 }
1135 if (ScalarTy->isFloatingPointTy()) {
1136 const auto &ID = WideIV->getInductionDescriptor();
1137 return B.createNaryOp(
1138 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1139 ? Instruction::FSub
1140 : Instruction::FAdd,
1141 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1142 }
1143 llvm_unreachable("all possible induction types must be handled");
1144 return nullptr;
1145}
1146
1148 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1149 // Compute end values for all inductions.
1150 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1151 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1152 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1154 VPValue *ResumeTC =
1155 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1156 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1157 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1158 if (!WideIV)
1159 continue;
1160 if (VPValue *EndValue =
1161 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1162 EndValues[WideIV] = EndValue;
1163 }
1164
1165 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1166 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1167 VPValue *Op;
1168 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1169 continue;
1170 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1171 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1172 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1173 R.eraseFromParent();
1174 }
1175 }
1176
1177 // Then, optimize exit block users.
1178 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1179 for (VPRecipeBase &R : ExitVPBB->phis()) {
1180 auto *ExitIRI = cast<VPIRPhi>(&R);
1181
1182 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1183 VPValue *Escape = nullptr;
1184 if (PredVPBB == MiddleVPBB)
1186 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1187 else
1189 Plan, ExitIRI->getOperand(Idx), PSE);
1190 if (Escape)
1191 ExitIRI->setOperand(Idx, Escape);
1192 }
1193 }
1194 }
1195}
1196
1197/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1198/// them with already existing recipes expanding the same SCEV expression.
1201
1202 for (VPRecipeBase &R :
1204 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1205 if (!ExpR)
1206 continue;
1207
1208 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1209 if (Inserted)
1210 continue;
1211
1212 ExpR->replaceAllUsesWith(V->second);
1213 if (ExpR == Plan.getTripCount())
1214 Plan.resetTripCount(V->second);
1215
1216 ExpR->eraseFromParent();
1217 }
1218}
1219
1221 SmallVector<VPValue *> WorkList;
1223 WorkList.push_back(V);
1224
1225 while (!WorkList.empty()) {
1226 VPValue *Cur = WorkList.pop_back_val();
1227 if (!Seen.insert(Cur).second)
1228 continue;
1229 VPRecipeBase *R = Cur->getDefiningRecipe();
1230 if (!R)
1231 continue;
1232 if (!isDeadRecipe(*R))
1233 continue;
1234 append_range(WorkList, R->operands());
1235 R->eraseFromParent();
1236 }
1237}
1238
1239/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1240/// Returns an optional pair, where the first element indicates whether it is
1241/// an intrinsic ID.
1242static std::optional<std::pair<bool, unsigned>>
1244 return TypeSwitch<const VPSingleDefRecipe *,
1245 std::optional<std::pair<bool, unsigned>>>(R)
1248 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1249 .Case([](const VPWidenIntrinsicRecipe *I) {
1250 return std::make_pair(true, I->getVectorIntrinsicID());
1251 })
1252 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1253 [](auto *I) {
1254 // For recipes that do not directly map to LLVM IR instructions,
1255 // assign opcodes after the last VPInstruction opcode (which is also
1256 // after the last IR Instruction opcode), based on the VPRecipeID.
1257 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1258 I->getVPRecipeID());
1259 })
1260 .Default([](auto *) { return std::nullopt; });
1261}
1262
1263/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1264/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1265/// Operands are foldable live-ins.
1267 ArrayRef<VPValue *> Operands,
1268 const DataLayout &DL, LLVMContext &Ctx) {
1269 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1270 if (!OpcodeOrIID)
1271 return nullptr;
1272
1274 for (VPValue *Op : Operands) {
1275 if (!match(Op, m_LiveIn()))
1276 return nullptr;
1277 Value *V = Op->getUnderlyingValue();
1278 if (!V)
1279 return nullptr;
1280 Ops.push_back(V);
1281 }
1282
1283 auto FoldToIRValue = [&]() -> Value * {
1284 InstSimplifyFolder Folder(DL);
1285 if (OpcodeOrIID->first) {
1286 if (R.getNumOperands() != 2)
1287 return nullptr;
1288 unsigned ID = OpcodeOrIID->second;
1289 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], R.getScalarType());
1290 }
1291 unsigned Opcode = OpcodeOrIID->second;
1292 if (Instruction::isBinaryOp(Opcode))
1293 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1294 Ops[0], Ops[1]);
1295 if (Instruction::isCast(Opcode))
1296 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1297 R.getVPSingleValue()->getScalarType());
1298 switch (Opcode) {
1300 return Folder.FoldSelect(Ops[0], Ops[1],
1302 case VPInstruction::Not:
1303 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1305 case Instruction::Select:
1306 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1307 case Instruction::ICmp:
1308 case Instruction::FCmp:
1309 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1310 Ops[1]);
1311 case Instruction::GetElementPtr: {
1312 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1313 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1314 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1315 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1316 }
1319 return Folder.FoldGEP(IntegerType::getInt8Ty(Ctx), Ops[0], Ops[1],
1320 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1321 // An extract of a live-in is an extract of a broadcast, so return the
1322 // broadcasted element.
1323 case Instruction::ExtractElement:
1324 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1325 return Ops[0];
1326 }
1327 return nullptr;
1328 };
1329
1330 if (Value *V = FoldToIRValue())
1331 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1332 return nullptr;
1333}
1334
1335/// Try to simplify logical and bitwise recipes in \p Def.
1337 bool CanCreateNewRecipe) {
1338 VPlan *Plan = Def->getParent()->getPlan();
1339
1340 // Simplify (X && Y) | (X && !Y) -> X.
1341 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1342 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1343 // recipes to be visited during simplification.
1344 VPValue *X, *Y, *Z;
1345 if (match(Def,
1348 Def->replaceAllUsesWith(X);
1349 Def->eraseFromParent();
1350 return true;
1351 }
1352
1353 // x | AllOnes -> AllOnes
1354 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1355 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1356 return true;
1357 }
1358
1359 // x | 0 -> x
1360 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1361 Def->replaceAllUsesWith(X);
1362 return true;
1363 }
1364
1365 // x | !x -> AllOnes
1366 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1367 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1368 return true;
1369 }
1370
1371 // x & 0 -> 0
1372 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1373 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1374 return true;
1375 }
1376
1377 // x & AllOnes -> x
1378 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1379 Def->replaceAllUsesWith(X);
1380 return true;
1381 }
1382
1383 // x && false -> false
1384 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1385 Def->replaceAllUsesWith(Plan->getFalse());
1386 return true;
1387 }
1388
1389 // x && true -> x
1390 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1391 Def->replaceAllUsesWith(X);
1392 return true;
1393 }
1394
1395 // (x && y) | (x && z) -> x && (y | z)
1396 if (CanCreateNewRecipe &&
1399 // Simplify only if one of the operands has one use to avoid creating an
1400 // extra recipe.
1401 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1402 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1403 Def->replaceAllUsesWith(
1404 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1405 return true;
1406 }
1407
1408 // x && (x && y) -> x && y
1409 if (match(Def, m_LogicalAnd(m_VPValue(X),
1411 Def->replaceAllUsesWith(Def->getOperand(1));
1412 return true;
1413 }
1414
1415 // x && (y && x) -> x && y
1416 if (match(Def, m_LogicalAnd(m_VPValue(X),
1418 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1419 return true;
1420 }
1421
1422 // x && !x -> 0
1423 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1424 Def->replaceAllUsesWith(Plan->getFalse());
1425 return true;
1426 }
1427
1428 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1429 Def->replaceAllUsesWith(X);
1430 return true;
1431 }
1432
1433 // select c, false, true -> not c
1434 VPValue *C;
1435 if (CanCreateNewRecipe &&
1436 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1437 Def->replaceAllUsesWith(Builder.createNot(C));
1438 return true;
1439 }
1440
1441 // select !c, x, y -> select c, y, x
1442 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1443 Def->setOperand(0, C);
1444 Def->setOperand(1, Y);
1445 Def->setOperand(2, X);
1446 return true;
1447 }
1448
1449 // select x, (i1 y | z), y -> y | (x && z)
1450 if (CanCreateNewRecipe &&
1451 match(Def, m_Select(m_VPValue(X),
1453 m_Deferred(Y))) &&
1454 Y->getScalarType()->isIntegerTy(1)) {
1455 Def->replaceAllUsesWith(
1456 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1457 return true;
1458 }
1459
1460 return false;
1461}
1462
1463/// Try to simplify VPSingleDefRecipe \p Def.
1465 VPlan *Plan = Def->getParent()->getPlan();
1466
1467 // Simplification of live-in IR values for SingleDef recipes using
1468 // InstSimplifyFolder.
1469 const DataLayout &DL = Plan->getDataLayout();
1470 if (VPValue *V =
1471 tryToFoldLiveIns(*Def, Def->operands(), DL, Plan->getContext()))
1472 return Def->replaceAllUsesWith(V);
1473
1474 // Fold PredPHI LiveIn -> LiveIn.
1475 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1476 VPValue *Op = PredPHI->getOperand(0);
1477 if (isa<VPIRValue>(Op))
1478 PredPHI->replaceAllUsesWith(Op);
1479 }
1480
1481 VPBuilder Builder(Def);
1482
1483 // Avoid replacing VPInstructions with underlying values with new
1484 // VPInstructions, as we would fail to create widen/replicate recpes from the
1485 // new VPInstructions without an underlying value, and miss out on some
1486 // transformations that only apply to widened/replicated recipes later, by
1487 // doing so.
1488 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1489 // VPInstructions without underlying values, as those will get skipped during
1490 // cost computation.
1491 bool CanCreateNewRecipe =
1492 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1493
1494 VPValue *A;
1495 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1496 Type *TruncTy = Def->getScalarType();
1497 Type *ATy = A->getScalarType();
1498 if (TruncTy == ATy) {
1499 Def->replaceAllUsesWith(A);
1500 } else {
1501 // Don't replace a non-widened cast recipe with a widened cast.
1502 if (!isa<VPWidenCastRecipe>(Def))
1503 return;
1504 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1505
1506 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1507 ? Instruction::SExt
1508 : Instruction::ZExt;
1509 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1510 TruncTy);
1511 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1512 // UnderlyingExt has distinct return type, used to retain legacy cost.
1513 Ext->setUnderlyingValue(UnderlyingExt);
1514 }
1515 Def->replaceAllUsesWith(Ext);
1516 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1517 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1518 Def->replaceAllUsesWith(Trunc);
1519 }
1520 }
1521 }
1522
1523 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1524 return;
1525
1526 VPValue *X, *Y, *C;
1527 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1528 return Def->replaceAllUsesWith(A);
1529
1530 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1531 return Def->replaceAllUsesWith(A);
1532
1533 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1534 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1535
1536 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1537 // Preserve nsw from the Mul on the new Sub.
1539 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1540 return Def->replaceAllUsesWith(Builder.createSub(
1541 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1542 }
1543
1544 if (CanCreateNewRecipe &&
1546 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1547 // new Sub.
1549 false,
1550 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1551 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1552 ->hasNoSignedWrap()};
1553 return Def->replaceAllUsesWith(
1554 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1555 }
1556
1557 const APInt *APC;
1558 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1559 APC->isPowerOf2())
1560 return Def->replaceAllUsesWith(Builder.createNaryOp(
1561 Instruction::Shl,
1562 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1563 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1564
1565 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1566 APC->isPowerOf2())
1567 return Def->replaceAllUsesWith(Builder.createNaryOp(
1568 Instruction::LShr,
1569 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1570 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1571
1572 if (match(Def, m_Not(m_VPValue(A)))) {
1573 if (match(A, m_Not(m_VPValue(A))))
1574 return Def->replaceAllUsesWith(A);
1575
1576 // Try to fold Not into compares by adjusting the predicate in-place.
1577 CmpPredicate Pred;
1578 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1579 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1580 if (all_of(Cmp->users(),
1582 m_Not(m_Specific(Cmp)),
1583 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1584 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1585 for (VPUser *U : to_vector(Cmp->users())) {
1586 auto *R = cast<VPSingleDefRecipe>(U);
1587 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1588 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1589 R->setOperand(1, Y);
1590 R->setOperand(2, X);
1591 } else {
1592 // not (cmp pred) -> cmp inv_pred
1593 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1594 R->replaceAllUsesWith(Cmp);
1595 }
1596 }
1597 // If Cmp doesn't have a debug location, use the one from the negation,
1598 // to preserve the location.
1599 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1600 Cmp->setDebugLoc(Def->getDebugLoc());
1601 }
1602 }
1603 }
1604
1605 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1606 // any-of (fcmp uno %A, %B), ...
1607 if (match(Def, m_AnyOf())) {
1609 VPRecipeBase *UnpairedCmp = nullptr;
1610 for (VPValue *Op : Def->operands()) {
1611 VPValue *X;
1612 if (Op->getNumUsers() > 1 ||
1614 m_Deferred(X)))) {
1615 NewOps.push_back(Op);
1616 } else if (!UnpairedCmp) {
1617 UnpairedCmp = Op->getDefiningRecipe();
1618 } else {
1619 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1620 UnpairedCmp->getOperand(0), X));
1621 UnpairedCmp = nullptr;
1622 }
1623 }
1624
1625 if (UnpairedCmp)
1626 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1627
1628 if (NewOps.size() < Def->getNumOperands()) {
1629 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1630 return Def->replaceAllUsesWith(NewAnyOf);
1631 }
1632 }
1633
1634 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1635 // This is useful for fmax/fmin without fast-math flags, where we need to
1636 // check if any operand is NaN.
1637 if (CanCreateNewRecipe &&
1639 m_Deferred(X)),
1641 m_Deferred(Y))))) {
1642 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1643 return Def->replaceAllUsesWith(NewCmp);
1644 }
1645
1646 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1647 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1648 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1649 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1650 return Def->replaceAllUsesWith(Def->getOperand(1));
1651
1653 m_One()))) {
1654 Type *WideStepTy = Def->getScalarType();
1655 if (X->getScalarType() != WideStepTy)
1656 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1657 Def->replaceAllUsesWith(X);
1658 return;
1659 }
1660
1661 // For i1 vp.merges produced by AnyOf reductions:
1662 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1664 m_VPValue(X), m_VPValue())) &&
1666 Def->getScalarType()->isIntegerTy(1)) {
1667 Def->setOperand(1, Def->getOperand(0));
1668 Def->setOperand(0, Y);
1669 return;
1670 }
1671
1672 // Simplify MaskedCond with no block mask to its single operand.
1674 !cast<VPInstruction>(Def)->isMasked())
1675 return Def->replaceAllUsesWith(Def->getOperand(0));
1676
1677 // Look through ExtractLastLane.
1678 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1679 if (match(A, m_BuildVector())) {
1680 auto *BuildVector = cast<VPInstruction>(A);
1681 Def->replaceAllUsesWith(
1682 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1683 return;
1684 }
1685
1686 if (match(A, m_Broadcast(m_VPValue(X))))
1687 return Def->replaceAllUsesWith(X);
1688
1690 return Def->replaceAllUsesWith(A);
1691
1692 if (Plan->hasScalarVFOnly())
1693 return Def->replaceAllUsesWith(A);
1694 }
1695
1696 // Look through ExtractPenultimateElement (BuildVector ....).
1698 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1699 Def->replaceAllUsesWith(
1700 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1701 return;
1702 }
1703
1704 uint64_t Idx;
1706 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1707 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1708 return;
1709 }
1710
1711 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1712 Def->replaceAllUsesWith(
1713 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1714 return;
1715 }
1716
1717 // Look through broadcast of single-scalar when used as select conditions; in
1718 // that case the scalar condition can be used directly.
1719 if (match(Def,
1722 "broadcast operand must be single-scalar");
1723 Def->setOperand(0, C);
1724 return;
1725 }
1726
1727 if (match(Def, m_Broadcast(m_VPValue(X))))
1728 return Def->replaceUsesWithIf(
1729 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1730
1732 if (Def->getNumOperands() == 1) {
1733 Def->replaceAllUsesWith(Def->getOperand(0));
1734 return;
1735 }
1736 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1737 if (all_equal(Phi->incoming_values()))
1738 Phi->replaceAllUsesWith(Phi->getOperand(0));
1739 }
1740 return;
1741 }
1742
1743 VPIRValue *IRV;
1744 if (Def->getNumOperands() == 1 &&
1746 return Def->replaceAllUsesWith(IRV);
1747
1748 // Some simplifications can only be applied after unrolling. Perform them
1749 // below.
1750 if (!Plan->isUnrolled())
1751 return;
1752
1753 // After unrolling, extract-lane may be used to extract values from multiple
1754 // scalar sources. Only simplify when extracting from a single scalar source.
1755 VPValue *LaneToExtract;
1756 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1757 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1759 return Def->replaceAllUsesWith(A);
1760
1761 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1762 // scalar canonical IV.
1764 if (match(LaneToExtract, m_ZeroInt()) &&
1765 match(A, m_CanonicalWidenIV(WidenIV)))
1766 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1767
1768 // Simplify extract-lane with single source to extract-element.
1769 Def->replaceAllUsesWith(Builder.createNaryOp(
1770 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1771 return;
1772 }
1773
1774 // Look for cycles where Def is of the form:
1775 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1776 // IVInc = X + Step ; used by X and Def
1777 // Def = IVInc + Y
1778 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1779 // and if Inc exists, replace it with X.
1780 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1781 isa<VPIRValue>(Y) &&
1782 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1783 auto *Phi = cast<VPPhi>(X);
1784 auto *IVInc = Def->getOperand(0);
1785 if (IVInc->getNumUsers() == 2) {
1786 // If Phi has a second user (besides IVInc's defining recipe), it must
1787 // be Inc = Phi + Y for the fold to apply.
1789 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1790 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1791 Def->replaceAllUsesWith(IVInc);
1792 if (Inc)
1793 Inc->replaceAllUsesWith(Phi);
1794 Phi->setOperand(0, Y);
1795 return;
1796 }
1797 }
1798 }
1799
1800 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1801 // just the pointer operand.
1802 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1803 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1804 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1805
1806 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1807 // the start index is zero and only the first lane 0 is demanded.
1808 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1809 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1810 Steps->replaceAllUsesWith(Steps->getOperand(0));
1811 return;
1812 }
1813 }
1814 // Simplify redundant ReductionStartVector recipes after unrolling.
1815 VPValue *StartV;
1817 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1818 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1819 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1820 return PhiR && PhiR->isInLoop();
1821 });
1822 return;
1823 }
1824
1825 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1826 return Def->replaceAllUsesWith(A);
1827}
1828
1838
1839/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1840/// header mask to be simplified further when tail folding, e.g. in
1841/// optimizeEVLMasks.
1842static void reassociateHeaderMask(VPlan &Plan) {
1843 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1844 if (!HeaderMask)
1845 return;
1846
1847 SmallVector<VPUser *> Worklist;
1848 for (VPUser *U : HeaderMask->users())
1849 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1851
1852 while (!Worklist.empty()) {
1853 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1854 VPValue *X, *Y;
1855 if (!R || !match(R, m_LogicalAnd(
1856 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1857 m_VPValue(Y))))
1858 continue;
1859 append_range(Worklist, R->users());
1860 VPBuilder Builder(R);
1861 R->replaceAllUsesWith(
1862 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1863 }
1864}
1865
1866static std::optional<Instruction::BinaryOps>
1868 switch (ID) {
1869 case Intrinsic::masked_udiv:
1870 return Instruction::UDiv;
1871 case Intrinsic::masked_sdiv:
1872 return Instruction::SDiv;
1873 case Intrinsic::masked_urem:
1874 return Instruction::URem;
1875 case Intrinsic::masked_srem:
1876 return Instruction::SRem;
1877 default:
1878 return {};
1879 }
1880}
1881
1883 if (Plan.hasScalarVFOnly())
1884 return;
1885
1887 vp_depth_first_deep(Plan.getEntry()))) {
1888 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1891 continue;
1892 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1893 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1894 continue;
1895
1896 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1897 if (RepR && RepR->getOpcode() == Instruction::Store &&
1898 vputils::isSingleScalar(RepR->getOperand(1))) {
1899 auto *Clone = new VPReplicateRecipe(
1900 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1901 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1902 *RepR /*Metadata*/, RepR->getDebugLoc());
1903 Clone->insertBefore(RepOrWidenR);
1904 VPBuilder Builder(Clone);
1905 VPValue *ExtractOp = Clone->getOperand(0);
1906 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1907 ExtractOp =
1908 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1909 ExtractOp =
1910 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1911 Clone->setOperand(0, ExtractOp);
1912 RepR->eraseFromParent();
1913 continue;
1914 }
1915
1916 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1917 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1918 if (!vputils::onlyFirstLaneUsed(IntrR))
1919 continue;
1920 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1921 if (!Opc)
1922 continue;
1923 VPBuilder Builder(IntrR);
1924 VPValue *SafeDivisor = Builder.createSelect(
1925 IntrR->getOperand(2), IntrR->getOperand(1),
1926 Plan.getConstantInt(IntrR->getScalarType(), 1));
1927 VPValue *Clone = Builder.createNaryOp(
1928 *Opc, {IntrR->getOperand(0), SafeDivisor},
1929 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1930 IntrR->replaceAllUsesWith(Clone);
1931 IntrR->eraseFromParent();
1932 continue;
1933 }
1934
1935 // Skip recipes that aren't single scalars.
1936 if (!vputils::isSingleScalar(RepOrWidenR))
1937 continue;
1938
1939 // Predicate to check if a user of Op introduces extra broadcasts.
1940 auto IntroducesBCastOf = [](const VPValue *Op) {
1941 return [Op](const VPUser *U) {
1942 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1946 VPI->getOpcode()))
1947 return false;
1948 }
1949 return !U->usesScalars(Op);
1950 };
1951 };
1952
1953 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1954 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1955 if (any_of(
1956 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1957 IntroducesBCastOf(Op)))
1958 return false;
1959 // Non-constant live-ins require broadcasts, while constants do not
1960 // need explicit broadcasts.
1961 auto *IRV = dyn_cast<VPIRValue>(Op);
1962 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1963 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1964 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1965 }))
1966 continue;
1967
1968 auto *Clone = new VPReplicateRecipe(
1969 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1970 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1971 Clone->insertBefore(RepOrWidenR);
1972 RepOrWidenR->replaceAllUsesWith(Clone);
1973 if (isDeadRecipe(*RepOrWidenR))
1974 RepOrWidenR->eraseFromParent();
1975 }
1976 }
1977}
1978
1979/// Try to see if all of \p Blend's masks share a common value logically and'ed
1980/// and remove it from the masks.
1982 if (Blend->isNormalized())
1983 return;
1984 VPValue *CommonEdgeMask;
1985 if (!match(Blend->getMask(0),
1986 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1987 return;
1988 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1989 if (!match(Blend->getMask(I),
1990 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1991 return;
1992 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1993 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1994}
1995
1996/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1997/// to make sure the masks are simplified.
1998static void simplifyBlends(VPlan &Plan) {
2001 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2002 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2003 if (!Blend)
2004 continue;
2005
2006 removeCommonBlendMask(Blend);
2007
2008 // Try to remove redundant blend recipes.
2009 SmallPtrSet<VPValue *, 4> UniqueValues;
2010 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2011 UniqueValues.insert(Blend->getIncomingValue(0));
2012 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2013 if (!match(Blend->getMask(I), m_False()))
2014 UniqueValues.insert(Blend->getIncomingValue(I));
2015
2016 if (UniqueValues.size() == 1) {
2017 Blend->replaceAllUsesWith(*UniqueValues.begin());
2018 Blend->eraseFromParent();
2019 continue;
2020 }
2021
2022 if (Blend->isNormalized())
2023 continue;
2024
2025 // Normalize the blend so its first incoming value is used as the initial
2026 // value with the others blended into it.
2027
2028 unsigned StartIndex = 0;
2029 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2030 // If a value's mask is used only by the blend then is can be deadcoded.
2031 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2032 // that's used by multiple blends where it can be removed from them all.
2033 VPValue *Mask = Blend->getMask(I);
2034 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
2035 StartIndex = I;
2036 break;
2037 }
2038 }
2039
2040 SmallVector<VPValue *, 4> OperandsWithMask;
2041 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2042
2043 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2044 if (I == StartIndex)
2045 continue;
2046 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2047 OperandsWithMask.push_back(Blend->getMask(I));
2048 }
2049
2050 auto *NewBlend =
2051 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2052 OperandsWithMask, *Blend, Blend->getDebugLoc());
2053 NewBlend->insertBefore(&R);
2054
2055 VPValue *DeadMask = Blend->getMask(StartIndex);
2056 Blend->replaceAllUsesWith(NewBlend);
2057 Blend->eraseFromParent();
2059
2060 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2061 VPValue *NewMask;
2062 if (NewBlend->getNumOperands() == 3 &&
2063 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2064 VPValue *Inc0 = NewBlend->getOperand(0);
2065 VPValue *Inc1 = NewBlend->getOperand(1);
2066 VPValue *OldMask = NewBlend->getOperand(2);
2067 NewBlend->setOperand(0, Inc1);
2068 NewBlend->setOperand(1, Inc0);
2069 NewBlend->setOperand(2, NewMask);
2070 if (OldMask->getNumUsers() == 0)
2071 cast<VPInstruction>(OldMask)->eraseFromParent();
2072 }
2073 }
2074 }
2075}
2076
2077/// Optimize the width of vector induction variables in \p Plan based on a known
2078/// constant Trip Count, \p BestVF and \p BestUF.
2080 ElementCount BestVF,
2081 unsigned BestUF) {
2082 // Only proceed if we have not completely removed the vector region.
2083 if (!Plan.getVectorLoopRegion())
2084 return false;
2085
2086 const APInt *TC;
2087 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2088 return false;
2089
2090 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2091 // and UF. Returns at least 8.
2092 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2093 APInt AlignedTC =
2096 APInt MaxVal = AlignedTC - 1;
2097 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2098 };
2099 unsigned NewBitWidth =
2100 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2101
2102 LLVMContext &Ctx = Plan.getContext();
2103 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2104
2105 bool MadeChange = false;
2106
2107 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2108 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2109 // Currently only handle canonical IVs as it is trivial to replace the start
2110 // and stop values, and we currently only perform the optimization when the
2111 // IV has a single use.
2113 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2114 continue;
2115 if (WideIV->hasMoreThanOneUniqueUser() ||
2116 NewIVTy == WideIV->getScalarType())
2117 continue;
2118
2119 // Currently only handle cases where the single user is a header-mask
2120 // comparison with the backedge-taken-count.
2121 VPUser *SingleUser = WideIV->getSingleUser();
2122 if (!SingleUser ||
2123 !match(SingleUser,
2124 m_ICmp(m_Specific(WideIV),
2126 continue;
2127
2128 // Update IV operands and comparison bound to use new narrower type.
2129 assert(!WideIV->getTruncInst() &&
2130 "canonical IV is not expected to have a truncation");
2131 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2132 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2133 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2134 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2135 NewWideIV->insertBefore(WideIV);
2136
2137 auto *NewBTC = new VPWidenCastRecipe(
2138 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2139 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2140 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2141 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2142 Cmp->replaceAllUsesWith(
2143 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2144
2145 MadeChange = true;
2146 }
2147
2148 return MadeChange;
2149}
2150
2151/// Return true if \p Cond is known to be true for given \p BestVF and \p
2152/// BestUF.
2154 ElementCount BestVF, unsigned BestUF,
2157 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2158 &PSE](VPValue *C) {
2159 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2160 });
2161
2162 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2165 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2166 m_Specific(&Plan.getVectorTripCount()))))
2167 return false;
2168
2169 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2170 // count is not conveniently available as SCEV so far, so we compare directly
2171 // against the original trip count. This is stricter than necessary, as we
2172 // will only return true if the trip count == vector trip count.
2173 const SCEV *VectorTripCount =
2175 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2176 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2177 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2178 "Trip count SCEV must be computable");
2179 ScalarEvolution &SE = *PSE.getSE();
2180 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2181 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2182 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2183}
2184
2185/// Try to replace multiple active lane masks used for control flow with
2186/// a single, wide active lane mask instruction followed by multiple
2187/// extract subvector intrinsics. This applies to the active lane mask
2188/// instructions both in the loop and in the preheader.
2189/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2190/// new extracts from the first active lane mask, which has it's last
2191/// operand (multiplier) set to UF.
2193 unsigned UF) {
2194 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2195 return false;
2196
2197 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2198 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2199 auto *Term = &ExitingVPBB->back();
2200
2201 using namespace llvm::VPlanPatternMatch;
2203 m_VPValue(), m_VPValue(), m_VPValue())))))
2204 return false;
2205
2206 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2207 LLVMContext &Ctx = Plan.getContext();
2208
2209 auto ExtractFromALM = [&](VPInstruction *ALM,
2210 SmallVectorImpl<VPValue *> &Extracts) {
2211 DebugLoc DL = ALM->getDebugLoc();
2212 for (unsigned Part = 0; Part < UF; ++Part) {
2214 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2215 auto *Ext =
2216 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2217 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2218 Extracts[Part] = Ext;
2219 Ext->insertAfter(ALM);
2220 }
2221 };
2222
2223 // Create a list of each active lane mask phi, ordered by unroll part.
2225 for (VPRecipeBase &R : Header->phis()) {
2227 if (!Phi)
2228 continue;
2229 VPValue *Index = nullptr;
2230 match(Phi->getBackedgeValue(),
2232 assert(Index && "Expected index from ActiveLaneMask instruction");
2233
2234 uint64_t Part;
2235 if (match(Index,
2237 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2238 Phis[Part] = Phi;
2239 else {
2240 // Anything other than a CanonicalIVIncrementForPart is part 0
2241 assert(!match(
2242 Index,
2244 Phis[0] = Phi;
2245 }
2246 }
2247
2248 assert(all_of(Phis, not_equal_to(nullptr)) &&
2249 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2250
2251 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2252 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2253
2254 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2255 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2256 "Expected incoming values of Phi to be ActiveLaneMasks");
2257
2258 // When using wide lane masks, the return type of the get.active.lane.mask
2259 // intrinsic is VF x UF (last operand).
2260 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2261 EntryALM->setOperand(2, ALMMultiplier);
2262 LoopALM->setOperand(2, ALMMultiplier);
2263
2264 // Create UF x extract vectors and insert into preheader.
2265 SmallVector<VPValue *> EntryExtracts(UF);
2266 ExtractFromALM(EntryALM, EntryExtracts);
2267
2268 // Create UF x extract vectors and insert before the loop compare & branch,
2269 // updating the compare to use the first extract.
2270 SmallVector<VPValue *> LoopExtracts(UF);
2271 ExtractFromALM(LoopALM, LoopExtracts);
2272 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2273 Not->setOperand(0, LoopExtracts[0]);
2274
2275 // Update the incoming values of active lane mask phis.
2276 for (unsigned Part = 0; Part < UF; ++Part) {
2277 Phis[Part]->setStartValue(EntryExtracts[Part]);
2278 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2279 }
2280
2281 return true;
2282}
2283
2284/// Try to simplify the branch condition of \p Plan. This may restrict the
2285/// resulting plan to \p BestVF and \p BestUF.
2287 unsigned BestUF,
2289 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2290 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2291 auto *Term = &ExitingVPBB->back();
2292 VPValue *Cond;
2293 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2294 // Check if the branch condition compares the canonical IV increment (for main
2295 // loop), or the canonical IV increment plus an offset (for epilog loop).
2296 if (match(Term, m_BranchOnCount(
2297 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2298 m_VPValue())) ||
2300 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2301 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2302 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2303 const SCEV *VectorTripCount =
2305 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2306 VectorTripCount =
2308 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2309 "Trip count SCEV must be computable");
2310 ScalarEvolution &SE = *PSE.getSE();
2311 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2312 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2313 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2314 return false;
2315 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2317 // For BranchOnCond, check if we can prove the condition to be true using VF
2318 // and UF.
2319 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2320 return false;
2321 } else {
2322 return false;
2323 }
2324
2325 // The vector loop region only executes once. Convert terminator of the
2326 // exiting block to exit in the first iteration.
2327 if (match(Term, m_BranchOnTwoConds())) {
2328 Term->setOperand(1, Plan.getTrue());
2329 return true;
2330 }
2331
2332 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2333 {}, Term->getDebugLoc());
2334 ExitingVPBB->appendRecipe(BOC);
2335 Term->eraseFromParent();
2336
2337 return true;
2338}
2339
2340/// From the definition of llvm.experimental.get.vector.length,
2341/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2345 vp_depth_first_deep(Plan.getEntry()))) {
2346 for (VPRecipeBase &R : *VPBB) {
2347 VPValue *AVL;
2348 if (!match(&R, m_EVL(m_VPValue(AVL))))
2349 continue;
2350
2351 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2352 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2353 continue;
2354 ScalarEvolution &SE = *PSE.getSE();
2355 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2356 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2357 continue;
2358
2360 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2361 R.getDebugLoc());
2362 if (Trunc != AVL) {
2363 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2364 const DataLayout &DL = Plan.getDataLayout();
2365 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL,
2366 Plan.getContext()))
2367 Trunc = Folded;
2368 }
2369 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2370 return true;
2371 }
2372 }
2373 return false;
2374}
2375
2377 unsigned BestUF,
2379 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2380 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2381
2382 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2383 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2384 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2385
2386 if (MadeChange) {
2387 Plan.setVF(BestVF);
2388 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2389 }
2390}
2391
2393 for (VPRecipeBase &R :
2395 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2396 if (!PhiR)
2397 continue;
2398 RecurKind RK = PhiR->getRecurrenceKind();
2399 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2401 continue;
2402
2403 for (VPUser *U : collectUsersRecursively(PhiR))
2404 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2405 RecWithFlags->dropPoisonGeneratingFlags();
2406 }
2407 }
2408}
2409
2410namespace {
2411struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2412 static bool isSentinel(const VPSingleDefRecipe *Def) {
2413 return Def == getEmptyKey();
2414 }
2415
2416 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2417 /// return that source element type.
2418 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2419 // All VPInstructions that lower to GEPs must have the i8 source element
2420 // type (as they are PtrAdds), so we omit it.
2422 .Case([](const VPReplicateRecipe *I) -> Type * {
2423 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2424 return GEP->getSourceElementType();
2425 return nullptr;
2426 })
2427 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2428 [](auto *I) { return I->getSourceElementType(); })
2429 .Default([](auto *) { return nullptr; });
2430 }
2431
2432 /// Returns true if recipe \p Def can be safely handed for CSE.
2433 static bool canHandle(const VPSingleDefRecipe *Def) {
2434 // We can extend the list of handled recipes in the future,
2435 // provided we account for the data embedded in them while checking for
2436 // equality or hashing.
2437 auto C = getOpcodeOrIntrinsicID(Def);
2438
2439 // The issue with (Insert|Extract)Value is that the index of the
2440 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2441 // VPlan.
2442 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2443 C->second == Instruction::ExtractValue)))
2444 return false;
2445
2446 // During CSE, we can only handle recipes that don't read from memory: if
2447 // they read from memory, there could be an intervening write to memory
2448 // before the next instance is CSE'd, leading to an incorrect result.
2449 return !Def->mayReadFromMemory();
2450 }
2451
2452 /// Hash the underlying data of \p Def.
2453 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2454 hash_code Result = hash_combine(
2455 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2456 getGEPSourceElementType(Def), Def->getScalarType(),
2458 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2459 if (RFlags->hasPredicate())
2460 return hash_combine(Result, RFlags->getPredicate());
2461 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2462 return hash_combine(Result, SIVSteps->getInductionOpcode());
2463 return Result;
2464 }
2465
2466 /// Check equality of underlying data of \p L and \p R.
2467 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2468 if (isSentinel(L) || isSentinel(R))
2469 return L == R;
2470 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2472 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2474 !equal(L->operands(), R->operands()))
2475 return false;
2477 "must have valid opcode info for both recipes");
2478 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2479 if (LFlags->hasPredicate() &&
2480 LFlags->getPredicate() !=
2481 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2482 return false;
2483 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2484 if (LSIV->getInductionOpcode() !=
2485 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2486 return false;
2487 // Recipes in replicate regions implicitly depend on predicate. If either
2488 // recipe is in a replicate region, only consider them equal if both have
2489 // the same parent.
2490 const VPRegionBlock *RegionL = L->getRegion();
2491 const VPRegionBlock *RegionR = R->getRegion();
2492 if (((RegionL && RegionL->isReplicator()) ||
2493 (RegionR && RegionR->isReplicator())) &&
2494 L->getParent() != R->getParent())
2495 return false;
2496 return L->getScalarType() == R->getScalarType();
2497 }
2498};
2499} // end anonymous namespace
2500
2501/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2502/// Plan.
2504 VPDominatorTree VPDT(Plan);
2506
2508 Plan.getEntry());
2510 for (VPRecipeBase &R : *VPBB) {
2511 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2512 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2513 continue;
2514 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2515 // V must dominate Def for a valid replacement.
2516 if (!VPDT.dominates(V->getParent(), VPBB))
2517 continue;
2518 // Only keep flags present on both V and Def.
2519 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2520 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2521 Def->replaceAllUsesWith(V);
2522 continue;
2523 }
2524 CSEMap[Def] = Def;
2525 }
2526 }
2527}
2528
2529/// Return true if we do not know how to (mechanically) hoist or sink a
2530/// non-memory or memory recipe \p R out of a loop region.
2532 VPBasicBlock *LastBB) {
2533 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2535
2536 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2537 auto MemLoc = vputils::getMemoryLocation(R);
2538 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2539}
2540
2541/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2542static void licm(VPlan &Plan) {
2543 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2544
2545 // Hoist any loop invariant recipes from the vector loop region to the
2546 // preheader. Preform a shallow traversal of the vector loop region, to
2547 // exclude recipes in replicate regions. Since the top-level blocks in the
2548 // vector loop region are guaranteed to execute if the vector pre-header is,
2549 // we don't need to check speculation safety.
2550 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2551 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2552 "Expected vector prehader's successor to be the vector loop region");
2554 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2555 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2556 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2557 LoopRegion->getExitingBasicBlock()))
2558 continue;
2559 if (any_of(R.operands(), [](VPValue *Op) {
2560 return !Op->isDefinedOutsideLoopRegions();
2561 }))
2562 continue;
2563 R.moveBefore(*Preheader, Preheader->end());
2564 }
2565 }
2566
2567#ifndef NDEBUG
2568 VPDominatorTree VPDT(Plan);
2569#endif
2570 // Sink recipes with no users inside the vector loop region if all users are
2571 // in the same exit block of the region.
2572 // TODO: Extend to sink recipes from inner loops.
2574 LoopRegion->getEntry());
2576 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2577 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2578 continue;
2579
2580 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2581 assert(!RepR->isPredicated() &&
2582 "Expected prior transformation of predicated replicates to "
2583 "replicate regions");
2584 // narrowToSingleScalarRecipes should have already maximally narrowed
2585 // replicates to single-scalar replicates.
2586 // TODO: When unrolling, replicateByVF doesn't handle sunk
2587 // non-single-scalar replicates correctly.
2588 if (!RepR->isSingleScalar())
2589 continue;
2590 }
2591
2592 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2593 // support recipes with multiple defined values (e.g., interleaved loads).
2594 auto *Def = cast<VPSingleDefRecipe>(&R);
2595
2596 // Cannot sink the recipe if the user is defined in a loop region or a
2597 // non-successor of the vector loop region. Cannot sink if user is a phi
2598 // either.
2599 VPBasicBlock *SinkBB = nullptr;
2600 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2601 auto *UserR = cast<VPRecipeBase>(U);
2602 VPBasicBlock *Parent = UserR->getParent();
2603 // TODO: Support sinking when users are in multiple blocks.
2604 if (SinkBB && SinkBB != Parent)
2605 return true;
2606 SinkBB = Parent;
2607 // TODO: If the user is a PHI node, we should check the block of
2608 // incoming value. Support PHI node users if needed.
2609 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2610 Parent->getSinglePredecessor() != LoopRegion;
2611 }))
2612 continue;
2613
2614 if (!SinkBB)
2615 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2616
2617 // TODO: This will need to be a check instead of a assert after
2618 // conditional branches in vectorized loops are supported.
2619 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2620 "Defining block must dominate sink block");
2621 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2622 // just moving.
2623 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2624 }
2625 }
2626}
2627
2629 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2630 if (Plan.hasScalarVFOnly())
2631 return;
2632 // Keep track of created truncates, so they can be re-used. Note that we
2633 // cannot use RAUW after creating a new truncate, as this would could make
2634 // other uses have different types for their operands, making them invalidly
2635 // typed.
2637 VPBasicBlock *PH = Plan.getVectorPreheader();
2640 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2643 continue;
2644
2645 VPValue *ResultVPV = R.getVPSingleValue();
2646 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2647 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2648 if (!NewResSizeInBits)
2649 continue;
2650
2651 // If the value wasn't vectorized, we must maintain the original scalar
2652 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2653 // skip casts which do not need to be handled explicitly here, as
2654 // redundant casts will be removed during recipe simplification.
2656 continue;
2657
2658 Type *OldResTy = ResultVPV->getScalarType();
2659 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2660 assert(OldResTy->isIntegerTy() && "only integer types supported");
2661 (void)OldResSizeInBits;
2662
2663 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2664
2665 // Any wrapping introduced by shrinking this operation shouldn't be
2666 // considered undefined behavior. So, we can't unconditionally copy
2667 // arithmetic wrapping flags to VPW.
2668 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2669 VPW->dropPoisonGeneratingFlags();
2670
2671 assert((OldResSizeInBits != NewResSizeInBits ||
2672 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2673 "Only ICmps should not need extending the result.");
2674 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2675
2676 // For loads/intrinsics we don't recreate the recipe; just wrap the
2677 // original wide result in a ZExt to OldResTy.
2679 if (OldResSizeInBits != NewResSizeInBits) {
2681 Instruction::ZExt, ResultVPV, OldResTy);
2682 ResultVPV->replaceAllUsesWith(Ext);
2683 Ext->setOperand(0, ResultVPV);
2684 }
2685 continue;
2686 }
2687
2688 // Shrink operands by introducing truncates as needed.
2689 unsigned StartIdx =
2690 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2691 SmallVector<VPValue *> NewOperands(R.operands());
2692 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2693 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2694 if (OpSizeInBits == NewResSizeInBits)
2695 continue;
2696 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2697 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2698 if (Inserted) {
2699 VPBuilder Builder;
2700 if (isa<VPIRValue>(Op))
2701 Builder.setInsertPoint(PH);
2702 else
2703 Builder.setInsertPoint(&R);
2704 ProcessedIter->second =
2705 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2706 }
2707 Op = ProcessedIter->second;
2708 }
2709
2710 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2711 NWR->insertBefore(&R);
2712
2713 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2714 // users (unless this is an ICmp, which produces i1 regardless).
2715 VPValue *Replacement = NWR->getVPSingleValue();
2716 if (OldResSizeInBits != NewResSizeInBits)
2717 Replacement =
2719 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2720 ->getVPSingleValue();
2721 ResultVPV->replaceAllUsesWith(Replacement);
2722 R.eraseFromParent();
2723 }
2724 }
2725}
2726
2727void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2728 std::optional<VPDominatorTree> VPDT;
2729 if (OnlyLatches)
2730 VPDT.emplace(Plan);
2731
2732 // Collect all blocks before modifying the CFG so we can identify unreachable
2733 // ones after constant branch removal.
2735
2736 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2737 VPValue *Cond;
2738 // Skip blocks that are not terminated by BranchOnCond.
2739 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2740 continue;
2741
2742 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2743 continue;
2744
2745 assert(VPBB->getNumSuccessors() == 2 &&
2746 "Two successors expected for BranchOnCond");
2747 unsigned RemovedIdx;
2748 if (match(Cond, m_True()))
2749 RemovedIdx = 1;
2750 else if (match(Cond, m_False()))
2751 RemovedIdx = 0;
2752 else
2753 continue;
2754
2755 VPBasicBlock *RemovedSucc =
2756 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2757 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2758 "There must be a single edge between VPBB and its successor");
2759 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2760 // these recipes.
2761 for (VPRecipeBase &R : RemovedSucc->phis())
2762 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2763
2764 // Disconnect blocks and remove the terminator.
2765 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2766 VPBB->back().eraseFromParent();
2767 }
2768
2769 // Compute which blocks are still reachable from the entry after constant
2770 // branch removal.
2773
2774 // Detach all unreachable blocks from their successors, removing their recipes
2775 // and incoming values from phi recipes.
2776 VPSymbolicValue Tmp(nullptr);
2777 for (VPBlockBase *B : AllBlocks) {
2778 if (Reachable.contains(B))
2779 continue;
2780 for (VPBlockBase *Succ : to_vector(B->successors())) {
2781 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2782 for (VPRecipeBase &R : SuccBB->phis())
2783 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2785 }
2786 for (VPBasicBlock *DeadBB :
2788 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2789 for (VPValue *Def : R.definedValues())
2790 Def->replaceAllUsesWith(&Tmp);
2791 R.eraseFromParent();
2792 }
2793 }
2794 }
2795}
2796
2816
2817// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2818// the loop terminator with a branch-on-cond recipe with the negated
2819// active-lane-mask as operand. Note that this turns the loop into an
2820// uncountable one. Only the existing terminator is replaced, all other existing
2821// recipes/users remain unchanged, except for poison-generating flags being
2822// dropped from the canonical IV increment. Return the created
2823// VPActiveLaneMaskPHIRecipe.
2824//
2825// The function adds the following recipes:
2826//
2827// vector.ph:
2828// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2829// %EntryALM = active-lane-mask %EntryInc, TC
2830//
2831// vector.body:
2832// ...
2833// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2834// ...
2835// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2836// %ALM = active-lane-mask %InLoopInc, TC
2837// %Negated = Not %ALM
2838// branch-on-cond %Negated
2839//
2842 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2843 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2844 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2845 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2846 // TODO: Check if dropping the flags is needed.
2847 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2848 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2849 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2850 // we have to take unrolling into account. Each part needs to start at
2851 // Part * VF
2852 auto *VecPreheader = Plan.getVectorPreheader();
2853 VPBuilder Builder(VecPreheader);
2854
2855 // Create the ActiveLaneMask instruction using the correct start values.
2856 VPValue *TC = Plan.getTripCount();
2857 VPValue *VF = &Plan.getVF();
2858
2859 auto *EntryIncrement = Builder.createOverflowingOp(
2860 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2861 DL, "index.part.next");
2862
2863 // Create the active lane mask instruction in the VPlan preheader.
2864 VPValue *ALMMultiplier =
2865 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2866 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2867 {EntryIncrement, TC, ALMMultiplier}, DL,
2868 "active.lane.mask.entry");
2869
2870 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2871 // preheader ActiveLaneMask instruction.
2872 auto *LaneMaskPhi =
2874 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2875 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2876
2877 // Create the active lane mask for the next iteration of the loop before the
2878 // original terminator.
2879 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2880 Builder.setInsertPoint(OriginalTerminator);
2881 auto *InLoopIncrement = Builder.createOverflowingOp(
2883 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2884 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2885 {InLoopIncrement, TC, ALMMultiplier}, DL,
2886 "active.lane.mask.next");
2887 LaneMaskPhi->addBackedgeValue(ALM);
2888
2889 // Replace the original terminator with BranchOnCond. We have to invert the
2890 // mask here because a true condition means jumping to the exit block.
2891 auto *NotMask = Builder.createNot(ALM, DL);
2892 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2893 OriginalTerminator->eraseFromParent();
2894 return LaneMaskPhi;
2895}
2896
2898 bool UseActiveLaneMaskForControlFlow) {
2899 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2900 auto *WideCanonicalIV =
2902 assert(WideCanonicalIV &&
2903 "Must have widened canonical IV when tail folding!");
2904 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2905 VPSingleDefRecipe *LaneMask;
2906 if (UseActiveLaneMaskForControlFlow) {
2907 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2908 } else {
2909 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2910 VPValue *ALMMultiplier =
2911 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2912 LaneMask =
2913 B.createNaryOp(VPInstruction::ActiveLaneMask,
2914 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2915 nullptr, "active.lane.mask");
2916 }
2917
2918 // Walk users of WideCanonicalIV and replace the header mask of the form
2919 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2920 // removing the old one to ensure there is always only a single header mask.
2921 HeaderMask->replaceAllUsesWith(LaneMask);
2922 HeaderMask->eraseFromParent();
2923}
2924
2925template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2926 Op0_t In;
2928
2929 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2930
2931 template <typename OpTy> bool match(OpTy *V) const {
2932 if (m_Specific(In).match(V)) {
2933 Out = nullptr;
2934 return true;
2935 }
2936 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2937 }
2938};
2939
2940/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2941/// Returns the remaining part \p Out if so, or nullptr otherwise.
2942template <typename Op0_t, typename Op1_t>
2943static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2944 Op1_t &Out) {
2945 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2946}
2947
2948static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2949 switch (IntrID) {
2950 case Intrinsic::masked_udiv:
2951 return Intrinsic::vp_udiv;
2952 case Intrinsic::masked_sdiv:
2953 return Intrinsic::vp_sdiv;
2954 case Intrinsic::masked_urem:
2955 return Intrinsic::vp_urem;
2956 case Intrinsic::masked_srem:
2957 return Intrinsic::vp_srem;
2958 default:
2959 return std::nullopt;
2960 }
2961}
2962
2963/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2964/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2965/// recipe could be created.
2966/// \p HeaderMask Header Mask.
2967/// \p CurRecipe Recipe to be transform.
2968/// \p EVL The explicit vector length parameter of vector-predication
2969/// intrinsics.
2971 VPRecipeBase &CurRecipe, VPValue &EVL) {
2972 VPlan *Plan = CurRecipe.getParent()->getPlan();
2973 DebugLoc DL = CurRecipe.getDebugLoc();
2974 VPValue *Addr, *Mask, *EndPtr;
2975
2976 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2977 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2978 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2979 EVLEndPtr->insertBefore(&CurRecipe);
2980 // Cast EVL (i32) to match the VF operand's type.
2981 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
2982 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
2984 EVLEndPtr->setOperand(1, EVLAsVF);
2985 return EVLEndPtr;
2986 };
2987
2988 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
2990 if (!V)
2991 return nullptr;
2992 auto *Reverse = new VPWidenIntrinsicRecipe(
2993 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2994 V->getScalarType(), {}, {}, DL);
2995 Reverse->insertBefore(&CurRecipe);
2996 return Reverse;
2997 };
2998
2999 if (match(&CurRecipe,
3000 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3001 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3002 EVL, Mask);
3003
3004 VPValue *ReversedVal;
3005 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3006 match(ReversedVal,
3007 m_MaskedLoad(m_VPValue(EndPtr),
3008 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3009 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3010 Mask = GetVPReverse(Mask);
3011 Addr = AdjustEndPtr(EndPtr);
3012 auto *LoadR = new VPWidenLoadEVLRecipe(
3013 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
3014 LoadR->insertBefore(&CurRecipe);
3015 return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_reverse,
3016 {LoadR, Plan->getTrue(), &EVL},
3017 LoadR->getScalarType(), {}, {}, DL);
3018 }
3019
3020 VPValue *Stride;
3022 m_VPValue(Addr), m_VPValue(Stride),
3023 m_RemoveMask(HeaderMask, Mask),
3024 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3025 if (!Mask)
3026 Mask = Plan->getTrue();
3027 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3028 NewLoad->setOperand(2, Mask);
3029 NewLoad->setOperand(3, &EVL);
3030 return NewLoad;
3031 }
3032
3033 VPValue *StoredVal;
3034 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3035 m_RemoveMask(HeaderMask, Mask))))
3036 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3037 StoredVal, EVL, Mask);
3038
3039 if (match(&CurRecipe,
3040 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3041 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3042 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3043 Mask = GetVPReverse(Mask);
3044 Addr = AdjustEndPtr(EndPtr);
3045 StoredVal = GetVPReverse(ReversedVal);
3046 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3047 StoredVal, EVL, Mask);
3048 }
3049
3050 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3051 if (Rdx->isConditional() &&
3052 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3053 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3054
3055 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3056 if (Interleave->getMask() &&
3057 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3058 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3059
3060 VPValue *LHS, *RHS;
3061 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3062 m_VPValue(RHS))))
3063 return new VPWidenIntrinsicRecipe(
3064 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3065 LHS->getScalarType(), {}, {}, DL);
3066
3067 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3068 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3069 VPValue *ZExt =
3070 VPBuilder(&CurRecipe)
3071 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3072 return new VPInstruction(
3073 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3074 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3075 }
3076
3077 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3078 if (match(&CurRecipe,
3080 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3081 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3082 {RHS, Plan->getTrue(), LHS, &EVL},
3083 LHS->getScalarType(), {}, {}, DL);
3084
3085 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3086 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3087 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3088 return new VPWidenIntrinsicRecipe(*VPID,
3089 {IntrR->getOperand(0),
3090 IntrR->getOperand(1),
3091 Mask ? Mask : Plan->getTrue(), &EVL},
3092 IntrR->getScalarType(), {}, {}, DL);
3093
3094 return nullptr;
3095}
3096
3097/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3098/// The transforms here need to preserve the original semantics.
3100 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3101 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3104 m_VPValue(EVL))) &&
3105 match(EVL, m_EVL(m_VPValue()))) {
3106 HeaderMask = R.getVPSingleValue();
3107 break;
3108 }
3109 }
3110 if (!HeaderMask)
3111 return;
3112
3113 SmallVector<VPRecipeBase *> OldRecipes;
3114 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3116 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3117 NewR->insertBefore(R);
3118 for (auto [Old, New] :
3119 zip_equal(R->definedValues(), NewR->definedValues()))
3120 Old->replaceAllUsesWith(New);
3121 OldRecipes.push_back(R);
3122 }
3123 }
3124
3125 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3126 // False, EVL)
3127 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3128 VPValue *Mask;
3129 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3130 auto *LogicalAnd = cast<VPInstruction>(U);
3131 auto *Merge = new VPWidenIntrinsicRecipe(
3132 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3133 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3134 Merge->insertBefore(LogicalAnd);
3135 LogicalAnd->replaceAllUsesWith(Merge);
3136 OldRecipes.push_back(LogicalAnd);
3137 }
3138 }
3139
3140 for (VPRecipeBase *R : reverse(OldRecipes)) {
3141 SmallVector<VPValue *> PossiblyDead(R->operands());
3142 R->eraseFromParent();
3143 for (VPValue *Op : PossiblyDead)
3145 }
3146}
3147
3148/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3149/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3150/// iteration.
3151static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3152 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3153 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3154
3155 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3156 VPValue *EVLAsIdx =
3160
3161 assert(all_of(Plan.getVF().users(),
3162 [&Plan](VPUser *U) {
3163 auto IsAllowedUser =
3164 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3165 VPWidenIntOrFpInductionRecipe,
3166 VPWidenMemIntrinsicRecipe>;
3167 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3168 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3169 IsAllowedUser);
3170 return IsAllowedUser(U);
3171 }) &&
3172 "User of VF that we can't transform to EVL.");
3173 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3175 });
3176
3177 assert(all_of(Plan.getVFxUF().users(),
3179 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3180 m_Specific(&Plan.getVFxUF())),
3182 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3183 "increment of the canonical induction.");
3184 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3185 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3186 // canonical induction must not be updated.
3188 });
3189
3190 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3191 // contained.
3192 bool ContainsFORs =
3194 if (ContainsFORs) {
3195 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3196 VPValue *MaxEVL = &Plan.getVF();
3197 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3198 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3199 MaxEVL = Builder.createScalarZExtOrTrunc(
3200 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3202
3203 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3204 VPValue *PrevEVL = Builder.createScalarPhi(
3205 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3206
3209 for (VPRecipeBase &R : *VPBB) {
3210 VPValue *V1, *V2;
3211 if (!match(&R,
3213 m_VPValue(V1), m_VPValue(V2))))
3214 continue;
3215 VPValue *Imm = Plan.getOrAddLiveIn(
3218 Intrinsic::experimental_vp_splice,
3219 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3220 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3221 VPSplice->insertBefore(&R);
3222 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3223 }
3224 }
3225 }
3226
3227 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3228 if (!HeaderMask)
3229 return;
3230
3231 // Ensure that any reduction that uses a select to mask off tail lanes does so
3232 // in the vector loop, not the middle block, since EVL tail folding can have
3233 // tail elements in the penultimate iteration.
3234 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3235 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3236 m_VPValue(), m_VPValue()))))
3237 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3238 Plan.getVectorLoopRegion();
3239 return true;
3240 }));
3241
3242 // Replace header masks with a mask equivalent to predicating by EVL:
3243 //
3244 // icmp ule widen-canonical-iv backedge-taken-count
3245 // ->
3246 // icmp ult step-vector, EVL
3247 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3248 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3249 Type *EVLType = EVL.getScalarType();
3250 VPValue *EVLMask = Builder.createICmp(
3252 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3253 HeaderMask->replaceAllUsesWith(EVLMask);
3254}
3255
3256/// Converts a tail folded vector loop region to step by
3257/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3258/// iteration.
3259///
3260/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3261/// replaces all uses of the canonical IV except for the canonical IV
3262/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3263/// only for loop iterations counting after this transformation.
3264///
3265/// - The header mask is replaced with a header mask based on the EVL.
3266///
3267/// - Plans with FORs have a new phi added to keep track of the EVL of the
3268/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3269/// @llvm.vp.splice.
3270///
3271/// The function uses the following definitions:
3272/// %StartV is the canonical induction start value.
3273///
3274/// The function adds the following recipes:
3275///
3276/// vector.ph:
3277/// ...
3278///
3279/// vector.body:
3280/// ...
3281/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3282/// [ %NextIter, %vector.body ]
3283/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3284/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3285/// ...
3286/// %OpEVL = cast i32 %VPEVL to IVSize
3287/// %NextIter = add IVSize %OpEVL, %CurrentIter
3288/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3289/// ...
3290///
3291/// If MaxSafeElements is provided, the function adds the following recipes:
3292/// vector.ph:
3293/// ...
3294///
3295/// vector.body:
3296/// ...
3297/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3298/// [ %NextIter, %vector.body ]
3299/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3300/// %cmp = cmp ult %AVL, MaxSafeElements
3301/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3302/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3303/// ...
3304/// %OpEVL = cast i32 %VPEVL to IVSize
3305/// %NextIter = add IVSize %OpEVL, %CurrentIter
3306/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3307/// ...
3308///
3310 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3311 if (Plan.hasScalarVFOnly())
3312 return;
3313 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3314 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3315
3316 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3317 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3318 VPValue *StartV = Plan.getZero(CanIVTy);
3319 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3320
3321 // Create the CurrentIteration recipe in the vector loop.
3322 auto *CurrentIteration =
3324 CurrentIteration->insertBefore(*Header, Header->begin());
3325 VPBuilder Builder(Header, Header->getFirstNonPhi());
3326 // Create the AVL (application vector length), starting from TC -> 0 in steps
3327 // of EVL.
3328 VPPhi *AVLPhi = Builder.createScalarPhi(
3329 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3330 VPValue *AVL = AVLPhi;
3331
3332 if (MaxSafeElements) {
3333 // Support for MaxSafeDist for correct loop emission.
3334 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3335 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3336 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3337 "safe_avl");
3338 }
3339 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3340 DebugLoc::getUnknown(), "evl");
3341
3342 Builder.setInsertPoint(CanonicalIVIncrement);
3343 VPValue *OpVPEVL = VPEVL;
3344
3345 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3346 OpVPEVL = Builder.createScalarZExtOrTrunc(
3347 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3348
3349 auto *NextIter = Builder.createAdd(
3350 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3351 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3352 CurrentIteration->addBackedgeValue(NextIter);
3353
3354 VPValue *NextAVL =
3355 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3356 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3357 AVLPhi->addIncoming(NextAVL);
3358
3359 fixupVFUsersForEVL(Plan, *VPEVL);
3360 removeDeadRecipes(Plan);
3361
3362 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3363 // except for the canonical IV increment.
3364 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3365 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3366 // TODO: support unroll factor > 1.
3367 Plan.setUF(1);
3368}
3369
3371 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3372 // There should be only one VPCurrentIteration in the entire plan.
3373 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3374
3377 for (VPRecipeBase &R : VPBB->phis())
3378 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3379 assert(!CurrentIteration &&
3380 "Found multiple CurrentIteration. Only one expected");
3381 CurrentIteration = PhiR;
3382 }
3383
3384 // Early return if it is not variable-length stepping.
3385 if (!CurrentIteration)
3386 return;
3387
3388 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3389 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3390
3391 // Convert CurrentIteration to concrete recipe.
3392 auto *ScalarR =
3393 VPBuilder(CurrentIteration)
3395 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3396 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3397 CurrentIteration->replaceAllUsesWith(ScalarR);
3398 CurrentIteration->eraseFromParent();
3399
3400 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3401 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3402 if (auto *CanIVInc = findUserOf(
3403 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3404 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3405 CanIVInc->eraseFromParent();
3406 }
3407}
3408
3410 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3411 if (!LoopRegion)
3412 return;
3413 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3414 if (Header->empty())
3415 return;
3416 // The EVL IV is always at the beginning.
3417 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3418 if (!EVLPhi)
3419 return;
3420
3421 // Bail if not an EVL tail folded loop.
3422 VPValue *AVL;
3423 if (!match(EVLPhi->getBackedgeValue(),
3424 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3425 return;
3426
3427 // The AVL may be capped to a safe distance.
3428 VPValue *SafeAVL, *UnsafeAVL;
3429 if (match(AVL,
3431 m_VPValue(SafeAVL)),
3432 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3433 AVL = UnsafeAVL;
3434
3435 VPValue *AVLNext;
3436 [[maybe_unused]] bool FoundAVLNext =
3438 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3439 assert(FoundAVLNext && "Didn't find AVL backedge?");
3440
3441 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3442 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3443 if (match(LatchBr, m_BranchOnCond(m_True())))
3444 return;
3445
3446 VPValue *CanIVInc;
3447 [[maybe_unused]] bool FoundIncrement = match(
3448 LatchBr,
3450 m_Specific(&Plan.getVectorTripCount()))));
3451 assert(FoundIncrement &&
3452 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3453 m_Specific(&Plan.getVFxUF()))) &&
3454 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3455 "trip count");
3456
3457 Type *AVLTy = AVLNext->getScalarType();
3458 VPBuilder Builder(LatchBr);
3459 LatchBr->setOperand(
3460 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3461}
3462
3464 VPlan &Plan, PredicatedScalarEvolution &PSE,
3465 const DenseMap<Value *, const SCEV *> &StridesMap) {
3466 // Replace VPValues for known constant strides guaranteed by predicated scalar
3467 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3468 // blocks dominated by the vector preheader.
3469 assert(!Plan.getVectorLoopRegion() &&
3470 "expected to run before loop regions are created");
3471 VPDominatorTree VPDT(Plan);
3472 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3473 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3474 auto *R = cast<VPRecipeBase>(&U);
3475 VPBlockBase *Parent = R->getParent();
3476 return VPDT.dominates(Preheader, Parent);
3477 };
3478 ValueToSCEVMapTy RewriteMap;
3479 for (const SCEV *Stride : StridesMap.values()) {
3480 using namespace SCEVPatternMatch;
3481 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3482 const APInt *StrideConst;
3483 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3484 // Only handle constant strides for now.
3485 continue;
3486
3487 auto *CI = Plan.getConstantInt(*StrideConst);
3488 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3489 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3490
3491 // The versioned value may not be used in the loop directly but through a
3492 // sext/zext. Add new live-ins in those cases.
3493 for (Value *U : StrideV->users()) {
3495 continue;
3496 VPValue *StrideVPV = Plan.getLiveIn(U);
3497 if (!StrideVPV)
3498 continue;
3499 unsigned BW = U->getType()->getScalarSizeInBits();
3500 APInt C =
3501 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3502 VPValue *CI = Plan.getConstantInt(C);
3503 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3504 }
3505 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3506 }
3507
3508 for (VPRecipeBase &R : *Plan.getEntry()) {
3509 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3510 if (!ExpSCEV)
3511 continue;
3512 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3513 auto *NewSCEV =
3514 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3515 if (NewSCEV != ScevExpr) {
3516 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3517 ExpSCEV->replaceAllUsesWith(NewExp);
3518 if (Plan.getTripCount() == ExpSCEV)
3519 Plan.resetTripCount(NewExp);
3520 }
3521 }
3522}
3523
3525 // Collect recipes in the backward slice of `Root` that may generate a poison
3526 // value that is used after vectorization.
3528 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3530 Worklist.push_back(Root);
3531
3532 // Traverse the backward slice of Root through its use-def chain.
3533 while (!Worklist.empty()) {
3534 VPRecipeBase *CurRec = Worklist.pop_back_val();
3535
3536 if (!Visited.insert(CurRec).second)
3537 continue;
3538
3539 // Prune search if we find another recipe generating a widen memory
3540 // instruction. Widen memory instructions involved in address computation
3541 // will lead to gather/scatter instructions, which don't need to be
3542 // handled.
3544 VPHeaderPHIRecipe>(CurRec))
3545 continue;
3546
3547 // This recipe contributes to the address computation of a widen
3548 // load/store. If the underlying instruction has poison-generating flags,
3549 // drop them directly.
3550 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3551 VPValue *A, *B;
3552 // Dropping disjoint from an OR may yield incorrect results, as some
3553 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3554 // for dependence analysis). Instead, replace it with an equivalent Add.
3555 // This is possible as all users of the disjoint OR only access lanes
3556 // where the operands are disjoint or poison otherwise.
3557 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3558 RecWithFlags->isDisjoint()) {
3559 VPBuilder Builder(RecWithFlags);
3560 VPInstruction *New =
3561 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3562 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3563 RecWithFlags->replaceAllUsesWith(New);
3564 RecWithFlags->eraseFromParent();
3565 CurRec = New;
3566 } else
3567 RecWithFlags->dropPoisonGeneratingFlags();
3568 } else {
3571 (void)Instr;
3572 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3573 "found instruction with poison generating flags not covered by "
3574 "VPRecipeWithIRFlags");
3575 }
3576
3577 // Add new definitions to the worklist.
3578 for (VPValue *Operand : CurRec->operands())
3579 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3580 Worklist.push_back(OpDef);
3581 }
3582 });
3583
3584 // We want to exclude the tail folding case, as we don't need to drop flags
3585 // for operations computing the first lane in this case: the first lane of the
3586 // header mask must always be true.
3587 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3588 return Mask && !vputils::isHeaderMask(Mask, Plan);
3589 };
3590
3591 // Traverse all the recipes in the VPlan and collect the poison-generating
3592 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3593 // VPInterleaveRecipe.
3594 auto Iter =
3597 for (VPRecipeBase &Recipe : *VPBB) {
3598 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3599 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3600 if (AddrDef && WidenRec->isConsecutive() &&
3601 IsNotHeaderMask(WidenRec->getMask()))
3602 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3603 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3604 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3605 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3606 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3607 }
3608 }
3609 }
3610}
3611
3613 VPlan &Plan,
3615 &InterleaveGroups,
3616 const bool &EpilogueAllowed) {
3617 if (InterleaveGroups.empty())
3618 return;
3619
3621 for (VPBasicBlock *VPBB :
3624 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3625 return isa<VPWidenMemoryRecipe>(&R);
3626 })) {
3627 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3628 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3629 }
3630
3631 // Interleave memory: for each Interleave Group we marked earlier as relevant
3632 // for this VPlan, replace the Recipes widening its memory instructions with a
3633 // single VPInterleaveRecipe at its insertion point.
3634 VPDominatorTree VPDT(Plan);
3635 for (const auto *IG : InterleaveGroups) {
3636 // Skip interleave groups where members don't have recipes. This can happen
3637 // when removeDeadRecipes removes recipes that are part of interleave groups
3638 // but have no users.
3639 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3640 return !IRMemberToRecipe.contains(Member);
3641 }))
3642 continue;
3643
3644 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3645 VPIRMetadata InterleaveMD(*Start);
3646 SmallVector<VPValue *, 4> StoredValues;
3647 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3648 StoredValues.push_back(StoreR->getStoredValue());
3649 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3650 Instruction *MemberI = IG->getMember(I);
3651 if (!MemberI)
3652 continue;
3653 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3654 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3655 StoredValues.push_back(StoreR->getStoredValue());
3656 InterleaveMD.intersect(*MemoryR);
3657 }
3658
3659 bool NeedsMaskForGaps =
3660 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3661 (!StoredValues.empty() && !IG->isFull());
3662
3663 Instruction *IRInsertPos = IG->getInsertPos();
3664 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3665 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3666
3668 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3669 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3670 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3671
3672 // Get or create the start address for the interleave group.
3673 VPValue *Addr = Start->getAddr();
3674 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3675 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3676 // We cannot re-use the address of member zero because it does not
3677 // dominate the insert position. Instead, use the address of the insert
3678 // position and create a PtrAdd adjusting it to the address of member
3679 // zero.
3680 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3681 // InsertPos or sink loads above zero members to join it.
3682 assert(IG->getIndex(IRInsertPos) != 0 &&
3683 "index of insert position shouldn't be zero");
3684 auto &DL = IRInsertPos->getDataLayout();
3685 APInt Offset(32,
3686 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3687 IG->getIndex(IRInsertPos),
3688 /*IsSigned=*/true);
3689 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3690 VPBuilder B(InsertPosR);
3691 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3692 }
3693 // If the group is reverse, adjust the index to refer to the last vector
3694 // lane instead of the first. We adjust the index from the first vector
3695 // lane, rather than directly getting the pointer for lane VF - 1, because
3696 // the pointer operand of the interleaved access is supposed to be uniform.
3697 if (IG->isReverse()) {
3698 auto *ReversePtr = new VPVectorEndPointerRecipe(
3699 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3700 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3701 ReversePtr->insertBefore(InsertPosR);
3702 Addr = ReversePtr;
3703 }
3704 auto *VPIG = new VPInterleaveRecipe(
3705 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3706 InterleaveMD, InsertPosR->getDebugLoc());
3707 VPIG->insertBefore(InsertPosR);
3708
3709 unsigned J = 0;
3710 for (unsigned i = 0; i < IG->getFactor(); ++i)
3711 if (Instruction *Member = IG->getMember(i)) {
3712 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3713 if (!Member->getType()->isVoidTy()) {
3714 VPValue *OriginalV = MemberR->getVPSingleValue();
3715 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3716 J++;
3717 }
3718 MemberR->eraseFromParent();
3719 }
3720 }
3721}
3722
3723/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3724/// value, phi and backedge value. In the following example:
3725///
3726/// vector.ph:
3727/// Successor(s): vector loop
3728///
3729/// <x1> vector loop: {
3730/// vector.body:
3731/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3732/// ...
3733/// EMIT branch-on-count ...
3734/// No successors
3735/// }
3736///
3737/// WIDEN-INDUCTION will get expanded to:
3738///
3739/// vector.ph:
3740/// ...
3741/// vp<%induction.start> = ...
3742/// vp<%induction.increment> = ...
3743///
3744/// Successor(s): vector loop
3745///
3746/// <x1> vector loop: {
3747/// vector.body:
3748/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3749/// ...
3750/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3751/// EMIT branch-on-count ...
3752/// No successors
3753/// }
3754static void
3756 VPlan *Plan = WidenIVR->getParent()->getPlan();
3757 VPValue *Start = WidenIVR->getStartValue();
3758 VPValue *Step = WidenIVR->getStepValue();
3759 VPValue *VF = WidenIVR->getVFValue();
3760 DebugLoc DL = WidenIVR->getDebugLoc();
3761
3762 // The value from the original loop to which we are mapping the new induction
3763 // variable.
3764 Type *Ty = WidenIVR->getScalarType();
3765
3766 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3769 VPIRFlags Flags = *WidenIVR;
3770 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3771 AddOp = Instruction::Add;
3772 MulOp = Instruction::Mul;
3773 } else {
3774 AddOp = ID.getInductionOpcode();
3775 MulOp = Instruction::FMul;
3776 }
3777
3778 // If the phi is truncated, truncate the start and step values.
3779 VPBuilder Builder(Plan->getVectorPreheader());
3780 Type *StepTy = Step->getScalarType();
3781 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3782 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3783 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3784 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3785 StepTy = Ty;
3786 }
3787
3788 // Construct the initial value of the vector IV in the vector loop preheader.
3789 Type *IVIntTy =
3791 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3792 if (StepTy->isFloatingPointTy())
3793 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3794
3795 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3796 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3797
3798 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3799 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3800 DebugLoc::getUnknown(), "induction");
3801
3802 // Create the widened phi of the vector IV.
3803 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3804 Init, WidenIVR->getDebugLoc(), "vec.ind");
3805
3806 // Create the backedge value for the vector IV.
3807 VPValue *Inc;
3808 VPValue *Prev;
3809 // If unrolled, use the increment and prev value from the operands.
3810 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3811 Inc = SplatVF;
3812 Prev = WidenIVR->getLastUnrolledPartOperand();
3813 } else {
3814 // Move the insertion point after the VF definition when the VF is defined
3815 // inside a loop, such as for EVL tail-folding.
3816 if (VPRecipeBase *R = VF->getDefiningRecipe())
3817 if (R->getParent()->getEnclosingLoopRegion())
3818 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3819
3820 // Multiply the vectorization factor by the step using integer or
3821 // floating-point arithmetic as appropriate.
3822 if (StepTy->isFloatingPointTy())
3823 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3824 DL);
3825 else
3826 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3827
3828 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3829 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3830 Prev = WidePHI;
3831 }
3832
3834 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3835 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3836 WidenIVR->getDebugLoc(), "vec.ind.next");
3837
3838 WidePHI->addIncoming(Next);
3839
3840 WidenIVR->replaceAllUsesWith(WidePHI);
3841}
3842
3843/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3844/// initial value, phi and backedge value. In the following example:
3845///
3846/// <x1> vector loop: {
3847/// vector.body:
3848/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3849/// ...
3850/// EMIT branch-on-count ...
3851/// }
3852///
3853/// WIDEN-POINTER-INDUCTION will get expanded to:
3854///
3855/// <x1> vector loop: {
3856/// vector.body:
3857/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3858/// EMIT %mul = mul %stepvector, %step
3859/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3860/// ...
3861/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3862/// EMIT branch-on-count ...
3863/// }
3865 VPlan *Plan = R->getParent()->getPlan();
3866 VPValue *Start = R->getStartValue();
3867 VPValue *Step = R->getStepValue();
3868 VPValue *VF = R->getVFValue();
3869
3870 assert(R->getInductionDescriptor().getKind() ==
3872 "Not a pointer induction according to InductionDescriptor!");
3873 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3874 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3875 "Recipe should have been replaced");
3876
3877 VPBuilder Builder(R);
3878 DebugLoc DL = R->getDebugLoc();
3879
3880 // Build a scalar pointer phi.
3881 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3882
3883 // Create actual address geps that use the pointer phi as base and a
3884 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3885 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3886 Type *StepTy = Step->getScalarType();
3887 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3888 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3889 VPValue *PtrAdd =
3890 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3891 R->replaceAllUsesWith(PtrAdd);
3892
3893 // Create the backedge value for the scalar pointer phi.
3895 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3896 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3897 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3898
3899 VPValue *InductionGEP =
3900 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3901 ScalarPtrPhi->addIncoming(InductionGEP);
3902}
3903
3904/// Expand a VPDerivedIVRecipe into executable recipes.
3906 VPBuilder Builder(R);
3907 VPIRValue *Start = R->getStartValue();
3908 VPValue *Step = R->getStepValue();
3909 VPValue *Index = R->getIndex();
3910 Type *StepTy = Step->getScalarType();
3911 Type *IndexTy = Index->getScalarType();
3912 Index = StepTy->isIntegerTy()
3913 ? Builder.createScalarSExtOrTrunc(
3914 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3915 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3917 switch (R->getInductionKind()) {
3919 assert(Index->getScalarType() == Start->getScalarType() &&
3920 "Index type does not match StartValue type");
3921 return R->replaceAllUsesWith(Builder.createAdd(
3922 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3923 }
3925 return R->replaceAllUsesWith(Builder.createPtrAdd(
3926 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3928 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3929 const FPMathOperator *FPBinOp = R->getFPBinOp();
3930 assert(FPBinOp &&
3931 (FPBinOp->getOpcode() == Instruction::FAdd ||
3932 FPBinOp->getOpcode() == Instruction::FSub) &&
3933 "Original BinOp should be defined for FP induction");
3934 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3935 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3936 return R->replaceAllUsesWith(
3937 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3938 }
3940 return;
3941 }
3942 llvm_unreachable("Unhandled induction kind");
3943}
3944
3946 // Replace loop regions with explicity CFG.
3947 SmallVector<VPRegionBlock *> LoopRegions;
3949 vp_depth_first_deep(Plan.getEntry()))) {
3950 if (!R->isReplicator())
3951 LoopRegions.push_back(R);
3952 }
3953 for (VPRegionBlock *R : LoopRegions)
3954 R->dissolveToCFGLoop();
3955}
3956
3959 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3960 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3963 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3964 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3965 }
3966
3967 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3968 // single-condition branches:
3969 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3970 // the first condition is true, and otherwise jumps to a new interim block.
3971 // 2. A branch that ends the interim block, jumps to the second successor if
3972 // the second condition is true, and otherwise jumps to the third
3973 // successor.
3974 for (VPInstruction *Br : WorkList) {
3975 assert(Br->getNumOperands() == 2 &&
3976 "BranchOnTwoConds must have exactly 2 conditions");
3977 DebugLoc DL = Br->getDebugLoc();
3978 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3979 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3980 assert(Successors.size() == 3 &&
3981 "BranchOnTwoConds must have exactly 3 successors");
3982
3983 for (VPBlockBase *Succ : Successors)
3984 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3985
3986 VPValue *Cond0 = Br->getOperand(0);
3987 VPValue *Cond1 = Br->getOperand(1);
3988 VPBlockBase *Succ0 = Successors[0];
3989 VPBlockBase *Succ1 = Successors[1];
3990 VPBlockBase *Succ2 = Successors[2];
3991
3992 // If the successor block for both conditions is the same, then combine the
3993 // two conditions and plant a single conditional branch.
3994 if (Succ0 == Succ1) {
3995 VPBuilder Builder(Br);
3996 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
3997 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
3998 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3999 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4000 Br->eraseFromParent();
4001 continue;
4002 }
4003
4004 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4005 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4006
4007 VPBasicBlock *InterimBB =
4008 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4009
4010 VPBuilder(BrOnTwoCondsBB)
4012 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4013 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4014
4016 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4017 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4018 Br->eraseFromParent();
4019 }
4020}
4021
4024 vp_depth_first_deep(Plan.getEntry()))) {
4025 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4026 VPBuilder Builder(&R);
4027 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4029 WidenIVR->eraseFromParent();
4030 continue;
4031 }
4032
4033 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4034 // If the recipe only generates scalars, scalarize it instead of
4035 // expanding it.
4036 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4037 VPValue *PtrAdd =
4038 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4039 WidenIVR->replaceAllUsesWith(PtrAdd);
4040 WidenIVR->eraseFromParent();
4041 continue;
4042 }
4044 WidenIVR->eraseFromParent();
4045 continue;
4046 }
4047
4048 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4049 expandVPDerivedIV(DerivedIVR);
4050 DerivedIVR->eraseFromParent();
4051 continue;
4052 }
4053
4054 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4055 VPValue *CanIV = WideCanIV->getCanonicalIV();
4056 Type *CanIVTy = CanIV->getScalarType();
4057 VPValue *Step = WideCanIV->getStepValue();
4058 if (!Step) {
4059 assert(Plan.getConcreteUF() == 1 &&
4060 "Expected unroller to have materialized step for UF != 1");
4061 Step = Plan.getZero(CanIVTy);
4062 }
4063 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4064 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4065 Step = Builder.createAdd(
4066 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4067 VPValue *CanVecIV =
4068 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4069 WideCanIV->getNoWrapFlags());
4070 WideCanIV->replaceAllUsesWith(CanVecIV);
4071 WideCanIV->eraseFromParent();
4072 continue;
4073 }
4074
4075 // Expand VPBlendRecipe into VPInstruction::Select.
4076 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4077 VPValue *Select = Blend->getIncomingValue(0);
4078 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4079 Select = Builder.createSelect(Blend->getMask(I),
4080 Blend->getIncomingValue(I), Select,
4081 R.getDebugLoc(), "predphi", *Blend);
4082 Blend->replaceAllUsesWith(Select);
4083 Blend->eraseFromParent();
4084 continue;
4085 }
4086
4087 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4088 if (!VEPR->getOffset()) {
4089 assert(Plan.getConcreteUF() == 1 &&
4090 "Expected unroller to have materialized offset for UF != 1");
4091 VEPR->materializeOffset();
4092 }
4093 continue;
4094 }
4095
4096 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4097 Expr->decompose();
4098 Expr->eraseFromParent();
4099 continue;
4100 }
4101
4102 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4103 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4104 if (LastActiveL &&
4105 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4106 // Create Not(Mask) for all operands.
4108 for (VPValue *Op : LastActiveL->operands()) {
4109 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4110 NotMasks.push_back(NotMask);
4111 }
4112
4113 // Create FirstActiveLane on the inverted masks.
4114 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4115 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4116
4117 // Subtract 1 to get the last active lane.
4118 VPValue *One =
4119 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4120 VPValue *LastLane =
4121 Builder.createSub(FirstInactiveLane, One,
4122 LastActiveL->getDebugLoc(), "last.active.lane");
4123
4124 LastActiveL->replaceAllUsesWith(LastLane);
4125 LastActiveL->eraseFromParent();
4126 continue;
4127 }
4128
4129 // Lower MaskedCond with block mask to LogicalAnd.
4131 auto *VPI = cast<VPInstruction>(&R);
4132 assert(VPI->isMasked() &&
4133 "Unmasked MaskedCond should be simplified earlier");
4134 VPI->replaceAllUsesWith(Builder.createNaryOp(
4135 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4136 VPI->eraseFromParent();
4137 continue;
4138 }
4139
4140 // Lower CanonicalIVIncrementForPart to plain Add.
4141 if (match(
4142 &R,
4144 auto *VPI = cast<VPInstruction>(&R);
4145 VPValue *Add = Builder.createOverflowingOp(
4146 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4147 VPI->getDebugLoc());
4148 VPI->replaceAllUsesWith(Add);
4149 VPI->eraseFromParent();
4150 continue;
4151 }
4152
4153 // Lower BranchOnCount to ICmp + BranchOnCond.
4154 VPValue *IV, *TC;
4155 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4156 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4157 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4158 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4159 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4160 BranchOnCountInst->eraseFromParent();
4161 continue;
4162 }
4163
4164 VPValue *VectorStep;
4165 VPValue *ScalarStep;
4167 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4168 continue;
4169
4170 // Expand WideIVStep.
4171 auto *VPI = cast<VPInstruction>(&R);
4172 Type *IVTy = VPI->getScalarType();
4173 if (VectorStep->getScalarType() != IVTy) {
4175 ? Instruction::UIToFP
4176 : Instruction::Trunc;
4177 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4178 }
4179
4180 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4181 if (ScalarStep->getScalarType() != IVTy) {
4182 ScalarStep =
4183 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4184 }
4185
4186 VPIRFlags Flags;
4187 unsigned MulOpc;
4188 if (IVTy->isFloatingPointTy()) {
4189 MulOpc = Instruction::FMul;
4190 Flags = VPI->getFastMathFlags();
4191 } else {
4192 MulOpc = Instruction::Mul;
4193 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4194 }
4195
4196 VPInstruction *Mul = Builder.createNaryOp(
4197 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4198 VectorStep = Mul;
4199 VPI->replaceAllUsesWith(VectorStep);
4200 VPI->eraseFromParent();
4201 }
4202 }
4203}
4204
4210
4211/// Update \p Plan to mask memory operations in the loop based on whether the
4212/// early exit is taken or not.
4215 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4216 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4217 AssumptionCache *AC, VPDominatorTree &VPDT) {
4218
4219 // Disconnect early exiting blocks from successors, remove branches. We
4220 // currently don't support multiple uses for recipes involved in creating
4221 // the uncountable exit condition.
4222 for (auto &Exit : Exits) {
4223 if (Exit.EarlyExitingVPBB == LatchVPBB)
4224 continue;
4225
4226 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4227 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4228 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4229 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4230 }
4231
4232 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4233 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4234 // version of the loop.
4236 SmallVector<VPInstruction *, 8> ConditionRecipes;
4237
4238 std::optional<VPValue *> Cond =
4239 vputils::getRecipesForUncountableExit(ConditionRecipes, GEPs, LatchVPBB);
4240 if (!Cond)
4241 return false;
4242
4243 // Find load contributing to condition.
4244 VPRecipeBase *CondLoad = nullptr;
4245 for (auto *Recipe : ConditionRecipes) {
4247 // TODO: Support more than one load. Needs legality updates too.
4248 assert(CondLoad == nullptr && "Too many condition loads");
4249 CondLoad = Recipe;
4250 }
4251 }
4252 assert(CondLoad && "Couldn't find load");
4253
4254 // Ensure that we are guaranteed to be able to dereference the memory used
4255 // for determining the uncountable exit for the maximum possible number of
4256 // scalar iterations of the loop.
4257 //
4258 // TODO: Support first-faulting loads in cases where we don't know whether
4259 // all possible addresses are dereferenceable.
4260 {
4263 VPValue *Ptr = Load->getOperand(0);
4264 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4265 const DataLayout &DL = Plan.getDataLayout();
4266 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4267 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4269 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4270 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4271 &Predicates))
4272 return false;
4273 }
4274
4275 // Check GEPs to see if we can link them to a widen IV recipe with a step of
4276 // 1; we're only interested in contiguous accesses for the condition load
4277 // right now.
4278 for (auto *GEP : GEPs) {
4279 VPValue *MaybeIV = nullptr;
4281 m_LiveIn(), m_VPValue(MaybeIV))))
4282 return false;
4283
4284 auto *WIV = dyn_cast<VPWidenInductionRecipe>(MaybeIV);
4285 if (!WIV)
4286 return false;
4287
4288 if (!match(WIV->getStartValue(), m_SpecificInt(0)) ||
4289 !match(WIV->getStepValue(), m_SpecificInt(1)))
4290 return false;
4291 }
4292
4293 // Find an insertion point. Default to the end of the header but override
4294 // if we find a memory op that needs masking before the condition load.
4295 auto InsertIt = HeaderVPBB->end();
4296 VPRecipeBase *CondR = (*Cond)->getDefiningRecipe();
4297 bool CondMoveNeeded = CondR->getParent() != HeaderVPBB;
4298 for (VPRecipeBase &R : *HeaderVPBB) {
4299 if (&R == CondLoad)
4300 continue;
4301
4302 if (R.mayReadOrWriteMemory()) {
4303 if (!VPDT.properlyDominates(CondR, &R)) {
4304 CondMoveNeeded = true;
4305 InsertIt = R.getIterator();
4306 }
4307 break;
4308 }
4309 }
4310
4311 // If another memory operation would take place before the comparison to
4312 // determine whether to exit early or the comparison doesn't take place in
4313 // the header, move the comparison (and supporting recipes).
4314 if (CondMoveNeeded)
4315 for (auto *Recipe : reverse(ConditionRecipes))
4316 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4317
4318 // Create a mask to represent all lanes that fully execute in the vector loop,
4319 // stopping short of any early exit.
4320 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4321 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4322 VPValue *IV = cast<VPSingleDefRecipe>(&HeaderVPBB->front());
4323 Type *IVScalarTy = IV->getScalarType();
4324 Type *FirstActiveTy = FirstActive->getScalarType();
4325 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4326 VPValue *Zero = Plan.getZero(IVScalarTy);
4327 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4328 FirstActiveTy, DebugLoc());
4330 {Zero, FirstActive, ALMMultiplier},
4331 DebugLoc(), "uncountable.exit.mask");
4332
4333 // Convert all other memory operations to use the mask.
4334 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4335 for (VPRecipeBase &R : *VPBB)
4336 if (R.mayReadOrWriteMemory() && &R != CondLoad) {
4337 // TODO: Handle conditional memory operations in the loop.
4338 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4339 return false;
4340 cast<VPInstruction>(&R)->addMask(Mask);
4341 }
4342
4343 // Update middle block branch to compare (IV + however many lanes were active)
4344 // against the full trip count, since we may be exiting the vector loop early.
4345 // If we didn't take an early exit, we should get the equivalent of VF from
4346 // the FirstActiveLane.
4347 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());
4348 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4349 {Zero, IV}, DebugLoc());
4350 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4351 VPValue *FullTC =
4352 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4353 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});
4354
4355 // Update resume phi in scalar.ph.
4356 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4357 auto Phis = ScalarPH->phis();
4358 // TODO: Handle more than one Phi; re-derive from IV.
4359 // TODO: Handle reductions.
4360 if (range_size(Phis) != 1)
4361 return false;
4362 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4363 ContinueIV->setOperand(0, ExitIV);
4364 return true;
4365}
4366
4368 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4369 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4371 VPDominatorTree VPDT(Plan);
4372 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4374 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4375 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4376 if (Pred == MiddleVPBB)
4377 continue;
4378 // Collect condition for this early exit.
4379 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4380 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4381 VPValue *CondOfEarlyExitingVPBB;
4382 [[maybe_unused]] bool Matched =
4383 match(EarlyExitingVPBB->getTerminator(),
4384 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4385 assert(Matched && "Terminator must be BranchOnCond");
4386
4387 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4388 // the correct block mask.
4389 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4390 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4392 TrueSucc == ExitBlock
4393 ? CondOfEarlyExitingVPBB
4394 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4395 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4396 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4397 VPDT.properlyDominates(
4398 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4399 LatchVPBB)) &&
4400 "exit condition must dominate the latch");
4401 Exits.push_back({
4402 EarlyExitingVPBB,
4403 ExitBlock,
4404 CondToEarlyExit,
4405 });
4406 }
4407 }
4408
4409 assert(!Exits.empty() && "must have at least one early exit");
4410 // Sort exits by RPO order to get correct program order. RPO gives a
4411 // topological ordering of the CFG, ensuring upstream exits are checked
4412 // before downstream exits in the dispatch chain.
4414 HeaderVPBB);
4416 for (const auto &[Num, VPB] : enumerate(RPOT))
4417 RPOIdx[VPB] = Num;
4418 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4419 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4420 });
4421#ifndef NDEBUG
4422 // After RPO sorting, verify that for any pair where one exit dominates
4423 // another, the dominating exit comes first. This is guaranteed by RPO
4424 // (topological order) and is required for the dispatch chain correctness.
4425 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4426 for (unsigned J = I + 1; J < Exits.size(); ++J)
4427 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4428 Exits[I].EarlyExitingVPBB) &&
4429 "RPO sort must place dominating exits before dominated ones");
4430#endif
4431
4432 // Build the AnyOf condition for the latch terminator using logical OR
4433 // to avoid poison propagation from later exit conditions when an earlier
4434 // exit is taken.
4435 VPValue *Combined = Exits[0].CondToExit;
4436 for (const EarlyExitInfo &Info : drop_begin(Exits))
4437 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4438
4439 VPValue *IsAnyExitTaken =
4440 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4441
4442 // Create a comparison for the latch exit condition and replace the
4443 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4444 // is used as the latch-exit condition; canonical IV recipes have not been
4445 // introduced yet, so there is no BranchOnCount to derive the condition from.
4446 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4447 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4448 "Unexpected terminator");
4449 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4450 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4451 LatchExitingBranch->eraseFromParent();
4452 LatchBuilder.setInsertPoint(LatchVPBB);
4454 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4455 LatchVPBB->clearSuccessors();
4456
4458 // If handling the exiting lane in the scalar loop, combine the exit
4459 // conditions into a single BranchOnCond.
4460 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4461 MiddleVPBB->clearPredecessors();
4462 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4463 return handleUncountableExitsWithSideEffects(Plan, Exits, HeaderVPBB,
4464 LatchVPBB, MiddleVPBB, TheLoop,
4465 PSE, DT, AC, VPDT);
4466 }
4467
4468 // Create the vector.early.exit blocks.
4469 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4470 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4471 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4472 VPBasicBlock *VectorEarlyExitVPBB =
4473 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4474 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4475 }
4476
4477 // Create the dispatch block (or reuse the single exit block if only one
4478 // exit). The dispatch block computes the first active lane of the combined
4479 // condition and, for multiple exits, chains through conditions to determine
4480 // which exit to take.
4481 VPBasicBlock *DispatchVPBB =
4482 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4483 : Plan.createVPBasicBlock("vector.early.exit.check");
4484 DispatchVPBB->setPredecessors({LatchVPBB});
4485 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4486 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4487 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4488 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4489
4490 // For each early exit, disconnect the original exiting block
4491 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4492 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4493 // values at the first active lane:
4494 //
4495 // Input:
4496 // early.exiting.I:
4497 // ...
4498 // EMIT branch-on-cond vp<%cond.I>
4499 // Successor(s): in.loop.succ, ir-bb<exit.I>
4500 //
4501 // ir-bb<exit.I>:
4502 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4503 //
4504 // Output:
4505 // early.exiting.I:
4506 // ...
4507 // Successor(s): in.loop.succ
4508 //
4509 // vector.early.exit.I:
4510 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4511 // Successor(s): ir-bb<exit.I>
4512 //
4513 // ir-bb<exit.I>:
4514 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4515 // vector.early.exit.I)
4516 //
4517 for (auto [Exit, VectorEarlyExitVPBB] :
4518 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4519 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4520 // Adjust the phi nodes in EarlyExitVPBB.
4521 // 1. remove incoming values from EarlyExitingVPBB,
4522 // 2. extract the incoming value at FirstActiveLane
4523 // 3. add back the extracts as last operands for the phis
4524 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4525 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4526 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4527 // values from VectorEarlyExitVPBB.
4528 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4529 auto *ExitIRI = cast<VPIRPhi>(&R);
4530 VPValue *IncomingVal =
4531 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4532 VPValue *NewIncoming = IncomingVal;
4533 if (!isa<VPIRValue>(IncomingVal)) {
4534 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4535 NewIncoming = EarlyExitBuilder.createNaryOp(
4536 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4537 DebugLoc::getUnknown(), "early.exit.value");
4538 }
4539 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4540 ExitIRI->addIncoming(NewIncoming);
4541 }
4542
4543 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4544 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4545 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4546 }
4547
4548 // Chain through exits: for each exit, check if its condition is true at
4549 // the first active lane. If so, take that exit; otherwise, try the next.
4550 // The last exit needs no check since it must be taken if all others fail.
4551 //
4552 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4553 //
4554 // latch:
4555 // ...
4556 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4557 // ...
4558 //
4559 // vector.early.exit.check:
4560 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4561 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4562 // EMIT branch-on-cond vp<%at.cond.0>
4563 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4564 //
4565 // vector.early.exit.check.0:
4566 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4567 // EMIT branch-on-cond vp<%at.cond.1>
4568 // Successor(s): vector.early.exit.1, vector.early.exit.2
4569 VPBasicBlock *CurrentBB = DispatchVPBB;
4570 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4571 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4572 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4573 DebugLoc::getUnknown(), "exit.cond.at.lane");
4574
4575 // For the last dispatch, branch directly to the last exit on false;
4576 // otherwise, create a new check block.
4577 bool IsLastDispatch = (I + 2 == Exits.size());
4578 VPBasicBlock *FalseBB =
4579 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4580 : Plan.createVPBasicBlock(
4581 Twine("vector.early.exit.check.") + Twine(I));
4582
4583 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4584 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4585 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4586 FalseBB->setPredecessors({CurrentBB});
4587
4588 CurrentBB = FalseBB;
4589 DispatchBuilder.setInsertPoint(CurrentBB);
4590 }
4591
4592 return true;
4593}
4594
4595/// This function tries convert extended in-loop reductions to
4596/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4597/// valid. The created recipe must be decomposed to its constituent
4598/// recipes before execution.
4599static VPExpressionRecipe *
4601 VFRange &Range) {
4602 Type *RedTy = Red->getScalarType();
4603 VPValue *VecOp = Red->getVecOp();
4604
4605 assert(!Red->isPartialReduction() &&
4606 "This path does not support partial reductions");
4607
4608 // Clamp the range if using extended-reduction is profitable.
4609 auto IsExtendedRedValidAndClampRange =
4610 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4612 [&](ElementCount VF) {
4613 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4615
4617 InstructionCost ExtCost =
4618 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4619 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4620
4621 assert(!RedTy->isFloatingPointTy() &&
4622 "getExtendedReductionCost only supports integer types");
4623 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4624 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4625 Red->getFastMathFlags(), CostKind);
4626 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4627 },
4628 Range);
4629 };
4630
4631 VPValue *A;
4632 // Match reduce(ext)).
4634 IsExtendedRedValidAndClampRange(
4635 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4636 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4637 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4638
4639 return nullptr;
4640}
4641
4642/// This function tries convert extended in-loop reductions to
4643/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4644/// and valid. The created VPExpressionRecipe must be decomposed to its
4645/// constituent recipes before execution. Patterns of the
4646/// VPExpressionRecipe:
4647/// reduce.add(mul(...)),
4648/// reduce.add(mul(ext(A), ext(B))),
4649/// reduce.add(ext(mul(ext(A), ext(B)))).
4650/// reduce.fadd(fmul(ext(A), ext(B)))
4651static VPExpressionRecipe *
4653 VPCostContext &Ctx, VFRange &Range) {
4654 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4655 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4656 Opcode != Instruction::FAdd)
4657 return nullptr;
4658
4659 assert(!Red->isPartialReduction() &&
4660 "This path does not support partial reductions");
4661 Type *RedTy = Red->getScalarType();
4662
4663 // Clamp the range if using multiply-accumulate-reduction is profitable.
4664 auto IsMulAccValidAndClampRange =
4666 VPWidenCastRecipe *OuterExt) -> bool {
4668 [&](ElementCount VF) {
4670 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4671 InstructionCost MulAccCost;
4672
4673 // getMulAccReductionCost for in-loop reductions does not support
4674 // mixed or floating-point extends.
4675 if (Ext0 && Ext1 &&
4676 (Ext0->getOpcode() != Ext1->getOpcode() ||
4677 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4678 return false;
4679
4680 bool IsZExt =
4681 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4682 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4683 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4684 SrcVecTy, CostKind);
4685
4686 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4687 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4688 InstructionCost ExtCost = 0;
4689 if (Ext0)
4690 ExtCost += Ext0->computeCost(VF, Ctx);
4691 if (Ext1)
4692 ExtCost += Ext1->computeCost(VF, Ctx);
4693 if (OuterExt)
4694 ExtCost += OuterExt->computeCost(VF, Ctx);
4695
4696 return MulAccCost.isValid() &&
4697 MulAccCost < ExtCost + MulCost + RedCost;
4698 },
4699 Range);
4700 };
4701
4702 VPValue *VecOp = Red->getVecOp();
4703 VPRecipeBase *Sub = nullptr;
4704 VPValue *A, *B;
4705 VPValue *Tmp = nullptr;
4706
4707 if (RedTy->isFloatingPointTy())
4708 return nullptr;
4709
4710 // Sub reductions could have a sub between the add reduction and vec op.
4711 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4712 Sub = VecOp->getDefiningRecipe();
4713 VecOp = Tmp;
4714 }
4715
4716 // If ValB is a constant and can be safely extended, truncate it to the same
4717 // type as ExtA's operand, then extend it to the same type as ExtA. This
4718 // creates two uniform extends that can more easily be matched by the rest of
4719 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4720 // replaced with the new extend of the constant.
4721 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4722 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4723 VPWidenRecipe *Mul) {
4724 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4725 return;
4726 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4727 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4728 const APInt *Const;
4729 if (!match(ValB, m_APInt(Const)) ||
4731 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4732 return;
4733 // The truncate ensures that the type of each extended operand is the
4734 // same, and it's been proven that the constant can be extended from
4735 // NarrowTy safely. Necessary since ExtA's extended operand would be
4736 // e.g. an i8, while the const will likely be an i32. This will be
4737 // elided by later optimisations.
4738 VPBuilder Builder(Mul);
4739 auto *Trunc =
4740 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4741 Type *WideTy = ExtA->getScalarType();
4742 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4743 Mul->setOperand(1, ExtB);
4744 };
4745
4746 // Try to match reduce.add(mul(...)).
4747 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4748 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4749 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4750 auto *Mul = cast<VPWidenRecipe>(VecOp);
4751
4752 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4753 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4754
4755 // Match reduce.add/sub(mul(ext, ext)).
4756 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4757 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4758 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4759 if (Sub)
4760 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4761 cast<VPWidenRecipe>(Sub), Red);
4762 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4763 }
4764 // TODO: Add an expression type for this variant with a negated mul
4765 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4766 return new VPExpressionRecipe(Mul, Red);
4767 }
4768 // TODO: Add an expression type for negated versions of other expression
4769 // variants.
4770 if (Sub)
4771 return nullptr;
4772
4773 // Match reduce.add(ext(mul(A, B))).
4774 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4775 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4776 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4777 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4778 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4779
4780 // reduce.add(ext(mul(ext, const)))
4781 // -> reduce.add(ext(mul(ext, ext(const))))
4782 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4783
4784 // reduce.add(ext(mul(ext(A), ext(B))))
4785 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4786 // The inner extends must either have the same opcode as the outer extend or
4787 // be the same, in which case the multiply can never result in a negative
4788 // value and the outer extend can be folded away by doing wider
4789 // extends for the operands of the mul.
4790 if (Ext0 && Ext1 &&
4791 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4792 Ext0->getOpcode() == Ext1->getOpcode() &&
4793 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4794 auto *NewExt0 = new VPWidenCastRecipe(
4795 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
4796 *Ext0, *Ext0, Ext0->getDebugLoc());
4797 NewExt0->insertBefore(Ext0);
4798
4799 VPWidenCastRecipe *NewExt1 = NewExt0;
4800 if (Ext0 != Ext1) {
4801 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4802 Ext->getScalarType(), nullptr, *Ext1,
4803 *Ext1, Ext1->getDebugLoc());
4804 NewExt1->insertBefore(Ext1);
4805 }
4806 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
4807 NewMul->insertBefore(Mul);
4808 Ext->replaceAllUsesWith(NewMul);
4809 Ext->eraseFromParent();
4810 Mul->eraseFromParent();
4811 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
4812 }
4813 }
4814 return nullptr;
4815}
4816
4817/// This function tries to create abstract recipes from the reduction recipe for
4818/// following optimizations and cost estimation.
4820 VPCostContext &Ctx,
4821 VFRange &Range) {
4822 // Creation of VPExpressions for partial reductions is entirely handled in
4823 // transformToPartialReduction.
4824 assert(!Red->isPartialReduction() &&
4825 "This path does not support partial reductions");
4826
4827 VPExpressionRecipe *AbstractR = nullptr;
4828 auto IP = std::next(Red->getIterator());
4829 auto *VPBB = Red->getParent();
4830 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4831 AbstractR = MulAcc;
4832 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4833 AbstractR = ExtRed;
4834 // Cannot create abstract inloop reduction recipes.
4835 if (!AbstractR)
4836 return;
4837
4838 AbstractR->insertBefore(*VPBB, IP);
4839 Red->replaceAllUsesWith(AbstractR);
4840}
4841
4852
4854 if (Plan.hasScalarVFOnly())
4855 return;
4856
4857#ifndef NDEBUG
4858 VPDominatorTree VPDT(Plan);
4859#endif
4860
4861 SmallVector<VPValue *> VPValues;
4862 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4863 VPValues.push_back(BTC);
4864 append_range(VPValues, Plan.getLiveIns());
4865 for (VPRecipeBase &R : *Plan.getEntry())
4866 append_range(VPValues, R.definedValues());
4867
4868 auto *VectorPreheader = Plan.getVectorPreheader();
4869 for (VPValue *VPV : VPValues) {
4871 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4872 continue;
4873
4874 // Add explicit broadcast at the insert point that dominates all users.
4875 VPBasicBlock *HoistBlock = VectorPreheader;
4876 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4877 for (VPUser *User : VPV->users()) {
4878 if (User->usesScalars(VPV))
4879 continue;
4880 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4881 HoistPoint = HoistBlock->begin();
4882 else
4883 assert(VPDT.dominates(VectorPreheader,
4884 cast<VPRecipeBase>(User)->getParent()) &&
4885 "All users must be in the vector preheader or dominated by it");
4886 }
4887
4888 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4889 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4890 VPV->replaceUsesWithIf(Broadcast,
4891 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4892 return Broadcast != &U && !U.usesScalars(VPV);
4893 });
4894 }
4895}
4896
4897// Collect common metadata from a group of replicate recipes by intersecting
4898// metadata from all recipes in the group.
4900 VPIRMetadata CommonMetadata = *Recipes.front();
4901 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4902 CommonMetadata.intersect(*Recipe);
4903 return CommonMetadata;
4904}
4905
4906template <unsigned Opcode>
4910 const Loop *L) {
4911 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4912 "Only Load and Store opcodes supported");
4913 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
4914
4915 // For each address, collect operations with the same or complementary masks.
4918 Plan, PSE, L,
4919 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4920 for (auto Recipes : Groups) {
4921 if (Recipes.size() < 2)
4922 continue;
4923
4925 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
4926 "Expected all recipes in group to have the same load-store type");
4927
4928 // Collect groups with the same or complementary masks.
4929 for (VPReplicateRecipe *&RecipeI : Recipes) {
4930 if (!RecipeI)
4931 continue;
4932
4933 VPValue *MaskI = RecipeI->getMask();
4935 Group.push_back(RecipeI);
4936 RecipeI = nullptr;
4937
4938 // Find all operations with the same or complementary masks.
4939 bool HasComplementaryMask = false;
4940 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4941 if (!RecipeJ)
4942 continue;
4943
4944 VPValue *MaskJ = RecipeJ->getMask();
4945 // Check if any operation in the group has a complementary mask with
4946 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4947 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4948 match(MaskJ, m_Not(m_Specific(MaskI)));
4949 Group.push_back(RecipeJ);
4950 RecipeJ = nullptr;
4951 }
4952
4953 if (HasComplementaryMask) {
4954 assert(Group.size() >= 2 && "must have at least 2 entries");
4955 AllGroups.push_back(std::move(Group));
4956 }
4957 }
4958 }
4959
4960 return AllGroups;
4961}
4962
4963// Find the recipe with minimum alignment in the group.
4964template <typename InstType>
4965static VPReplicateRecipe *
4967 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4968 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4969 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4970 });
4971}
4972
4975 const Loop *L) {
4976 auto Groups =
4978 if (Groups.empty())
4979 return;
4980
4981 // Process each group of loads.
4982 for (auto &Group : Groups) {
4983 // Try to use the earliest (most dominating) load to replace all others.
4984 VPReplicateRecipe *EarliestLoad = Group[0];
4985 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4986 VPBasicBlock *LastBB = Group.back()->getParent();
4987
4988 // Check that the load doesn't alias with stores between first and last.
4989 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4990 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4991 continue;
4992
4993 // Collect common metadata from all loads in the group.
4994 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4995
4996 // Find the load with minimum alignment to use.
4997 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4998
4999 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5000 assert(all_of(Group,
5001 [IsSingleScalar](VPReplicateRecipe *R) {
5002 return R->isSingleScalar() == IsSingleScalar;
5003 }) &&
5004 "all members in group must agree on IsSingleScalar");
5005
5006 // Create an unpredicated version of the earliest load with common
5007 // metadata.
5008 auto *UnpredicatedLoad = new VPReplicateRecipe(
5009 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5010 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5011
5012 UnpredicatedLoad->insertBefore(EarliestLoad);
5013
5014 // Replace all loads in the group with the unpredicated load.
5015 for (VPReplicateRecipe *Load : Group) {
5016 Load->replaceAllUsesWith(UnpredicatedLoad);
5017 Load->eraseFromParent();
5018 }
5019 }
5020}
5021
5022static bool
5024 PredicatedScalarEvolution &PSE, const Loop &L) {
5025 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5026 if (!StoreLoc || !StoreLoc->AATags.Scope)
5027 return false;
5028
5029 // When sinking a group of stores, all members of the group alias each other.
5030 // Skip them during the alias checks.
5031 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
5032 StoresToSink.end());
5033
5034 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5035 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5036 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L);
5037 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5038}
5039
5042 const Loop *L) {
5043 auto Groups =
5045 if (Groups.empty())
5046 return;
5047
5048 for (auto &Group : Groups) {
5049 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5050 continue;
5051
5052 // Use the last (most dominated) store's location for the unconditional
5053 // store.
5054 VPReplicateRecipe *LastStore = Group.back();
5055 VPBasicBlock *InsertBB = LastStore->getParent();
5056
5057 // Collect common alias metadata from all stores in the group.
5058 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5059
5060 // Build select chain for stored values.
5061 VPValue *SelectedValue = Group[0]->getOperand(0);
5062 VPBuilder Builder(InsertBB, LastStore->getIterator());
5063
5064 bool IsSingleScalar = Group[0]->isSingleScalar();
5065 for (unsigned I = 1; I < Group.size(); ++I) {
5066 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5067 "all members in group must agree on IsSingleScalar");
5068 VPValue *Mask = Group[I]->getMask();
5069 VPValue *Value = Group[I]->getOperand(0);
5070 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5071 Group[I]->getDebugLoc());
5072 }
5073
5074 // Find the store with minimum alignment to use.
5075 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5076
5077 // Create unconditional store with selected value and common metadata.
5078 auto *UnpredicatedStore = new VPReplicateRecipe(
5079 StoreWithMinAlign->getUnderlyingInstr(),
5080 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5081 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5082 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5083
5084 // Remove all predicated stores from the group.
5085 for (VPReplicateRecipe *Store : Group)
5086 Store->eraseFromParent();
5087 }
5088}
5089
5091 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5093 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5094 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5095
5096 VPValue *TC = Plan.getTripCount();
5097 if (TC->getNumUsers() == 0)
5098 return;
5099
5100 // Skip cases for which the trip count may be non-trivial to materialize.
5101 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5102 // tail is required.
5103 if (!Plan.hasScalarTail() ||
5105 Plan.getScalarPreheader() ||
5106 !isa<VPIRValue>(TC))
5107 return;
5108
5109 // Materialize vector trip counts for constants early if it can simply
5110 // be computed as (Original TC / VF * UF) * VF * UF.
5111 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5112 // tail-folded loops.
5113 ScalarEvolution &SE = *PSE.getSE();
5114 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5115 if (!isa<SCEVConstant>(TCScev))
5116 return;
5117 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5118 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5119 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5120 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5121}
5122
5124 VPBasicBlock *VectorPH) {
5126 if (BTC->getNumUsers() == 0)
5127 return;
5128
5129 VPBuilder Builder(VectorPH, VectorPH->begin());
5130 auto *TCTy = Plan.getTripCount()->getScalarType();
5131 auto *TCMO =
5132 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5133 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5134 BTC->replaceAllUsesWith(TCMO);
5135}
5136
5138 if (Plan.hasScalarVFOnly())
5139 return;
5140
5141 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5142 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5144 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5145 vp_depth_first_shallow(LoopRegion->getEntry()));
5146 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5147 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5148 // regions. Those are not materialized explicitly yet.
5149 // TODO: materialize build vectors for replicating recipes in replicating
5150 // regions.
5151 for (VPBasicBlock *VPBB :
5152 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5153 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5155 continue;
5156 auto *DefR = cast<VPSingleDefRecipe>(&R);
5157 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5158 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5159 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5160 };
5161 if ((isa<VPReplicateRecipe>(DefR) &&
5162 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5163 (isa<VPInstruction>(DefR) &&
5165 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5166 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5167 continue;
5168
5169 Type *ScalarTy = DefR->getScalarType();
5170 unsigned Opcode = ScalarTy->isStructTy()
5173 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5174 BuildVector->insertAfter(DefR);
5175
5176 DefR->replaceUsesWithIf(
5177 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5178 VPUser &U, unsigned) {
5179 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5180 });
5181 }
5182 }
5183
5184 // Create explicit VPInstructions to convert vectors to scalars. The current
5185 // implementation is conservative - it may miss some cases that may or may not
5186 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5187 // if they are known to operate on scalar values.
5188 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5189 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5191 VPDerivedIVRecipe>(&R))
5192 continue;
5193 for (VPValue *Def : R.definedValues()) {
5194 // Skip recipes that are single-scalar or only have their first lane
5195 // used.
5196 // TODO: The Defs skipped here may or may not be vector values.
5197 // Introduce Unpacks, and remove them later, if they are guaranteed to
5198 // produce scalar values.
5200 continue;
5201
5202 // At the moment, we create unpacks only for scalar users outside
5203 // replicate regions. Recipes inside replicate regions still extract the
5204 // required lanes implicitly.
5205 // TODO: Remove once replicate regions are unrolled completely.
5206 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5207 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5208 return U->usesScalars(Def) &&
5209 (!ParentRegion || !ParentRegion->isReplicator());
5210 };
5211 if (none_of(Def->users(), IsCandidateUnpackUser))
5212 continue;
5213
5214 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5215 if (R.isPhi())
5216 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5217 else
5218 Unpack->insertAfter(&R);
5219 Def->replaceUsesWithIf(Unpack,
5220 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5221 return IsCandidateUnpackUser(&U);
5222 });
5223 }
5224 }
5225 }
5226}
5227
5229 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5230 bool RequiresScalarEpilogue, VPValue *Step,
5231 std::optional<uint64_t> MaxRuntimeStep) {
5232 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5233 // There's nothing to do if there are no users of the vector trip count or its
5234 // IR value has already been set.
5235 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5236 return;
5237
5238 VPValue *TC = Plan.getTripCount();
5239 Type *TCTy = TC->getScalarType();
5240 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5241 if (auto *StepR = Step->getDefiningRecipe()) {
5242 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5243 "Step VPBB must dominate VectorPHVPBB");
5244 // Insert after Step's definition to maintain valid def-use ordering.
5245 InsertPt = std::next(StepR->getIterator());
5246 }
5247 VPBuilder Builder(VectorPHVPBB, InsertPt);
5248
5249 // For scalable steps, if TC is a constant and is divisible by the maximum
5250 // possible runtime step, then TC % Step == 0 for all valid vscale values
5251 // and the vector trip count equals TC directly.
5252 const APInt *TCVal;
5253 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5254 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5255 VectorTC.replaceAllUsesWith(TC);
5256 return;
5257 }
5258
5259 // If the tail is to be folded by masking, round the number of iterations N
5260 // up to a multiple of Step instead of rounding down. This is done by first
5261 // adding Step-1 and then rounding down. Note that it's ok if this addition
5262 // overflows: the vector induction variable will eventually wrap to zero given
5263 // that it starts at zero and its Step is a power of two; the loop will then
5264 // exit, with the last early-exit vector comparison also producing all-true.
5265 if (TailByMasking) {
5266 TC = Builder.createAdd(
5267 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5268 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5269 }
5270
5271 // Now we need to generate the expression for the part of the loop that the
5272 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5273 // iterations are not required for correctness, or N - Step, otherwise. Step
5274 // is equal to the vectorization factor (number of SIMD elements) times the
5275 // unroll factor (number of SIMD instructions).
5276 VPValue *R =
5277 Builder.createNaryOp(Instruction::URem, {TC, Step},
5278 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5279
5280 // There are cases where we *must* run at least one iteration in the remainder
5281 // loop. See the cost model for when this can happen. If the step evenly
5282 // divides the trip count, we set the remainder to be equal to the step. If
5283 // the step does not evenly divide the trip count, no adjustment is necessary
5284 // since there will already be scalar iterations. Note that the minimum
5285 // iterations check ensures that N >= Step.
5286 if (RequiresScalarEpilogue) {
5287 assert(!TailByMasking &&
5288 "requiring scalar epilogue is not supported with fail folding");
5289 VPValue *IsZero =
5290 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5291 R = Builder.createSelect(IsZero, Step, R);
5292 }
5293
5294 VPValue *Res =
5295 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5296 VectorTC.replaceAllUsesWith(Res);
5297}
5298
5300 ElementCount VFEC) {
5301 // If VF and VFxUF have already been materialized (no remaining users),
5302 // there's nothing more to do.
5303 if (Plan.getVF().isMaterialized()) {
5304 assert(Plan.getVFxUF().isMaterialized() &&
5305 "VF and VFxUF must be materialized together");
5306 return;
5307 }
5308
5309 VPBuilder Builder(VectorPH, VectorPH->begin());
5310 Type *TCTy = Plan.getTripCount()->getScalarType();
5311 VPValue &VF = Plan.getVF();
5312 VPValue &VFxUF = Plan.getVFxUF();
5313 // If there are no users of the runtime VF, compute VFxUF by constant folding
5314 // the multiplication of VF and UF.
5315 if (VF.getNumUsers() == 0) {
5316 VPValue *RuntimeVFxUF =
5317 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5318 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5319 return;
5320 }
5321
5322 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5323 // vscale) * UF.
5324 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5326 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5328 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5329 }
5330 VF.replaceAllUsesWith(RuntimeVF);
5331
5332 VPValue *MulByUF = Builder.createOverflowingOp(
5333 Instruction::Mul,
5334 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5335 {true, false});
5336 VFxUF.replaceAllUsesWith(MulByUF);
5337}
5338
5340 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5341 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5342 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5343
5344 VPBuilder Builder(Plan.getVectorPreheader());
5345 auto *AliasMask = Builder.createNaryOp(
5346 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5347 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5348
5349 if (HeaderMaskDef->isPhi())
5350 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5351 else
5352 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5353
5354 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5355 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5356 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5357 return &U != ClampedHeaderMask;
5358 });
5359}
5360
5361VPValue *
5363 ArrayRef<PointerDiffInfo> DiffChecks) {
5364 VPBuilder Builder(AliasCheckVPBB);
5365 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5366
5367 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5368 assert(IncomingAliasMask && "Expected an alias mask!");
5369
5370 VPValue *AliasMask = nullptr;
5371 for (const PointerDiffInfo &Check : DiffChecks) {
5373 VPValue *Sink =
5375 Type *AddrType = Src->getScalarType();
5376
5377 // TODO: Only freeze the required pointer (not both src and sink).
5378 if (Check.NeedsFreeze) {
5379 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5380 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5381 }
5382
5383 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5384 // dependency between the source and the sink. This is not necessary for
5385 // correctness of the mask, but using the "raw" variant prevents loads
5386 // depending on the completion of stores.
5387 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5388 Intrinsic::loop_dependence_war_mask,
5389 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5390
5391 if (AliasMask)
5392 AliasMask = Builder.createAnd(AliasMask, WARMask);
5393 else
5394 AliasMask = WARMask;
5395 }
5396
5398 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5399 VPValue *NumActive = Builder.createNaryOp(
5400 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5401 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5402 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5403 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5404
5405 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5406
5407 return ClampedVF;
5408}
5409
5411 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5412 VPBasicBlock *ClampedVFCheck =
5413 Plan.createVPBasicBlock("vector.clamped.vf.check");
5414
5415 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5416 VPBuilder Builder(ClampedVFCheck);
5418 Type *TCTy = Plan.getTripCount()->getScalarType();
5419
5420 // Check the "ClampedVF" from the alias mask is larger than one.
5421 VPValue *IsScalar =
5422 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5423 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5424
5425 VPValue *TripCount = Plan.getTripCount();
5426 VPValue *MaxUIntTripCount =
5428 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5429
5430 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5431 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5432 // condition (index.next == n.vec) may not be correct in the case of an
5433 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5434 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5435 // power-of-two).
5436 VPValue *TripCountCheck = Builder.createICmp(
5437 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5438
5439 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5440 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5441
5442 // Materialize the trip count early as this will add a use of (VFxUF) that
5443 // needs to be replaced with the ClampedVF.
5445 /*TailByMasking=*/true,
5446 /*RequiresScalarEpilogue=*/false,
5447 &Plan.getVFxUF());
5448
5449 assert(Plan.getConcreteUF() == 1 &&
5450 "Clamped VF not supported with interleaving");
5451 Plan.getVF().replaceAllUsesWith(ClampedVF);
5452 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5453}
5454
5456 ScalarEvolution &SE) {
5457 auto *Entry = Plan.getEntry();
5458 VPBuilder Builder(Entry, Entry->begin());
5459 VPSCEVExpander Expander(Builder, SE);
5460
5461 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5462 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5463 // late expansion.
5464 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5465 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5466 if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)
5467 continue;
5468 Builder.setInsertPoint(ExpSCEV);
5469 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5470 if (!Expanded)
5471 continue;
5472 ExpSCEV->replaceAllUsesWith(Expanded);
5473 if (Plan.getTripCount() == ExpSCEV)
5474 Plan.resetTripCount(Expanded);
5475 ExpSCEV->eraseFromParent();
5476 }
5477}
5478
5481 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5482
5483 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5484 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5485 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5486 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5487 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5488 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5489 if (!ExpSCEV)
5490 continue;
5491 const SCEV *Expr = ExpSCEV->getSCEV();
5492 Value *Res =
5493 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5494 ExpandedSCEVs[Expr] = Res;
5495 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5496 ExpSCEV->replaceAllUsesWith(Exp);
5497 if (Plan.getTripCount() == ExpSCEV)
5498 Plan.resetTripCount(Exp);
5499 ExpSCEV->eraseFromParent();
5500 }
5502 "all VPExpandSCEVRecipes must have been expanded");
5503 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5504 // to the VPIRBasicBlock.
5505 auto EI = Entry->begin();
5506 for (Instruction &I : drop_end(*EntryBB)) {
5507 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5508 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5509 EI++;
5510 continue;
5511 }
5513 }
5514
5515 return ExpandedSCEVs;
5516}
5517
5518/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5519/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5520/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5521/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5522/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5523/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5524/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5525/// is defined at \p Idx of a load interleave group.
5526static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5527 VPValue *OpV, unsigned Idx, bool IsScalable) {
5528 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5529 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5530 if (!Member0OpR)
5531 return Member0Op == OpV;
5532 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5533 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5534 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5535 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5536 Member0Op == OpV;
5537 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5538 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5539 return false;
5540}
5541
5542static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5544 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5545 if (!WideMember0)
5546 return false;
5547 for (VPValue *V : Ops) {
5549 return false;
5550 auto *R = cast<VPSingleDefRecipe>(V);
5551 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5552 return false;
5553 }
5554
5555 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5557 for (VPValue *Op : Ops)
5558 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5559
5560 if (canNarrowOps(OpsI, IsScalable))
5561 continue;
5562
5563 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5564 const auto &[OpIdx, OpV] = P;
5565 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5566 }))
5567 return false;
5568 }
5569
5570 return true;
5571}
5572
5573/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5574/// number of members both equal to VF. The interleave group must also access
5575/// the full vector width.
5576static std::optional<ElementCount>
5579 const TargetTransformInfo &TTI) {
5580 if (!InterleaveR || InterleaveR->getMask())
5581 return std::nullopt;
5582
5583 Type *GroupElementTy = nullptr;
5584 if (InterleaveR->getStoredValues().empty()) {
5585 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5586 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5587 return Op->getScalarType() == GroupElementTy;
5588 }))
5589 return std::nullopt;
5590 } else {
5591 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5592 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5593 return Op->getScalarType() == GroupElementTy;
5594 }))
5595 return std::nullopt;
5596 }
5597
5598 auto IG = InterleaveR->getInterleaveGroup();
5599 if (IG->getFactor() != IG->getNumMembers())
5600 return std::nullopt;
5601
5602 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5603 TypeSize Size = TTI.getRegisterBitWidth(
5606 assert(Size.isScalable() == VF.isScalable() &&
5607 "if Size is scalable, VF must be scalable and vice versa");
5608 return Size.getKnownMinValue();
5609 };
5610
5611 for (ElementCount VF : VFs) {
5612 unsigned MinVal = VF.getKnownMinValue();
5613 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5614 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5615 return {VF};
5616 }
5617 return std::nullopt;
5618}
5619
5620/// Returns true if \p VPValue is a narrow VPValue.
5621static bool isAlreadyNarrow(VPValue *VPV) {
5622 if (isa<VPIRValue>(VPV))
5623 return true;
5624 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5625 return RepR && RepR->isSingleScalar();
5626}
5627
5628// Convert the wide recipes defining the VPValues in \p Members feeding an
5629// interleave group to a single narrow variant. The first member is reused as
5630// the narrowed recipe.
5631static VPValue *
5633 SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5634 VPValue *V = Members.front();
5635 auto *R = V->getDefiningRecipe();
5636 if (!R || NarrowedOps.contains(V))
5637 return V;
5638
5639 if (isAlreadyNarrow(V))
5640 return V;
5641
5643 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5644 for (VPValue *Member : Members.drop_front())
5645 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5646 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5648 for (VPValue *Member : Members)
5649 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5650 WideMember0->setOperand(Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps));
5651 }
5652 return V;
5653 }
5654
5655 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5656 // Narrow interleave group to wide load, as transformed VPlan will only
5657 // process one original iteration.
5658 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5659 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5660 LoadGroup->getMask(), /*Consecutive=*/true,
5661 *LoadGroup, LoadGroup->getDebugLoc());
5662 L->insertBefore(LoadGroup);
5663 NarrowedOps.insert(L);
5664 return L;
5665 }
5666
5667 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5668 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5669 "must be a single scalar load");
5670 NarrowedOps.insert(RepR);
5671 return RepR;
5672 }
5673
5674 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5675 VPValue *PtrOp = WideLoad->getAddr();
5676 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5677 PtrOp = VecPtr->getOperand(0);
5678 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5679 // process one original iteration.
5680 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5681 /*IsUniform*/ true,
5682 /*Mask*/ nullptr, {}, *WideLoad);
5683 N->insertBefore(WideLoad);
5684 NarrowedOps.insert(N);
5685 return N;
5686}
5687
5688std::unique_ptr<VPlan>
5690 const TargetTransformInfo &TTI) {
5691 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5692
5693 if (!VectorLoop)
5694 return nullptr;
5695
5696 // Only handle single-block loops for now.
5697 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5698 return nullptr;
5699
5700 // Skip plans when we may not be able to properly narrow.
5701 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5702 if (!match(&Exiting->back(), m_BranchOnCount()))
5703 return nullptr;
5704
5705 assert(match(&Exiting->back(),
5707 m_Specific(&Plan.getVectorTripCount()))) &&
5708 "unexpected branch-on-count");
5709
5711 std::optional<ElementCount> VFToOptimize;
5712 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5715 continue;
5716
5717 // Bail out on recipes not supported at the moment:
5718 // * phi recipes other than the canonical induction
5719 // * recipes writing to memory except interleave groups
5720 // Only support plans with a canonical induction phi.
5721 if (R.isPhi())
5722 return nullptr;
5723
5724 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5725 if (R.mayWriteToMemory() && !InterleaveR)
5726 return nullptr;
5727
5728 // Bail out if any recipe defines a vector value used outside the
5729 // vector loop region.
5730 if (any_of(R.definedValues(), [&](VPValue *V) {
5731 return any_of(V->users(), [&](VPUser *U) {
5732 auto *UR = cast<VPRecipeBase>(U);
5733 return UR->getParent()->getParent() != VectorLoop;
5734 });
5735 }))
5736 return nullptr;
5737
5738 // All other ops are allowed, but we reject uses that cannot be converted
5739 // when checking all allowed consumers (store interleave groups) below.
5740 if (!InterleaveR)
5741 continue;
5742
5743 // Try to find a single VF, where all interleave groups are consecutive and
5744 // saturate the full vector width. If we already have a candidate VF, check
5745 // if it is applicable for the current InterleaveR, otherwise look for a
5746 // suitable VF across the Plan's VFs.
5748 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5749 : to_vector(Plan.vectorFactors());
5750 std::optional<ElementCount> NarrowedVF =
5751 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
5752 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5753 return nullptr;
5754 VFToOptimize = NarrowedVF;
5755
5756 // Skip read interleave groups.
5757 if (InterleaveR->getStoredValues().empty())
5758 continue;
5759
5760 // Narrow interleave groups, if all operands are already matching narrow
5761 // ops.
5762 auto *Member0 = InterleaveR->getStoredValues()[0];
5763 if (isAlreadyNarrow(Member0) &&
5764 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5765 StoreGroups.push_back(InterleaveR);
5766 continue;
5767 }
5768
5769 // For now, we only support full interleave groups storing load interleave
5770 // groups.
5771 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5772 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5773 if (!DefR)
5774 return false;
5775 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5776 return IR && IR->getInterleaveGroup()->isFull() &&
5777 IR->getVPValue(Op.index()) == Op.value();
5778 })) {
5779 StoreGroups.push_back(InterleaveR);
5780 continue;
5781 }
5782
5783 // Check if all values feeding InterleaveR are matching wide recipes, which
5784 // operands that can be narrowed.
5785 if (!canNarrowOps(InterleaveR->getStoredValues(),
5786 VFToOptimize->isScalable()))
5787 return nullptr;
5788 StoreGroups.push_back(InterleaveR);
5789 }
5790
5791 if (StoreGroups.empty())
5792 return nullptr;
5793
5794 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5795 bool RequiresScalarEpilogue =
5796 MiddleVPBB->getNumSuccessors() == 1 &&
5797 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5798 // Bail out for tail-folding (middle block with a single successor to exit).
5799 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5800 return nullptr;
5801
5802 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5803 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5804 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5805 // TODO: Handle cases where only some interleave groups can be narrowed.
5806 std::unique_ptr<VPlan> NewPlan;
5807 if (size(Plan.vectorFactors()) != 1) {
5808 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5809 Plan.setVF(*VFToOptimize);
5810 NewPlan->removeVF(*VFToOptimize);
5811 }
5812
5813 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5814 SmallPtrSet<VPValue *, 4> NarrowedOps;
5815 // Narrow operation tree rooted at store groups.
5816 for (auto *StoreGroup : StoreGroups) {
5817 VPValue *Res =
5818 narrowInterleaveGroupOp(StoreGroup->getStoredValues(), NarrowedOps);
5819 auto *SI =
5820 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5821 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5822 /*Consecutive=*/true, *StoreGroup,
5823 StoreGroup->getDebugLoc());
5824 S->insertBefore(StoreGroup);
5825 StoreGroup->eraseFromParent();
5826 }
5827
5828 // Adjust induction to reflect that the transformed plan only processes one
5829 // original iteration.
5831 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5832 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5833 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5834
5835 VPValue *UF = &Plan.getUF();
5836 VPValue *Step;
5837 if (VFToOptimize->isScalable()) {
5838 VPValue *VScale =
5839 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5840 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5841 {true, false});
5842 Plan.getVF().replaceAllUsesWith(VScale);
5843 } else {
5844 Step = UF;
5845 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5846 }
5847 // Materialize vector trip count with the narrowed step.
5848 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5849 RequiresScalarEpilogue, Step);
5850
5851 CanIVInc->setOperand(1, Step);
5852 Plan.getVFxUF().replaceAllUsesWith(Step);
5853
5854 removeDeadRecipes(Plan);
5855 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5857 "All VPVectorPointerRecipes should have been removed");
5858 return NewPlan;
5859}
5860
5861/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5862/// BranchOnCond recipe.
5864 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5865 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5866 auto *MiddleTerm =
5868 // Only add branch metadata if there is a (conditional) terminator.
5869 if (!MiddleTerm)
5870 return;
5871
5872 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5873 "must have a BranchOnCond");
5874 // Assume that `TripCount % VectorStep ` is equally distributed.
5875 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5876 if (VF.isScalable() && VScaleForTuning.has_value())
5877 VectorStep *= *VScaleForTuning;
5878 assert(VectorStep > 0 && "trip count should not be zero");
5879 MDBuilder MDB(Plan.getContext());
5880 MDNode *BranchWeights =
5881 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5882 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5883}
5884
5886 VFRange &Range) {
5887 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5888 auto *MiddleVPBB = Plan.getMiddleBlock();
5889 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5890
5891 auto IsScalableOne = [](ElementCount VF) -> bool {
5892 return VF == ElementCount::getScalable(1);
5893 };
5894
5895 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5896 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5897 if (!FOR)
5898 continue;
5899
5900 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5901 "Cannot handle loops with uncountable early exits");
5902
5903 // Find the existing splice for this FOR, created in
5904 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5905 // RecurSplice there; only RecurSplice itself still references FOR.
5906 auto *RecurSplice =
5908 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5909
5910 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5911 // penultimate value of the recurrence. Instead we rely on the existing
5912 // extract of the last element from the result of
5913 // VPInstruction::FirstOrderRecurrenceSplice.
5914 // TODO: Consider vscale_range info and UF.
5915 if (any_of(RecurSplice->users(),
5916 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5918 Range))
5919 return;
5920
5921 // This is the second phase of vectorizing first-order recurrences, creating
5922 // extracts for users outside the loop. An overview of the transformation is
5923 // described below. Suppose we have the following loop with some use after
5924 // the loop of the last a[i-1],
5925 //
5926 // for (int i = 0; i < n; ++i) {
5927 // t = a[i - 1];
5928 // b[i] = a[i] - t;
5929 // }
5930 // use t;
5931 //
5932 // There is a first-order recurrence on "a". For this loop, the shorthand
5933 // scalar IR looks like:
5934 //
5935 // scalar.ph:
5936 // s.init = a[-1]
5937 // br scalar.body
5938 //
5939 // scalar.body:
5940 // i = phi [0, scalar.ph], [i+1, scalar.body]
5941 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5942 // s2 = a[i]
5943 // b[i] = s2 - s1
5944 // br cond, scalar.body, exit.block
5945 //
5946 // exit.block:
5947 // use = lcssa.phi [s1, scalar.body]
5948 //
5949 // In this example, s1 is a recurrence because it's value depends on the
5950 // previous iteration. In the first phase of vectorization, we created a
5951 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5952 // for users in the scalar preheader and exit block.
5953 //
5954 // vector.ph:
5955 // v_init = vector(..., ..., ..., a[-1])
5956 // br vector.body
5957 //
5958 // vector.body
5959 // i = phi [0, vector.ph], [i+4, vector.body]
5960 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5961 // v2 = a[i, i+1, i+2, i+3]
5962 // v1' = splice(v1(3), v2(0, 1, 2))
5963 // b[i, i+1, i+2, i+3] = v2 - v1'
5964 // br cond, vector.body, middle.block
5965 //
5966 // middle.block:
5967 // vector.recur.extract.for.phi = v2(2)
5968 // vector.recur.extract = v2(3)
5969 // br cond, scalar.ph, exit.block
5970 //
5971 // scalar.ph:
5972 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5973 // [s.init, otherwise]
5974 // br scalar.body
5975 //
5976 // scalar.body:
5977 // i = phi [0, scalar.ph], [i+1, scalar.body]
5978 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5979 // s2 = a[i]
5980 // b[i] = s2 - s1
5981 // br cond, scalar.body, exit.block
5982 //
5983 // exit.block:
5984 // lo = lcssa.phi [s1, scalar.body],
5985 // [vector.recur.extract.for.phi, middle.block]
5986 //
5987 // Update extracts of the splice in the middle block: they extract the
5988 // penultimate element of the recurrence.
5990 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5991 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5992 continue;
5993
5994 auto *ExtractR = cast<VPInstruction>(&R);
5995 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5996 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5997 {}, "vector.recur.extract.for.phi");
5998 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5999 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6000 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6001 }
6002 }
6003 }
6004}
6005
6006/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6007/// value. Returns the widened IV if found, nullptr otherwise.
6009 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6010 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6011 Instruction::isIntDivRem(BinOp->getOpcode()))
6012 return nullptr;
6013
6014 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6015 VPValue *InvariantCandidate = BinOp->getOperand(1);
6016 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6017 std::swap(WidenIVCandidate, InvariantCandidate);
6018
6019 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6020 return nullptr;
6021
6022 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6023}
6024
6025/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6026/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6030 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6031 auto *ClonedOp = BinOp->clone();
6032 if (ClonedOp->getOperand(0) == WidenIV) {
6033 ClonedOp->setOperand(0, ScalarIV);
6034 } else {
6035 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6036 ClonedOp->setOperand(1, ScalarIV);
6037 }
6038 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6039 return ClonedOp;
6040}
6041
6044 Loop &L) {
6045 ScalarEvolution &SE = *PSE.getSE();
6046 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6047
6048 // Helper lambda to check if the IV range excludes the sentinel value. Try
6049 // signed first, then unsigned. Return an excluded sentinel if found,
6050 // otherwise return std::nullopt.
6051 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6052 bool UseMax) -> std::optional<APSInt> {
6053 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6054 for (bool Signed : {true, false}) {
6055 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6056 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6057
6058 ConstantRange IVRange =
6059 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6060 if (!IVRange.contains(Sentinel))
6061 return Sentinel;
6062 }
6063 return std::nullopt;
6064 };
6065
6066 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6067 for (VPRecipeBase &Phi :
6068 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6069 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6071 PhiR->getRecurrenceKind()))
6072 continue;
6073
6074 Type *PhiTy = PhiR->getScalarType();
6075 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6076 continue;
6077
6078 // If there's a header mask, the backedge select will not be the find-last
6079 // select.
6080 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6081 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6082 if (HeaderMask &&
6083 !match(BackedgeVal,
6084 m_Select(m_Specific(HeaderMask),
6085 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6086 continue;
6087
6088 // Get the find-last expression from the find-last select of the reduction
6089 // phi. The find-last select should be a select between the phi and the
6090 // find-last expression.
6091 VPValue *Cond, *FindLastExpression;
6092 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
6093 m_VPValue(FindLastExpression))) &&
6094 !match(FindLastSelect,
6095 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
6096 m_Specific(PhiR))))
6097 continue;
6098
6099 // Check if FindLastExpression is a simple expression of a widened IV. If
6100 // so, we can track the underlying IV instead and sink the expression.
6101 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6102 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6103 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6104 &L);
6105 const SCEV *Step;
6106 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6107 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6109 "IVOfExpressionToSink not being an AddRec must imply "
6110 "FindLastExpression not being an AddRec.");
6111 continue;
6112 }
6113
6114 // Determine direction from SCEV step.
6115 if (!SE.isKnownNonZero(Step))
6116 continue;
6117
6118 // Positive step means we need UMax/SMax to find the last IV value, and
6119 // UMin/SMin otherwise.
6120 bool UseMax = SE.isKnownPositive(Step);
6121 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6122 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6123
6124 // Sinking an expression will disable epilogue vectorization. Only use it,
6125 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6126 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6127 // multiply or divide by large constant, respectively), which also makes
6128 // sinking undesirable.
6129 if (IVOfExpressionToSink) {
6130 const SCEV *FindLastExpressionSCEV =
6131 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6132 if (match(FindLastExpressionSCEV,
6133 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6134 bool NewUseMax = SE.isKnownPositive(Step);
6135 if (auto NewSentinel =
6136 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6137 // The original expression already has a sentinel, so prefer not
6138 // sinking to keep epilogue vectorization possible.
6139 SentinelVal = *NewSentinel;
6140 UseSigned = NewSentinel->isSigned();
6141 UseMax = NewUseMax;
6142 IVSCEV = FindLastExpressionSCEV;
6143 IVOfExpressionToSink = nullptr;
6144 }
6145 }
6146 }
6147
6148 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6149 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6150 // cannot use min/max.
6151 if (!SentinelVal) {
6152 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6153 if (AR->hasNoSignedWrap())
6154 UseSigned = true;
6155 else if (AR->hasNoUnsignedWrap())
6156 UseSigned = false;
6157 else
6158 continue;
6159 }
6160
6162 BackedgeVal,
6164
6165 VPValue *NewFindLastSelect = BackedgeVal;
6166 VPValue *SelectCond = Cond;
6167 if (!SentinelVal || IVOfExpressionToSink) {
6168 // When we need to create a new select, normalize the condition so that
6169 // PhiR is the last operand and include the header mask if needed.
6170 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6171 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6172 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6173 SelectCond = LoopBuilder.createNot(SelectCond);
6174
6175 // When tail folding, mask the condition with the header mask to prevent
6176 // propagating poison from inactive lanes in the last vector iteration.
6177 if (HeaderMask)
6178 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6179
6180 if (SelectCond != Cond || IVOfExpressionToSink) {
6181 NewFindLastSelect = LoopBuilder.createSelect(
6182 SelectCond,
6183 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6184 PhiR, DL);
6185 }
6186 }
6187
6188 // Create the reduction result in the middle block using sentinel directly.
6189 RecurKind MinMaxKind =
6190 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6191 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6192 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6193 FastMathFlags());
6194 DebugLoc ExitDL = RdxResult->getDebugLoc();
6195 VPBuilder MiddleBuilder(RdxResult);
6196 VPValue *ReducedIV =
6198 NewFindLastSelect, Flags, ExitDL);
6199
6200 // If IVOfExpressionToSink is an expression to sink, sink it now.
6201 VPValue *VectorRegionExitingVal = ReducedIV;
6202 if (IVOfExpressionToSink)
6203 VectorRegionExitingVal =
6204 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6205 ReducedIV, IVOfExpressionToSink);
6206
6207 VPValue *NewRdxResult;
6208 VPValue *StartVPV = PhiR->getStartValue();
6209 if (SentinelVal) {
6210 // Sentinel-based approach: reduce IVs with min/max, compare against
6211 // sentinel to detect if condition was ever true, select accordingly.
6212 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6213 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6214 Sentinel, ExitDL);
6215 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6216 StartVPV, ExitDL);
6217 StartVPV = Sentinel;
6218 } else {
6219 // Introduce a boolean AnyOf reduction to track if the condition was ever
6220 // true in the loop. Use it to select the initial start value, if it was
6221 // never true.
6222 auto *AnyOfPhi = new VPReductionPHIRecipe(
6223 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6224 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6225 AnyOfPhi->insertAfter(PhiR);
6226
6227 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6228 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6229 AnyOfPhi->setOperand(1, OrVal);
6230
6231 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6232 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6233
6234 // Initialize the IV reduction phi with the neutral element, not the
6235 // original start value, to ensure correct min/max reduction results.
6236 StartVPV = Plan.getOrAddLiveIn(
6237 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6238 }
6239 RdxResult->replaceAllUsesWith(NewRdxResult);
6240 RdxResult->eraseFromParent();
6241
6242 auto *NewPhiR = new VPReductionPHIRecipe(
6243 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6244 *NewFindLastSelect, RdxUnordered{1}, {},
6245 PhiR->hasUsesOutsideReductionChain());
6246 NewPhiR->insertBefore(PhiR);
6247 PhiR->replaceAllUsesWith(NewPhiR);
6248 PhiR->eraseFromParent();
6249 }
6250}
6251
6252namespace {
6253
6254using ExtendKind = TTI::PartialReductionExtendKind;
6255struct ReductionExtend {
6256 Type *SrcType = nullptr;
6257 ExtendKind Kind = ExtendKind::PR_None;
6258};
6259
6260/// Describes the extends used to compute the extended reduction operand.
6261/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6262/// operation.
6263struct ExtendedReductionOperand {
6264 /// The recipe that consumes the extends.
6265 VPWidenRecipe *ExtendsUser = nullptr;
6266 /// Extend descriptions (inputs to getPartialReductionCost).
6267 ReductionExtend ExtendA, ExtendB;
6268};
6269
6270/// A chain of recipes that form a partial reduction. Matches either
6271/// reduction_bin_op (extended op, accumulator), or
6272/// reduction_bin_op (accumulator, extended op).
6273/// The possible forms of the "extended op" are listed in
6274/// matchExtendedReductionOperand.
6275struct VPPartialReductionChain {
6276 /// The top-level binary operation that forms the reduction to a scalar
6277 /// after the loop body.
6278 VPWidenRecipe *ReductionBinOp = nullptr;
6279 /// The user of the extends that is then reduced.
6280 ExtendedReductionOperand ExtendedOp;
6281 /// The recurrence kind for the entire partial reduction chain.
6282 /// This allows distinguishing between Sub and AddWithSub recurrences,
6283 /// when the ReductionBinOp is a Instruction::Sub.
6284 RecurKind RK;
6285 /// The index of the accumulator operand of ReductionBinOp. The extended op
6286 /// is `1 - AccumulatorOpIdx`.
6287 unsigned AccumulatorOpIdx;
6288 unsigned ScaleFactor;
6289};
6290
6291static VPSingleDefRecipe *
6292optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6293 // reduce.add(mul(ext(A), C))
6294 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6295 const APInt *Const;
6296 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6297 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6298 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6299 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6300 if (!Op->hasOneUse() ||
6302 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6303 return Op;
6304
6305 VPBuilder Builder(Op);
6306 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6307 Op->getOperand(1), NarrowTy);
6308 Type *WideTy = ExtA->getScalarType();
6309 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6310 return Op;
6311 }
6312
6313 // reduce.add(abs(sub(ext(A), ext(B))))
6314 // -> reduce.add(ext(absolute-difference(A, B)))
6315 VPValue *X, *Y;
6318 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6319 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6320 assert(Ext->getOpcode() ==
6321 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6322 "Expected both the LHS and RHS extends to be the same");
6323 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6324 VPBuilder Builder(Op);
6325 Type *SrcTy = X->getScalarType();
6326 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6327 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6328 auto *Max = Builder.insert(
6329 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6330 {FreezeX, FreezeY}, SrcTy));
6331 auto *Min = Builder.insert(
6332 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6333 {FreezeX, FreezeY}, SrcTy));
6334 auto *AbsDiff =
6335 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6336 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6337 Op->getScalarType());
6338 }
6339
6340 // reduce.add(ext(mul(ext(A), ext(B))))
6341 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6342 // TODO: Support this optimization for float types.
6344 m_ZExtOrSExt(m_VPValue()))))) {
6345 auto *Ext = cast<VPWidenCastRecipe>(Op);
6346 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6347 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6348 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6349 if (!Mul->hasOneUse() ||
6350 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6351 MulLHS->getOpcode() != MulRHS->getOpcode())
6352 return Op;
6353 VPBuilder Builder(Mul);
6354 auto *NewLHS = Builder.createWidenCast(
6355 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6356 auto *NewRHS = MulLHS == MulRHS
6357 ? NewLHS
6358 : Builder.createWidenCast(MulRHS->getOpcode(),
6359 MulRHS->getOperand(0),
6360 Ext->getScalarType());
6361 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6362 Builder.insert(NewMul);
6363 Op->replaceAllUsesWith(NewMul);
6364 Op->eraseFromParent();
6365 Mul->eraseFromParent();
6366 return NewMul;
6367 }
6368
6369 return Op;
6370}
6371
6372static VPExpressionRecipe *
6373createPartialReductionExpression(VPReductionRecipe *Red) {
6374 VPValue *VecOp = Red->getVecOp();
6375
6376 // reduce.[f]add(ext(op))
6377 // -> VPExpressionRecipe(op, red)
6378 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6379 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6380
6381 // reduce.[f]add(neg(ext(op)))
6382 // -> VPExpressionRecipe(op, sub/neg, red)
6383 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6384 auto *Neg = cast<VPWidenRecipe>(VecOp);
6385 auto *Ext =
6386 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6387 return new VPExpressionRecipe(Ext, Neg, Red);
6388 }
6389
6390 // reduce.[f]add([f]mul(ext(a), ext(b)))
6391 // -> VPExpressionRecipe(a, b, mul, red)
6392 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6393 match(VecOp,
6395 auto *Mul = cast<VPWidenRecipe>(VecOp);
6396 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6397 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6398 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6399 }
6400
6401 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6402 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6403 if (match(VecOp,
6405 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6406 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6407 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6408 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6409 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6410 }
6411
6412 // reduce.add(neg(mul(ext(a), ext(b))))
6413 // -> VPExpressionRecipe(a, b, mul, sub, red)
6415 m_ZExtOrSExt(m_VPValue()))))) {
6416 auto *Sub = cast<VPWidenRecipe>(VecOp);
6417 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6418 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6419 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6420 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6421 }
6422
6423 llvm_unreachable("Unsupported expression");
6424}
6425
6426// Helper to transform a partial reduction chain into a partial reduction
6427// recipe. Assumes profitability has been checked.
6428static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6429 VPlan &Plan,
6430 VPReductionPHIRecipe *RdxPhi) {
6431 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6432 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6433
6434 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6435 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6436 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6437
6438 // FIXME: Do these transforms before invoking the cost-model.
6439 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6440
6441 // Sub-reductions can be implemented in two ways:
6442 // (1) negate the operand in the vector loop (the default way).
6443 // (2) subtract the reduced value from the init value in the middle block.
6444 // Both ways keep the reduction itself as an 'add' reduction.
6445 //
6446 // The ISD nodes for partial reductions don't support folding the
6447 // sub/negation into its operands because the following is not a valid
6448 // transformation:
6449 // sub(0, mul(ext(a), ext(b)))
6450 // -> mul(ext(a), ext(sub(0, b)))
6451 //
6452 // It's therefore better to choose option (2) such that the partial
6453 // reduction is always positive (starting at '0') and to do a final
6454 // subtract in the middle block.
6455 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6456 Chain.RK != RecurKind::Sub) ||
6457 (WidenRecipe->getOpcode() == Instruction::FSub &&
6458 Chain.RK != RecurKind::FSub)) {
6459 VPBuilder Builder(WidenRecipe);
6460 Type *ElemTy = ExtendedOp->getScalarType();
6461 VPWidenRecipe *NegRecipe;
6462 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6463 NegRecipe =
6464 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6466 } else {
6467 auto *Zero = Plan.getZero(ElemTy);
6468 NegRecipe =
6469 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6471 }
6472 Builder.insert(NegRecipe);
6473 ExtendedOp = NegRecipe;
6474 }
6475
6476 // Check if WidenRecipe is the final result of the reduction. If so look
6477 // through selects for predicated reductions.
6478 VPValue *Cond = nullptr;
6480 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6481 m_Specific(RdxPhi))));
6482 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6483 RdxPhi->getBackedgeValue() == ExitValue;
6484 assert((!ExitValue || IsLastInChain) &&
6485 "if we found ExitValue, it must match RdxPhi's backedge value");
6486
6487 Type *PhiType = RdxPhi->getScalarType();
6488 RecurKind RdxKind =
6490 auto *PartialRed = new VPReductionRecipe(
6491 RdxKind,
6492 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6493 : FastMathFlags(),
6494 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6495 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6496 PartialRed->insertBefore(WidenRecipe);
6497
6498 if (Cond)
6499 ExitValue->replaceAllUsesWith(PartialRed);
6500 WidenRecipe->replaceAllUsesWith(PartialRed);
6501
6502 // For cost-model purposes, fold this into a VPExpression.
6503 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6504 E->insertBefore(WidenRecipe);
6505 PartialRed->replaceAllUsesWith(E);
6506
6507 // We only need to update the PHI node once, which is when we find the
6508 // last reduction in the chain.
6509 if (!IsLastInChain)
6510 return;
6511
6512 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6513 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6514 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6515
6516 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6517 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6518 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6519 StartInst->setOperand(2, NewScaleFactor);
6520
6521 // If this is the last value in a sub-reduction chain, then update the PHI
6522 // node to start at `0` and update the reduction-result to subtract from
6523 // the PHI's start value.
6524 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6525 return;
6526
6527 VPValue *OldStartValue = StartInst->getOperand(0);
6528 StartInst->setOperand(0, StartInst->getOperand(1));
6529
6530 // Replace reduction_result by 'sub (startval, reductionresult)'.
6532 assert(RdxResult && "Could not find reduction result");
6533
6534 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6535 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6536 : Instruction::BinaryOps::Sub;
6537 VPInstruction *NewResult = Builder.createNaryOp(
6538 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6539 RdxPhi->getDebugLoc());
6540 RdxResult->replaceUsesWithIf(
6541 NewResult,
6542 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6543}
6544
6545/// Returns the cost of a link in a partial-reduction chain for a given VF.
6546static InstructionCost
6547getPartialReductionLinkCost(VPCostContext &CostCtx,
6548 const VPPartialReductionChain &Link,
6549 ElementCount VF) {
6550 Type *RdxType = Link.ReductionBinOp->getScalarType();
6551 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6552 std::optional<unsigned> BinOpc = std::nullopt;
6553 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6554 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6555 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6556
6557 std::optional<llvm::FastMathFlags> Flags;
6558 if (RdxType->isFloatingPointTy())
6559 Flags = Link.ReductionBinOp->getFastMathFlags();
6560
6561 auto GetLinkOpcode = [&Link]() -> unsigned {
6562 switch (Link.RK) {
6563 case RecurKind::Sub:
6564 return Instruction::Add;
6565 case RecurKind::FSub:
6566 return Instruction::FAdd;
6567 default:
6568 return Link.ReductionBinOp->getOpcode();
6569 }
6570 };
6571
6572 return CostCtx.TTI.getPartialReductionCost(
6573 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6574 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6575 CostCtx.CostKind, Flags);
6576}
6577
6578static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6580}
6581
6582/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6583/// operand. This is an operand where the source of the value (e.g. a load) has
6584/// been extended (sext, zext, or fpext) before it is used in the reduction.
6585///
6586/// Possible forms matched by this function:
6587/// - UpdateR(PrevValue, ext(...))
6588/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6589/// - UpdateR(PrevValue, mul(ext(...), Constant))
6590/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6591/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6592/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6593///
6594/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6595static std::optional<ExtendedReductionOperand>
6596matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6597 assert(is_contained(UpdateR->operands(), Op) &&
6598 "Op should be operand of UpdateR");
6599
6600 // Try matching an absolute difference operand of the form
6601 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6602 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6603 // difference on a wider type and get the extend for "free" from the partial
6604 // reduction.
6605 VPValue *X, *Y;
6606 if (Op->hasOneUse() &&
6610 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6611 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6612 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6613 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6614 Type *LHSInputType = X->getScalarType();
6615 Type *RHSInputType = Y->getScalarType();
6616 if (LHSInputType != RHSInputType ||
6617 LHSExt->getOpcode() != RHSExt->getOpcode())
6618 return std::nullopt;
6619 // Note: This is essentially the same as matching ext(...) as we will
6620 // rewrite this operand to ext(absolute-difference(A, B)).
6621 return ExtendedReductionOperand{
6622 Sub,
6623 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6624 /*ExtendB=*/{}};
6625 }
6626
6627 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6629 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6630 VPValue *CastSource = CastRecipe->getOperand(0);
6631 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6632 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6633 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6634 // Match: ext(mul(...))
6635 // Record the outer extend kind and set `Op` to the mul. We can then match
6636 // this as a binary operation. Note: We can optimize out the outer extend
6637 // by widening the inner extends to match it. See
6638 // optimizeExtendsForPartialReduction.
6639 Op = CastSource;
6640 } else {
6641 return ExtendedReductionOperand{
6642 UpdateR,
6643 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6644 /*ExtendB=*/{}};
6645 }
6646 }
6647
6648 if (!Op->hasOneUse())
6649 return std::nullopt;
6650
6652 if (!MulOp ||
6653 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6654 return std::nullopt;
6655
6656 // The rest of the matching assumes `Op` is a (possibly extended) mul
6657 // operation.
6658
6659 VPValue *LHS = MulOp->getOperand(0);
6660 VPValue *RHS = MulOp->getOperand(1);
6661
6662 // The LHS of the operation must always be an extend.
6664 return std::nullopt;
6665
6666 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6667 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6668 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6669
6670 // The RHS of the operation can be an extend or a constant integer.
6671 const APInt *RHSConst = nullptr;
6672 VPWidenCastRecipe *RHSCast = nullptr;
6674 RHSCast = cast<VPWidenCastRecipe>(RHS);
6675 else if (!match(RHS, m_APInt(RHSConst)) ||
6676 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6677 return std::nullopt;
6678
6679 // The outer extend kind must match the inner extends for folding.
6680 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6681 if (Cast && OuterExtKind &&
6682 getPartialReductionExtendKind(Cast) != OuterExtKind)
6683 return std::nullopt;
6684
6685 Type *RHSInputType = LHSInputType;
6686 ExtendKind RHSExtendKind = LHSExtendKind;
6687 if (RHSCast) {
6688 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6689 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6690 }
6691
6692 return ExtendedReductionOperand{
6693 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6694}
6695
6696/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6697/// and determines if the target can use a cheaper operation with a wider
6698/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6699/// of operations in the reduction.
6700static std::optional<SmallVector<VPPartialReductionChain>>
6701getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6702 VFRange &Range) {
6703 // Get the backedge value from the reduction PHI and find the
6704 // ComputeReductionResult that uses it (directly or through a select for
6705 // predicated reductions).
6706 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6707 if (!RdxResult)
6708 return std::nullopt;
6709 VPValue *ExitValue = RdxResult->getOperand(0);
6710 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6711
6713 RecurKind RK = RedPhiR->getRecurrenceKind();
6714 Type *PhiType = RedPhiR->getScalarType();
6715 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6716
6717 // Work backwards from the ExitValue examining each reduction operation.
6718 VPValue *CurrentValue = ExitValue;
6719 while (CurrentValue != RedPhiR) {
6720 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6721 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6722 return std::nullopt;
6723
6724 VPValue *Op = UpdateR->getOperand(1);
6725 VPValue *PrevValue = UpdateR->getOperand(0);
6726
6727 // Find the extended operand. The other operand (PrevValue) is the next link
6728 // in the reduction chain.
6729 std::optional<ExtendedReductionOperand> ExtendedOp =
6730 matchExtendedReductionOperand(UpdateR, Op);
6731 if (!ExtendedOp) {
6732 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6733 if (!ExtendedOp)
6734 return std::nullopt;
6735 std::swap(Op, PrevValue);
6736 }
6737
6738 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6739 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6740 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6741 return std::nullopt;
6742
6743 // Check if a partial reduction chain is supported by the target (i.e. does
6744 // not have an invalid cost) for the given VF range. Clamps the range and
6745 // returns true if feasible for any VF.
6746 VPPartialReductionChain Link(
6747 {UpdateR, *ExtendedOp, RK,
6748 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6749 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6750 Chain.push_back(Link);
6751 CurrentValue = PrevValue;
6752 }
6753
6754 // The chain links were collected by traversing backwards from the exit value.
6755 // Reverse the chains so they are in program order.
6756 std::reverse(Chain.begin(), Chain.end());
6757 return Chain;
6758}
6759} // namespace
6760
6762 VPCostContext &CostCtx,
6763 VFRange &Range) {
6764 // Find all possible valid partial reductions, grouping chains by their PHI.
6765 // This grouping allows invalidating the whole chain, if any link is not a
6766 // valid partial reduction.
6768 ChainsByPhi;
6769 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6770 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6771 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6772 if (!RedPhiR)
6773 continue;
6774
6775 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6776 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6777 }
6778
6779 if (ChainsByPhi.empty())
6780 return;
6781
6782 // Build set of partial reduction operations for extend user validation and
6783 // a map of reduction bin ops to their scale factors for scale validation.
6784 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6785 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6786 for (const auto &[_, Chains] : ChainsByPhi)
6787 for (const VPPartialReductionChain &Chain : Chains) {
6788 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6789 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6790 }
6791
6792 // A partial reduction is invalid if any of its extends are used by
6793 // something that isn't another partial reduction. This is because the
6794 // extends are intended to be lowered along with the reduction itself.
6795 auto ExtendUsersValid = [&](VPValue *Ext) {
6796 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6797 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6798 });
6799 };
6800
6801 auto IsProfitablePartialReductionChainForVF =
6802 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6803 InstructionCost PartialCost = 0, RegularCost = 0;
6804
6805 // The chain is a profitable partial reduction chain if the cost of handling
6806 // the entire chain is cheaper when using partial reductions than when
6807 // handling the entire chain using regular reductions.
6808 for (const VPPartialReductionChain &Link : Chain) {
6809 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6810 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6811 if (!LinkCost.isValid())
6812 return false;
6813
6814 PartialCost += LinkCost;
6815 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6816 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6817 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6818 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6819 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6820 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6821 RegularCost += Extend->computeCost(VF, CostCtx);
6822 }
6823 return PartialCost.isValid() && PartialCost < RegularCost;
6824 };
6825
6826 // Validate chains: check that extends are only used by partial reductions,
6827 // and that reduction bin ops are only used by other partial reductions with
6828 // matching scale factors, are outside the loop region or the select
6829 // introduced by tail-folding. Otherwise we would create users of scaled
6830 // reductions where the types of the other operands don't match.
6831 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6832 for (const VPPartialReductionChain &Chain : Chains) {
6833 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6834 Chains.clear();
6835 break;
6836 }
6837 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6838 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6839 return PhiR == RedPhiR;
6840 auto *R = cast<VPSingleDefRecipe>(U);
6841 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6843 m_Specific(Chain.ReductionBinOp))) ||
6844 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6845 m_Specific(RedPhiR)));
6846 };
6847 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6848 Chains.clear();
6849 break;
6850 }
6851
6852 // Check if the compute-reduction-result is used by a sunk store.
6853 // TODO: Also form partial reductions in those cases.
6854 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6855 if (any_of(RdxResult->users(), [](VPUser *U) {
6856 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6857 return RepR && RepR->getOpcode() == Instruction::Store;
6858 })) {
6859 Chains.clear();
6860 break;
6861 }
6862 }
6863 }
6864
6865 // Clear the chain if it is not profitable.
6867 [&, &Chains = Chains](ElementCount VF) {
6868 return IsProfitablePartialReductionChainForVF(Chains, VF);
6869 },
6870 Range))
6871 Chains.clear();
6872 }
6873
6874 for (auto &[Phi, Chains] : ChainsByPhi)
6875 for (const VPPartialReductionChain &Chain : Chains)
6876 transformToPartialReduction(Chain, Plan, Phi);
6877}
6878
6880 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6881 // Collect all loads/stores first. We will start with ones having simpler
6882 // decisions followed by more complex ones that are potentially
6883 // guided/dependent on the simpler ones.
6885 for (VPBasicBlock *VPBB :
6888 for (VPRecipeBase &R : *VPBB) {
6889 auto *VPI = dyn_cast<VPInstruction>(&R);
6890 if (VPI && VPI->getUnderlyingValue() &&
6891 is_contained({Instruction::Load, Instruction::Store},
6892 VPI->getOpcode()))
6893 MemOps.push_back(VPI);
6894 }
6895 }
6896
6897 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6898 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6899
6900 for (VPInstruction *VPI : MemOps) {
6901 auto ReplaceWith = [&](VPRecipeBase *New) {
6902 New->insertBefore(VPI);
6903 if (VPI->getOpcode() == Instruction::Load)
6904 VPI->replaceAllUsesWith(New->getVPSingleValue());
6905 VPI->eraseFromParent();
6906 };
6907
6908 // Note: we must do that for scalar VPlan as well.
6909 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6910 FinalRedStoresBuilder))
6911 continue;
6912
6913 // Filter out scalar VPlan for the remaining memory operations.
6915 [](ElementCount VF) { return VF.isScalar(); }, Range))
6916 continue;
6917
6918 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6919 ReplaceWith(Histogram);
6920 continue;
6921 }
6922
6923 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6924 if (!Recipe)
6925 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6926
6927 ReplaceWith(Recipe);
6928 }
6929}
6930
6933 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6934 return;
6935
6937 Plan.getEntry());
6939 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6940 auto *VPI = dyn_cast<VPInstruction>(&R);
6941 if (!VPI)
6942 continue;
6943
6944 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6945 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6946 if (!I)
6947 continue;
6948
6949 // If executing other lanes produces side-effects we can't avoid them.
6950 if (VPI->mayHaveSideEffects())
6951 continue;
6952
6953 // We want to drop the mask operand, verify we can safely do that.
6954 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6955 continue;
6956
6957 // Avoid rewriting IV increment as that interferes with
6958 // `removeRedundantCanonicalIVs`.
6959 if (VPI->getOpcode() == Instruction::Add &&
6961 continue;
6962
6963 // Other lanes are needed - can't drop them.
6965 continue;
6966
6967 auto *Recipe = new VPReplicateRecipe(
6968 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6969 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
6970 Recipe->insertBefore(VPI);
6971 VPI->replaceAllUsesWith(Recipe);
6972 VPI->eraseFromParent();
6973 }
6974 }
6975}
6976
6977/// Returns true if \p Info's parameter kinds are compatible with \p Args.
6978static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
6979 PredicatedScalarEvolution &PSE, const Loop *L) {
6980 ScalarEvolution *SE = PSE.getSE();
6981 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
6982 switch (Param.ParamKind) {
6983 case VFParamKind::Vector:
6984 case VFParamKind::GlobalPredicate:
6985 return true;
6986 case VFParamKind::OMP_Uniform:
6987 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
6988 SE->isLoopInvariant(
6989 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6990 L);
6991 case VFParamKind::OMP_Linear:
6992 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6993 m_scev_AffineAddRec(
6994 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
6995 m_SpecificLoop(L)));
6996 default:
6997 return false;
6998 }
6999 });
7000}
7001
7002/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7003/// Returns the variant function, or nullptr. Masked variants are assumed to
7004/// take the mask as a trailing parameter.
7006 ElementCount VF, bool MaskRequired,
7008 const Loop *L) {
7009 if (CI->isNoBuiltin())
7010 return nullptr;
7011 auto Mappings = VFDatabase::getMappings(*CI);
7012 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7013 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7014 areVFParamsOk(Info, Args, PSE, L);
7015 });
7016 if (It == Mappings.end())
7017 return nullptr;
7018 return CI->getModule()->getFunction(It->VectorName);
7019}
7020
7021namespace {
7022/// The outcome of choosing how to widen a call at a given VF.
7023struct CallWideningDecision {
7024 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7025 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7026 : Kind(Kind), Variant(Variant) {}
7027 KindTy Kind;
7028
7029 /// Set when Kind == VectorVariant.
7031
7032 bool operator==(const CallWideningDecision &Other) const {
7033 return Kind == Other.Kind && Variant == Other.Variant;
7034 }
7035};
7036} // namespace
7037
7038/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7039/// vector intrinsic, and vector library variant.
7040static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7042 ElementCount VF,
7043 VPCostContext &CostCtx) {
7044 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7045
7046 // Scalar VFs and calls forced or known to scalarize always replicate.
7047 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7048 return CallWideningDecision::KindTy::Scalarize;
7049
7050 auto *CalledFn = cast<Function>(
7052 Type *ResultTy = VPI.getScalarType();
7054 bool MaskRequired = CostCtx.isMaskRequired(CI);
7055
7056 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7058 return CallWideningDecision::KindTy::Scalarize;
7059
7060 InstructionCost ScalarCost =
7061 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7062 /*IsSingleScalar=*/false, VF, CostCtx);
7063
7064 Function *VecFunc =
7065 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7067 if (VecFunc)
7068 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7069
7070 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7071 // available vector variant.
7072 if (ID) {
7075 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7076 (!VecFunc || VecCallCost >= IntrinsicCost))
7077 return CallWideningDecision::KindTy::Intrinsic;
7078 }
7079
7080 // Otherwise, use a vector library variant when it beats scalarizing.
7081 if (VecFunc && ScalarCost >= VecCallCost)
7082 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7083
7084 return CallWideningDecision::KindTy::Scalarize;
7085}
7086
7088 VPRecipeBuilder &RecipeBuilder,
7089 VPCostContext &CostCtx) {
7092 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7093 auto *VPI = dyn_cast<VPInstruction>(&R);
7094 if (!VPI || !VPI->getUnderlyingValue() ||
7095 VPI->getOpcode() != Instruction::Call)
7096 continue;
7097
7098 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7099 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7100 VPI->op_begin() + CI->arg_size());
7101
7102 CallWideningDecision Decision =
7103 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7105 [&](ElementCount VF) {
7106 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7107 },
7108 Range);
7109
7110 VPSingleDefRecipe *Replacement = nullptr;
7111 switch (Decision.Kind) {
7112 case CallWideningDecision::KindTy::Intrinsic: {
7114 Type *ResultTy = VPI->getScalarType();
7115 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7116 *VPI, VPI->getDebugLoc());
7117 break;
7118 }
7119 case CallWideningDecision::KindTy::VectorVariant: {
7120 // Masked variants take the mask as a trailing parameter, so they have
7121 // one more parameter than the original call's arguments.
7122 if (Decision.Variant->arg_size() > Ops.size()) {
7123 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7124 Ops.push_back(Mask);
7125 }
7126 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7127 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7128 *VPI, VPI->getDebugLoc());
7129 break;
7130 }
7131 case CallWideningDecision::KindTy::Scalarize:
7132 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7133 break;
7134 }
7135
7136 Replacement->insertBefore(VPI);
7137 VPI->replaceAllUsesWith(Replacement);
7138 VPI->eraseFromParent();
7139 }
7140 }
7141}
7142
7145 Loop &L, VPCostContext &Ctx,
7146 VFRange &Range) {
7147 if (Plan.hasScalarVFOnly())
7148 return;
7149
7150 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7151 VPValue *I32VF = nullptr;
7153 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7154 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7155 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7156 // TODO: Support strided store.
7157 // TODO: Transform reverse access into strided access with -1 stride.
7158 // TODO: Transform gather/scatter with uniform address into strided access
7159 // with 0 stride.
7160 // TODO: Transform interleave access into multiple strided accesses.
7161 if (!LoadR || LoadR->isConsecutive())
7162 continue;
7163
7164 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7165 if (!Ptr)
7166 continue;
7167
7168 // Check if this is a strided access by analyzing the address SCEV for an
7169 // affine addRec.
7170 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7171 const SCEV *Start;
7172 const SCEVConstant *Step;
7173 // TODO: Support non-constant loop invariant stride.
7174 if (!match(PtrSCEV,
7176 m_SpecificLoop(&L))))
7177 continue;
7178
7179 Type *LoadTy = LoadR->getScalarType();
7180 Align Alignment = LoadR->getAlign();
7181 auto IsProfitable = [&](ElementCount VF) {
7182 Type *DataTy = toVectorTy(LoadTy, VF);
7183 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7184 return false;
7185 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7186 const InstructionCost StridedLoadStoreCost =
7188 Intrinsic::experimental_vp_strided_load, DataTy,
7189 LoadR->isMasked(), Alignment, Ctx);
7190 return StridedLoadStoreCost < CurrentCost;
7191 };
7192
7194 Range))
7195 continue;
7196
7197 // Invalidate the legacy widening decision so the cost of replaced load is
7198 // not counted during precomputeCosts.
7199 // TODO: Remove once the legacy exit cost computation is retired.
7200 for (ElementCount VF : Range)
7201 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7202
7203 // Get VF as i32 for the vector length operand.
7204 if (!I32VF) {
7205 VPBuilder Builder(Plan.getVectorPreheader());
7206 I32VF = Builder.createScalarZExtOrTrunc(
7207 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7209 }
7210
7211 VPBuilder Builder(LoadR);
7212 // Create the base pointer of strided access.
7213 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7214 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7215 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7216 assert(IndexTy == StrideInBytes->getScalarType() &&
7217 "Stride type from SCEV must match the index type");
7218 VPValue *CanIVTyStride = Builder.createScalarSExtOrTrunc(
7219 StrideInBytes, VectorLoop->getCanonicalIVType(), IndexTy,
7221 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7222 auto *Offset = Builder.createOverflowingOp(
7223 Instruction::Mul, {VectorLoop->getCanonicalIV(), CanIVTyStride},
7224 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7225 auto *BasePtr = Builder.createNoWrapPtrAdd(
7226 StartVPV, Offset,
7227 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7229
7230 // Create a new vector pointer for strided access.
7231 VPValue *NewPtr = Builder.createVectorPointer(
7232 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7233 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7234
7235 VPValue *Mask = LoadR->getMask();
7236 if (!Mask)
7237 Mask = Plan.getTrue();
7238 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7239 Intrinsic::experimental_vp_strided_load,
7240 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7241 LoadR->getDebugLoc());
7242 LoadR->replaceAllUsesWith(StridedLoad);
7243 }
7244 }
7245}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, LLVMContext &Ctx)
Try to fold R using InstSimplifyFolder.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, VPDominatorTree &VPDT)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1666
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1075
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4044
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4399
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4474
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4426
iterator end()
Definition VPlan.h:4436
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4434
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4487
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4446
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4448
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2963
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3013
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:3003
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3019
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2999
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:342
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:361
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:251
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:269
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:287
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:323
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:307
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3495
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1653
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4076
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4177
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3540
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2437
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2484
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2473
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2156
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4552
Class to record and manage LLVM IR flags.
Definition VPlan.h:694
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:891
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1472
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1318
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1268
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1314
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1263
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1260
@ CanonicalIVIncrementForPart
Definition VPlan.h:1244
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1271
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3115
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3107
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3136
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3188
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3146
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1658
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3704
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3359
A recipe for handling reduction phis.
Definition VPlan.h:2865
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2916
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2909
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2927
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3239
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4609
const VPBlockBase * getEntry() const
Definition VPlan.h:4653
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4685
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4670
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4729
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4737
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4721
const VPBlockBase * getExiting() const
Definition VPlan.h:4665
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4678
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3404
bool isSingleScalar() const
Definition VPlan.h:3460
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
bool isPredicated() const
Definition VPlan.h:3462
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3479
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:189
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4244
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:608
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:679
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:457
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:430
unsigned getNumOperands() const
Definition VPlanValue.h:424
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1481
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1487
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2267
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2090
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1871
Instruction::CastOps getOpcode() const
Definition VPlan.h:1907
A recipe for handling GEP instructions.
Definition VPlan.h:2199
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2517
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2565
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2583
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2568
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2588
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2624
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2671
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2675
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2686
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2697
A recipe for widening vector intrinsics.
Definition VPlan.h:1918
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3740
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2755
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1810
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1831
unsigned getOpcode() const
Definition VPlan.h:1850
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4757
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5082
bool hasVF(ElementCount VF) const
Definition VPlan.h:4980
const DataLayout & getDataLayout() const
Definition VPlan.h:4962
LLVMContext & getContext() const
Definition VPlan.h:4958
VPBasicBlock * getEntry()
Definition VPlan.h:4853
bool hasScalableVF() const
Definition VPlan.h:4981
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4916
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4937
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4987
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5053
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4956
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5059
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5131
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5085
bool hasUF(unsigned UF) const
Definition VPlan.h:5005
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4906
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4946
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4943
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5030
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5056
void setVF(ElementCount VF)
Definition VPlan.h:4968
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5021
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1068
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5008
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4930
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4882
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5108
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5050
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4858
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4953
bool hasScalarVFOnly() const
Definition VPlan.h:4998
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4896
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4949
void setUF(unsigned UF)
Definition VPlan.h:5013
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5163
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1224
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5064
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:139
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
LLVM_ABI_FOR_TEST std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, SmallVectorImpl< VPInstruction * > &GEPs, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1693
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1850
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:311
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2847
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3853
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3804
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3955
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3902
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...