LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanHelpers.h"
17#include "VPlanPatternMatch.h"
18#include "VPlanUtils.h"
19#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenMemIntrinsicSC:
75 case VPWidenIntrinsicSC:
76 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
77 case VPActiveLaneMaskPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenMemIntrinsicSC:
128 case VPWidenIntrinsicSC:
129 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
130 case VPBranchOnMaskSC:
131 case VPDerivedIVSC:
132 case VPCurrentIterationPHISC:
133 case VPFirstOrderRecurrencePHISC:
134 case VPReductionPHISC:
135 case VPPredInstPHISC:
136 case VPScalarIVStepsSC:
137 case VPWidenStoreEVLSC:
138 case VPWidenStoreSC:
139 return false;
140 case VPBlendSC:
141 case VPReductionEVLSC:
142 case VPReductionSC:
143 case VPVectorPointerSC:
144 case VPWidenCanonicalIVSC:
145 case VPWidenCastSC:
146 case VPWidenGEPSC:
147 case VPWidenIntOrFpInductionSC:
148 case VPWidenPHISC:
149 case VPWidenPointerInductionSC:
150 case VPWidenSC: {
151 const Instruction *I =
152 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
153 (void)I;
154 assert((!I || !I->mayReadFromMemory()) &&
155 "underlying instruction may read from memory");
156 return false;
157 }
158 default:
159 // FIXME: Return false if the recipe represents an interleaved store.
160 return true;
161 }
162}
163
165 switch (getVPRecipeID()) {
166 case VPExpressionSC:
167 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
168 case VPActiveLaneMaskPHISC:
169 case VPDerivedIVSC:
170 case VPCurrentIterationPHISC:
171 case VPFirstOrderRecurrencePHISC:
172 case VPReductionPHISC:
173 case VPPredInstPHISC:
174 case VPVectorEndPointerSC:
175 return false;
176 case VPInstructionSC: {
177 auto *VPI = cast<VPInstruction>(this);
178 return mayWriteToMemory() ||
179 VPI->getOpcode() == VPInstruction::BranchOnCount ||
180 VPI->getOpcode() == VPInstruction::BranchOnCond ||
181 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
182 }
183 case VPWidenCallSC: {
184 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
185 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
186 }
187 case VPWidenMemIntrinsicSC:
188 case VPWidenIntrinsicSC:
189 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
190 case VPBlendSC:
191 case VPReductionEVLSC:
192 case VPReductionSC:
193 case VPScalarIVStepsSC:
194 case VPVectorPointerSC:
195 case VPWidenCanonicalIVSC:
196 case VPWidenCastSC:
197 case VPWidenGEPSC:
198 case VPWidenIntOrFpInductionSC:
199 case VPWidenPHISC:
200 case VPWidenPointerInductionSC:
201 case VPWidenSC: {
202 const Instruction *I =
203 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
204 (void)I;
205 assert((!I || !I->mayHaveSideEffects()) &&
206 "underlying instruction has side-effects");
207 return false;
208 }
209 case VPInterleaveEVLSC:
210 case VPInterleaveSC:
211 return mayWriteToMemory();
212 case VPWidenLoadEVLSC:
213 case VPWidenLoadSC:
214 case VPWidenStoreEVLSC:
215 case VPWidenStoreSC:
216 assert(
217 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
219 "mayHaveSideffects result for ingredient differs from this "
220 "implementation");
221 return mayWriteToMemory();
222 case VPReplicateSC: {
223 auto *R = cast<VPReplicateRecipe>(this);
224 return R->getUnderlyingInstr()->mayHaveSideEffects();
225 }
226 default:
227 return true;
228 }
229}
230
232 switch (getVPRecipeID()) {
233 default:
234 return false;
235 case VPInstructionSC: {
236 unsigned Opcode = cast<VPInstruction>(this)->getOpcode();
237 if (Instruction::isCast(Opcode))
238 return true;
239
240 switch (Opcode) {
241 default:
242 return false;
243 case Instruction::Add:
244 case Instruction::Sub:
245 case Instruction::Mul:
246 case Instruction::GetElementPtr:
247 return true;
248 }
249 }
250 }
251}
252
254 assert(!Parent && "Recipe already in some VPBasicBlock");
255 assert(InsertPos->getParent() &&
256 "Insertion position not in any VPBasicBlock");
257 InsertPos->getParent()->insert(this, InsertPos->getIterator());
258}
259
260void VPRecipeBase::insertBefore(VPBasicBlock &BB,
262 assert(!Parent && "Recipe already in some VPBasicBlock");
263 assert(I == BB.end() || I->getParent() == &BB);
264 BB.insert(this, I);
265}
266
268 assert(!Parent && "Recipe already in some VPBasicBlock");
269 assert(InsertPos->getParent() &&
270 "Insertion position not in any VPBasicBlock");
271 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
272}
273
275 assert(getParent() && "Recipe not in any VPBasicBlock");
277 Parent = nullptr;
278}
279
281 assert(getParent() && "Recipe not in any VPBasicBlock");
283}
284
287 insertAfter(InsertPos);
288}
289
295
297 // Get the underlying instruction for the recipe, if there is one. It is used
298 // to
299 // * decide if cost computation should be skipped for this recipe,
300 // * apply forced target instruction cost.
301 Instruction *UI = nullptr;
302 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
303 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
304 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
305 UI = IG->getInsertPos();
306 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
307 UI = &WidenMem->getIngredient();
308
309 InstructionCost RecipeCost;
310 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
311 RecipeCost = 0;
312 } else {
313 RecipeCost = computeCost(VF, Ctx);
314 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
315 RecipeCost.isValid()) {
316 if (UI)
318 else
319 RecipeCost = InstructionCost(0);
320 }
321 }
322
323 LLVM_DEBUG({
324 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
325 dump();
326 });
327 return RecipeCost;
328}
329
331 VPCostContext &Ctx) const {
332 llvm_unreachable("subclasses should implement computeCost");
333}
334
336 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
338}
339
341 assert(OpType == Other.OpType && "OpType must match");
342 switch (OpType) {
343 case OperationType::OverflowingBinOp:
344 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
345 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
346 break;
347 case OperationType::Trunc:
348 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
349 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
350 break;
351 case OperationType::DisjointOp:
352 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
353 break;
354 case OperationType::PossiblyExactOp:
355 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
356 break;
357 case OperationType::GEPOp:
358 GEPFlagsStorage &= Other.GEPFlagsStorage;
359 break;
360 case OperationType::FPMathOp:
361 case OperationType::FCmp:
362 assert((OpType != OperationType::FCmp ||
363 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
364 "Cannot drop CmpPredicate");
365 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
366 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
367 break;
368 case OperationType::NonNegOp:
369 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
370 break;
371 case OperationType::Cmp:
372 assert(CmpPredStorage == Other.CmpPredStorage &&
373 "Cannot drop CmpPredicate");
374 break;
375 case OperationType::ReductionOp:
376 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
377 "Cannot change RecurKind");
378 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
379 "Cannot change IsOrdered");
380 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
381 "Cannot change IsInLoop");
382 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
383 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
384 break;
385 case OperationType::Other:
386 break;
387 }
388}
389
391 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
392 OpType == OperationType::ReductionOp ||
393 OpType == OperationType::Other) &&
394 "recipe doesn't have fast math flags");
395 if (OpType == OperationType::Other)
396 return FastMathFlags();
397 const FastMathFlagsTy &F = getFMFsRef();
398 FastMathFlags Res;
399 Res.setAllowReassoc(F.AllowReassoc);
400 Res.setNoNaNs(F.NoNaNs);
401 Res.setNoInfs(F.NoInfs);
402 Res.setNoSignedZeros(F.NoSignedZeros);
403 Res.setAllowReciprocal(F.AllowReciprocal);
404 Res.setAllowContract(F.AllowContract);
405 Res.setApproxFunc(F.ApproxFunc);
406 return Res;
407}
408
409#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
411
412void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
413 VPSlotTracker &SlotTracker) const {
414 printRecipe(O, Indent, SlotTracker);
415 if (auto DL = getDebugLoc()) {
416 O << ", !dbg ";
417 DL.print(O);
418 }
419
420 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
422}
423#endif
424
426 : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()),
427 Expr(Expr) {}
428
429/// For call VPInstruction operands, return the operand index of the called
430/// function. The function is either the last operand (for unmasked calls) or
431/// the second-to-last operand (for masked calls).
433 unsigned NumOps = Operands.size();
434 auto *LastOp = dyn_cast<VPIRValue>(Operands[NumOps - 1]);
435 if (LastOp && isa<Function>(LastOp->getValue()))
436 return NumOps - 1;
438 "expected function operand");
439 return NumOps - 2;
440}
441
442/// For call VPInstruction operands, return the called function.
444 unsigned Idx = getCalledFnOperandIndex(Operands);
445 return cast<Function>(cast<VPIRValue>(Operands[Idx])->getValue());
446}
447
449 ArrayRef<VPValue *> Operands) {
450 assert(!Operands.empty() &&
451 "zero-operand VPInstruction opcodes must pass explicit ResultTy");
452 // Assert operand \p Idx (if present and typed) has type \p ExpectedTy.
453 [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx,
454 Type *ExpectedTy) {
455 if (!ExpectedTy || Operands.size() <= Idx)
456 return;
457 [[maybe_unused]] Type *OpTy = Operands[Idx]->getScalarType();
458 assert((!OpTy || OpTy == ExpectedTy) &&
459 "different types inferred for different operands");
460 };
461
462 Type *Op0Ty = Operands[0]->getScalarType();
463 LLVMContext &Ctx = Op0Ty->getContext();
464 switch (Opcode) {
468 case Instruction::Store:
469 case Instruction::Switch:
470 return Type::getVoidTy(Ctx);
471 case Instruction::ICmp:
472 case Instruction::FCmp:
474 AssertOperandType(1, Op0Ty);
475 return IntegerType::get(Ctx, 1);
479 assert((!Op0Ty || Op0Ty->isIntegerTy(1)) && "expected bool operand");
480 AssertOperandType(1, Op0Ty);
481 return IntegerType::get(Ctx, 1);
483 return IntegerType::get(Ctx, 32);
484 case Instruction::Select: {
485 Type *Op1Ty = Operands[1]->getScalarType();
486 AssertOperandType(2, Op1Ty);
487 return Op1Ty;
488 }
490 assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and "
491 "at least one source vector operand");
492 Type *Op1Ty = Operands[1]->getScalarType();
493 for (unsigned Idx = 2; Idx != Operands.size(); ++Idx)
494 AssertOperandType(Idx, Op1Ty);
495 return Op1Ty;
496 }
497 case Instruction::ExtractValue: {
498 assert(Operands.size() == 2 && "expected single level extractvalue");
499 auto *StructTy = cast<StructType>(Op0Ty);
500 return StructTy->getTypeAtIndex(
501 cast<VPConstantInt>(Operands[1])->getZExtValue());
502 }
507 case Instruction::Load:
508 case Instruction::Alloca:
509 llvm_unreachable("type must be passed explicitly");
510 case Instruction::Call:
511 return getCalledFunction(Operands)->getReturnType();
512 default:
513 break;
514 }
515
516 // Opcodes that require all operands to share the same scalar type as the
517 // result.
518 bool AllOperandsSameType =
519 Instruction::isBinaryOp(Opcode) ||
525 Opcode);
526 if (AllOperandsSameType)
527 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
528 AssertOperandType(Idx, Op0Ty);
529
530 return Op0Ty;
531}
532
534 ArrayRef<VPValue *> Operands) {
535 unsigned Opcode = I->getOpcode();
536 if (Instruction::isCast(Opcode) ||
537 is_contained(ArrayRef<unsigned>({Instruction::ExtractValue,
538 Instruction::Load, Instruction::Alloca}),
539 Opcode))
540 return I->getType();
541 return computeScalarTypeForInstruction(Opcode, Operands);
542}
543
545 const VPIRFlags &Flags, const VPIRMetadata &MD,
546 DebugLoc DL, const Twine &Name, Type *ResultTy)
548 VPRecipeBase::VPInstructionSC, Operands,
549 ResultTy ? ResultTy
550 : computeScalarTypeForInstruction(Opcode, Operands),
551 Flags, DL),
552 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
554 "Set flags not supported for the provided opcode");
556 "Opcode requires specific flags to be set");
560 "number of operands does not match opcode");
561}
562
564 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
565 return 1;
566
567 if (Instruction::isBinaryOp(Opcode))
568 return 2;
569
570 switch (Opcode) {
574 return 0;
575 case Instruction::Alloca:
576 case Instruction::ExtractValue:
577 case Instruction::Freeze:
578 case Instruction::Load:
592 return 1;
593 case Instruction::ICmp:
594 case Instruction::FCmp:
595 case Instruction::ExtractElement:
596 case Instruction::Store:
606 return 2;
607 case Instruction::InsertElement:
608 case Instruction::Select:
611 return 3;
612 case Instruction::Call:
614 1;
615 case Instruction::GetElementPtr:
616 case Instruction::PHI:
617 case Instruction::Switch:
627 // Cannot determine the number of operands from the opcode.
628 return -1u;
629 }
630 llvm_unreachable("all cases should be handled above");
631}
632
636
637bool VPInstruction::canGenerateScalarForFirstLane() const {
639 return true;
641 return true;
642 switch (Opcode) {
643 case Instruction::Freeze:
644 case Instruction::ICmp:
645 case Instruction::PHI:
646 case Instruction::Select:
656 return true;
657 default:
658 return false;
659 }
660}
661
663 if (Kind == RecurKind::Sub)
664 return Instruction::Add;
665 if (Kind == RecurKind::FSub)
666 return Instruction::FAdd;
667 llvm_unreachable("RecurKind should be Sub/FSub.");
668}
669
670Value *VPInstruction::generate(VPTransformState &State) {
671 IRBuilderBase &Builder = State.Builder;
672
674 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
675 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
676 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
677 auto *Res =
678 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
679 if (auto *I = dyn_cast<Instruction>(Res))
680 applyFlags(*I);
681 return Res;
682 }
683
684 switch (getOpcode()) {
685 case VPInstruction::Not: {
686 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
687 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
688 return Builder.CreateNot(A, Name);
689 }
690 case Instruction::ExtractElement: {
691 assert(State.VF.isVector() && "Only extract elements from vectors");
692 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
693 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
694 Value *Vec = State.get(getOperand(0));
695 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
696 return Builder.CreateExtractElement(Vec, Idx, Name);
697 }
698 case Instruction::InsertElement: {
699 assert(State.VF.isVector() && "Can only insert elements into vectors");
700 Value *Vec = State.get(getOperand(0), /*IsScalar=*/false);
701 Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
702 Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
703 return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
704 }
705 case Instruction::Freeze: {
707 return Builder.CreateFreeze(Op, Name);
708 }
709 case Instruction::FCmp:
710 case Instruction::ICmp: {
711 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
712 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
713 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
714 return Builder.CreateCmp(getPredicate(), A, B, Name);
715 }
716 case Instruction::PHI: {
717 llvm_unreachable("should be handled by VPPhi::execute");
718 }
719 case Instruction::Select: {
720 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
721 Value *Cond =
722 State.get(getOperand(0),
723 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
724 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
725 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
726 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
727 }
729 // Get first lane of vector induction variable.
730 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
731 // Get the original loop tripcount.
732 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
733
734 // If this part of the active lane mask is scalar, generate the CMP directly
735 // to avoid unnecessary extracts.
736 if (State.VF.isScalar())
737 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
738 Name);
739
740 ElementCount EC = State.VF.multiplyCoefficientBy(
741 cast<VPConstantInt>(getOperand(2))->getZExtValue());
742 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
743 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
744 {PredTy, ScalarTC->getType()},
745 {VIVElem0, ScalarTC}, nullptr, Name);
746 }
748 Value *Op = State.get(getOperand(0));
749 auto *VecTy = cast<VectorType>(Op->getType());
750 assert(VecTy->getScalarSizeInBits() == 1 &&
751 "NumActiveLanes only implemented for i1 vectors");
752
753 Type *Ty = getScalarType();
754 Value *ZExt = Builder.CreateCast(
755 Instruction::ZExt, Op, VectorType::get(Ty, VecTy->getElementCount()));
756 Value *NumActive =
757 Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
758 return NumActive;
759 }
761 // Generate code to combine the previous and current values in vector v3.
762 //
763 // vector.ph:
764 // v_init = vector(..., ..., ..., a[-1])
765 // br vector.body
766 //
767 // vector.body
768 // i = phi [0, vector.ph], [i+4, vector.body]
769 // v1 = phi [v_init, vector.ph], [v2, vector.body]
770 // v2 = a[i, i+1, i+2, i+3];
771 // v3 = vector(v1(3), v2(0, 1, 2))
772
773 auto *V1 = State.get(getOperand(0));
774 if (!V1->getType()->isVectorTy())
775 return V1;
776 Value *V2 = State.get(getOperand(1));
777 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
778 }
780 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
781 Value *VFxUF = State.get(getOperand(1), VPLane(0));
782 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
783 Value *Cmp =
784 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
786 return Builder.CreateSelect(Cmp, Sub, Zero);
787 }
789 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
790 // be outside of the main loop.
791 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
792 // Compute EVL
793 assert(AVL->getType()->isIntegerTy() &&
794 "Requested vector length should be an integer.");
795
796 assert(State.VF.isScalable() && "Expected scalable vector factor.");
797 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
798
799 Value *EVL = Builder.CreateIntrinsic(
800 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
801 {AVL, VFArg, Builder.getTrue()});
802 return EVL;
803 }
805 Value *Cond = State.get(getOperand(0), VPLane(0));
806 // Replace the temporary unreachable terminator with a new conditional
807 // branch, hooking it up to backward destination for latch blocks now, and
808 // to forward destination(s) later when they are created.
809 // Second successor may be backwards - iff it is already in VPBB2IRBB.
810 VPBasicBlock *SecondVPSucc =
811 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
812 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
813 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
814 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
815 // First successor is always forward, reset it to nullptr.
816 Br->setSuccessor(0, nullptr);
818 applyMetadata(*Br);
819 return Br;
820 }
822 return Builder.CreateVectorSplat(
823 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
824 }
826 // For struct types, we need to build a new 'wide' struct type, where each
827 // element is widened, i.e., we create a struct of vectors.
828 auto *StructTy = cast<StructType>(getOperand(0)->getScalarType());
829 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
830 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
831 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
832 FieldIndex++) {
833 Value *ScalarValue =
834 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
835 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
836 VectorValue =
837 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
838 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
839 }
840 }
841 return Res;
842 }
844 auto *ScalarTy = getOperand(0)->getScalarType();
845 auto NumOfElements = ElementCount::getFixed(getNumOperands());
846 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
847 for (const auto &[Idx, Op] : enumerate(operands()))
848 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
849 Builder.getInt32(Idx));
850 return Res;
851 }
853 if (State.VF.isScalar())
854 return State.get(getOperand(0), true);
855 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
857 // If this start vector is scaled then it should produce a vector with fewer
858 // elements than the VF.
859 ElementCount VF = State.VF.divideCoefficientBy(
860 cast<VPConstantInt>(getOperand(2))->getZExtValue());
861 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
862 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
863 Builder.getInt32(0));
864 }
866 RecurKind RK = getRecurKind();
867 bool IsOrdered = isReductionOrdered();
868 bool IsInLoop = isReductionInLoop();
870 "FindIV should use min/max reduction kinds");
871
872 // The recipe may have multiple operands to be reduced together.
873 unsigned NumOperandsToReduce = getNumOperands();
874 VectorParts RdxParts(NumOperandsToReduce);
875 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
876 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
877
878 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
880
881 // Reduce multiple operands into one.
882 Value *ReducedPartRdx = RdxParts[0];
883 if (IsOrdered) {
884 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
885 } else {
886 // Floating-point operations should have some FMF to enable the reduction.
887 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
888 Value *RdxPart = RdxParts[Part];
890 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
891 else {
892 // For sub-recurrences, each part's reduction variable is already
893 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
897 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
898 ReducedPartRdx =
899 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
900 }
901 }
902 }
903
904 // Create the reduction after the loop. Note that inloop reductions create
905 // the target reduction in the loop using a Reduction recipe.
906 if (State.VF.isVector() && !IsInLoop) {
907 // TODO: Support in-order reductions based on the recurrence descriptor.
908 // All ops in the reduction inherit fast-math-flags from the recurrence
909 // descriptor.
910 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
911 }
912
913 return ReducedPartRdx;
914 }
917 unsigned Offset =
919 Value *Res;
920 if (State.VF.isVector()) {
921 assert(Offset <= State.VF.getKnownMinValue() &&
922 "invalid offset to extract from");
923 // Extract lane VF - Offset from the operand.
924 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
925 } else {
926 // TODO: Remove ExtractLastLane for scalar VFs.
927 assert(Offset <= 1 && "invalid offset to extract from");
928 Res = State.get(getOperand(0));
929 }
931 Res->setName(Name);
932 return Res;
933 }
935 Value *A = State.get(getOperand(0));
936 Value *B = State.get(getOperand(1));
937 return Builder.CreateLogicalAnd(A, B, Name);
938 }
940 Value *A = State.get(getOperand(0));
941 Value *B = State.get(getOperand(1));
942 return Builder.CreateLogicalOr(A, B, Name);
943 }
945 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
946 "can only generate first lane for PtrAdd");
947 Value *Ptr = State.get(getOperand(0), VPLane(0));
948 Value *Addend = State.get(getOperand(1), VPLane(0));
949 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
950 }
952 Value *Ptr =
954 Value *Addend = State.get(getOperand(1));
955 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
956 }
958 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
959 for (VPValue *Op : drop_begin(operands()))
960 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
961 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
962 }
964 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
965 "simplified to ExtractElement.");
966 Value *LaneToExtract = State.get(getOperand(0), true);
967 Type *IdxTy = getOperand(0)->getScalarType();
968 Value *Res = nullptr;
969 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
970
971 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
972 Value *VectorStart =
973 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
974 Value *VectorIdx = Idx == 1
975 ? LaneToExtract
976 : Builder.CreateSub(LaneToExtract, VectorStart);
977 Value *Ext = State.VF.isScalar()
978 ? State.get(getOperand(Idx))
979 : Builder.CreateExtractElement(
980 State.get(getOperand(Idx)), VectorIdx);
981 if (Res) {
982 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
983 Res = Builder.CreateSelect(Cmp, Ext, Res);
984 } else {
985 Res = Ext;
986 }
987 }
988 return Res;
989 }
991 Type *Ty = this->getScalarType();
992 if (getNumOperands() == 1) {
993 Value *Mask = State.get(getOperand(0));
994 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
995 /*ZeroIsPoison=*/false, Name);
996 }
997 // If there are multiple operands, create a chain of selects to pick the
998 // first operand with an active lane and add the number of lanes of the
999 // preceding operands.
1000 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
1001 unsigned LastOpIdx = getNumOperands() - 1;
1002 Value *Res = nullptr;
1003 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1004 Value *TrailingZeros =
1005 State.VF.isScalar()
1006 ? Builder.CreateZExt(
1007 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
1008 Builder.getFalse()),
1009 Ty)
1011 Ty, State.get(getOperand(Idx)),
1012 /*ZeroIsPoison=*/false, Name);
1013 Value *Current = Builder.CreateAdd(
1014 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
1015 TrailingZeros);
1016 if (Res) {
1017 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
1018 Res = Builder.CreateSelect(Cmp, Current, Res);
1019 } else {
1020 Res = Current;
1021 }
1022 }
1023
1024 return Res;
1025 }
1027 return State.get(getOperand(0), true);
1029 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
1031 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
1032 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
1033 Value *Data = State.get(getOperand(Idx));
1034 Value *Mask = State.get(getOperand(Idx + 1));
1035 Type *VTy = Data->getType();
1036
1037 if (State.VF.isScalar())
1038 Result = Builder.CreateSelect(Mask, Data, Result);
1039 else
1040 Result = Builder.CreateIntrinsic(
1041 Intrinsic::experimental_vector_extract_last_active, {VTy},
1042 {Data, Mask, Result});
1043 }
1044
1045 return Result;
1046 }
1047 default:
1048 llvm_unreachable("Unsupported opcode for instruction");
1049 }
1050}
1051
1053 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1054 Type *ScalarTy = this->getScalarType();
1055 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1056 switch (Opcode) {
1057 case Instruction::FNeg:
1058 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1059 case Instruction::UDiv:
1060 case Instruction::SDiv:
1061 case Instruction::SRem:
1062 case Instruction::URem:
1063 case Instruction::Add:
1064 case Instruction::FAdd:
1065 case Instruction::Sub:
1066 case Instruction::FSub:
1067 case Instruction::Mul:
1068 case Instruction::FMul:
1069 case Instruction::FDiv:
1070 case Instruction::FRem:
1071 case Instruction::Shl:
1072 case Instruction::LShr:
1073 case Instruction::AShr:
1074 case Instruction::And:
1075 case Instruction::Or:
1076 case Instruction::Xor: {
1077 // Certain instructions can be cheaper if they have a constant second
1078 // operand. One example of this are shifts on x86.
1079 VPValue *RHS = getOperand(1);
1080 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1081
1082 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1085
1088 if (CtxI)
1089 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1090 return Ctx.TTI.getArithmeticInstrCost(
1091 Opcode, ResultTy, Ctx.CostKind,
1092 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1093 RHSInfo, Operands, CtxI, &Ctx.TLI);
1094 }
1095 case Instruction::Freeze:
1096 // NOTE: The only way to ask for the cost is via getInstructionCost, which
1097 // requires the actual vector instruction. Instead, both here and in the
1098 // LoopVectorizationCostModel::getInstructionCost the costs mirror the
1099 // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
1100 // them in sync.
1101 return TTI::TCC_Free;
1102 case Instruction::ExtractValue:
1103 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1104 Ctx.CostKind);
1105 case Instruction::ICmp:
1106 case Instruction::FCmp: {
1107 Type *ScalarOpTy = getOperand(0)->getScalarType();
1108 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1110 return Ctx.TTI.getCmpSelInstrCost(
1111 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1112 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1113 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1114 }
1115 case Instruction::BitCast: {
1116 Type *ScalarTy = this->getScalarType();
1117 if (ScalarTy->isPointerTy())
1118 return 0;
1119 [[fallthrough]];
1120 }
1121 case Instruction::SExt:
1122 case Instruction::ZExt:
1123 case Instruction::FPToUI:
1124 case Instruction::FPToSI:
1125 case Instruction::FPExt:
1126 case Instruction::PtrToInt:
1127 case Instruction::PtrToAddr:
1128 case Instruction::IntToPtr:
1129 case Instruction::SIToFP:
1130 case Instruction::UIToFP:
1131 case Instruction::Trunc:
1132 case Instruction::FPTrunc:
1133 case Instruction::AddrSpaceCast: {
1134 // Computes the CastContextHint from a recipe that may access memory.
1135 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1136 if (isa<VPInterleaveBase>(R))
1138 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1139 // Only compute CCH for memory operations, matching the legacy model
1140 // which only considers loads/stores for cast context hints.
1141 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1142 if (!isa<LoadInst, StoreInst>(UI))
1144 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1146 }
1147 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1148 if (WidenMemoryRecipe == nullptr)
1150 if (VF.isScalar())
1152 if (!WidenMemoryRecipe->isConsecutive())
1154 if (WidenMemoryRecipe->isMasked())
1157 };
1158
1159 VPValue *Operand = getOperand(0);
1161 bool IsReverse = false;
1162 // For Trunc/FPTrunc, get the context from the only user.
1163 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1164 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1165 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1166 return nullptr;
1167 return dyn_cast<VPRecipeBase>(*R->user_begin());
1168 };
1169 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1170 if (match(Recipe,
1174 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1175 IsReverse = true;
1176 }
1177 if (Recipe)
1178 CCH = ComputeCCH(Recipe);
1179 }
1180 }
1181 // For Z/Sext, get the context from the operand.
1182 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1183 Opcode == Instruction::FPExt) {
1184 if (auto *Recipe = Operand->getDefiningRecipe()) {
1185 VPValue *ReverseOp;
1186 if (match(Recipe,
1187 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1189 m_VPValue(ReverseOp))))) {
1190 Recipe = ReverseOp->getDefiningRecipe();
1191 IsReverse = true;
1192 }
1193 if (Recipe)
1194 CCH = ComputeCCH(Recipe);
1195 }
1196 }
1197 if (IsReverse && CCH != TTI::CastContextHint::None)
1199
1200 auto *ScalarSrcTy = Operand->getScalarType();
1201 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1202 // Arm TTI will use the underlying instruction to determine the cost.
1203 return Ctx.TTI.getCastInstrCost(
1204 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1206 }
1207 case Instruction::Select: {
1209 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1210 Type *ScalarTy = this->getScalarType();
1211
1212 VPValue *Op0, *Op1;
1213 bool IsLogicalAnd =
1214 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1215 bool IsLogicalOr =
1216 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1217 // Also match the inverted forms:
1218 // select x, false, y --> !x & y (still AND)
1219 // select x, y, true --> !x | y (still OR)
1220 IsLogicalAnd |=
1221 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1222 IsLogicalOr |=
1223 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1224
1225 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1226 (IsLogicalAnd || IsLogicalOr)) {
1227 // select x, y, false --> x & y
1228 // select x, true, y --> x | y
1229 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1230 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1231
1233 if (SI && all_of(operands(),
1234 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1235 append_range(Operands, SI->operands());
1236 return Ctx.TTI.getArithmeticInstrCost(
1237 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1238 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1239 }
1240
1241 Type *CondTy = getOperand(0)->getScalarType();
1242 if (!IsScalarCond && VF.isVector())
1243 CondTy = VectorType::get(CondTy, VF);
1244
1245 llvm::CmpPredicate Pred;
1246 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1247 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1248 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1249 Pred = Cmp->getPredicate();
1250 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1251 return Ctx.TTI.getCmpSelInstrCost(
1252 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1253 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1254 }
1255 }
1256 llvm_unreachable("called for unsupported opcode");
1257}
1258
1260 VPCostContext &Ctx) const {
1262 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1263 // TODO: Compute cost for VPInstructions without underlying values once
1264 // the legacy cost model has been retired.
1265 return 0;
1266 }
1267
1269 "Should only generate a vector value or single scalar, not scalars "
1270 "for all lanes.");
1272 getOpcode(),
1274 }
1275
1276 switch (getOpcode()) {
1277 case Instruction::Select: {
1279 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1280 auto *CondTy = getOperand(0)->getScalarType();
1281 auto *VecTy = getOperand(1)->getScalarType();
1282 if (!vputils::onlyFirstLaneUsed(this)) {
1283 CondTy = toVectorTy(CondTy, VF);
1284 VecTy = toVectorTy(VecTy, VF);
1285 }
1286 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1287 Ctx.CostKind);
1288 }
1289 case Instruction::ExtractElement:
1291 if (VF.isScalar()) {
1292 // ExtractLane with VF=1 takes care of handling extracting across multiple
1293 // parts.
1294 return 0;
1295 }
1296
1297 // Add on the cost of extracting the element.
1298 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1299 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1300 Ctx.CostKind);
1301 }
1302 case VPInstruction::AnyOf: {
1303 auto *VecTy = toVectorTy(this->getScalarType(), VF);
1304 return Ctx.TTI.getArithmeticReductionCost(
1305 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1306 }
1308 Type *Ty = this->getScalarType();
1309 Type *ScalarTy = getOperand(0)->getScalarType();
1310 if (VF.isScalar())
1311 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1313 CmpInst::ICMP_EQ, Ctx.CostKind);
1314 // Calculate the cost of determining the lane index.
1315 auto *PredTy = toVectorTy(ScalarTy, VF);
1316 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1317 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1318 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1319 }
1321 Type *Ty = this->getScalarType();
1322 Type *ScalarTy = getOperand(0)->getScalarType();
1323 if (VF.isScalar())
1324 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1326 CmpInst::ICMP_EQ, Ctx.CostKind);
1327 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1328 auto *PredTy = toVectorTy(ScalarTy, VF);
1329 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1330 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1331 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1332 // Add cost of NOT operation on the predicate.
1333 Cost += Ctx.TTI.getArithmeticInstrCost(
1334 Instruction::Xor, PredTy, Ctx.CostKind,
1335 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1336 {TargetTransformInfo::OK_UniformConstantValue,
1337 TargetTransformInfo::OP_None});
1338 // Add cost of SUB operation on the index.
1339 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1340 return Cost;
1341 }
1343 Type *ScalarTy = this->getScalarType();
1344 Type *VecTy = toVectorTy(ScalarTy, VF);
1345 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1347 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1348 {VecTy, MaskTy, ScalarTy});
1349 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1350 }
1352 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1353 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1354 return Ctx.TTI.getShuffleCost(
1356 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1357 }
1359 Type *ArgTy = getOperand(0)->getScalarType();
1360 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1361 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1362 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1363 {ArgTy, ArgTy});
1364 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1365 }
1367 Type *Arg0Ty = getOperand(0)->getScalarType();
1368 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1369 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1370 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1371 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1372 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1373 }
1375 assert(VF.isVector() && "Reverse operation must be vector type");
1376 Type *EltTy = this->getScalarType();
1377 // Skip the reverse operation cost for the mask.
1378 // FIXME: Remove this once redundant mask reverse operations can be
1379 // eliminated by VPlanTransforms::cse before cost computation.
1380 if (EltTy->isIntegerTy(1))
1381 return 0;
1382 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1383 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
1384 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1385 /*Index=*/0);
1386 }
1388 // Add on the cost of extracting the element.
1389 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1390 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1391 VecTy, Ctx.CostKind, 0);
1392 }
1393 case VPInstruction::Not: {
1394 Type *ValTy = this->getScalarType();
1395 // InstCombine will fold `xor` to the conditional branch.
1396 if (auto *U = const_cast<VPUser *>(getSingleUser()))
1397 if (match(U, m_BranchOnCond(m_VPValue())))
1398 return 0;
1399 if (!vputils::onlyFirstLaneUsed(this))
1400 ValTy = toVectorTy(ValTy, VF);
1401 return Ctx.TTI.getArithmeticInstrCost(Instruction::Xor, ValTy,
1402 Ctx.CostKind);
1403 }
1405 // If TC <= VF then this is just a branch.
1406 // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
1407 // where it checks TC <= VF * UF, but we don't know UF yet. This means in
1408 // some cases we get a cost that's too high due to counting a cmp that
1409 // later gets removed.
1410 // FIXME: The compare could also be removed if TC = M * vscale,
1411 // VF = N * vscale, and M <= N. Detecting that would require having the
1412 // trip count as a SCEV though.
1415 if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
1416 return 0;
1417 // Otherwise BranchOnCount generates ICmpEQ followed by a branch.
1418 Type *ValTy = getOperand(0)->getScalarType();
1419 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy,
1421 CmpInst::ICMP_EQ, Ctx.CostKind);
1422 }
1423 case Instruction::FCmp:
1424 case Instruction::ICmp:
1426 getOpcode(),
1429 if (VF == ElementCount::getScalable(1))
1431 [[fallthrough]];
1432 default:
1433 // TODO: Compute cost other VPInstructions once the legacy cost model has
1434 // been retired.
1436 "unexpected VPInstruction witht underlying value");
1437 return 0;
1438 }
1439}
1440
1453
1455 switch (getOpcode()) {
1456 case Instruction::Load:
1457 case Instruction::PHI:
1461 return true;
1462 default:
1464 }
1465}
1466
1468#ifndef NDEBUG
1469 Type *Ty = Op->getScalarType();
1470 switch (getOpcode()) {
1474 assert(Ty == getOperand(0)->getScalarType() &&
1475 "types of operand 0 and new operand must match");
1476 break;
1480 assert(Ty == getOperand(0)->getScalarType() &&
1481 "appended operand must match operand 0's scalar type");
1482 break;
1484 assert(Ty == getOperand(1)->getScalarType() &&
1485 "appended operand must match operand 1's scalar type");
1486 break;
1488 // The recipe is constructed with 3 operands (result, data, mask). Extra
1489 // operands beyond that are appended in (data, mask) pairs.
1490 constexpr unsigned NumInitialOperands = 3;
1491 assert(getNumOperands() >= NumInitialOperands &&
1492 "ExtractLastActive must have at least the initial 3 operands");
1493 bool IsMaskSlot = ((getNumOperands() - NumInitialOperands) & 1u) == 1u;
1494 assert((IsMaskSlot ? Ty->isIntegerTy(1)
1495 : Ty == getOperand(1)->getScalarType()) &&
1496 "ExtractLastActive expects alternating data/mask operands "
1497 "matching operand 1's type and i1, respectively");
1498 break;
1499 }
1500 default:
1501 llvm_unreachable("opcode does not support growing the operand list "
1502 "outside of construction");
1503 }
1504#endif
1506}
1507
1509 assert(!isMasked() && "cannot execute masked VPInstruction");
1510 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1512 "Set flags not supported for the provided opcode");
1514 "Opcode requires specific flags to be set");
1515 if (hasFastMathFlags())
1516 State.Builder.setFastMathFlags(getFastMathFlags());
1517 Value *GeneratedValue = generate(State);
1518 if (!hasResult())
1519 return;
1520 assert(GeneratedValue && "generate must produce a value");
1521 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1524 assert((((GeneratedValue->getType()->isVectorTy() ||
1525 GeneratedValue->getType()->isStructTy()) ==
1526 !GeneratesPerFirstLaneOnly) ||
1527 State.VF.isScalar()) &&
1528 "scalar value but not only first lane defined");
1529 State.set(this, GeneratedValue,
1530 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1532 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1533 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1534 // vectorization explicitly connects VPlans.
1535 setUnderlyingValue(GeneratedValue);
1536 }
1537}
1538
1542 return false;
1543 switch (getOpcode()) {
1544 case Instruction::ExtractValue:
1545 case Instruction::InsertValue:
1546 case Instruction::GetElementPtr:
1547 case Instruction::ExtractElement:
1548 case Instruction::InsertElement:
1549 case Instruction::Freeze:
1550 case Instruction::FCmp:
1551 case Instruction::ICmp:
1552 case Instruction::Select:
1553 case Instruction::PHI:
1578 case VPInstruction::Not:
1587 return false;
1588 case Instruction::Call:
1591 default:
1592 return true;
1593 }
1594}
1595
1597 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1599 return vputils::onlyFirstLaneUsed(this);
1600
1601 switch (getOpcode()) {
1602 default:
1603 return false;
1604 case Instruction::ExtractElement:
1605 return Op == getOperand(1);
1606 case Instruction::InsertElement:
1607 return Op == getOperand(1) || Op == getOperand(2);
1608 case Instruction::PHI:
1609 return true;
1610 case Instruction::FCmp:
1611 case Instruction::ICmp:
1612 case Instruction::Select:
1613 case Instruction::Or:
1614 case Instruction::Freeze:
1615 case VPInstruction::Not:
1616 // TODO: Cover additional opcodes.
1617 return vputils::onlyFirstLaneUsed(this);
1618 case Instruction::Load:
1628 return true;
1631 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1632 // operand, after replicating its operands only the first lane is used.
1633 // Before replicating, it will have only a single operand.
1634 return getNumOperands() > 1;
1636 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1638 // WidePtrAdd supports scalar and vector base addresses.
1639 return false;
1642 return Op == getOperand(0);
1643 };
1644 llvm_unreachable("switch should return");
1645}
1646
1648 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1650 return vputils::onlyFirstPartUsed(this);
1651
1652 switch (getOpcode()) {
1653 default:
1654 return false;
1655 case Instruction::FCmp:
1656 case Instruction::ICmp:
1657 case Instruction::Select:
1658 return vputils::onlyFirstPartUsed(this);
1663 return true;
1664 };
1665 llvm_unreachable("switch should return");
1666}
1667
1668#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1670 VPSlotTracker SlotTracker(getParent()->getPlan());
1672}
1673
1675 VPSlotTracker &SlotTracker) const {
1676 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1677
1678 if (hasResult()) {
1680 O << " = ";
1681 }
1682
1683 switch (getOpcode()) {
1684 case VPInstruction::Not:
1685 O << "not";
1686 break;
1688 O << "active lane mask";
1689 break;
1691 O << "incoming-alias-mask";
1692 break;
1694 O << "EXPLICIT-VECTOR-LENGTH";
1695 break;
1697 O << "first-order splice";
1698 break;
1700 O << "branch-on-cond";
1701 break;
1703 O << "branch-on-two-conds";
1704 break;
1706 O << "TC > VF ? TC - VF : 0";
1707 break;
1709 O << "VF * Part +";
1710 break;
1712 O << "branch-on-count";
1713 break;
1715 O << "broadcast";
1716 break;
1718 O << "buildstructvector";
1719 break;
1721 O << "buildvector";
1722 break;
1724 O << "exiting-iv-value";
1725 break;
1727 O << "masked-cond";
1728 break;
1730 O << "extract-lane";
1731 break;
1733 O << "extract-last-lane";
1734 break;
1736 O << "extract-last-part";
1737 break;
1739 O << "extract-penultimate-element";
1740 break;
1742 O << "compute-reduction-result";
1743 break;
1745 O << "logical-and";
1746 break;
1748 O << "logical-or";
1749 break;
1751 O << "ptradd";
1752 break;
1754 O << "wide-ptradd";
1755 break;
1757 O << "any-of";
1758 break;
1760 O << "first-active-lane";
1761 break;
1763 O << "last-active-lane";
1764 break;
1766 O << "reduction-start-vector";
1767 break;
1769 O << "resume-for-epilogue";
1770 break;
1772 O << "reverse";
1773 break;
1775 O << "unpack";
1776 break;
1778 O << "extract-last-active";
1779 break;
1781 O << "num-active-lanes";
1782 break;
1783 default:
1785 }
1786
1787 printFlags(O);
1789}
1790#endif
1791
1793 Type *ResultTy = getResultType();
1795 Value *Op = State.get(getOperand(0), VPLane(0));
1796 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1797 Op, ResultTy);
1798 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
1799 applyFlags(*CastOp);
1800 applyMetadata(*CastOp);
1801 }
1802 State.set(this, Cast, VPLane(0));
1803 return;
1804 }
1805 switch (getOpcode()) {
1807 Value *StepVector =
1808 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1809 State.set(this, StepVector);
1810 break;
1811 }
1812 case VPInstruction::VScale: {
1813 Value *VScale = State.Builder.CreateVScale(ResultTy);
1814 State.set(this, VScale, true);
1815 break;
1816 }
1817
1818 default:
1819 llvm_unreachable("opcode not implemented yet");
1820 }
1821}
1822
1823#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1825 VPSlotTracker &SlotTracker) const {
1826 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1828 O << " = ";
1829
1830 Type *ResultTy = getResultType();
1831 switch (getOpcode()) {
1833 O << "wide-iv-step ";
1835 break;
1837 O << "step-vector " << *ResultTy;
1838 break;
1840 O << "vscale " << *ResultTy;
1841 break;
1842 case Instruction::Load:
1843 O << "load ";
1845 break;
1846 default:
1847 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1849 printFlags(O);
1851 O << " to " << *ResultTy;
1852 }
1853}
1854#endif
1855
1857 PHINode *NewPhi = State.Builder.CreatePHI(getScalarType(), 2, getName());
1858 unsigned NumIncoming = getNumIncoming();
1859 // Detect header phis: the parent block dominates its second incoming block
1860 // (the latch). Those IR incoming values have not been generated yet and need
1861 // to be added after they have been executed.
1862 if (NumIncoming == 2 &&
1863 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1864 NumIncoming = 1;
1865 }
1866 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1867 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1868 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1869 NewPhi->addIncoming(IncV, PredBB);
1870 }
1871 State.set(this, NewPhi, VPLane(0));
1872}
1873
1874#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1875void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1876 VPSlotTracker &SlotTracker) const {
1877 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1879 O << " = phi";
1880 printFlags(O);
1882}
1883#endif
1884
1885VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1886 if (auto *Phi = dyn_cast<PHINode>(&I))
1887 return new VPIRPhi(*Phi);
1888 return new VPIRInstruction(I);
1889}
1890
1892 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1893 "PHINodes must be handled by VPIRPhi");
1894 // Advance the insert point after the wrapped IR instruction. This allows
1895 // interleaving VPIRInstructions and other recipes.
1896 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1897}
1898
1900 VPCostContext &Ctx) const {
1901 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1902 // hence it does not contribute to the cost-modeling for the VPlan.
1903 return 0;
1904}
1905
1906#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1908 VPSlotTracker &SlotTracker) const {
1909 O << Indent << "IR " << I;
1910}
1911#endif
1912
1914 PHINode *Phi = &getIRPhi();
1915 for (const auto &[Idx, Op] : enumerate(operands())) {
1916 VPValue *ExitValue = Op;
1917 auto Lane = vputils::isSingleScalar(ExitValue)
1919 : VPLane::getLastLaneForVF(State.VF);
1920 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1921 auto *PredVPBB = Pred->getExitingBasicBlock();
1922 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1923 // Set insertion point in PredBB in case an extract needs to be generated.
1924 // TODO: Model extracts explicitly.
1925 State.Builder.SetInsertPoint(PredBB->getTerminator());
1926 Value *V = State.get(ExitValue, VPLane(Lane));
1927 // If there is no existing block for PredBB in the phi, add a new incoming
1928 // value. Otherwise update the existing incoming value for PredBB.
1929 if (Phi->getBasicBlockIndex(PredBB) == -1)
1930 Phi->addIncoming(V, PredBB);
1931 else
1932 Phi->setIncomingValueForBlock(PredBB, V);
1933 }
1934
1935 // Advance the insert point after the wrapped IR instruction. This allows
1936 // interleaving VPIRInstructions and other recipes.
1937 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1938}
1939
1941 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1942 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1943 "Number of phi operands must match number of predecessors");
1944 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1945 R->removeOperand(Position);
1946}
1947
1948VPValue *
1950 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1951 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1952}
1953
1955 VPValue *V) const {
1956 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1957 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1958}
1959
1960#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1962 VPSlotTracker &SlotTracker) const {
1963 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1964 [this, &O, &SlotTracker](auto Op) {
1965 O << "[ ";
1966 Op.value()->printAsOperand(O, SlotTracker);
1967 O << ", ";
1968 getIncomingBlock(Op.index())->printAsOperand(O);
1969 O << " ]";
1970 });
1971}
1972#endif
1973
1974#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1976 VPSlotTracker &SlotTracker) const {
1978
1979 if (getNumOperands() != 0) {
1980 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1982 [&O, &SlotTracker](auto Op) {
1983 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1984 O << " from ";
1985 std::get<1>(Op)->printAsOperand(O);
1986 });
1987 O << ")";
1988 }
1989}
1990#endif
1991
1993 for (const auto &[Kind, Node] : Metadata)
1994 I.setMetadata(Kind, Node);
1995}
1996
1998 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1999 for (const auto &[KindA, MDA] : Metadata) {
2000 for (const auto &[KindB, MDB] : Other.Metadata) {
2001 if (KindA == KindB && MDA == MDB) {
2002 MetadataIntersection.emplace_back(KindA, MDA);
2003 break;
2004 }
2005 }
2006 }
2007 Metadata = std::move(MetadataIntersection);
2008}
2009
2010#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2012 const Module *M = SlotTracker.getModule();
2013 if (Metadata.empty() || !M)
2014 return;
2015
2016 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
2017 O << " (";
2018 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
2019 auto [Kind, Node] = KindNodePair;
2020 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
2021 "Unexpected unnamed metadata kind");
2022 O << "!" << MDNames[Kind] << " ";
2023 Node->printAsOperand(O, M);
2024 });
2025 O << ")";
2026}
2027#endif
2028
2030 assert(State.VF.isVector() && "not widening");
2031 assert(Variant != nullptr && "Can't create vector function.");
2032
2033 FunctionType *VFTy = Variant->getFunctionType();
2034 // Add return type if intrinsic is overloaded on it.
2036 for (const auto &I : enumerate(args())) {
2037 Value *Arg;
2038 // Some vectorized function variants may also take a scalar argument,
2039 // e.g. linear parameters for pointers. This needs to be the scalar value
2040 // from the start of the respective part when interleaving.
2041 if (!VFTy->getParamType(I.index())->isVectorTy())
2042 Arg = State.get(I.value(), VPLane(0));
2043 else
2044 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2045 Args.push_back(Arg);
2046 }
2047
2050 if (CI)
2051 CI->getOperandBundlesAsDefs(OpBundles);
2052
2053 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
2054 applyFlags(*V);
2055 applyMetadata(*V);
2056 V->setCallingConv(Variant->getCallingConv());
2057
2058 if (!V->getType()->isVoidTy())
2059 State.set(this, V);
2060}
2061
2063 VPCostContext &Ctx) const {
2064 assert(getVectorizedTypeVF(Variant->getReturnType()) == VF &&
2065 "Variant return type must match VF");
2066 return computeCallCost(Variant, Ctx);
2067}
2068
2070 VPCostContext &Ctx) {
2071 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
2072 Variant->getFunctionType()->params(),
2073 Ctx.CostKind);
2074}
2075
2077 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2078 assert(Variant && "Variant not set");
2079 FunctionType *VFTy = Variant->getFunctionType();
2080 return all_of(enumerate(args()), [VFTy, &Op](const auto &Arg) {
2081 auto [Idx, V] = Arg;
2082 Type *ArgTy = VFTy->getParamType(Idx);
2083 return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() ||
2084 ArgTy->isPointerTy() || ArgTy->isByteTy();
2085 });
2086}
2087
2088#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2090 VPSlotTracker &SlotTracker) const {
2091 O << Indent << "WIDEN-CALL ";
2092
2093 Function *CalledFn = getCalledScalarFunction();
2094 if (CalledFn->getReturnType()->isVoidTy())
2095 O << "void ";
2096 else {
2098 O << " = ";
2099 }
2100
2101 O << "call";
2102 printFlags(O);
2103 O << " @" << CalledFn->getName() << "(";
2104 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
2105 Op->printAsOperand(O, SlotTracker);
2106 });
2107 O << ")";
2108
2109 O << " (using library function";
2110 if (Variant->hasName())
2111 O << ": " << Variant->getName();
2112 O << ")";
2113}
2114#endif
2115
2117 assert(State.VF.isVector() && "not widening");
2118
2119 SmallVector<Type *, 2> TysForDecl;
2120 // Add return type if intrinsic is overloaded on it.
2121 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
2122 State.TTI)) {
2123 Type *RetTy = toVectorizedTy(getScalarType(), State.VF);
2124 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
2125 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
2127 Idx, State.TTI))
2128 TysForDecl.push_back(Ty);
2129 }
2130 }
2132 for (const auto &I : enumerate(operands())) {
2133 // Some intrinsics have a scalar argument - don't replace it with a
2134 // vector.
2135 Value *Arg;
2136 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
2137 State.TTI))
2138 Arg = State.get(I.value(), VPLane(0));
2139 else
2140 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2141 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
2142 State.TTI))
2143 TysForDecl.push_back(Arg->getType());
2144 Args.push_back(Arg);
2145 }
2146
2147 // Use vector version of the intrinsic.
2148 Module *M = State.Builder.GetInsertBlock()->getModule();
2149 Function *VectorF =
2150 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
2151 assert(VectorF &&
2152 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
2153
2156 if (CI)
2157 CI->getOperandBundlesAsDefs(OpBundles);
2158
2159 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
2160
2161 applyFlags(*V);
2162 applyMetadata(*V);
2163
2164 return V;
2165}
2166
2168 CallInst *V = createVectorCall(State);
2169 if (!V->getType()->isVoidTy())
2170 State.set(this, V);
2171}
2172
2175 const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) {
2176 Type *ScalarRetTy = R.getScalarType();
2177 // Skip the reverse operation cost for the mask.
2178 // FIXME: Remove this once redundant mask reverse operations can be eliminated
2179 // by VPlanTransforms::cse before cost computation.
2180 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
2181 return InstructionCost(0);
2182
2183 // Some backends analyze intrinsic arguments to determine cost. Use the
2184 // underlying value for the operand if it has one. Otherwise try to use the
2185 // operand of the underlying call instruction, if there is one. Otherwise
2186 // clear Arguments.
2187 // TODO: Rework TTI interface to be independent of concrete IR values.
2189 for (const auto &[Idx, Op] : enumerate(Operands)) {
2190 auto *V = Op->getUnderlyingValue();
2191 if (!V) {
2192 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
2193 Arguments.push_back(UI->getArgOperand(Idx));
2194 continue;
2195 }
2196 Arguments.clear();
2197 break;
2198 }
2199 Arguments.push_back(V);
2200 }
2201
2202 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
2203 SmallVector<Type *> ParamTys =
2204 map_to_vector(Operands, [&](const VPValue *Op) {
2205 return toVectorTy(Op->getScalarType(), VF);
2206 });
2207
2208 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2209 IntrinsicCostAttributes CostAttrs(
2210 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
2211 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
2213 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
2214}
2215
2217 VPCostContext &Ctx) const {
2219 return computeCallCost(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
2220}
2221
2223 return Intrinsic::getBaseName(VectorIntrinsicID);
2224}
2225
2227 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2228 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
2229 auto [Idx, V] = X;
2231 Idx, nullptr);
2232 });
2233}
2234
2235#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2237 VPSlotTracker &SlotTracker) const {
2238 O << Indent << "WIDEN-INTRINSIC ";
2239 if (getScalarType()->isVoidTy()) {
2240 O << "void ";
2241 } else {
2243 O << " = ";
2244 }
2245
2246 O << "call";
2247 printFlags(O);
2248 O << getIntrinsicName() << "(";
2249
2251 Op->printAsOperand(O, SlotTracker);
2252 });
2253 O << ")";
2254}
2255#endif
2256
2258 CallInst *MemI = createVectorCall(State);
2259 MemI->addParamAttr(
2260 0, Attribute::getWithAlignment(MemI->getContext(), Alignment));
2261 State.set(this, MemI);
2262}
2263
2265 Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment,
2266 VPCostContext &Ctx) {
2267 return Ctx.TTI.getMemIntrinsicInstrCost(
2268 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment),
2269 Ctx.CostKind);
2270}
2271
2274 VPCostContext &Ctx) const {
2275 Type *Ty = toVectorTy(getScalarType(), VF);
2277 !match(getOperand(2), m_True()), Alignment,
2278 Ctx);
2279}
2280
2282 IRBuilderBase &Builder = State.Builder;
2283
2284 Value *Address = State.get(getOperand(0));
2285 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2286 VectorType *VTy = cast<VectorType>(Address->getType());
2287
2288 // The histogram intrinsic requires a mask even if the recipe doesn't;
2289 // if the mask operand was omitted then all lanes should be executed and
2290 // we just need to synthesize an all-true mask.
2291 Value *Mask = nullptr;
2292 if (VPValue *VPMask = getMask())
2293 Mask = State.get(VPMask);
2294 else
2295 Mask =
2296 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2297
2298 // If this is a subtract, we want to invert the increment amount. We may
2299 // add a separate intrinsic in future, but for now we'll try this.
2300 if (Opcode == Instruction::Sub)
2301 IncAmt = Builder.CreateNeg(IncAmt);
2302 else
2303 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2304
2305 auto *HistogramInst = State.Builder.CreateIntrinsic(
2306 Intrinsic::experimental_vector_histogram_add, {VTy, IncAmt->getType()},
2307 {Address, IncAmt, Mask});
2308 applyMetadata(*HistogramInst);
2309}
2310
2312 VPCostContext &Ctx) const {
2313 // FIXME: Take the gather and scatter into account as well. For now we're
2314 // generating the same cost as the fallback path, but we'll likely
2315 // need to create a new TTI method for determining the cost, including
2316 // whether we can use base + vec-of-smaller-indices or just
2317 // vec-of-pointers.
2318 assert(VF.isVector() && "Invalid VF for histogram cost");
2319 Type *AddressTy = getOperand(0)->getScalarType();
2320 VPValue *IncAmt = getOperand(1);
2321 Type *IncTy = IncAmt->getScalarType();
2322 VectorType *VTy = VectorType::get(IncTy, VF);
2323
2324 // Assume that a non-constant update value (or a constant != 1) requires
2325 // a multiply, and add that into the cost.
2326 InstructionCost MulCost =
2327 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2328 if (match(IncAmt, m_One()))
2329 MulCost = TTI::TCC_Free;
2330
2331 // Find the cost of the histogram operation itself.
2332 Type *PtrTy = VectorType::get(AddressTy, VF);
2333 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2334 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2335 Type::getVoidTy(Ctx.LLVMCtx),
2336 {PtrTy, IncTy, MaskTy});
2337
2338 // Add the costs together with the add/sub operation.
2339 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2340 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2341}
2342
2343#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2345 VPSlotTracker &SlotTracker) const {
2346 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2348
2349 if (Opcode == Instruction::Sub)
2350 O << ", dec: ";
2351 else {
2352 assert(Opcode == Instruction::Add);
2353 O << ", inc: ";
2354 }
2356
2357 if (VPValue *Mask = getMask()) {
2358 O << ", mask: ";
2359 Mask->printAsOperand(O, SlotTracker);
2360 }
2361}
2362#endif
2363
2364VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2365 AllowReassoc = FMF.allowReassoc();
2366 NoNaNs = FMF.noNaNs();
2367 NoInfs = FMF.noInfs();
2368 NoSignedZeros = FMF.noSignedZeros();
2369 AllowReciprocal = FMF.allowReciprocal();
2370 AllowContract = FMF.allowContract();
2371 ApproxFunc = FMF.approxFunc();
2372}
2373
2375 switch (Opcode) {
2376 case Instruction::Add:
2377 case Instruction::Sub:
2378 case Instruction::Mul:
2379 case Instruction::Shl:
2381 return WrapFlagsTy(false, false);
2382 case Instruction::Trunc:
2383 return TruncFlagsTy(false, false);
2384 case Instruction::Or:
2385 return DisjointFlagsTy(false);
2386 case Instruction::AShr:
2387 case Instruction::LShr:
2388 case Instruction::UDiv:
2389 case Instruction::SDiv:
2390 return ExactFlagsTy(false);
2391 case Instruction::GetElementPtr:
2394 return GEPNoWrapFlags::none();
2395 case Instruction::ZExt:
2396 case Instruction::UIToFP:
2397 return NonNegFlagsTy(false);
2398 case Instruction::FAdd:
2399 case Instruction::FSub:
2400 case Instruction::FMul:
2401 case Instruction::FDiv:
2402 case Instruction::FRem:
2403 case Instruction::FNeg:
2404 case Instruction::FPExt:
2405 case Instruction::FPTrunc:
2406 return FastMathFlags();
2407 case Instruction::ICmp:
2408 case Instruction::FCmp:
2410 llvm_unreachable("opcode requires explicit flags");
2411 default:
2412 return VPIRFlags();
2413 }
2414}
2415
2416#if !defined(NDEBUG)
2417bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2418 switch (OpType) {
2419 case OperationType::OverflowingBinOp:
2420 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2421 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2422 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2423 case OperationType::Trunc:
2424 return Opcode == Instruction::Trunc;
2425 case OperationType::DisjointOp:
2426 return Opcode == Instruction::Or;
2427 case OperationType::PossiblyExactOp:
2428 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2429 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2430 case OperationType::GEPOp:
2431 return Opcode == Instruction::GetElementPtr ||
2432 Opcode == VPInstruction::PtrAdd ||
2433 Opcode == VPInstruction::WidePtrAdd;
2434 case OperationType::FPMathOp:
2435 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2436 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2437 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2438 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2439 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2440 Opcode == Instruction::Select || Opcode == Instruction::SIToFP ||
2441 Opcode == Instruction::UIToFP ||
2442 Opcode == VPInstruction::WideIVStep ||
2444 case OperationType::FCmp:
2445 return Opcode == Instruction::FCmp;
2446 case OperationType::NonNegOp:
2447 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2448 case OperationType::Cmp:
2449 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2450 case OperationType::ReductionOp:
2452 case OperationType::Other:
2453 return true;
2454 }
2455 llvm_unreachable("Unknown OperationType enum");
2456}
2457
2458bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2459 // Handle opcodes without default flags.
2460 if (Opcode == Instruction::ICmp)
2461 return OpType == OperationType::Cmp;
2462 if (Opcode == Instruction::FCmp)
2463 return OpType == OperationType::FCmp;
2465 return OpType == OperationType::ReductionOp;
2466
2467 OperationType Required = getDefaultFlags(Opcode).OpType;
2468 return Required == OperationType::Other || Required == OpType;
2469}
2470#endif
2471
2472#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2473static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) {
2474 switch (Kind) {
2475 case RecurKind::None:
2476 OS << "none";
2477 break;
2478 case RecurKind::Add:
2479 OS << "add";
2480 break;
2481 case RecurKind::Sub:
2482 OS << "sub";
2483 break;
2485 OS << "add-chain-with-subs";
2486 break;
2487 case RecurKind::Mul:
2488 OS << "mul";
2489 break;
2490 case RecurKind::Or:
2491 OS << "or";
2492 break;
2493 case RecurKind::And:
2494 OS << "and";
2495 break;
2496 case RecurKind::Xor:
2497 OS << "xor";
2498 break;
2499 case RecurKind::SMin:
2500 OS << "smin";
2501 break;
2502 case RecurKind::SMax:
2503 OS << "smax";
2504 break;
2505 case RecurKind::UMin:
2506 OS << "umin";
2507 break;
2508 case RecurKind::UMax:
2509 OS << "umax";
2510 break;
2511 case RecurKind::FAdd:
2512 OS << "fadd";
2513 break;
2515 OS << "fadd-chain-with-subs";
2516 break;
2517 case RecurKind::FSub:
2518 OS << "fsub";
2519 break;
2520 case RecurKind::FMul:
2521 OS << "fmul";
2522 break;
2523 case RecurKind::FMin:
2524 OS << "fmin";
2525 break;
2526 case RecurKind::FMax:
2527 OS << "fmax";
2528 break;
2529 case RecurKind::FMinNum:
2530 OS << "fminnum";
2531 break;
2532 case RecurKind::FMaxNum:
2533 OS << "fmaxnum";
2534 break;
2536 OS << "fminimum";
2537 break;
2539 OS << "fmaximum";
2540 break;
2542 OS << "fminimumnum";
2543 break;
2545 OS << "fmaximumnum";
2546 break;
2547 case RecurKind::FMulAdd:
2548 OS << "fmuladd";
2549 break;
2550 case RecurKind::AnyOf:
2551 OS << "any-of";
2552 break;
2553 case RecurKind::FindIV:
2554 OS << "find-iv";
2555 break;
2557 OS << "find-last";
2558 break;
2559 }
2560}
2561
2563 switch (OpType) {
2564 case OperationType::Cmp:
2566 break;
2567 case OperationType::FCmp:
2570 break;
2571 case OperationType::DisjointOp:
2572 if (DisjointFlags.IsDisjoint)
2573 O << " disjoint";
2574 break;
2575 case OperationType::PossiblyExactOp:
2576 if (ExactFlags.IsExact)
2577 O << " exact";
2578 break;
2579 case OperationType::OverflowingBinOp:
2580 if (WrapFlags.HasNUW)
2581 O << " nuw";
2582 if (WrapFlags.HasNSW)
2583 O << " nsw";
2584 break;
2585 case OperationType::Trunc:
2586 if (TruncFlags.HasNUW)
2587 O << " nuw";
2588 if (TruncFlags.HasNSW)
2589 O << " nsw";
2590 break;
2591 case OperationType::FPMathOp:
2593 break;
2594 case OperationType::GEPOp: {
2596 if (Flags.isInBounds())
2597 O << " inbounds";
2598 else if (Flags.hasNoUnsignedSignedWrap())
2599 O << " nusw";
2600 if (Flags.hasNoUnsignedWrap())
2601 O << " nuw";
2602 break;
2603 }
2604 case OperationType::NonNegOp:
2605 if (NonNegFlags.NonNeg)
2606 O << " nneg";
2607 break;
2608 case OperationType::ReductionOp: {
2609 O << " (";
2611 if (isReductionInLoop())
2612 O << ", in-loop";
2613 if (isReductionOrdered())
2614 O << ", ordered";
2615 O << ")";
2617 break;
2618 }
2619 case OperationType::Other:
2620 break;
2621 }
2622 O << " ";
2623}
2624#endif
2625
2627 auto &Builder = State.Builder;
2628 switch (Opcode) {
2629 case Instruction::Call:
2630 case Instruction::UncondBr:
2631 case Instruction::CondBr:
2632 case Instruction::PHI:
2633 case Instruction::GetElementPtr:
2634 llvm_unreachable("This instruction is handled by a different recipe.");
2635 case Instruction::UDiv:
2636 case Instruction::SDiv:
2637 case Instruction::SRem:
2638 case Instruction::URem:
2639 case Instruction::Add:
2640 case Instruction::FAdd:
2641 case Instruction::Sub:
2642 case Instruction::FSub:
2643 case Instruction::FNeg:
2644 case Instruction::Mul:
2645 case Instruction::FMul:
2646 case Instruction::FDiv:
2647 case Instruction::FRem:
2648 case Instruction::Shl:
2649 case Instruction::LShr:
2650 case Instruction::AShr:
2651 case Instruction::And:
2652 case Instruction::Or:
2653 case Instruction::Xor: {
2654 // Just widen unops and binops.
2656 for (VPValue *VPOp : operands())
2657 Ops.push_back(State.get(VPOp));
2658
2659 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2660
2661 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2662 applyFlags(*VecOp);
2663 applyMetadata(*VecOp);
2664 }
2665
2666 // Use this vector value for all users of the original instruction.
2667 State.set(this, V);
2668 break;
2669 }
2670 case Instruction::ExtractValue: {
2671 assert(getNumOperands() == 2 && "expected single level extractvalue");
2672 Value *Op = State.get(getOperand(0));
2673 Value *Extract = Builder.CreateExtractValue(
2674 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2675 State.set(this, Extract);
2676 break;
2677 }
2678 case Instruction::Freeze: {
2679 Value *Op = State.get(getOperand(0));
2680 Value *Freeze = Builder.CreateFreeze(Op);
2681 State.set(this, Freeze);
2682 break;
2683 }
2684 case Instruction::ICmp:
2685 case Instruction::FCmp: {
2686 // Widen compares. Generate vector compares.
2687 bool FCmp = Opcode == Instruction::FCmp;
2688 Value *A = State.get(getOperand(0));
2689 Value *B = State.get(getOperand(1));
2690 Value *C = nullptr;
2691 if (FCmp) {
2692 C = Builder.CreateFCmp(getPredicate(), A, B);
2693 } else {
2694 C = Builder.CreateICmp(getPredicate(), A, B);
2695 }
2696 if (auto *I = dyn_cast<Instruction>(C)) {
2697 applyFlags(*I);
2698 applyMetadata(*I);
2699 }
2700 State.set(this, C);
2701 break;
2702 }
2703 case Instruction::Select: {
2704 VPValue *CondOp = getOperand(0);
2705 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2706 Value *Op0 = State.get(getOperand(1));
2707 Value *Op1 = State.get(getOperand(2));
2708 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2709 State.set(this, Sel);
2710 if (auto *I = dyn_cast<Instruction>(Sel)) {
2712 applyFlags(*I);
2713 applyMetadata(*I);
2714 }
2715 break;
2716 }
2717 default:
2718 // This instruction is not vectorized by simple widening.
2719 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2720 << Instruction::getOpcodeName(Opcode));
2721 llvm_unreachable("Unhandled instruction!");
2722 } // end of switch.
2723
2724#if !defined(NDEBUG)
2725 // Verify that VPlan type inference results agree with the type of the
2726 // generated values.
2727 assert(VectorType::get(this->getScalarType(), State.VF) ==
2728 State.get(this)->getType() &&
2729 "inferred type and type from generated instructions do not match");
2730#endif
2731}
2732
2734 VPCostContext &Ctx) const {
2735 switch (Opcode) {
2736 case Instruction::UDiv:
2737 case Instruction::SDiv:
2738 case Instruction::SRem:
2739 case Instruction::URem:
2740 // If the div/rem operation isn't safe to speculate and requires
2741 // predication, then the only way we can even create a vplan is to insert
2742 // a select on the second input operand to ensure we use the value of 1
2743 // for the inactive lanes. The select will be costed separately.
2744 case Instruction::FNeg:
2745 case Instruction::Add:
2746 case Instruction::FAdd:
2747 case Instruction::Sub:
2748 case Instruction::FSub:
2749 case Instruction::Mul:
2750 case Instruction::FMul:
2751 case Instruction::FDiv:
2752 case Instruction::FRem:
2753 case Instruction::Shl:
2754 case Instruction::LShr:
2755 case Instruction::AShr:
2756 case Instruction::And:
2757 case Instruction::Or:
2758 case Instruction::Xor:
2759 case Instruction::Freeze:
2760 case Instruction::ExtractValue:
2761 case Instruction::ICmp:
2762 case Instruction::FCmp:
2763 case Instruction::Select:
2764 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2765 default:
2766 llvm_unreachable("Unsupported opcode for instruction");
2767 }
2768}
2769
2770#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2772 VPSlotTracker &SlotTracker) const {
2773 O << Indent << "WIDEN ";
2775 O << " = " << Instruction::getOpcodeName(Opcode);
2776 printFlags(O);
2778}
2779#endif
2780
2782 auto &Builder = State.Builder;
2783 /// Vectorize casts.
2784 assert(State.VF.isVector() && "Not vectorizing?");
2785 Type *DestTy = VectorType::get(getScalarType(), State.VF);
2786 VPValue *Op = getOperand(0);
2787 Value *A = State.get(Op);
2788 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2789 State.set(this, Cast);
2790 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2791 applyFlags(*CastOp);
2792 applyMetadata(*CastOp);
2793 }
2794}
2795
2800
2801#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2803 VPSlotTracker &SlotTracker) const {
2804 O << Indent << "WIDEN-CAST ";
2806 O << " = " << Instruction::getOpcodeName(Opcode);
2807 printFlags(O);
2809 O << " to " << *getScalarType();
2810}
2811#endif
2812
2814 VPCostContext &Ctx) const {
2815 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2816}
2817
2818#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2820 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2821 O << Indent;
2823 O << " = WIDEN-INDUCTION";
2824 printFlags(O);
2826
2827 if (auto *TI = getTruncInst())
2828 O << " (truncated to " << *TI->getType() << ")";
2829}
2830#endif
2831
2833 // The step may be defined by a recipe in the preheader (e.g. if it requires
2834 // SCEV expansion), but for the canonical induction the step is required to be
2835 // 1, which is represented as live-in.
2836 return match(getStartValue(), m_ZeroInt()) &&
2837 match(getStepValue(), m_One()) &&
2838 getScalarType() == getRegion()->getCanonicalIVType();
2839}
2840
2842 VPCostContext &Ctx) const {
2843 // The cost model for this is modelled on expandVPDerivedIV in
2844 // VPlanTransforms.cpp. In order to avoid overly pessimistic costs that can
2845 // negatively affect vectorization it takes into account any expected
2846 // simplifications that happen in simplifyRecipe.
2847 switch (getInductionKind()) {
2848 default:
2849 // TODO: Compute cost for remaining kinds.
2850 break;
2852 // There are currently no tests that expose a path where all lanes are
2853 // used, so it's better to bail out for now.
2854 if (!vputils::onlyFirstLaneUsed(this))
2855 break;
2856
2857 // Start off by assuming we need both mul and add, then refine this.
2858 bool NeedsMul = true, NeedsAdd = true, NeedsShl = false;
2859
2860 // If the start value is zero the add gets folded away.
2861 if (auto *VPV = dyn_cast<VPIRValue>(getStartValue()))
2862 if (auto *StartC = dyn_cast<ConstantInt>(VPV->getValue()))
2863 NeedsAdd = !StartC->isZero();
2864
2865 // For some values of step the arithmetic changes:
2866 // 1. A step of 1 requires no operation.
2867 // 2. A step of -1 requires a negate.
2868 // 3. A power-of-2 step will use a shl, instead of a mul.
2869 Type *StepTy = getStepValue()->getScalarType();
2871 if (auto *VPV = dyn_cast<VPIRValue>(getStepValue())) {
2872 if (auto *StepC = dyn_cast<ConstantInt>(VPV->getValue())) {
2873 if (StepC->isOne())
2874 NeedsMul = false;
2875 else if (StepC->isMinusOne()) {
2876 // This will most likely end up as a negate in simplifyRecipe, and
2877 // the negate will be combined with the add to make a sub.
2878 // NOTE: This is perhaps an invalid assumption that the cost of an
2879 // 'add' is the same as a 'sub'.
2880 NeedsMul = false;
2881 NeedsAdd = true;
2882 } else if (StepC->getValue().isPowerOf2()) {
2883 // This will most likely end up as a shift-left in simplifyRecipe
2884 NeedsMul = false;
2885 NeedsShl = true;
2886 }
2887 }
2888 }
2889
2890 // Add the cost of the conversion from index to step type if the index
2891 // will be used.
2892 Type *IndexTy = getIndex()->getScalarType();
2893 unsigned StepTySize = StepTy->getScalarSizeInBits();
2894 unsigned IndexTySize = IndexTy->getScalarSizeInBits();
2895 if ((NeedsAdd || NeedsMul || NeedsShl) && StepTySize != IndexTySize) {
2896 unsigned CastOpc =
2897 StepTySize < IndexTySize ? Instruction::Trunc : Instruction::SExt;
2898 Cost += Ctx.TTI.getCastInstrCost(
2899 CastOpc, StepTy, IndexTy, TTI::CastContextHint::None, Ctx.CostKind);
2900 }
2901
2902 if (NeedsMul)
2903 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, StepTy,
2904 Ctx.CostKind);
2905 if (NeedsShl)
2906 Cost += Ctx.TTI.getArithmeticInstrCost(
2907 Instruction::Shl, StepTy, Ctx.CostKind,
2908 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2909 {TargetTransformInfo::OK_UniformConstantValue,
2910 TargetTransformInfo::OP_None});
2911 if (NeedsAdd)
2912 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Add, StepTy,
2913 Ctx.CostKind);
2914 return Cost;
2915 }
2916 }
2917
2918 return 0;
2919}
2920
2921#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2923 VPSlotTracker &SlotTracker) const {
2924 O << Indent;
2926 O << " = DERIVED-IV ";
2927 getStartValue()->printAsOperand(O, SlotTracker);
2928 O << " + ";
2929 getOperand(1)->printAsOperand(O, SlotTracker);
2930 O << " * ";
2931 getStepValue()->printAsOperand(O, SlotTracker);
2932}
2933#endif
2934
2936 // Fast-math-flags propagate from the original induction instruction.
2937 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2938 State.Builder.setFastMathFlags(getFastMathFlags());
2939
2940 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2941 /// variable on which to base the steps, \p Step is the size of the step.
2942
2943 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2944 Value *Step = State.get(getStepValue(), VPLane(0));
2945 IRBuilderBase &Builder = State.Builder;
2946
2947 // Ensure step has the same type as that of scalar IV.
2948 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2949 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2950
2951 // We build scalar steps for both integer and floating-point induction
2952 // variables. Here, we determine the kind of arithmetic we will perform.
2955 if (BaseIVTy->isIntegerTy()) {
2956 AddOp = Instruction::Add;
2957 MulOp = Instruction::Mul;
2958 } else {
2959 AddOp = InductionOpcode;
2960 MulOp = Instruction::FMul;
2961 }
2962
2963 // Determine the number of scalars we need to generate.
2964 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2965 // Compute the scalar steps and save the results in State.
2966
2967 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2968 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2969 : Constant::getNullValue(BaseIVTy);
2970
2971 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
2972 // It is okay if the induction variable type cannot hold the lane number,
2973 // we expect truncation in this case.
2974 Constant *LaneValue =
2975 BaseIVTy->isIntegerTy()
2976 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2977 /*ImplicitTrunc=*/true)
2978 : ConstantFP::get(BaseIVTy, Lane);
2979 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2980 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2981 "Expected StartIdx to be folded to a constant when VF is not "
2982 "scalable");
2983 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2984 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2985 State.set(this, Add, VPLane(Lane));
2986 }
2987}
2988
2989#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2991 VPSlotTracker &SlotTracker) const {
2992 O << Indent;
2994 O << " = SCALAR-STEPS ";
2996}
2997#endif
2998
3000 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
3002}
3003
3005 assert(State.VF.isVector() && "not widening");
3006 // Construct a vector GEP by widening the operands of the scalar GEP as
3007 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3008 // results in a vector of pointers when at least one operand of the GEP
3009 // is vector-typed. Thus, to keep the representation compact, we only use
3010 // vector-typed operands for loop-varying values.
3011
3012 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
3013 return Op->isDefinedOutsideLoopRegions();
3014 });
3015 if (AllOperandsAreInvariant) {
3016 // If we are vectorizing, but the GEP has only loop-invariant operands,
3017 // the GEP we build (by only using vector-typed operands for
3018 // loop-varying values) would be a scalar pointer. Thus, to ensure we
3019 // produce a vector of pointers, we need to either arbitrarily pick an
3020 // operand to broadcast, or broadcast a clone of the original GEP.
3021 // Here, we broadcast a clone of the original.
3022
3024 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
3025 Ops.push_back(State.get(getOperand(I), VPLane(0)));
3026
3027 auto *NewGEP =
3028 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
3029 "", getGEPNoWrapFlags());
3030 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
3031 State.set(this, Splat);
3032 return;
3033 }
3034
3035 // If the GEP has at least one loop-varying operand, we are sure to
3036 // produce a vector of pointers unless VF is scalar.
3037 // The pointer operand of the new GEP. If it's loop-invariant, we
3038 // won't broadcast it.
3039 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
3040
3041 // Collect all the indices for the new GEP. If any index is
3042 // loop-invariant, we won't broadcast it.
3044 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
3045 VPValue *Operand = getOperand(I);
3046 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
3047 }
3048
3049 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3050 // but it should be a vector, otherwise.
3051 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
3052 "", getGEPNoWrapFlags());
3053 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
3054 "NewGEP is not a pointer vector");
3055 State.set(this, NewGEP);
3056}
3057
3058#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3060 VPSlotTracker &SlotTracker) const {
3061 O << Indent << "WIDEN-GEP ";
3062 O << (isPointerLoopInvariant() ? "Inv" : "Var");
3063 for (size_t I = 0; I < getNumOperands() - 1; ++I)
3064 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
3065
3066 O << " ";
3068 O << " = getelementptr";
3069 printFlags(O);
3071}
3072#endif
3073
3075 assert(!getOffset() && "Unexpected offset operand");
3076 VPBuilder Builder(this);
3077 VPlan &Plan = *getParent()->getPlan();
3078 VPValue *VFVal = getVFValue();
3079 const DataLayout &DL = Plan.getDataLayout();
3080 Type *IndexTy = DL.getIndexType(this->getScalarType());
3081 VPValue *Stride =
3082 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
3083 Type *VFTy = VFVal->getScalarType();
3084 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
3086
3087 // Offset for Part0 = Offset0 = Stride * (VF - 1).
3088 VPInstruction *VFMinusOne =
3089 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
3090 DebugLoc::getUnknown(), "", {true, true});
3091 VPInstruction *Offset0 =
3092 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
3093
3094 // Offset for PartN = Offset0 + Part * Stride * VF.
3095 VPValue *PartxStride =
3096 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
3097 VPValue *Offset = Builder.createAdd(
3098 Offset0,
3099 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
3101}
3102
3104 auto &Builder = State.Builder;
3105 assert(getOffset() && "Expected prior materialization of offset");
3106 Value *Ptr = State.get(getPointer(), true);
3107 Value *Offset = State.get(getOffset(), true);
3108 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3110 State.set(this, ResultPtr, /*IsScalar*/ true);
3111}
3112
3113#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3115 VPSlotTracker &SlotTracker) const {
3116 O << Indent;
3118 O << " = vector-end-pointer";
3119 printFlags(O);
3121}
3122#endif
3123
3125 assert(getVFxPart() &&
3126 "Expected prior simplification of recipe without VFxPart");
3127
3128 auto &Builder = State.Builder;
3129 Value *Ptr = State.get(getOperand(0), VPLane(0));
3130 Value *Offset = State.get(getVFxPart(), true);
3131 // TODO: Expand to VPInstruction to support constant folding.
3132 if (!match(getStride(), m_One())) {
3133 Value *Stride = Builder.CreateZExtOrTrunc(State.get(getStride(), true),
3134 Offset->getType());
3135 Offset = Builder.CreateMul(Offset, Stride);
3136 }
3137 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3139 State.set(this, ResultPtr, /*IsScalar*/ true);
3140}
3141
3142#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3144 VPSlotTracker &SlotTracker) const {
3145 O << Indent;
3147 O << " = vector-pointer";
3148 printFlags(O);
3150}
3151#endif
3152
3154 VPCostContext &Ctx) const {
3155 // A blend will be expanded to a select VPInstruction, which will generate a
3156 // scalar select if only the first lane is used.
3158 VF = ElementCount::getFixed(1);
3159
3160 Type *ResultTy = toVectorTy(this->getScalarType(), VF);
3161 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
3162 return (getNumIncomingValues() - 1) *
3163 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
3164 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
3165}
3166
3167#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3169 VPSlotTracker &SlotTracker) const {
3170 O << Indent << "BLEND ";
3172 O << " =";
3173 printFlags(O);
3174 if (getNumIncomingValues() == 1) {
3175 // Not a User of any mask: not really blending, this is a
3176 // single-predecessor phi.
3177 getIncomingValue(0)->printAsOperand(O, SlotTracker);
3178 } else {
3179 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
3180 if (I != 0)
3181 O << " ";
3182 getIncomingValue(I)->printAsOperand(O, SlotTracker);
3183 if (I == 0 && isNormalized())
3184 continue;
3185 O << "/";
3186 getMask(I)->printAsOperand(O, SlotTracker);
3187 }
3188 }
3189}
3190#endif
3191
3195 "In-loop AnyOf reductions aren't currently supported");
3196 // Propagate the fast-math flags carried by the underlying instruction.
3197 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
3198 State.Builder.setFastMathFlags(getFastMathFlags());
3199 Value *NewVecOp = State.get(getVecOp());
3200 if (VPValue *Cond = getCondOp()) {
3201 Value *NewCond = State.get(Cond, State.VF.isScalar());
3202 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
3203 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
3204
3205 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
3206 if (State.VF.isVector())
3207 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
3208
3209 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
3210 NewVecOp = Select;
3211 }
3212 Value *NewRed;
3213 Value *NextInChain;
3214 if (isOrdered()) {
3215 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3216 if (State.VF.isVector())
3217 NewRed =
3218 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
3219 else
3220 NewRed = State.Builder.CreateBinOp(
3222 PrevInChain, NewVecOp);
3223 PrevInChain = NewRed;
3224 NextInChain = NewRed;
3225 } else if (isPartialReduction()) {
3226 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
3227 "Unexpected partial reduction kind");
3228 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
3229 NewRed = State.Builder.CreateIntrinsic(
3230 PrevInChain->getType(),
3231 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
3232 : Intrinsic::vector_partial_reduce_fadd,
3233 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
3234 "partial.reduce");
3235 PrevInChain = NewRed;
3236 NextInChain = NewRed;
3237 } else {
3238 assert(isInLoop() &&
3239 "The reduction must either be ordered, partial or in-loop");
3240 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3241 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
3243 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
3244 else
3245 NextInChain = State.Builder.CreateBinOp(
3247 PrevInChain, NewRed);
3248 }
3249 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
3250}
3251
3253
3254 auto &Builder = State.Builder;
3255 // Propagate the fast-math flags carried by the underlying instruction.
3256 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
3257 Builder.setFastMathFlags(getFastMathFlags());
3258
3260 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
3261 Value *VecOp = State.get(getVecOp());
3262 Value *EVL = State.get(getEVL(), VPLane(0));
3263
3264 Value *Mask;
3265 if (VPValue *CondOp = getCondOp())
3266 Mask = State.get(CondOp);
3267 else
3268 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3269
3270 Value *NewRed;
3271 if (isOrdered()) {
3272 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
3273 } else {
3274 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
3276 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
3277 else
3278 NewRed = Builder.CreateBinOp(
3280 Prev);
3281 }
3282 State.set(this, NewRed, /*IsScalar*/ true);
3283}
3284
3286 VPCostContext &Ctx) const {
3287 RecurKind RdxKind = getRecurrenceKind();
3288 Type *ElementTy = this->getScalarType();
3289 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
3290 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
3292 std::optional<FastMathFlags> OptionalFMF =
3293 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
3294
3295 if (isPartialReduction()) {
3296 InstructionCost CondCost = 0;
3297 if (isConditional()) {
3299 auto *CondTy =
3301 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
3302 CondTy, Pred, Ctx.CostKind);
3303 }
3304 return CondCost + Ctx.TTI.getPartialReductionCost(
3305 Opcode, ElementTy, ElementTy, ElementTy, VF,
3306 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
3307 OptionalFMF);
3308 }
3309
3310 // TODO: Support any-of reductions.
3311 assert(
3313 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
3314 "Any-of reduction not implemented in VPlan-based cost model currently.");
3315
3316 // Note that TTI should model the cost of moving result to the scalar register
3317 // and the BinOp cost in the getMinMaxReductionCost().
3320 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
3321 }
3322
3323 // Note that TTI should model the cost of moving result to the scalar register
3324 // and the BinOp cost in the getArithmeticReductionCost().
3325 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
3326 Ctx.CostKind);
3327}
3328
3329VPExpressionRecipe::VPExpressionRecipe(
3330 ExpressionTypes ExpressionType,
3331 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
3332 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {},
3333 cast<VPReductionRecipe>(ExpressionRecipes.back())
3334 ->getChainOp()
3335 ->getScalarType()),
3336 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
3337 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
3338 assert(
3339 none_of(ExpressionRecipes,
3340 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3341 "expression cannot contain recipes with side-effects");
3342
3343 // Maintain a copy of the expression recipes as a set of users.
3344 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
3345 for (auto *R : ExpressionRecipes)
3346 ExpressionRecipesAsSetOfUsers.insert(R);
3347
3348 // Recipes in the expression, except the last one, must only be used by
3349 // (other) recipes inside the expression. If there are other users, external
3350 // to the expression, use a clone of the recipe for external users.
3351 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
3352 if (R != ExpressionRecipes.back() &&
3353 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
3354 return !ExpressionRecipesAsSetOfUsers.contains(U);
3355 })) {
3356 // There are users outside of the expression. Clone the recipe and use the
3357 // clone those external users.
3358 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3359 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
3360 VPUser &U, unsigned) {
3361 return !ExpressionRecipesAsSetOfUsers.contains(&U);
3362 });
3363 CopyForExtUsers->insertBefore(R);
3364 }
3365 if (R->getParent())
3366 R->removeFromParent();
3367 }
3368
3369 // Internalize all external operands to the expression recipes. To do so,
3370 // create new temporary VPValues for all operands defined by a recipe outside
3371 // the expression. The original operands are added as operands of the
3372 // VPExpressionRecipe itself.
3373 for (auto *R : ExpressionRecipes) {
3374 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3375 auto *Def = Op->getDefiningRecipe();
3376 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3377 continue;
3378 addOperand(Op);
3379 LiveInPlaceholders.push_back(new VPSymbolicValue(Op->getScalarType()));
3380 }
3381 }
3382
3383 // Replace each external operand with the first one created for it in
3384 // LiveInPlaceholders.
3385 for (auto *R : ExpressionRecipes)
3386 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3387 R->replaceUsesOfWith(LiveIn, Tmp);
3388}
3389
3391 for (auto *R : ExpressionRecipes)
3392 // Since the list could contain duplicates, make sure the recipe hasn't
3393 // already been inserted.
3394 if (!R->getParent())
3395 R->insertBefore(this);
3396
3397 for (const auto &[Idx, Op] : enumerate(operands()))
3398 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3399
3400 replaceAllUsesWith(ExpressionRecipes.back());
3401 ExpressionRecipes.clear();
3402}
3403
3405 VPCostContext &Ctx) const {
3406 Type *RedTy = this->getScalarType();
3407 auto *SrcVecTy =
3409 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3410 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3411 switch (ExpressionType) {
3412 case ExpressionTypes::NegatedExtendedReduction:
3413 assert((Opcode == Instruction::Add || Opcode == Instruction::FAdd) &&
3414 "Unexpected opcode");
3415 Opcode = Opcode == Instruction::Add ? Instruction::Sub : Instruction::FSub;
3416 [[fallthrough]];
3417 case ExpressionTypes::ExtendedReduction: {
3418 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3419 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3420
3421 if (RedR->isPartialReduction())
3422 return Ctx.TTI.getPartialReductionCost(
3423 Opcode, getOperand(0)->getScalarType(), nullptr, RedTy, VF,
3425 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3426 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3427 : std::nullopt);
3428 else if (!RedTy->isFloatingPointTy())
3429 // TTI::getExtendedReductionCost only supports integer types.
3430 return Ctx.TTI.getExtendedReductionCost(
3431 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3432 std::nullopt, Ctx.CostKind);
3433 else
3435 }
3436 case ExpressionTypes::MulAccReduction:
3437 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3438 Ctx.CostKind);
3439
3440 case ExpressionTypes::ExtNegatedMulAccReduction:
3441 switch (Opcode) {
3442 case Instruction::Add:
3443 Opcode = Instruction::Sub;
3444 break;
3445 case Instruction::FAdd:
3446 Opcode = Instruction::FSub;
3447 break;
3448 default:
3449 llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction");
3450 }
3451 [[fallthrough]];
3452 case ExpressionTypes::ExtMulAccReduction: {
3453 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3454 if (RedR->isPartialReduction()) {
3455 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3456 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3457 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3458 return Ctx.TTI.getPartialReductionCost(
3459 Opcode, getOperand(0)->getScalarType(),
3460 getOperand(1)->getScalarType(), RedTy, VF,
3462 Ext0R->getOpcode()),
3464 Ext1R->getOpcode()),
3465 Mul->getOpcode(), Ctx.CostKind,
3466 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3467 : std::nullopt);
3468 }
3469 assert(Opcode != Instruction::FSub && "Only integer types are supported");
3470 return Ctx.TTI.getMulAccReductionCost(
3471 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3472 Instruction::ZExt,
3473 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3474 }
3475 }
3476 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3477}
3478
3480 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3481 return R->mayReadFromMemory() || R->mayWriteToMemory();
3482 });
3483}
3484
3486 assert(
3487 none_of(ExpressionRecipes,
3488 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3489 "expression cannot contain recipes with side-effects");
3490 return false;
3491}
3492
3494 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3495 return RR && !RR->isPartialReduction();
3496}
3497
3498#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3499
3501 VPSlotTracker &SlotTracker) const {
3502 O << Indent << "EXPRESSION ";
3504 O << " = ";
3505 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3506 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3507
3508 switch (ExpressionType) {
3509 case ExpressionTypes::NegatedExtendedReduction:
3510 case ExpressionTypes::ExtendedReduction: {
3511 bool Negated = ExpressionType == ExpressionTypes::NegatedExtendedReduction;
3513 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3514 O << Instruction::getOpcodeName(Opcode) << " (";
3515 if (Negated)
3516 O << (Opcode == Instruction::Add ? "sub (0, " : "fneg(");
3518 if (Negated)
3519 O << ")";
3520 Red->printFlags(O);
3521
3522 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3523 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3524 << *Ext0->getScalarType();
3525 if (Red->isConditional()) {
3526 O << ", ";
3527 Red->getCondOp()->printAsOperand(O, SlotTracker);
3528 }
3529 O << ")";
3530 break;
3531 }
3532 case ExpressionTypes::ExtNegatedMulAccReduction: {
3534 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3536 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3537 << " (sub (0, mul";
3538 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3539 Mul->printFlags(O);
3540 O << "(";
3542 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3543 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3544 << *Ext0->getScalarType() << "), (";
3546 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3547 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3548 << *Ext1->getScalarType() << ")";
3549 if (Red->isConditional()) {
3550 O << ", ";
3551 Red->getCondOp()->printAsOperand(O, SlotTracker);
3552 }
3553 O << "))";
3554 break;
3555 }
3556 case ExpressionTypes::MulAccReduction:
3557 case ExpressionTypes::ExtMulAccReduction: {
3559 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3561 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3562 << " (";
3563 O << "mul";
3564 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3565 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3566 : ExpressionRecipes[0]);
3567 Mul->printFlags(O);
3568 if (IsExtended)
3569 O << "(";
3571 if (IsExtended) {
3572 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3573 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3574 << *Ext0->getScalarType() << "), (";
3575 } else {
3576 O << ", ";
3577 }
3579 if (IsExtended) {
3580 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3581 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3582 << *Ext1->getScalarType() << ")";
3583 }
3584 if (Red->isConditional()) {
3585 O << ", ";
3586 Red->getCondOp()->printAsOperand(O, SlotTracker);
3587 }
3588 O << ")";
3589 break;
3590 }
3591 }
3592}
3593
3595 VPSlotTracker &SlotTracker) const {
3596 if (isPartialReduction())
3597 O << Indent << "PARTIAL-REDUCE ";
3598 else
3599 O << Indent << "REDUCE ";
3601 O << " = ";
3603 O << " +";
3604 printFlags(O);
3605 O << " reduce.";
3607 O << " (";
3609 if (isConditional()) {
3610 O << ", ";
3612 }
3613 O << ")";
3614}
3615
3617 VPSlotTracker &SlotTracker) const {
3618 O << Indent << "REDUCE ";
3620 O << " = ";
3622 O << " +";
3623 printFlags(O);
3624 O << " vp.reduce."
3627 << " (";
3629 O << ", ";
3631 if (isConditional()) {
3632 O << ", ";
3634 }
3635 O << ")";
3636}
3637
3638#endif
3639
3641 assert(IsSingleScalar &&
3642 "VPReplicateRecipes must be unrolled before ::execute");
3643 auto *Instr = getUnderlyingInstr();
3644 Instruction *Cloned = Instr->clone();
3645 Type *ResultTy = getScalarType();
3646 if (!ResultTy->isVoidTy()) {
3647 Cloned->setName(Instr->getName() + ".cloned");
3648 // The operands of the replicate recipe may have been narrowed, resulting in
3649 // a narrower result type. Update the type of the cloned instruction to the
3650 // correct type.
3651 if (ResultTy != Cloned->getType())
3652 Cloned->mutateType(ResultTy);
3653 }
3654
3655 applyFlags(*Cloned);
3656 applyMetadata(*Cloned);
3657
3658 if (hasPredicate())
3659 cast<CmpInst>(Cloned)->setPredicate(getPredicate());
3660
3661 // Replace the operands of the cloned instructions with their scalar
3662 // equivalents in the new loop.
3663 for (const auto &[Idx, V] : enumerate(operands()))
3664 Cloned->setOperand(Idx, State.get(V, true));
3665
3666 // Place the cloned scalar in the new loop.
3667 State.Builder.Insert(Cloned);
3668
3669 State.set(this, Cloned, true);
3670
3671 // If we just cloned a new assumption, add it the assumption cache.
3672 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3673 State.AC->registerAssumption(II);
3674}
3675
3676/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3677/// which the legacy cost model computes a SCEV expression when computing the
3678/// address cost. Computing SCEVs for VPValues is incomplete and returns
3679/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3680/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3681static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3683 const Loop *L) {
3684 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3685 if (isa<SCEVCouldNotCompute>(Addr))
3686 return Addr;
3687
3688 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3689}
3690
3692 VPCostContext &Ctx) const {
3694 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3695 // transform, avoid computing their cost multiple times for now.
3696 Ctx.SkipCostComputation.insert(UI);
3697
3698 if (VF.isScalable() && !isSingleScalar())
3700
3701 switch (UI->getOpcode()) {
3702 case Instruction::Alloca:
3703 if (VF.isScalable())
3705 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul,
3706 this->getScalarType(), Ctx.CostKind);
3707 case Instruction::GetElementPtr:
3708 // We mark this instruction as zero-cost because the cost of GEPs in
3709 // vectorized code depends on whether the corresponding memory instruction
3710 // is scalarized or not. Therefore, we handle GEPs with the memory
3711 // instruction cost.
3712 return 0;
3713 case Instruction::Call: {
3714 auto *CalledFn =
3716 Type *ResultTy = this->getScalarType();
3718 return computeCallCost(CalledFn, ResultTy, ArgOps, isSingleScalar(), VF,
3719 Ctx);
3720 }
3721 case Instruction::Add:
3722 case Instruction::Sub:
3723 case Instruction::FAdd:
3724 case Instruction::FSub:
3725 case Instruction::Mul:
3726 case Instruction::FMul:
3727 case Instruction::FDiv:
3728 case Instruction::FRem:
3729 case Instruction::Shl:
3730 case Instruction::LShr:
3731 case Instruction::AShr:
3732 case Instruction::And:
3733 case Instruction::Or:
3734 case Instruction::Xor:
3735 case Instruction::ICmp:
3736 case Instruction::FCmp:
3738 Ctx) *
3739 (isSingleScalar() ? 1 : VF.getFixedValue());
3740 case Instruction::SDiv:
3741 case Instruction::UDiv:
3742 case Instruction::SRem:
3743 case Instruction::URem: {
3744 InstructionCost ScalarCost =
3746 if (isSingleScalar())
3747 return ScalarCost;
3748
3749 // If any of the operands is from a different replicate region and has its
3750 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3751 // model to avoid cost mis-match.
3752 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3753 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3754 if (!PredR)
3755 return false;
3756 return Ctx.skipCostComputation(
3758 PredR->getOperand(0)->getUnderlyingValue()),
3759 VF.isVector());
3760 }))
3761 break;
3762
3763 ScalarCost = ScalarCost * VF.getFixedValue() +
3764 Ctx.getScalarizationOverhead(this->getScalarType(),
3765 to_vector(operands()), VF);
3766 // If the recipe is not predicated (i.e. not in a replicate region), return
3767 // the scalar cost. Otherwise handle predicated cost.
3768 if (!getRegion()->isReplicator())
3769 return ScalarCost;
3770
3771 // Account for the phi nodes that we will create.
3772 ScalarCost += VF.getFixedValue() *
3773 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3774 // Scale the cost by the probability of executing the predicated blocks.
3775 // This assumes the predicated block for each vector lane is equally
3776 // likely.
3777 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3778 return ScalarCost;
3779 }
3780 case Instruction::Load:
3781 case Instruction::Store: {
3782 bool IsLoad = UI->getOpcode() == Instruction::Load;
3783 const VPValue *PtrOp = getOperand(!IsLoad);
3784 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3786 break;
3787
3788 Type *ValTy = (IsLoad ? this : getOperand(0))->getScalarType();
3789 Type *ScalarPtrTy = PtrOp->getScalarType();
3790 const Align Alignment = getLoadStoreAlignment(UI);
3791 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3793 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3794 bool UsedByLoadStoreAddress =
3795 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this);
3796 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3797 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3798 UsedByLoadStoreAddress ? UI : nullptr);
3799
3800 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3801 InstructionCost ScalarCost =
3802 ScalarMemOpCost +
3803 Ctx.TTI.getAddressComputationCost(
3804 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3805 Ctx.CostKind);
3806 if (isSingleScalar())
3807 return ScalarCost;
3808
3809 SmallVector<const VPValue *> OpsToScalarize;
3810 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3811 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3812 // don't assign scalarization overhead in general, if the target prefers
3813 // vectorized addressing or the loaded value is used as part of an address
3814 // of another load or store.
3815 if (!UsedByLoadStoreAddress) {
3816 bool EfficientVectorLoadStore =
3817 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3818 if (!(IsLoad && !PreferVectorizedAddressing) &&
3819 !(!IsLoad && EfficientVectorLoadStore))
3820 append_range(OpsToScalarize, operands());
3821
3822 if (!EfficientVectorLoadStore)
3823 ResultTy = this->getScalarType();
3824 }
3825
3829 (ScalarCost * VF.getFixedValue()) +
3830 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3831
3832 const VPRegionBlock *ParentRegion = getRegion();
3833 if (ParentRegion && ParentRegion->isReplicator()) {
3834 if (!PtrSCEV)
3835 break;
3836 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3837 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3838
3839 auto *VecI1Ty = VectorType::get(
3840 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3841 Cost += Ctx.TTI.getScalarizationOverhead(
3842 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3843 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3844
3845 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3846 // Artificially setting to a high enough value to practically disable
3847 // vectorization with such operations.
3848 return 3000000;
3849 }
3850 }
3851 return Cost;
3852 }
3853 case Instruction::SExt:
3854 case Instruction::ZExt:
3855 case Instruction::FPToUI:
3856 case Instruction::FPToSI:
3857 case Instruction::FPExt:
3858 case Instruction::PtrToInt:
3859 case Instruction::PtrToAddr:
3860 case Instruction::IntToPtr:
3861 case Instruction::SIToFP:
3862 case Instruction::UIToFP:
3863 case Instruction::Trunc:
3864 case Instruction::FPTrunc:
3865 case Instruction::Select:
3866 case Instruction::AddrSpaceCast: {
3868 Ctx) *
3869 (isSingleScalar() ? 1 : VF.getFixedValue());
3870 }
3871 case Instruction::ExtractValue:
3872 case Instruction::InsertValue:
3873 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3874 }
3875
3876 return Ctx.getLegacyCost(UI, VF);
3877}
3878
3880 Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps,
3881 bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) {
3883 ArgOps, [&](const VPValue *Op) { return Op->getScalarType(); });
3884
3885 Intrinsic::ID IntrinID = CalledFn->getIntrinsicID();
3886 auto GetIntrinsicCost = [&] {
3887 if (!IntrinID)
3889 return Ctx.TTI.getIntrinsicInstrCost(
3890 IntrinsicCostAttributes(IntrinID, ResultTy, Tys), Ctx.CostKind);
3891 };
3892
3893 if (IntrinID && VPCostContext::isFreeScalarIntrinsic(IntrinID)) {
3894 assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free");
3895 return 0;
3896 }
3897
3898 InstructionCost ScalarCallCost =
3899 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3900 if (IsSingleScalar) {
3901 ScalarCallCost = std::min(ScalarCallCost, GetIntrinsicCost());
3902 return ScalarCallCost;
3903 }
3904
3905 // Scalarization overhead is undefined for scalable VFs.
3906 if (VF.isScalable())
3908
3909 return ScalarCallCost * VF.getFixedValue() +
3910 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3911}
3912
3913#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3915 VPSlotTracker &SlotTracker) const {
3916 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3917
3918 if (!getScalarType()->isVoidTy()) {
3920 O << " = ";
3921 }
3922 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3923 O << "call";
3924 printFlags(O);
3925 O << "@" << CB->getCalledFunction()->getName() << "(";
3927 O, [&O, &SlotTracker](VPValue *Op) {
3928 Op->printAsOperand(O, SlotTracker);
3929 });
3930 O << ")";
3931 } else {
3933 printFlags(O);
3935 }
3936
3937 // Find if the recipe is used by a widened recipe via an intervening
3938 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3939 if (any_of(users(), [](const VPUser *U) {
3940 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3941 return !vputils::onlyScalarValuesUsed(PredR);
3942 return false;
3943 }))
3944 O << " (S->V)";
3945}
3946#endif
3947
3949 llvm_unreachable("recipe must be removed when dissolving replicate region");
3950}
3951
3953 VPCostContext &Ctx) const {
3954 // The legacy cost model doesn't assign costs to branches for individual
3955 // replicate regions. Match the current behavior in the VPlan cost model for
3956 // now.
3957 return 0;
3958}
3959
3961 llvm_unreachable("recipe must be removed when dissolving replicate region");
3962}
3963
3964#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3966 VPSlotTracker &SlotTracker) const {
3967 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3969 O << " = ";
3971}
3972#endif
3973
3975 VPCostContext &Ctx) const {
3976 const VPRecipeBase *R = getAsRecipe();
3978 Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(R)->getScalarType()
3979 : R->getOperand(1)->getScalarType();
3980 Type *Ty = toVectorTy(ScalarTy, VF);
3981 unsigned AS =
3982 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
3983 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
3984
3985 if (!Consecutive) {
3986 // TODO: Using the original IR may not be accurate.
3987 // Currently, ARM will use the underlying IR to calculate gather/scatter
3988 // instruction cost.
3989 [[maybe_unused]] auto IsReverseMask = [this, R]() {
3990 VPValue *Mask = getMask();
3991 if (!Mask)
3992 return false;
3993
3996
3997 return match(Mask, m_Reverse(m_VPValue()));
3998 };
3999 assert(!IsReverseMask() &&
4000 "Inconsecutive memory access should not have reverse order");
4001 Type *PtrTy = getAddr()->getScalarType();
4002 const Value *Ptr = getAddr()->getUnderlyingValue();
4003
4004 // If the address value is uniform across all lanes, then the address can be
4005 // calculated with scalar type and broadcast.
4007 PtrTy = toVectorTy(PtrTy, VF);
4008
4009 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_gather
4010 : isa<VPWidenStoreRecipe>(R) ? Intrinsic::masked_scatter
4011 : isa<VPWidenLoadEVLRecipe>(R) ? Intrinsic::vp_gather
4012 : Intrinsic::vp_scatter;
4013 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4014 Ctx.CostKind) +
4015 Ctx.TTI.getMemIntrinsicInstrCost(
4017 &Ingredient),
4018 Ctx.CostKind);
4019 }
4020
4022 if (IsMasked) {
4023 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_load
4024 : Intrinsic::masked_store;
4025 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
4026 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
4027 } else {
4028 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
4030 : R->getOperand(1));
4031 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
4032 OpInfo, &Ingredient);
4033 }
4034 return Cost;
4035}
4036
4038 Type *ScalarDataTy = getScalarType();
4039 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
4040 bool CreateGather = !isConsecutive();
4041
4042 auto &Builder = State.Builder;
4043 Value *Mask = nullptr;
4044 if (auto *VPMask = getMask())
4045 Mask = State.get(VPMask);
4046
4047 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
4048 Value *NewLI;
4049 if (CreateGather) {
4050 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
4051 "wide.masked.gather");
4052 } else if (Mask) {
4053 NewLI =
4054 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
4055 PoisonValue::get(DataTy), "wide.masked.load");
4056 } else {
4057 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
4058 }
4060 State.set(this, NewLI);
4061}
4062
4063#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4065 VPSlotTracker &SlotTracker) const {
4066 O << Indent << "WIDEN ";
4068 O << " = load ";
4070}
4071#endif
4072
4074 Type *ScalarDataTy = getScalarType();
4075 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
4076 bool CreateGather = !isConsecutive();
4077
4078 auto &Builder = State.Builder;
4079 CallInst *NewLI;
4080 Value *EVL = State.get(getEVL(), VPLane(0));
4081 Value *Addr = State.get(getAddr(), !CreateGather);
4082 Value *Mask = nullptr;
4083 if (VPValue *VPMask = getMask())
4084 Mask = State.get(VPMask);
4085 else
4086 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4087
4088 if (CreateGather) {
4089 NewLI =
4090 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
4091 nullptr, "wide.masked.gather");
4092 } else {
4093 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
4094 {Addr, Mask, EVL}, nullptr, "vp.op.load");
4095 }
4096 NewLI->addParamAttr(
4098 applyMetadata(*NewLI);
4099 Instruction *Res = NewLI;
4100 State.set(this, Res);
4101}
4102
4104 VPCostContext &Ctx) const {
4105 if (!Consecutive || IsMasked)
4106 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4107
4108 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4109 // here because the EVL recipes using EVL to replace the tail mask. But in the
4110 // legacy model, it will always calculate the cost of mask.
4111 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4112 // don't need to compare to the legacy cost model.
4113 Type *Ty = toVectorTy(getScalarType(), VF);
4114 unsigned AS =
4115 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4116 return Ctx.TTI.getMemIntrinsicInstrCost(
4117 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
4118 Ctx.CostKind);
4119}
4120
4121#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4123 VPSlotTracker &SlotTracker) const {
4124 O << Indent << "WIDEN ";
4126 O << " = vp.load ";
4128}
4129#endif
4130
4132 VPValue *StoredVPValue = getStoredValue();
4133 bool CreateScatter = !isConsecutive();
4134
4135 auto &Builder = State.Builder;
4136
4137 Value *Mask = nullptr;
4138 if (auto *VPMask = getMask())
4139 Mask = State.get(VPMask);
4140
4141 Value *StoredVal = State.get(StoredVPValue);
4142 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
4143 Instruction *NewSI = nullptr;
4144 if (CreateScatter)
4145 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
4146 else if (Mask)
4147 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
4148 else
4149 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
4150 applyMetadata(*NewSI);
4151}
4152
4153#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4155 VPSlotTracker &SlotTracker) const {
4156 O << Indent << "WIDEN store ";
4158}
4159#endif
4160
4162 VPValue *StoredValue = getStoredValue();
4163 bool CreateScatter = !isConsecutive();
4164
4165 auto &Builder = State.Builder;
4166
4167 CallInst *NewSI = nullptr;
4168 Value *StoredVal = State.get(StoredValue);
4169 Value *EVL = State.get(getEVL(), VPLane(0));
4170 Value *Mask = nullptr;
4171 if (VPValue *VPMask = getMask())
4172 Mask = State.get(VPMask);
4173 else
4174 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4175
4176 Value *Addr = State.get(getAddr(), !CreateScatter);
4177 if (CreateScatter) {
4178 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4179 Intrinsic::vp_scatter,
4180 {StoredVal, Addr, Mask, EVL});
4181 } else {
4182 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4183 Intrinsic::vp_store,
4184 {StoredVal, Addr, Mask, EVL});
4185 }
4186 NewSI->addParamAttr(
4188 applyMetadata(*NewSI);
4189}
4190
4192 VPCostContext &Ctx) const {
4193 if (!Consecutive || IsMasked)
4194 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4195
4196 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4197 // here because the EVL recipes using EVL to replace the tail mask. But in the
4198 // legacy model, it will always calculate the cost of mask.
4199 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4200 // don't need to compare to the legacy cost model.
4201 Type *Ty = toVectorTy(getStoredValue()->getScalarType(), VF);
4202 unsigned AS =
4203 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4204 return Ctx.TTI.getMemIntrinsicInstrCost(
4205 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4206 Ctx.CostKind);
4207}
4208
4209#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4211 VPSlotTracker &SlotTracker) const {
4212 O << Indent << "WIDEN vp.store ";
4214}
4215#endif
4216
4218 VectorType *DstVTy, const DataLayout &DL) {
4219 // Verify that V is a vector type with same number of elements as DstVTy.
4220 auto VF = DstVTy->getElementCount();
4221 auto *SrcVecTy = cast<VectorType>(V->getType());
4222 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4223 Type *SrcElemTy = SrcVecTy->getElementType();
4224 Type *DstElemTy = DstVTy->getElementType();
4225 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4226 "Vector elements must have same size");
4227
4228 // Do a direct cast if element types are castable.
4229 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4230 return Builder.CreateBitOrPointerCast(V, DstVTy);
4231 }
4232 // V cannot be directly casted to desired vector type.
4233 // May happen when V is a floating point vector but DstVTy is a vector of
4234 // pointers or vice-versa. Handle this using a two-step bitcast using an
4235 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4236 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4237 "Only one type should be a pointer type");
4238 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4239 "Only one type should be a floating point type");
4240 Type *IntTy =
4241 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4242 auto *VecIntTy = VectorType::get(IntTy, VF);
4243 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4244 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4245}
4246
4247/// Return a vector containing interleaved elements from multiple
4248/// smaller input vectors.
4250 const Twine &Name) {
4251 unsigned Factor = Vals.size();
4252 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4253
4254 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4255#ifndef NDEBUG
4256 for (Value *Val : Vals)
4257 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4258#endif
4259
4260 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4261 // must use intrinsics to interleave.
4262 if (VecTy->isScalableTy()) {
4263 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4264 return Builder.CreateVectorInterleave(Vals, Name);
4265 }
4266
4267 // Fixed length. Start by concatenating all vectors into a wide vector.
4268 Value *WideVec = concatenateVectors(Builder, Vals);
4269
4270 // Interleave the elements into the wide vector.
4271 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4272 return Builder.CreateShuffleVector(
4273 WideVec, createInterleaveMask(NumElts, Factor), Name);
4274}
4275
4276// Try to vectorize the interleave group that \p Instr belongs to.
4277//
4278// E.g. Translate following interleaved load group (factor = 3):
4279// for (i = 0; i < N; i+=3) {
4280// R = Pic[i]; // Member of index 0
4281// G = Pic[i+1]; // Member of index 1
4282// B = Pic[i+2]; // Member of index 2
4283// ... // do something to R, G, B
4284// }
4285// To:
4286// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4287// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4288// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4289// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4290//
4291// Or translate following interleaved store group (factor = 3):
4292// for (i = 0; i < N; i+=3) {
4293// ... do something to R, G, B
4294// Pic[i] = R; // Member of index 0
4295// Pic[i+1] = G; // Member of index 1
4296// Pic[i+2] = B; // Member of index 2
4297// }
4298// To:
4299// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4300// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4301// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4302// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4303// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4305 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4306 "Masking gaps for scalable vectors is not yet supported.");
4308 Instruction *Instr = Group->getInsertPos();
4309
4310 // Prepare for the vector type of the interleaved load/store.
4311 Type *ScalarTy = getLoadStoreType(Instr);
4312 unsigned InterleaveFactor = Group->getFactor();
4313 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4314
4315 VPValue *BlockInMask = getMask();
4316 VPValue *Addr = getAddr();
4317 Value *ResAddr = State.get(Addr, VPLane(0));
4318
4319 auto CreateGroupMask = [&BlockInMask, &State,
4320 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4321 if (State.VF.isScalable()) {
4322 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4323 assert(InterleaveFactor <= 8 &&
4324 "Unsupported deinterleave factor for scalable vectors");
4325 auto *ResBlockInMask = State.get(BlockInMask);
4326 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4327 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4328 }
4329
4330 if (!BlockInMask)
4331 return MaskForGaps;
4332
4333 Value *ResBlockInMask = State.get(BlockInMask);
4334 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4335 ResBlockInMask,
4336 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4337 "interleaved.mask");
4338 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4339 ShuffledMask, MaskForGaps)
4340 : ShuffledMask;
4341 };
4342
4343 const DataLayout &DL = Instr->getDataLayout();
4344 // Vectorize the interleaved load group.
4345 if (isa<LoadInst>(Instr)) {
4346 Value *MaskForGaps = nullptr;
4347 if (needsMaskForGaps()) {
4348 MaskForGaps =
4349 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4350 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4351 }
4352
4353 Instruction *NewLoad;
4354 if (BlockInMask || MaskForGaps) {
4355 Value *GroupMask = CreateGroupMask(MaskForGaps);
4356 Value *PoisonVec = PoisonValue::get(VecTy);
4357 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4358 Group->getAlign(), GroupMask,
4359 PoisonVec, "wide.masked.vec");
4360 } else
4361 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4362 Group->getAlign(), "wide.vec");
4363 applyMetadata(*NewLoad);
4364 // TODO: Also manage existing metadata using VPIRMetadata.
4365 Group->addMetadata(NewLoad);
4366
4368 if (VecTy->isScalableTy()) {
4369 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4370 // so must use intrinsics to deinterleave.
4371 assert(InterleaveFactor <= 8 &&
4372 "Unsupported deinterleave factor for scalable vectors");
4373 NewLoad = State.Builder.CreateIntrinsic(
4374 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4375 NewLoad->getType(), NewLoad,
4376 /*FMFSource=*/nullptr, "strided.vec");
4377 }
4378
4379 auto CreateStridedVector = [&InterleaveFactor, &State,
4380 &NewLoad](unsigned Index) -> Value * {
4381 assert(Index < InterleaveFactor && "Illegal group index");
4382 if (State.VF.isScalable())
4383 return State.Builder.CreateExtractValue(NewLoad, Index);
4384
4385 // For fixed length VF, use shuffle to extract the sub-vectors from the
4386 // wide load.
4387 auto StrideMask =
4388 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4389 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4390 "strided.vec");
4391 };
4392
4393 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4394 Instruction *Member = Group->getMember(I);
4395
4396 // Skip the gaps in the group.
4397 if (!Member)
4398 continue;
4399
4400 Value *StridedVec = CreateStridedVector(I);
4401
4402 // If this member has different type, cast the result type.
4403 if (Member->getType() != ScalarTy) {
4404 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4405 StridedVec =
4406 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4407 }
4408
4409 if (Group->isReverse())
4410 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4411
4412 State.set(VPDefs[J], StridedVec);
4413 ++J;
4414 }
4415 return;
4416 }
4417
4418 // The sub vector type for current instruction.
4419 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4420
4421 // Vectorize the interleaved store group.
4422 Value *MaskForGaps =
4423 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4424 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4425 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4426 ArrayRef<VPValue *> StoredValues = getStoredValues();
4427 // Collect the stored vector from each member.
4428 SmallVector<Value *, 4> StoredVecs;
4429 unsigned StoredIdx = 0;
4430 for (unsigned i = 0; i < InterleaveFactor; i++) {
4431 assert((Group->getMember(i) || MaskForGaps) &&
4432 "Fail to get a member from an interleaved store group");
4433 Instruction *Member = Group->getMember(i);
4434
4435 // Skip the gaps in the group.
4436 if (!Member) {
4437 Value *Undef = PoisonValue::get(SubVT);
4438 StoredVecs.push_back(Undef);
4439 continue;
4440 }
4441
4442 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4443 ++StoredIdx;
4444
4445 if (Group->isReverse())
4446 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4447
4448 // If this member has different type, cast it to a unified type.
4449
4450 if (StoredVec->getType() != SubVT)
4451 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4452
4453 StoredVecs.push_back(StoredVec);
4454 }
4455
4456 // Interleave all the smaller vectors into one wider vector.
4457 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4458 Instruction *NewStoreInstr;
4459 if (BlockInMask || MaskForGaps) {
4460 Value *GroupMask = CreateGroupMask(MaskForGaps);
4461 NewStoreInstr = State.Builder.CreateMaskedStore(
4462 IVec, ResAddr, Group->getAlign(), GroupMask);
4463 } else
4464 NewStoreInstr =
4465 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4466
4467 applyMetadata(*NewStoreInstr);
4468 // TODO: Also manage existing metadata using VPIRMetadata.
4469 Group->addMetadata(NewStoreInstr);
4470}
4471
4472#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4474 VPSlotTracker &SlotTracker) const {
4476 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4478 VPValue *Mask = getMask();
4479 if (Mask) {
4480 O << ", ";
4481 Mask->printAsOperand(O, SlotTracker);
4482 }
4483
4484 unsigned OpIdx = 0;
4485 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4486 if (!IG->getMember(i))
4487 continue;
4488 if (getNumStoreOperands() > 0) {
4489 O << "\n" << Indent << " store ";
4491 O << " to index " << i;
4492 } else {
4493 O << "\n" << Indent << " ";
4495 O << " = load from index " << i;
4496 }
4497 ++OpIdx;
4498 }
4499}
4500#endif
4501
4503 assert(State.VF.isScalable() &&
4504 "Only support scalable VF for EVL tail-folding.");
4506 "Masking gaps for scalable vectors is not yet supported.");
4508 Instruction *Instr = Group->getInsertPos();
4509
4510 // Prepare for the vector type of the interleaved load/store.
4511 Type *ScalarTy = getLoadStoreType(Instr);
4512 unsigned InterleaveFactor = Group->getFactor();
4513 assert(InterleaveFactor <= 8 &&
4514 "Unsupported deinterleave/interleave factor for scalable vectors");
4515 ElementCount WideVF = State.VF * InterleaveFactor;
4516 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4517
4518 VPValue *Addr = getAddr();
4519 Value *ResAddr = State.get(Addr, VPLane(0));
4520 Value *EVL = State.get(getEVL(), VPLane(0));
4521 Value *InterleaveEVL = State.Builder.CreateMul(
4522 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4523 /* NUW= */ true, /* NSW= */ true);
4524 LLVMContext &Ctx = State.Builder.getContext();
4525
4526 Value *GroupMask = nullptr;
4527 if (VPValue *BlockInMask = getMask()) {
4528 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4529 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4530 } else {
4531 GroupMask =
4532 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4533 }
4534
4535 // Vectorize the interleaved load group.
4536 if (isa<LoadInst>(Instr)) {
4537 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4538 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4539 "wide.vp.load");
4540 NewLoad->addParamAttr(0,
4541 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4542
4543 applyMetadata(*NewLoad);
4544 // TODO: Also manage existing metadata using VPIRMetadata.
4545 Group->addMetadata(NewLoad);
4546
4547 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4548 // so must use intrinsics to deinterleave.
4549 NewLoad = State.Builder.CreateIntrinsic(
4550 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4551 NewLoad->getType(), NewLoad,
4552 /*FMFSource=*/nullptr, "strided.vec");
4553
4554 const DataLayout &DL = Instr->getDataLayout();
4555 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4556 Instruction *Member = Group->getMember(I);
4557 // Skip the gaps in the group.
4558 if (!Member)
4559 continue;
4560
4561 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4562 // If this member has different type, cast the result type.
4563 if (Member->getType() != ScalarTy) {
4564 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4565 StridedVec =
4566 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4567 }
4568
4569 State.set(getVPValue(J), StridedVec);
4570 ++J;
4571 }
4572 return;
4573 } // End for interleaved load.
4574
4575 // The sub vector type for current instruction.
4576 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4577 // Vectorize the interleaved store group.
4578 ArrayRef<VPValue *> StoredValues = getStoredValues();
4579 // Collect the stored vector from each member.
4580 SmallVector<Value *, 4> StoredVecs;
4581 const DataLayout &DL = Instr->getDataLayout();
4582 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4583 Instruction *Member = Group->getMember(I);
4584 // Skip the gaps in the group.
4585 if (!Member) {
4586 StoredVecs.push_back(PoisonValue::get(SubVT));
4587 continue;
4588 }
4589
4590 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4591 // If this member has different type, cast it to a unified type.
4592 if (StoredVec->getType() != SubVT)
4593 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4594
4595 StoredVecs.push_back(StoredVec);
4596 ++StoredIdx;
4597 }
4598
4599 // Interleave all the smaller vectors into one wider vector.
4600 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4601 CallInst *NewStore =
4602 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4603 {IVec, ResAddr, GroupMask, InterleaveEVL});
4604 NewStore->addParamAttr(1,
4605 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4606
4607 applyMetadata(*NewStore);
4608 // TODO: Also manage existing metadata using VPIRMetadata.
4609 Group->addMetadata(NewStore);
4610}
4611
4612#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4614 VPSlotTracker &SlotTracker) const {
4616 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4618 O << ", ";
4620 if (VPValue *Mask = getMask()) {
4621 O << ", ";
4622 Mask->printAsOperand(O, SlotTracker);
4623 }
4624
4625 unsigned OpIdx = 0;
4626 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4627 if (!IG->getMember(i))
4628 continue;
4629 if (getNumStoreOperands() > 0) {
4630 O << "\n" << Indent << " vp.store ";
4632 O << " to index " << i;
4633 } else {
4634 O << "\n" << Indent << " ";
4636 O << " = vp.load from index " << i;
4637 }
4638 ++OpIdx;
4639 }
4640}
4641#endif
4642
4644 VPCostContext &Ctx) const {
4645 Instruction *InsertPos = getInsertPos();
4646 // Find the VPValue index of the interleave group. We need to skip gaps.
4647 unsigned InsertPosIdx = 0;
4648 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4649 if (auto *Member = IG->getMember(Idx)) {
4650 if (Member == InsertPos)
4651 break;
4652 InsertPosIdx++;
4653 }
4654 const VPValue *ValV = getNumDefinedValues() > 0
4655 ? getVPValue(InsertPosIdx)
4656 : getStoredValues()[InsertPosIdx];
4657 Type *ValTy = ValV->getScalarType();
4658 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4659 unsigned AS =
4660 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4661
4662 unsigned InterleaveFactor = IG->getFactor();
4663 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4664
4665 // Holds the indices of existing members in the interleaved group.
4667 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4668 if (IG->getMember(IF))
4669 Indices.push_back(IF);
4670
4671 // Calculate the cost of the whole interleaved group.
4672 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4673 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4674 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4675
4676 if (!IG->isReverse())
4677 return Cost;
4678
4679 return Cost + IG->getNumMembers() *
4680 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4681 VectorTy, VectorTy, {}, Ctx.CostKind,
4682 0);
4683}
4684
4686 return vputils::onlyScalarValuesUsed(this) &&
4687 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4688}
4689
4690#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4692 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4693 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4694 "unexpected number of operands");
4695 O << Indent << "EMIT ";
4697 O << " = WIDEN-POINTER-INDUCTION ";
4699 O << ", ";
4701 O << ", ";
4703 if (getNumOperands() == 5) {
4704 O << ", ";
4706 O << ", ";
4708 }
4709}
4710
4712 VPSlotTracker &SlotTracker) const {
4713 O << Indent << "EMIT ";
4715 O << " = EXPAND SCEV " << *Expr;
4716}
4717#endif
4718
4719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4721 VPSlotTracker &SlotTracker) const {
4722 O << Indent << "EMIT ";
4724 O << " = WIDEN-CANONICAL-INDUCTION";
4725 printFlags(O);
4727}
4728#endif
4729
4731 auto &Builder = State.Builder;
4732 // Create a vector from the initial value.
4733 auto *VectorInit = getStartValue()->getLiveInIRValue();
4734
4735 Type *VecTy = State.VF.isScalar()
4736 ? VectorInit->getType()
4737 : VectorType::get(VectorInit->getType(), State.VF);
4738
4739 BasicBlock *VectorPH =
4740 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4741 if (State.VF.isVector()) {
4742 auto *IdxTy = Builder.getInt32Ty();
4743 auto *One = ConstantInt::get(IdxTy, 1);
4744 IRBuilder<>::InsertPointGuard Guard(Builder);
4745 Builder.SetInsertPoint(VectorPH->getTerminator());
4746 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4747 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4748 VectorInit = Builder.CreateInsertElement(
4749 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4750 }
4751
4752 // Create a phi node for the new recurrence.
4753 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4754 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4755 Phi->addIncoming(VectorInit, VectorPH);
4756 State.set(this, Phi);
4757}
4758
4761 VPCostContext &Ctx) const {
4762 if (VF.isScalar())
4763 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4764
4765 return 0;
4766}
4767
4768#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4770 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4771 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4773 O << " = phi ";
4775}
4776#endif
4777
4779 // Reductions do not have to start at zero. They can start with
4780 // any loop invariant values.
4781 VPValue *StartVPV = getStartValue();
4782
4783 // In order to support recurrences we need to be able to vectorize Phi nodes.
4784 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4785 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4786 // this value when we vectorize all of the instructions that use the PHI.
4787 BasicBlock *VectorPH =
4788 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4789 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4790 Value *StartV = State.get(StartVPV, ScalarPHI);
4791 Type *VecTy = StartV->getType();
4792
4793 BasicBlock *HeaderBB = State.CFG.PrevBB;
4794 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4795 "recipe must be in the vector loop header");
4796 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4797 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4798 State.set(this, Phi, isInLoop());
4799
4800 Phi->addIncoming(StartV, VectorPH);
4801}
4802
4803#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4805 VPSlotTracker &SlotTracker) const {
4806 O << Indent << "WIDEN-REDUCTION-PHI ";
4807
4809 O << " = phi (";
4810 printRecurrenceKind(O, Kind);
4811 O << ")";
4812 printFlags(O);
4814 if (getVFScaleFactor() > 1)
4815 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4816}
4817#endif
4818
4820 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4821 return vputils::onlyFirstLaneUsed(this);
4822}
4823
4825 Value *Op0 = State.get(getOperand(0));
4826 Type *VecTy = Op0->getType();
4827 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4828 State.set(this, VecPhi);
4829}
4830
4832 VPCostContext &Ctx) const {
4833 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4834}
4835
4836#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4838 VPSlotTracker &SlotTracker) const {
4839 O << Indent << "WIDEN-PHI ";
4840
4842 O << " = phi ";
4844}
4845#endif
4846
4848 BasicBlock *VectorPH =
4849 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4850 Value *StartMask = State.get(getOperand(0));
4851 PHINode *Phi =
4852 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4853 Phi->addIncoming(StartMask, VectorPH);
4854 State.set(this, Phi);
4855}
4856
4857#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4859 VPSlotTracker &SlotTracker) const {
4860 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4861
4863 O << " = phi ";
4865}
4866#endif
4867
4868#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4870 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4871 O << Indent << "CURRENT-ITERATION-PHI ";
4872
4874 O << " = phi ";
4876}
4877#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void replaceAllUsesWith(Value *Old, Value *New, SmallPtrSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind)
SmallVector< Value *, 2 > VectorParts
static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind)
static unsigned getCalledFnOperandIndex(ArrayRef< VPValue * > Operands)
For call VPInstruction operands, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:246
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2691
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2703
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2101
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2286
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2388
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1792
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2518
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1876
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2384
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1176
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2396
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1800
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2494
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
@ IK_IntInduction
Integer induction variable. Step = C.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isByteTy() const
True if this is an instance of ByteType.
Definition Type.h:242
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4399
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4452
iterator end()
Definition VPlan.h:4436
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4465
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:3008
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:3003
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2999
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:364
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
InductionDescriptor::InductionKind getInductionKind() const
Definition VPlan.h:4220
VPValue * getIndex() const
Definition VPlan.h:4217
VPIRValue * getStartValue() const
Definition VPlan.h:4216
VPValue * getStepValue() const
Definition VPlan.h:4218
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPDerivedIVRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPExpandSCEVRecipe(const SCEV *Expr)
bool isVectorToScalar() const
Returns true if this VPExpressionRecipe produces a single scalar.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2473
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2186
Class to record and manage LLVM IR flags.
Definition VPlan.h:694
FastMathFlagsTy FMFs
Definition VPlan.h:782
ReductionFlagsTy ReductionFlags
Definition VPlan.h:784
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:776
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:999
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1063
TruncFlagsTy TruncFlags
Definition VPlan.h:777
CmpInst::Predicate getPredicate() const
Definition VPlan.h:971
ExactFlagsTy ExactFlags
Definition VPlan.h:779
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:780
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:989
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:994
DisjointFlagsTy DisjointFlags
Definition VPlan.h:778
FCmpFlagsTy FCmpFlags
Definition VPlan.h:783
NonNegFlagsTy NonNegFlags
Definition VPlan.h:781
bool isReductionInLoop() const
Definition VPlan.h:1069
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:928
uint8_t CmpPredStorage
Definition VPlan.h:775
RecurKind getRecurKind() const
Definition VPlan.h:1057
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1720
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
Type * getResultType() const
Definition VPlan.h:1588
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1327
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1318
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1331
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1343
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1321
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1268
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1314
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1263
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1260
@ VScale
Returns the value for vscale.
Definition VPlan.h:1347
@ CanonicalIVIncrementForPart
Definition VPlan.h:1244
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1271
bool hasResult() const
Definition VPlan.h:1437
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1519
unsigned getOpcode() const
Definition VPlan.h:1416
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void addOperand(VPValue *Op)
Add Op as operand of this VPInstruction.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1462
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:3113
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:3117
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3115
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3107
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3136
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3101
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3210
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3223
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3173
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1624
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4543
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1649
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1609
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4744
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
bool isSafeToSpeculativelyExecute() const
Return true if we can safely execute this recipe unconditionally even if it is masked originally.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:523
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:467
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
friend class VPValue
Definition VPlanValue.h:316
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3382
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2909
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2933
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3324
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3335
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3337
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3320
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3326
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3333
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3328
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4609
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4685
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3460
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
static Type * computeScalarType(const Instruction *I, ArrayRef< VPValue * > Operands)
Compute the scalar result type for a VPReplicateRecipe wrapping I with Operands (excluding any predic...
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
unsigned getOpcode() const
Definition VPlan.h:3484
VPValue * getStepValue() const
Definition VPlan.h:4288
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4296
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:608
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:679
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:610
This class can be used to assign names to VPValues.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1527
operand_range operands()
Definition VPlanValue.h:457
unsigned getNumOperands() const
Definition VPlanValue.h:424
operand_iterator op_end()
Definition VPlanValue.h:455
operand_iterator op_begin()
Definition VPlanValue.h:453
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
void addOperand(VPValue *Operand)
Definition VPlanValue.h:410
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1523
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
VPValue * getVFValue() const
Definition VPlan.h:2288
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2285
int64_t getStride() const
Definition VPlan.h:2286
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
VPValue * getStride() const
Definition VPlan.h:2362
Type * getSourceElementType() const
Definition VPlan.h:2377
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
VPValue * getVFxPart() const
Definition VPlan.h:2364
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
operand_range args()
Definition VPlan.h:2137
Function * getCalledScalarFunction() const
Definition VPlan.h:2133
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2242
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2565
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2568
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2671
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2686
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
CallInst * createVectorCall(VPTransformState &State)
Helper function to produce the widened intrinsic call.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:2022
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
void execute(VPTransformState &State) override
Produce a widened version of the vector memory intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector memory intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3751
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3776
Instruction & Ingredient
Definition VPlan.h:3742
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3748
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3786
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3745
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3779
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4757
const DataLayout & getDataLayout() const
Definition VPlan.h:4962
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4916
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5064
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:806
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
LLVM_ABI Type * computeScalarTypeForInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands)
Compute the scalar result type for an IR Opcode given Operands.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ None
Not a recurrence.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
TargetTransformInfo::TargetCostKind CostKind
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1778
PHINode & getIRPhi()
Definition VPlan.h:1791
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1117
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:313
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3870
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3971
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3974
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3920