LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdota4* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
690
691 // First, handle cases where having a fixed length vector enables us to
692 // give a more accurate cost than falling back to generic scalable codegen.
693 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
694 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
695 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
697 *this, LT.second, ST->getRealVLen(),
698 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
699 if (VRegSplittingCost.isValid())
700 return VRegSplittingCost;
701 switch (Kind) {
702 default:
703 break;
705 if (Mask.size() >= 2) {
706 MVT EltTp = LT.second.getVectorElementType();
707 // If the size of the element is < ELEN then shuffles of interleaves and
708 // deinterleaves of 2 vectors can be lowered into the following
709 // sequences
710 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
711 // Example sequence:
712 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
713 // vwaddu.vv v10, v8, v9
714 // li a0, -1 (ignored)
715 // vwmaccu.vx v10, a0, v9
716 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
717 return 2 * LT.first * TLI->getLMULCost(LT.second);
718
719 if (Mask[0] == 0 || Mask[0] == 1) {
720 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
721 // Example sequence:
722 // vnsrl.wi v10, v8, 0
723 if (equal(DeinterleaveMask, Mask))
724 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
725 LT.second, CostKind);
726 }
727 }
728 int SubVectorSize;
729 if (LT.second.getScalarSizeInBits() != 1 &&
730 isRepeatedConcatMask(Mask, SubVectorSize)) {
732 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
733 // The cost of extraction from a subvector is 0 if the index is 0.
734 for (unsigned I = 0; I != NumSlides; ++I) {
735 unsigned InsertIndex = SubVectorSize * (1 << I);
736 FixedVectorType *SubTp =
737 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
738 FixedVectorType *DestTp =
740 std::pair<InstructionCost, MVT> DestLT =
742 // Add the cost of whole vector register move because the
743 // destination vector register group for vslideup cannot overlap the
744 // source.
745 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
746 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
747 CostKind, InsertIndex, SubTp);
748 }
749 return Cost;
750 }
751 }
752
753 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
754 SlideCost.isValid())
755 return SlideCost;
756
757 // vrgather + cost of generating the mask constant.
758 // We model this for an unknown mask with a single vrgather.
759 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
760 LT.second.getVectorNumElements() <= 256)) {
761 VectorType *IdxTy =
762 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
763 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
764 return IndexCost +
765 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
766 }
767 break;
768 }
771
772 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
773 SlideCost.isValid())
774 return SlideCost;
775
776 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
777 // register for the second vrgather. We model this for an unknown
778 // (shuffle) mask.
779 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
780 LT.second.getVectorNumElements() <= 256)) {
781 auto &C = SrcTy->getContext();
782 auto EC = SrcTy->getElementCount();
783 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
785 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
786 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
787 return 2 * IndexCost +
788 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
789 LT.second, CostKind) +
790 MaskCost;
791 }
792 break;
793 }
794 }
795
796 auto shouldSplit = [](TTI::ShuffleKind Kind) {
797 switch (Kind) {
798 default:
799 return false;
803 return true;
804 }
805 };
806
807 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
808 shouldSplit(Kind)) {
809 InstructionCost SplitCost =
810 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
811 if (SplitCost.isValid())
812 return SplitCost;
813 }
814 }
815
816 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
817 switch (Kind) {
818 default:
819 // Fallthrough to generic handling.
820 // TODO: Most of these cases will return getInvalid in generic code, and
821 // must be implemented here.
822 break;
824 // Extract at zero is always a subregister extract
825 if (Index == 0)
826 return TTI::TCC_Free;
827
828 // If we're extracting a subvector of at most m1 size at a sub-register
829 // boundary - which unfortunately we need exact vlen to identify - this is
830 // a subregister extract at worst and thus won't require a vslidedown.
831 // TODO: Extend for aligned m2, m4 subvector extracts
832 // TODO: Extend for misalgined (but contained) extracts
833 // TODO: Extend for scalable subvector types
834 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
835 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
836 if (std::optional<unsigned> VLen = ST->getRealVLen();
837 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
838 SubLT.second.getSizeInBits() <= *VLen)
839 return TTI::TCC_Free;
840 }
841
842 // Example sequence:
843 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
844 // vslidedown.vi v8, v9, 2
845 return LT.first *
846 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
848 // Example sequence:
849 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
850 // vslideup.vi v8, v9, 2
851 LT = getTypeLegalizationCost(DstTy);
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
854 case TTI::SK_Select: {
855 // Example sequence:
856 // li a0, 90
857 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
858 // vmv.s.x v0, a0
859 // vmerge.vvm v8, v9, v8, v0
860 // We use 2 for the cost of the mask materialization as this is the true
861 // cost for small masks and most shuffles are small. At worst, this cost
862 // should be a very small constant for the constant pool load. As such,
863 // we may bias towards large selects slightly more than truly warranted.
864 return LT.first *
865 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
866 LT.second, CostKind));
867 }
868 case TTI::SK_Broadcast: {
869 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
870 Instruction::InsertElement);
871 if (LT.second.getScalarSizeInBits() == 1) {
872 if (HasScalar) {
873 // Example sequence:
874 // andi a0, a0, 1
875 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
876 // vmv.v.x v8, a0
877 // vmsne.vi v0, v8, 0
878 return LT.first *
879 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
880 LT.second, CostKind));
881 }
882 // Example sequence:
883 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
884 // vmv.v.i v8, 0
885 // vmerge.vim v8, v8, 1, v0
886 // vmv.x.s a0, v8
887 // andi a0, a0, 1
888 // vmv.v.x v8, a0
889 // vmsne.vi v0, v8, 0
890
891 return LT.first *
892 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
893 RISCV::VMV_X_S, RISCV::VMV_V_X,
894 RISCV::VMSNE_VI},
895 LT.second, CostKind));
896 }
897
898 if (HasScalar) {
899 // Example sequence:
900 // vmv.v.x v8, a0
901 return LT.first *
902 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
903 }
904
905 // Example sequence:
906 // vrgather.vi v9, v8, 0
907 return LT.first *
908 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
909 }
910 case TTI::SK_Splice: {
911 // vslidedown+vslideup.
912 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
913 // of similar code, but I think we expand through memory.
914 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
915 if (Index >= 0 && Index < 32)
916 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
917 else if (Index < 0 && Index > -32)
918 Opcodes[1] = RISCV::VSLIDEUP_VI;
919 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
920 }
921 case TTI::SK_Reverse: {
922
923 if (!LT.second.isVector())
925
926 // TODO: Cases to improve here:
927 // * Illegal vector types
928 // * i64 on RV32
929 if (SrcTy->getElementType()->isIntegerTy(1)) {
930 VectorType *WideTy =
931 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
932 cast<VectorType>(SrcTy)->getElementCount());
933 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
935 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
936 nullptr) +
937 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
939 }
940
941 MVT ContainerVT = LT.second;
942 if (LT.second.isFixedLengthVector())
943 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
944 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
945 if (ContainerVT.bitsLE(M1VT)) {
946 // Example sequence:
947 // csrr a0, vlenb
948 // srli a0, a0, 3
949 // addi a0, a0, -1
950 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
951 // vid.v v9
952 // vrsub.vx v10, v9, a0
953 // vrgather.vv v9, v8, v10
954 InstructionCost LenCost = 3;
955 if (LT.second.isFixedLengthVector())
956 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
957 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
958 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
959 if (LT.second.isFixedLengthVector() &&
960 isInt<5>(LT.second.getVectorNumElements() - 1))
961 Opcodes[1] = RISCV::VRSUB_VI;
962 InstructionCost GatherCost =
963 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
964 return LT.first * (LenCost + GatherCost);
965 }
966
967 // At high LMUL, we split into a series of M1 reverses (see
968 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
969 // the resulting gap at the bottom (for fixed vectors only). The important
970 // bit is that the cost scales linearly, not quadratically with LMUL.
971 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
972 InstructionCost FixedCost =
973 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
974 unsigned Ratio =
976 InstructionCost GatherCost =
977 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
978 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
979 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
980 return FixedCost + LT.first * (GatherCost + SlideCost);
981 }
982 }
983 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
984 SubTp);
985}
986
987static unsigned isM1OrSmaller(MVT VT) {
989 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
993}
994
996 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
997 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
998 TTI::VectorInstrContext VIC) const {
1001
1002 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1003 // For now, skip all fixed vector cost analysis when P extension is available
1004 // to avoid crashes in getMinRVVVectorSizeInBits()
1005 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1006 return 1; // Treat as single instruction cost for now
1007 }
1008
1009 // A build_vector (which is m1 sized or smaller) can be done in no
1010 // worse than one vslide1down.vx per element in the type. We could
1011 // in theory do an explode_vector in the inverse manner, but our
1012 // lowering today does not have a first class node for this pattern.
1014 Ty, DemandedElts, Insert, Extract, CostKind);
1015 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1016 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1017 if (Ty->getScalarSizeInBits() == 1) {
1018 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1019 // Note: Implicit scalar anyextend is assumed to be free since the i1
1020 // must be stored in a GPR.
1021 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1022 CostKind) +
1023 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1025 }
1026
1027 assert(LT.second.isFixedLengthVector());
1028 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1029 if (isM1OrSmaller(ContainerVT)) {
1030 InstructionCost BV =
1031 cast<FixedVectorType>(Ty)->getNumElements() *
1032 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1033 if (BV < Cost)
1034 Cost = BV;
1035 }
1036 }
1037 return Cost;
1038}
1039
1043 Type *DataTy = MICA.getDataType();
1044 Align Alignment = MICA.getAlignment();
1045 switch (MICA.getID()) {
1046 case Intrinsic::vp_load_ff: {
1047 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1048 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1050
1051 unsigned AS = MICA.getAddressSpace();
1052 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1053 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1054 }
1055 case Intrinsic::experimental_vp_strided_load:
1056 case Intrinsic::experimental_vp_strided_store:
1057 return getStridedMemoryOpCost(MICA, CostKind);
1058 case Intrinsic::masked_compressstore:
1059 case Intrinsic::masked_expandload:
1061 case Intrinsic::vp_scatter:
1062 case Intrinsic::vp_gather:
1063 case Intrinsic::masked_scatter:
1064 case Intrinsic::masked_gather:
1065 return getGatherScatterOpCost(MICA, CostKind);
1066 case Intrinsic::vp_load:
1067 case Intrinsic::vp_store:
1068 case Intrinsic::masked_load:
1069 case Intrinsic::masked_store:
1070 return getMaskedMemoryOpCost(MICA, CostKind);
1071 }
1073}
1074
1078 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1079 : Instruction::Store;
1080 Type *Src = MICA.getDataType();
1081 Align Alignment = MICA.getAlignment();
1082 unsigned AddressSpace = MICA.getAddressSpace();
1083
1084 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1087
1088 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1089}
1090
1092 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1093 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1094 bool UseMaskForCond, bool UseMaskForGaps) const {
1095
1096 // The interleaved memory access pass will lower (de)interleave ops combined
1097 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1098 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1099 // gap).
1100 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1101 auto *VTy = cast<VectorType>(VecTy);
1102 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1103 // Need to make sure type has't been scalarized
1104 if (LT.second.isVector()) {
1105 auto *SubVecTy =
1106 VectorType::get(VTy->getElementType(),
1107 VTy->getElementCount().divideCoefficientBy(Factor));
1108 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1109 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1110 AddressSpace, DL)) {
1111
1112 // Some processors optimize segment loads/stores as one wide memory op +
1113 // Factor * LMUL shuffle ops.
1114 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1116 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1117 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1118 Cost += Factor * TLI->getLMULCost(SubVecVT);
1119 return LT.first * Cost;
1120 }
1121
1122 // Otherwise, the cost is proportional to the number of elements (VL *
1123 // Factor ops).
1124 InstructionCost MemOpCost =
1125 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1126 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1127 unsigned NumLoads = getEstimatedVLFor(VTy);
1128 return NumLoads * MemOpCost;
1129 }
1130 }
1131 }
1132
1133 // TODO: Return the cost of interleaved accesses for scalable vector when
1134 // unable to convert to segment accesses instructions.
1135 if (isa<ScalableVectorType>(VecTy))
1137
1138 auto *FVTy = cast<FixedVectorType>(VecTy);
1139 InstructionCost MemCost =
1140 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1141 unsigned VF = FVTy->getNumElements() / Factor;
1142
1143 // An interleaved load will look like this for Factor=3:
1144 // %wide.vec = load <12 x i32>, ptr %3, align 4
1145 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1146 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1147 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1148 if (Opcode == Instruction::Load) {
1149 InstructionCost Cost = MemCost;
1150 for (unsigned Index : Indices) {
1151 FixedVectorType *VecTy =
1152 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1153 auto Mask = createStrideMask(Index, Factor, VF);
1154 Mask.resize(VF * Factor, -1);
1155 InstructionCost ShuffleCost =
1157 Mask, CostKind, 0, nullptr, {});
1158 Cost += ShuffleCost;
1159 }
1160 return Cost;
1161 }
1162
1163 // TODO: Model for NF > 2
1164 // We'll need to enhance getShuffleCost to model shuffles that are just
1165 // inserts and extracts into subvectors, since they won't have the full cost
1166 // of a vrgather.
1167 // An interleaved store for 3 vectors of 4 lanes will look like
1168 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1169 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1170 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1171 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1172 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1173 if (Factor != 2)
1174 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1175 Alignment, AddressSpace, CostKind,
1176 UseMaskForCond, UseMaskForGaps);
1177
1178 assert(Opcode == Instruction::Store && "Opcode must be a store");
1179 // For an interleaving store of 2 vectors, we perform one large interleaving
1180 // shuffle that goes into the wide store
1181 auto Mask = createInterleaveMask(VF, Factor);
1182 InstructionCost ShuffleCost =
1184 CostKind, 0, nullptr, {});
1185 return MemCost + ShuffleCost;
1186}
1187
1191
1192 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1193 MICA.getID() == Intrinsic::vp_gather;
1194 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1195 Type *DataTy = MICA.getDataType();
1196 Align Alignment = MICA.getAlignment();
1199
1200 if ((Opcode == Instruction::Load &&
1201 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1202 (Opcode == Instruction::Store &&
1203 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1205
1206 // Cost is proportional to the number of memory operations implied. For
1207 // scalable vectors, we use an estimate on that number since we don't
1208 // know exactly what VL will be.
1209 auto &VTy = *cast<VectorType>(DataTy);
1210 unsigned NumLoads = getEstimatedVLFor(&VTy);
1211 return NumLoads * TTI::TCC_Basic;
1212}
1213
1215 const MemIntrinsicCostAttributes &MICA,
1217 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1218 ? Instruction::Load
1219 : Instruction::Store;
1220 Type *DataTy = MICA.getDataType();
1221 bool VariableMask = MICA.getVariableMask();
1222 Align Alignment = MICA.getAlignment();
1223 bool IsLegal = (Opcode == Instruction::Store &&
1224 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1225 (Opcode == Instruction::Load &&
1226 isLegalMaskedExpandLoad(DataTy, Alignment));
1227 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1229 // Example compressstore sequence:
1230 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1231 // vcompress.vm v10, v8, v0
1232 // vcpop.m a1, v0
1233 // vsetvli zero, a1, e32, m2, ta, ma
1234 // vse32.v v10, (a0)
1235 // Example expandload sequence:
1236 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1237 // vcpop.m a1, v0
1238 // vsetvli zero, a1, e32, m2, ta, ma
1239 // vle32.v v10, (a0)
1240 // vsetivli zero, 8, e32, m2, ta, ma
1241 // viota.m v12, v0
1242 // vrgather.vv v8, v10, v12, v0.t
1243 auto MemOpCost =
1244 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1245 auto LT = getTypeLegalizationCost(DataTy);
1246 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1247 if (VariableMask)
1248 Opcodes.push_back(RISCV::VCPOP_M);
1249 if (Opcode == Instruction::Store)
1250 Opcodes.append({RISCV::VCOMPRESS_VM});
1251 else
1252 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1253 return MemOpCost +
1254 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1255}
1256
1260
1261 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1262 ? Instruction::Load
1263 : Instruction::Store;
1264
1265 Type *DataTy = MICA.getDataType();
1266 Align Alignment = MICA.getAlignment();
1267 const Instruction *I = MICA.getInst();
1268
1269 if (!isLegalStridedLoadStore(DataTy, Alignment))
1271
1273 return TTI::TCC_Basic;
1274
1275 // Cost is proportional to the number of memory operations implied. For
1276 // scalable vectors, we use an estimate on that number since we don't
1277 // know exactly what VL will be.
1278 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1279 auto &VTy = *cast<VectorType>(DataTy);
1280 InstructionCost MemOpCost =
1281 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1282 {TTI::OK_AnyValue, TTI::OP_None}, I);
1283 unsigned NumLoads = getEstimatedVLFor(&VTy);
1284 return NumLoads * MemOpCost;
1285}
1286
1289 // FIXME: This is a property of the default vector convention, not
1290 // all possible calling conventions. Fixing that will require
1291 // some TTI API and SLP rework.
1294 for (auto *Ty : Tys) {
1295 if (!Ty->isVectorTy())
1296 continue;
1297 Align A = DL.getPrefTypeAlign(Ty);
1298 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1299 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1300 }
1301 return Cost;
1302}
1303
1304// Currently, these represent both throughput and codesize costs
1305// for the respective intrinsics. The costs in this table are simply
1306// instruction counts with the following adjustments made:
1307// * One vsetvli is considered free.
1309 {Intrinsic::floor, MVT::f32, 9},
1310 {Intrinsic::floor, MVT::f64, 9},
1311 {Intrinsic::ceil, MVT::f32, 9},
1312 {Intrinsic::ceil, MVT::f64, 9},
1313 {Intrinsic::trunc, MVT::f32, 7},
1314 {Intrinsic::trunc, MVT::f64, 7},
1315 {Intrinsic::round, MVT::f32, 9},
1316 {Intrinsic::round, MVT::f64, 9},
1317 {Intrinsic::roundeven, MVT::f32, 9},
1318 {Intrinsic::roundeven, MVT::f64, 9},
1319 {Intrinsic::rint, MVT::f32, 7},
1320 {Intrinsic::rint, MVT::f64, 7},
1321 {Intrinsic::nearbyint, MVT::f32, 9},
1322 {Intrinsic::nearbyint, MVT::f64, 9},
1323 {Intrinsic::bswap, MVT::i16, 3},
1324 {Intrinsic::bswap, MVT::i32, 12},
1325 {Intrinsic::bswap, MVT::i64, 31},
1326 {Intrinsic::vp_bswap, MVT::i16, 3},
1327 {Intrinsic::vp_bswap, MVT::i32, 12},
1328 {Intrinsic::vp_bswap, MVT::i64, 31},
1329 {Intrinsic::vp_fshl, MVT::i8, 7},
1330 {Intrinsic::vp_fshl, MVT::i16, 7},
1331 {Intrinsic::vp_fshl, MVT::i32, 7},
1332 {Intrinsic::vp_fshl, MVT::i64, 7},
1333 {Intrinsic::vp_fshr, MVT::i8, 7},
1334 {Intrinsic::vp_fshr, MVT::i16, 7},
1335 {Intrinsic::vp_fshr, MVT::i32, 7},
1336 {Intrinsic::vp_fshr, MVT::i64, 7},
1337 {Intrinsic::bitreverse, MVT::i8, 17},
1338 {Intrinsic::bitreverse, MVT::i16, 24},
1339 {Intrinsic::bitreverse, MVT::i32, 33},
1340 {Intrinsic::bitreverse, MVT::i64, 52},
1341 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1342 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1343 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1344 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1345 {Intrinsic::ctpop, MVT::i8, 12},
1346 {Intrinsic::ctpop, MVT::i16, 19},
1347 {Intrinsic::ctpop, MVT::i32, 20},
1348 {Intrinsic::ctpop, MVT::i64, 21},
1349 {Intrinsic::ctlz, MVT::i8, 19},
1350 {Intrinsic::ctlz, MVT::i16, 28},
1351 {Intrinsic::ctlz, MVT::i32, 31},
1352 {Intrinsic::ctlz, MVT::i64, 35},
1353 {Intrinsic::cttz, MVT::i8, 16},
1354 {Intrinsic::cttz, MVT::i16, 23},
1355 {Intrinsic::cttz, MVT::i32, 24},
1356 {Intrinsic::cttz, MVT::i64, 25},
1357 {Intrinsic::vp_ctpop, MVT::i8, 12},
1358 {Intrinsic::vp_ctpop, MVT::i16, 19},
1359 {Intrinsic::vp_ctpop, MVT::i32, 20},
1360 {Intrinsic::vp_ctpop, MVT::i64, 21},
1361 {Intrinsic::vp_ctlz, MVT::i8, 19},
1362 {Intrinsic::vp_ctlz, MVT::i16, 28},
1363 {Intrinsic::vp_ctlz, MVT::i32, 31},
1364 {Intrinsic::vp_ctlz, MVT::i64, 35},
1365 {Intrinsic::vp_cttz, MVT::i8, 16},
1366 {Intrinsic::vp_cttz, MVT::i16, 23},
1367 {Intrinsic::vp_cttz, MVT::i32, 24},
1368 {Intrinsic::vp_cttz, MVT::i64, 25},
1369};
1370
1374 auto *RetTy = ICA.getReturnType();
1375 switch (ICA.getID()) {
1376 case Intrinsic::lrint:
1377 case Intrinsic::llrint:
1378 case Intrinsic::lround:
1379 case Intrinsic::llround: {
1380 auto LT = getTypeLegalizationCost(RetTy);
1381 Type *SrcTy = ICA.getArgTypes().front();
1382 auto SrcLT = getTypeLegalizationCost(SrcTy);
1383 if (ST->hasVInstructions() && LT.second.isVector()) {
1385 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1386 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1387 if (LT.second.getVectorElementType() == MVT::bf16) {
1388 if (!ST->hasVInstructionsBF16Minimal())
1390 if (DstEltSz == 32)
1391 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1392 else
1393 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1394 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1395 !ST->hasVInstructionsF16()) {
1396 if (!ST->hasVInstructionsF16Minimal())
1398 if (DstEltSz == 32)
1399 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1400 else
1401 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1402
1403 } else if (SrcEltSz > DstEltSz) {
1404 Ops = {RISCV::VFNCVT_X_F_W};
1405 } else if (SrcEltSz < DstEltSz) {
1406 Ops = {RISCV::VFWCVT_X_F_V};
1407 } else {
1408 Ops = {RISCV::VFCVT_X_F_V};
1409 }
1410
1411 // We need to use the source LMUL in the case of a narrowing op, and the
1412 // destination LMUL otherwise.
1413 if (SrcEltSz > DstEltSz)
1414 return SrcLT.first *
1415 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1416 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1417 }
1418 break;
1419 }
1420 case Intrinsic::ceil:
1421 case Intrinsic::floor:
1422 case Intrinsic::trunc:
1423 case Intrinsic::rint:
1424 case Intrinsic::round:
1425 case Intrinsic::roundeven: {
1426 // These all use the same code.
1427 auto LT = getTypeLegalizationCost(RetTy);
1428 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1429 return LT.first * 8;
1430 break;
1431 }
1432 case Intrinsic::umin:
1433 case Intrinsic::umax:
1434 case Intrinsic::smin:
1435 case Intrinsic::smax: {
1436 auto LT = getTypeLegalizationCost(RetTy);
1437 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1438 return LT.first;
1439
1440 if (ST->hasVInstructions() && LT.second.isVector()) {
1441 unsigned Op;
1442 switch (ICA.getID()) {
1443 case Intrinsic::umin:
1444 Op = RISCV::VMINU_VV;
1445 break;
1446 case Intrinsic::umax:
1447 Op = RISCV::VMAXU_VV;
1448 break;
1449 case Intrinsic::smin:
1450 Op = RISCV::VMIN_VV;
1451 break;
1452 case Intrinsic::smax:
1453 Op = RISCV::VMAX_VV;
1454 break;
1455 }
1456 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1457 }
1458 break;
1459 }
1460 case Intrinsic::sadd_sat:
1461 case Intrinsic::ssub_sat:
1462 case Intrinsic::uadd_sat:
1463 case Intrinsic::usub_sat: {
1464 auto LT = getTypeLegalizationCost(RetTy);
1465 if (ST->hasVInstructions() && LT.second.isVector()) {
1466 unsigned Op;
1467 switch (ICA.getID()) {
1468 case Intrinsic::sadd_sat:
1469 Op = RISCV::VSADD_VV;
1470 break;
1471 case Intrinsic::ssub_sat:
1472 Op = RISCV::VSSUBU_VV;
1473 break;
1474 case Intrinsic::uadd_sat:
1475 Op = RISCV::VSADDU_VV;
1476 break;
1477 case Intrinsic::usub_sat:
1478 Op = RISCV::VSSUBU_VV;
1479 break;
1480 }
1481 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1482 }
1483 break;
1484 }
1485 case Intrinsic::fma:
1486 case Intrinsic::fmuladd: {
1487 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1488 auto LT = getTypeLegalizationCost(RetTy);
1489 if (ST->hasVInstructions() && LT.second.isVector())
1490 return LT.first *
1491 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1492 break;
1493 }
1494 case Intrinsic::fabs: {
1495 auto LT = getTypeLegalizationCost(RetTy);
1496 if (ST->hasVInstructions() && LT.second.isVector()) {
1497 // lui a0, 8
1498 // addi a0, a0, -1
1499 // vsetvli a1, zero, e16, m1, ta, ma
1500 // vand.vx v8, v8, a0
1501 // f16 with zvfhmin and bf16 with zvfhbmin
1502 if (LT.second.getVectorElementType() == MVT::bf16 ||
1503 (LT.second.getVectorElementType() == MVT::f16 &&
1504 !ST->hasVInstructionsF16()))
1505 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1506 CostKind) +
1507 2;
1508 else
1509 return LT.first *
1510 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1511 }
1512 break;
1513 }
1514 case Intrinsic::sqrt: {
1515 auto LT = getTypeLegalizationCost(RetTy);
1516 if (ST->hasVInstructions() && LT.second.isVector()) {
1519 MVT ConvType = LT.second;
1520 MVT FsqrtType = LT.second;
1521 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1522 // will be spilt.
1523 if (LT.second.getVectorElementType() == MVT::bf16) {
1524 if (LT.second == MVT::nxv32bf16) {
1525 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1526 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1527 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1528 ConvType = MVT::nxv16f16;
1529 FsqrtType = MVT::nxv16f32;
1530 } else {
1531 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1532 FsqrtOp = {RISCV::VFSQRT_V};
1533 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1534 }
1535 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1536 !ST->hasVInstructionsF16()) {
1537 if (LT.second == MVT::nxv32f16) {
1538 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1539 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1540 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1541 ConvType = MVT::nxv16f16;
1542 FsqrtType = MVT::nxv16f32;
1543 } else {
1544 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1545 FsqrtOp = {RISCV::VFSQRT_V};
1546 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1547 }
1548 } else {
1549 FsqrtOp = {RISCV::VFSQRT_V};
1550 }
1551
1552 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1553 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1554 }
1555 break;
1556 }
1557 case Intrinsic::cttz:
1558 case Intrinsic::ctlz:
1559 case Intrinsic::ctpop: {
1560 auto LT = getTypeLegalizationCost(RetTy);
1561 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1562 unsigned Op;
1563 switch (ICA.getID()) {
1564 case Intrinsic::cttz:
1565 Op = RISCV::VCTZ_V;
1566 break;
1567 case Intrinsic::ctlz:
1568 Op = RISCV::VCLZ_V;
1569 break;
1570 case Intrinsic::ctpop:
1571 Op = RISCV::VCPOP_V;
1572 break;
1573 }
1574 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1575 }
1576 break;
1577 }
1578 case Intrinsic::abs: {
1579 auto LT = getTypeLegalizationCost(RetTy);
1580 if (ST->hasVInstructions() && LT.second.isVector()) {
1581 // vabs.v v10, v8
1582 if (ST->hasStdExtZvabd())
1583 return LT.first *
1584 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1585
1586 // vrsub.vi v10, v8, 0
1587 // vmax.vv v8, v8, v10
1588 return LT.first *
1589 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1590 LT.second, CostKind);
1591 }
1592 break;
1593 }
1594 case Intrinsic::fshl:
1595 case Intrinsic::fshr: {
1596 if (ICA.getArgs().empty())
1597 break;
1598
1599 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1600 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1601 // instruction.
1602 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1603 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1604 (RetTy->getIntegerBitWidth() == 32 ||
1605 RetTy->getIntegerBitWidth() == 64) &&
1606 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1607 return 1;
1608 }
1609 break;
1610 }
1611 case Intrinsic::get_active_lane_mask: {
1612 if (ST->hasVInstructions()) {
1613 Type *ExpRetTy = VectorType::get(
1614 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1615 auto LT = getTypeLegalizationCost(ExpRetTy);
1616
1617 // vid.v v8 // considered hoisted
1618 // vsaddu.vx v8, v8, a0
1619 // vmsltu.vx v0, v8, a1
1620 return LT.first *
1621 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1622 LT.second, CostKind);
1623 }
1624 break;
1625 }
1626 // TODO: add more intrinsic
1627 case Intrinsic::stepvector: {
1628 auto LT = getTypeLegalizationCost(RetTy);
1629 // Legalisation of illegal types involves an `index' instruction plus
1630 // (LT.first - 1) vector adds.
1631 if (ST->hasVInstructions())
1632 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1633 (LT.first - 1) *
1634 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1635 return 1 + (LT.first - 1);
1636 }
1637 case Intrinsic::vector_splice_left:
1638 case Intrinsic::vector_splice_right: {
1639 auto LT = getTypeLegalizationCost(RetTy);
1640 // Constant offsets fall through to getShuffleCost.
1641 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1642 break;
1643 if (ST->hasVInstructions() && LT.second.isVector()) {
1644 return LT.first *
1645 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1646 LT.second, CostKind);
1647 }
1648 break;
1649 }
1650 case Intrinsic::experimental_cttz_elts: {
1651 Type *ArgTy = ICA.getArgTypes()[0];
1652 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1653 if (getTLI()->shouldExpandCttzElements(ArgType))
1654 break;
1655 InstructionCost Cost = getRISCVInstructionCost(
1656 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1657
1658 // If zero_is_poison is false, then we will generate additional
1659 // cmp + select instructions to convert -1 to EVL.
1660 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1661 if (ICA.getArgs().size() > 1 &&
1662 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1663 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1665 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1667
1668 return Cost;
1669 }
1670 case Intrinsic::experimental_vp_splice: {
1671 // To support type-based query from vectorizer, set the index to 0.
1672 // Note that index only change the cost from vslide.vx to vslide.vi and in
1673 // current implementations they have same costs.
1675 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1677 }
1678 case Intrinsic::fptoui_sat:
1679 case Intrinsic::fptosi_sat: {
1681 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1682 Type *SrcTy = ICA.getArgTypes()[0];
1683
1684 auto SrcLT = getTypeLegalizationCost(SrcTy);
1685 auto DstLT = getTypeLegalizationCost(RetTy);
1686 if (!SrcTy->isVectorTy())
1687 break;
1688
1689 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1691
1692 Cost +=
1693 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1694 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1695
1696 // Handle NaN.
1697 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1698 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1699 Type *CondTy = RetTy->getWithNewBitWidth(1);
1700 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1702 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1704 return Cost;
1705 }
1706 case Intrinsic::experimental_vector_extract_last_active: {
1707 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1708 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1709
1710 auto ValLT = getTypeLegalizationCost(ValTy);
1711 auto MaskLT = getTypeLegalizationCost(MaskTy);
1712
1713 // TODO: Return cheaper cost when the entire lane is inactive.
1714 // The expected asm sequence is:
1715 // vcpop.m a0, v0
1716 // beqz a0, exit # Return passthru when the entire lane is inactive.
1717 // vid v10, v0.t
1718 // vredmaxu.vs v10, v10, v10
1719 // vmv.x.s a0, v10
1720 // zext.b a0, a0
1721 // vslidedown.vx v8, v8, a0
1722 // vmv.x.s a0, v8
1723 // exit:
1724 // ...
1725
1726 // Find a suitable type for a stepvector.
1727 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1728 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1729 MaskTy->getScalarType(), MaskTy->getElementCount(),
1730 /*ZeroIsPoison=*/true, &VScaleRange);
1731 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1732 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1733 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1734 auto StepLT = getTypeLegalizationCost(StepVecTy);
1736 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1737
1738 Cost += MaskLT.first *
1739 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1740 Cost += getCFInstrCost(Instruction::Br, CostKind, nullptr);
1741 Cost += StepLT.first *
1742 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1743 Cost += getCastInstrCost(Instruction::ZExt,
1744 Type::getInt64Ty(ValTy->getContext()), StepTy,
1746 Cost += ValLT.first *
1747 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1748 ValLT.second, CostKind);
1749 return Cost;
1750 }
1751 }
1752
1753 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1754 if (auto LT = getTypeLegalizationCost(RetTy);
1755 LT.second.isVector()) {
1756 MVT EltTy = LT.second.getVectorElementType();
1757 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1758 ICA.getID(), EltTy))
1759 return LT.first * Entry->Cost;
1760 }
1761 }
1762
1764}
1765
1768 const SCEV *Ptr,
1770 // Address computations for vector indexed load/store likely require an offset
1771 // and/or scaling.
1772 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1773 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1774
1775 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1776}
1777
1779 Type *Src,
1782 const Instruction *I) const {
1783 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1784 if (!IsVectorType)
1785 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1786
1787 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1788 // For now, skip all fixed vector cost analysis when P extension is available
1789 // to avoid crashes in getMinRVVVectorSizeInBits()
1790 if (ST->hasStdExtP() &&
1792 return 1; // Treat as single instruction cost for now
1793 }
1794
1795 // FIXME: Need to compute legalizing cost for illegal types. The current
1796 // code handles only legal types and those which can be trivially
1797 // promoted to legal.
1798 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1799 Dst->getScalarSizeInBits() > ST->getELen())
1800 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1801
1802 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1803 assert(ISD && "Invalid opcode");
1804 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1805 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1806
1807 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1808 // The shared implementation doesn't model vector widening during legalization
1809 // and instead assumes scalarization. In order to scalarize an <N x i1>
1810 // vector, we need to extend/trunc to/from i8. If we don't special case
1811 // this, we can get an infinite recursion cycle.
1812 switch (ISD) {
1813 default:
1814 break;
1815 case ISD::SIGN_EXTEND:
1816 case ISD::ZERO_EXTEND:
1817 if (Src->getScalarSizeInBits() == 1) {
1818 // We do not use vsext/vzext to extend from mask vector.
1819 // Instead we use the following instructions to extend from mask vector:
1820 // vmv.v.i v8, 0
1821 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1822 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1823 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1824 DstLT.second, CostKind) +
1825 DstLT.first - 1;
1826 }
1827 break;
1828 case ISD::TRUNCATE:
1829 if (Dst->getScalarSizeInBits() == 1) {
1830 // We do not use several vncvt to truncate to mask vector. So we could
1831 // not use PowDiff to calculate it.
1832 // Instead we use the following instructions to truncate to mask vector:
1833 // vand.vi v8, v8, 1
1834 // vmsne.vi v0, v8, 0
1835 return SrcLT.first *
1836 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1837 SrcLT.second, CostKind) +
1838 SrcLT.first - 1;
1839 }
1840 break;
1841 };
1842
1843 // Our actual lowering for the case where a wider legal type is available
1844 // uses promotion to the wider type. This is reflected in the result of
1845 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1846 // scalarized if the legalized Src and Dst are not equal sized.
1847 const DataLayout &DL = this->getDataLayout();
1848 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1849 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1850 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1851 SrcLT.second.getSizeInBits()) ||
1852 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1853 DstLT.second.getSizeInBits()) ||
1854 SrcLT.first > 1 || DstLT.first > 1)
1855 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1856
1857 // The split cost is handled by the base getCastInstrCost
1858 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1859
1860 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1861 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1862 switch (ISD) {
1863 case ISD::SIGN_EXTEND:
1864 case ISD::ZERO_EXTEND: {
1865 if ((PowDiff < 1) || (PowDiff > 3))
1866 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1867 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1868 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1869 unsigned Op =
1870 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1871 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1872 }
1873 case ISD::TRUNCATE:
1874 case ISD::FP_EXTEND:
1875 case ISD::FP_ROUND: {
1876 // Counts of narrow/widen instructions.
1877 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1878 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1879
1880 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1881 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1882 : RISCV::VFNCVT_F_F_W;
1884 for (; SrcEltSize != DstEltSize;) {
1885 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1886 ? MVT::getIntegerVT(DstEltSize)
1887 : MVT::getFloatingPointVT(DstEltSize);
1888 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1889 DstEltSize =
1890 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1891 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1892 }
1893 return Cost;
1894 }
1895 case ISD::FP_TO_SINT:
1896 case ISD::FP_TO_UINT: {
1897 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1898 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1899 unsigned FWCVT =
1900 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1901 unsigned FNCVT =
1902 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1903 unsigned SrcEltSize = Src->getScalarSizeInBits();
1904 unsigned DstEltSize = Dst->getScalarSizeInBits();
1906 if ((SrcEltSize == 16) &&
1907 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1908 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1909 // pre-widening to f32 and then convert f32 to integer
1910 VectorType *VecF32Ty =
1911 VectorType::get(Type::getFloatTy(Dst->getContext()),
1912 cast<VectorType>(Dst)->getElementCount());
1913 std::pair<InstructionCost, MVT> VecF32LT =
1914 getTypeLegalizationCost(VecF32Ty);
1915 Cost +=
1916 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1917 VecF32LT.second, CostKind);
1918 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1919 return Cost;
1920 }
1921 if (DstEltSize == SrcEltSize)
1922 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1923 else if (DstEltSize > SrcEltSize)
1924 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1925 else { // (SrcEltSize > DstEltSize)
1926 // First do a narrowing conversion to an integer half the size, then
1927 // truncate if needed.
1928 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1929 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1930 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1931 if ((SrcEltSize / 2) > DstEltSize) {
1932 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1933 Cost +=
1934 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1935 }
1936 }
1937 return Cost;
1938 }
1939 case ISD::SINT_TO_FP:
1940 case ISD::UINT_TO_FP: {
1941 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1942 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1943 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1944 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1945 unsigned SrcEltSize = Src->getScalarSizeInBits();
1946 unsigned DstEltSize = Dst->getScalarSizeInBits();
1947
1949 if ((DstEltSize == 16) &&
1950 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1951 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1952 // it is converted to f32 and then converted to f16
1953 VectorType *VecF32Ty =
1954 VectorType::get(Type::getFloatTy(Dst->getContext()),
1955 cast<VectorType>(Dst)->getElementCount());
1956 std::pair<InstructionCost, MVT> VecF32LT =
1957 getTypeLegalizationCost(VecF32Ty);
1958 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1959 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1960 DstLT.second, CostKind);
1961 return Cost;
1962 }
1963
1964 if (DstEltSize == SrcEltSize)
1965 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1966 else if (DstEltSize > SrcEltSize) {
1967 if ((DstEltSize / 2) > SrcEltSize) {
1968 VectorType *VecTy =
1969 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1970 cast<VectorType>(Dst)->getElementCount());
1971 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1972 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1973 }
1974 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1975 } else
1976 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1977 return Cost;
1978 }
1979 }
1980 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1981}
1982
1983unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1984 if (isa<ScalableVectorType>(Ty)) {
1985 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1986 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1987 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1988 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1989 }
1990 return cast<FixedVectorType>(Ty)->getNumElements();
1991}
1992
1995 FastMathFlags FMF,
1997 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1998 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1999
2000 // Skip if scalar size of Ty is bigger than ELEN.
2001 if (Ty->getScalarSizeInBits() > ST->getELen())
2002 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2003
2004 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2005 if (Ty->getElementType()->isIntegerTy(1)) {
2006 // SelectionDAGBuilder does following transforms:
2007 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2008 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2009 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2010 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2011 else
2012 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2013 }
2014
2015 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2017 InstructionCost ExtraCost = 0;
2018 switch (IID) {
2019 case Intrinsic::maximum:
2020 if (FMF.noNaNs()) {
2021 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2022 } else {
2023 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2024 RISCV::VFMV_F_S};
2025 // Cost of Canonical Nan + branch
2026 // lui a0, 523264
2027 // fmv.w.x fa0, a0
2028 Type *DstTy = Ty->getScalarType();
2029 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2030 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2031 ExtraCost = 1 +
2032 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2034 getCFInstrCost(Instruction::Br, CostKind);
2035 }
2036 break;
2037
2038 case Intrinsic::minimum:
2039 if (FMF.noNaNs()) {
2040 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2041 } else {
2042 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2043 RISCV::VFMV_F_S};
2044 // Cost of Canonical Nan + branch
2045 // lui a0, 523264
2046 // fmv.w.x fa0, a0
2047 Type *DstTy = Ty->getScalarType();
2048 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2049 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2050 ExtraCost = 1 +
2051 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2053 getCFInstrCost(Instruction::Br, CostKind);
2054 }
2055 break;
2056 }
2057 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2058 }
2059
2060 // IR Reduction is composed by one rvv reduction instruction and vmv
2061 unsigned SplitOp;
2063 switch (IID) {
2064 default:
2065 llvm_unreachable("Unsupported intrinsic");
2066 case Intrinsic::smax:
2067 SplitOp = RISCV::VMAX_VV;
2068 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2069 break;
2070 case Intrinsic::smin:
2071 SplitOp = RISCV::VMIN_VV;
2072 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2073 break;
2074 case Intrinsic::umax:
2075 SplitOp = RISCV::VMAXU_VV;
2076 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2077 break;
2078 case Intrinsic::umin:
2079 SplitOp = RISCV::VMINU_VV;
2080 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2081 break;
2082 case Intrinsic::maxnum:
2083 SplitOp = RISCV::VFMAX_VV;
2084 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2085 break;
2086 case Intrinsic::minnum:
2087 SplitOp = RISCV::VFMIN_VV;
2088 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2089 break;
2090 }
2091 // Add a cost for data larger than LMUL8
2092 InstructionCost SplitCost =
2093 (LT.first > 1) ? (LT.first - 1) *
2094 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2095 : 0;
2096 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2097}
2098
2101 std::optional<FastMathFlags> FMF,
2103 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2104 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2105
2106 // Skip if scalar size of Ty is bigger than ELEN.
2107 if (Ty->getScalarSizeInBits() > ST->getELen())
2108 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2109
2110 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2111 assert(ISD && "Invalid opcode");
2112
2113 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2114 ISD != ISD::FADD)
2115 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2116
2117 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2118 Type *ElementTy = Ty->getElementType();
2119 if (ElementTy->isIntegerTy(1)) {
2120 // Example sequences:
2121 // vfirst.m a0, v0
2122 // seqz a0, a0
2123 if (LT.second == MVT::v1i1)
2124 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2125 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2127
2128 if (ISD == ISD::AND) {
2129 // Example sequences:
2130 // vmand.mm v8, v9, v8 ; needed every time type is split
2131 // vmnot.m v8, v0 ; alias for vmnand
2132 // vcpop.m a0, v8
2133 // seqz a0, a0
2134
2135 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2136 // For LMUL <= 8, there is no splitting,
2137 // the sequences are vmnot, vcpop and seqz.
2138 // When LMUL > 8 and split = 1,
2139 // the sequences are vmnand, vcpop and seqz.
2140 // When LMUL > 8 and split > 1,
2141 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2142 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2143 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2144 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2145 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2146 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2148 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2149 // Example sequences:
2150 // vsetvli a0, zero, e8, mf8, ta, ma
2151 // vmxor.mm v8, v0, v8 ; needed every time type is split
2152 // vcpop.m a0, v8
2153 // andi a0, a0, 1
2154 return (LT.first - 1) *
2155 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2156 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2157 } else {
2158 assert(ISD == ISD::OR);
2159 // Example sequences:
2160 // vsetvli a0, zero, e8, mf8, ta, ma
2161 // vmor.mm v8, v9, v8 ; needed every time type is split
2162 // vcpop.m a0, v0
2163 // snez a0, a0
2164 return (LT.first - 1) *
2165 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2166 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2167 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2169 }
2170 }
2171
2172 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2173 // instruction, and others is composed by two vmv and one rvv reduction
2174 // instruction
2175 unsigned SplitOp;
2177 switch (ISD) {
2178 case ISD::ADD:
2179 SplitOp = RISCV::VADD_VV;
2180 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2181 break;
2182 case ISD::OR:
2183 SplitOp = RISCV::VOR_VV;
2184 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2185 break;
2186 case ISD::XOR:
2187 SplitOp = RISCV::VXOR_VV;
2188 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2189 break;
2190 case ISD::AND:
2191 SplitOp = RISCV::VAND_VV;
2192 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2193 break;
2194 case ISD::FADD:
2195 // We can't promote f16/bf16 fadd reductions.
2196 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2197 LT.second.getScalarType() == MVT::bf16)
2198 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2200 Opcodes.push_back(RISCV::VFMV_S_F);
2201 for (unsigned i = 0; i < LT.first.getValue(); i++)
2202 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2203 Opcodes.push_back(RISCV::VFMV_F_S);
2204 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2205 }
2206 SplitOp = RISCV::VFADD_VV;
2207 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2208 break;
2209 }
2210 // Add a cost for data larger than LMUL8
2211 InstructionCost SplitCost =
2212 (LT.first > 1) ? (LT.first - 1) *
2213 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2214 : 0;
2215 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2216}
2217
2219 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2220 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2221 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2222 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2223 FMF, CostKind);
2224
2225 // Skip if scalar size of ResTy is bigger than ELEN.
2226 if (ResTy->getScalarSizeInBits() > ST->getELen())
2227 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2228 FMF, CostKind);
2229
2230 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2231 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2232 FMF, CostKind);
2233
2234 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2235
2236 if (IsUnsigned && Opcode == Instruction::Add &&
2237 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2238 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2239 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2240 return LT.first *
2241 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2242 }
2243
2244 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2245 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2246 FMF, CostKind);
2247
2248 return (LT.first - 1) +
2249 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2250}
2251
2255 assert(OpInfo.isConstant() && "non constant operand?");
2256 if (!isa<VectorType>(Ty))
2257 // FIXME: We need to account for immediate materialization here, but doing
2258 // a decent job requires more knowledge about the immediate than we
2259 // currently have here.
2260 return 0;
2261
2262 if (OpInfo.isUniform())
2263 // vmv.v.i, vmv.v.x, or vfmv.v.f
2264 // We ignore the cost of the scalar constant materialization to be consistent
2265 // with how we treat scalar constants themselves just above.
2266 return 1;
2267
2268 return getConstantPoolLoadCost(Ty, CostKind);
2269}
2270
2272 Align Alignment,
2273 unsigned AddressSpace,
2275 TTI::OperandValueInfo OpInfo,
2276 const Instruction *I) const {
2277 EVT VT = TLI->getValueType(DL, Src, true);
2278 // Type legalization can't handle structs
2279 if (VT == MVT::Other)
2280 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2281 CostKind, OpInfo, I);
2282
2284 if (Opcode == Instruction::Store && OpInfo.isConstant())
2285 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2286
2287 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2288
2289 InstructionCost BaseCost = [&]() {
2290 InstructionCost Cost = LT.first;
2292 return Cost;
2293
2294 // Our actual lowering for the case where a wider legal type is available
2295 // uses the a VL predicated load on the wider type. This is reflected in
2296 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2297 // widened cases are scalarized.
2298 const DataLayout &DL = this->getDataLayout();
2299 if (Src->isVectorTy() && LT.second.isVector() &&
2300 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2301 LT.second.getSizeInBits()))
2302 return Cost;
2303
2304 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2305 CostKind, OpInfo, I);
2306 }();
2307
2308 // Assume memory ops cost scale with the number of vector registers
2309 // possible accessed by the instruction. Note that BasicTTI already
2310 // handles the LT.first term for us.
2311 if (ST->hasVInstructions() && LT.second.isVector() &&
2313 BaseCost *= TLI->getLMULCost(LT.second);
2314 return Cost + BaseCost;
2315}
2316
2318 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2320 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2322 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2323 Op1Info, Op2Info, I);
2324
2325 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2326 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2327 Op1Info, Op2Info, I);
2328
2329 // Skip if scalar size of ValTy is bigger than ELEN.
2330 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2331 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2332 Op1Info, Op2Info, I);
2333
2334 auto GetConstantMatCost =
2335 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2336 if (OpInfo.isUniform())
2337 // We return 0 we currently ignore the cost of materializing scalar
2338 // constants in GPRs.
2339 return 0;
2340
2341 return getConstantPoolLoadCost(ValTy, CostKind);
2342 };
2343
2344 InstructionCost ConstantMatCost;
2345 if (Op1Info.isConstant())
2346 ConstantMatCost += GetConstantMatCost(Op1Info);
2347 if (Op2Info.isConstant())
2348 ConstantMatCost += GetConstantMatCost(Op2Info);
2349
2350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2351 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2352 if (CondTy->isVectorTy()) {
2353 if (ValTy->getScalarSizeInBits() == 1) {
2354 // vmandn.mm v8, v8, v9
2355 // vmand.mm v9, v0, v9
2356 // vmor.mm v0, v9, v8
2357 return ConstantMatCost +
2358 LT.first *
2359 getRISCVInstructionCost(
2360 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2361 LT.second, CostKind);
2362 }
2363 // vselect and max/min are supported natively.
2364 return ConstantMatCost +
2365 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2366 CostKind);
2367 }
2368
2369 if (ValTy->getScalarSizeInBits() == 1) {
2370 // vmv.v.x v9, a0
2371 // vmsne.vi v9, v9, 0
2372 // vmandn.mm v8, v8, v9
2373 // vmand.mm v9, v0, v9
2374 // vmor.mm v0, v9, v8
2375 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2376 return ConstantMatCost +
2377 LT.first *
2378 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2379 InterimVT, CostKind) +
2380 LT.first * getRISCVInstructionCost(
2381 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2382 LT.second, CostKind);
2383 }
2384
2385 // vmv.v.x v10, a0
2386 // vmsne.vi v0, v10, 0
2387 // vmerge.vvm v8, v9, v8, v0
2388 return ConstantMatCost +
2389 LT.first * getRISCVInstructionCost(
2390 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2391 LT.second, CostKind);
2392 }
2393
2394 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2395 CmpInst::isIntPredicate(VecPred)) {
2396 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2397 // provided they incur the same cost across all implementations
2398 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2399 LT.second,
2400 CostKind);
2401 }
2402
2403 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2404 CmpInst::isFPPredicate(VecPred)) {
2405
2406 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2407 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2408 return ConstantMatCost +
2409 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2410
2411 // If we do not support the input floating point vector type, use the base
2412 // one which will calculate as:
2413 // ScalarizeCost + Num * Cost for fixed vector,
2414 // InvalidCost for scalable vector.
2415 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2416 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2417 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2418 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2419 Op1Info, Op2Info, I);
2420
2421 // Assuming vector fp compare and mask instructions are all the same cost
2422 // until a need arises to differentiate them.
2423 switch (VecPred) {
2424 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2425 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2426 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2427 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2428 return ConstantMatCost +
2429 LT.first * getRISCVInstructionCost(
2430 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2431 LT.second, CostKind);
2432
2433 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2434 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2435 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2436 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2437 return ConstantMatCost +
2438 LT.first *
2439 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2440 LT.second, CostKind);
2441
2442 case CmpInst::FCMP_OEQ: // vmfeq.vv
2443 case CmpInst::FCMP_OGT: // vmflt.vv
2444 case CmpInst::FCMP_OGE: // vmfle.vv
2445 case CmpInst::FCMP_OLT: // vmflt.vv
2446 case CmpInst::FCMP_OLE: // vmfle.vv
2447 case CmpInst::FCMP_UNE: // vmfne.vv
2448 return ConstantMatCost +
2449 LT.first *
2450 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2451 default:
2452 break;
2453 }
2454 }
2455
2456 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2457 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2458 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2459 // be (0 + select instr cost).
2460 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2461 ValTy->isIntegerTy() && !I->user_empty()) {
2462 if (all_of(I->users(), [&](const User *U) {
2463 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2464 U->getType()->isIntegerTy() &&
2465 !isa<ConstantData>(U->getOperand(1)) &&
2466 !isa<ConstantData>(U->getOperand(2));
2467 }))
2468 return 0;
2469 }
2470
2471 // TODO: Add cost for scalar type.
2472
2473 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2474 Op1Info, Op2Info, I);
2475}
2476
2479 const Instruction *I) const {
2481 return Opcode == Instruction::PHI ? 0 : 1;
2482 // Branches are assumed to be predicted.
2483 return 0;
2484}
2485
2487 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2488 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2489 assert(Val->isVectorTy() && "This must be a vector type");
2490
2491 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2492 // For now, skip all fixed vector cost analysis when P extension is available
2493 // to avoid crashes in getMinRVVVectorSizeInBits()
2494 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2495 return 1; // Treat as single instruction cost for now
2496 }
2497
2498 if (Opcode != Instruction::ExtractElement &&
2499 Opcode != Instruction::InsertElement)
2500 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2501 VIC);
2502
2503 // Legalize the type.
2504 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2505
2506 // This type is legalized to a scalar type.
2507 if (!LT.second.isVector()) {
2508 auto *FixedVecTy = cast<FixedVectorType>(Val);
2509 // If Index is a known constant, cost is zero.
2510 if (Index != -1U)
2511 return 0;
2512 // Extract/InsertElement with non-constant index is very costly when
2513 // scalarized; estimate cost of loads/stores sequence via the stack:
2514 // ExtractElement cost: store vector to stack, load scalar;
2515 // InsertElement cost: store vector to stack, store scalar, load vector.
2516 Type *ElemTy = FixedVecTy->getElementType();
2517 auto NumElems = FixedVecTy->getNumElements();
2518 auto Align = DL.getPrefTypeAlign(ElemTy);
2519 InstructionCost LoadCost =
2520 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2521 InstructionCost StoreCost =
2522 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2523 return Opcode == Instruction::ExtractElement
2524 ? StoreCost * NumElems + LoadCost
2525 : (StoreCost + LoadCost) * NumElems + StoreCost;
2526 }
2527
2528 // For unsupported scalable vector.
2529 if (LT.second.isScalableVector() && !LT.first.isValid())
2530 return LT.first;
2531
2532 // Mask vector extract/insert is expanded via e8.
2533 if (Val->getScalarSizeInBits() == 1) {
2534 VectorType *WideTy =
2536 cast<VectorType>(Val)->getElementCount());
2537 if (Opcode == Instruction::ExtractElement) {
2538 InstructionCost ExtendCost
2539 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2541 InstructionCost ExtractCost
2542 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2543 return ExtendCost + ExtractCost;
2544 }
2545 InstructionCost ExtendCost
2546 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2548 InstructionCost InsertCost
2549 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2550 InstructionCost TruncCost
2551 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2553 return ExtendCost + InsertCost + TruncCost;
2554 }
2555
2556
2557 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2558 // and vslideup + vmv.s.x to insert element to vector.
2559 unsigned BaseCost = 1;
2560 // When insertelement we should add the index with 1 as the input of vslideup.
2561 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2562
2563 if (Index != -1U) {
2564 // The type may be split. For fixed-width vectors we can normalize the
2565 // index to the new type.
2566 if (LT.second.isFixedLengthVector()) {
2567 unsigned Width = LT.second.getVectorNumElements();
2568 Index = Index % Width;
2569 }
2570
2571 // If exact VLEN is known, we will insert/extract into the appropriate
2572 // subvector with no additional subvector insert/extract cost.
2573 if (auto VLEN = ST->getRealVLen()) {
2574 unsigned EltSize = LT.second.getScalarSizeInBits();
2575 unsigned M1Max = *VLEN / EltSize;
2576 Index = Index % M1Max;
2577 }
2578
2579 if (Index == 0)
2580 // We can extract/insert the first element without vslidedown/vslideup.
2581 SlideCost = 0;
2582 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2583 Val->getScalarType()->isIntegerTy())
2584 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2585 else if (Opcode == Instruction::InsertElement)
2586 SlideCost = 1; // With a constant index, we do not need to use addi.
2587 }
2588
2589 // When the vector needs to split into multiple register groups and the index
2590 // exceeds single vector register group, we need to insert/extract the element
2591 // via stack.
2592 if (LT.first > 1 &&
2593 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2594 LT.second.isScalableVector()))) {
2595 Type *ScalarType = Val->getScalarType();
2596 Align VecAlign = DL.getPrefTypeAlign(Val);
2597 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2598 // Extra addi for unknown index.
2599 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2600
2601 // Store all split vectors into stack and load the target element.
2602 if (Opcode == Instruction::ExtractElement)
2603 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2604 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2605 CostKind) +
2606 IdxCost;
2607
2608 // Store all split vectors into stack and store the target element and load
2609 // vectors back.
2610 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2611 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2612 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2613 CostKind) +
2614 IdxCost;
2615 }
2616
2617 // Extract i64 in the target that has XLEN=32 need more instruction.
2618 if (Val->getScalarType()->isIntegerTy() &&
2619 ST->getXLen() < Val->getScalarSizeInBits()) {
2620 // For extractelement, we need the following instructions:
2621 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2622 // vslidedown.vx v8, v8, a0
2623 // vmv.x.s a0, v8
2624 // li a1, 32
2625 // vsrl.vx v8, v8, a1
2626 // vmv.x.s a1, v8
2627
2628 // For insertelement, we need the following instructions:
2629 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2630 // vmv.v.i v12, 0
2631 // vslide1up.vx v16, v12, a1
2632 // vslide1up.vx v12, v16, a0
2633 // addi a0, a2, 1
2634 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2635 // vslideup.vx v8, v12, a2
2636
2637 // TODO: should we count these special vsetvlis?
2638 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2639 }
2640 return BaseCost + SlideCost;
2641}
2642
2646 unsigned Index) const {
2647 if (isa<FixedVectorType>(Val))
2649 Index);
2650
2651 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2652 // for the cost of extracting the last lane of a scalable vector. It probably
2653 // needs a more accurate cost.
2654 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2655 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2656 return getVectorInstrCost(Opcode, Val, CostKind,
2657 EC.getKnownMinValue() - 1 - Index, nullptr,
2658 nullptr);
2659}
2660
2662 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2664 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2665
2666 // TODO: Handle more cost kinds.
2668 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2669 Args, CxtI);
2670
2671 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2672 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2673 Args, CxtI);
2674
2675 // Skip if scalar size of Ty is bigger than ELEN.
2676 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2677 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2678 Args, CxtI);
2679
2680 // Legalize the type.
2681 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2682 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2683
2684 // TODO: Handle scalar type.
2685 if (!LT.second.isVector()) {
2686 static const CostTblEntry DivTbl[]{
2687 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2688 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2689 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2690 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2691 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2692 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2693 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2694 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2695 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2696 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2697 return Entry->Cost * LT.first;
2698
2699 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2700 Args, CxtI);
2701 }
2702
2703 // f16 with zvfhmin and bf16 will be promoted to f32.
2704 // FIXME: nxv32[b]f16 will be custom lowered and split.
2705 InstructionCost CastCost = 0;
2706 if ((LT.second.getVectorElementType() == MVT::f16 ||
2707 LT.second.getVectorElementType() == MVT::bf16) &&
2708 TLI->getOperationAction(ISDOpcode, LT.second) ==
2710 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2711 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2712 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2713 // Add cost of extending arguments
2714 CastCost += LT.first * Args.size() *
2715 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2717 // Add cost of truncating result
2718 CastCost +=
2719 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2721 // Compute cost of op in promoted type
2722 LT.second = PromotedVT;
2723 }
2724
2725 auto getConstantMatCost =
2726 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2727 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2728 // Two sub-cases:
2729 // * Has a 5 bit immediate operand which can be splatted.
2730 // * Has a larger immediate which must be materialized in scalar register
2731 // We return 0 for both as we currently ignore the cost of materializing
2732 // scalar constants in GPRs.
2733 return 0;
2734
2735 return getConstantPoolLoadCost(Ty, CostKind);
2736 };
2737
2738 // Add the cost of materializing any constant vectors required.
2739 InstructionCost ConstantMatCost = 0;
2740 if (Op1Info.isConstant())
2741 ConstantMatCost += getConstantMatCost(0, Op1Info);
2742 if (Op2Info.isConstant())
2743 ConstantMatCost += getConstantMatCost(1, Op2Info);
2744
2745 unsigned Op;
2746 switch (ISDOpcode) {
2747 case ISD::ADD:
2748 case ISD::SUB:
2749 Op = RISCV::VADD_VV;
2750 break;
2751 case ISD::SHL:
2752 case ISD::SRL:
2753 case ISD::SRA:
2754 Op = RISCV::VSLL_VV;
2755 break;
2756 case ISD::AND:
2757 case ISD::OR:
2758 case ISD::XOR:
2759 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2760 break;
2761 case ISD::MUL:
2762 case ISD::MULHS:
2763 case ISD::MULHU:
2764 Op = RISCV::VMUL_VV;
2765 break;
2766 case ISD::SDIV:
2767 case ISD::UDIV:
2768 Op = RISCV::VDIV_VV;
2769 break;
2770 case ISD::SREM:
2771 case ISD::UREM:
2772 Op = RISCV::VREM_VV;
2773 break;
2774 case ISD::FADD:
2775 case ISD::FSUB:
2776 Op = RISCV::VFADD_VV;
2777 break;
2778 case ISD::FMUL:
2779 Op = RISCV::VFMUL_VV;
2780 break;
2781 case ISD::FDIV:
2782 Op = RISCV::VFDIV_VV;
2783 break;
2784 case ISD::FNEG:
2785 Op = RISCV::VFSGNJN_VV;
2786 break;
2787 default:
2788 // Assuming all other instructions have the same cost until a need arises to
2789 // differentiate them.
2790 return CastCost + ConstantMatCost +
2791 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2792 Args, CxtI);
2793 }
2794
2795 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2796 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2797 // ops are twice as expensive as integer ops. Do the same for vectors so
2798 // scalar floating point ops aren't cheaper than their vector equivalents.
2799 if (Ty->isFPOrFPVectorTy())
2800 InstrCost *= 2;
2801 return CastCost + ConstantMatCost + LT.first * InstrCost;
2802}
2803
2804// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2806 ArrayRef<const Value *> Ptrs, const Value *Base,
2807 const TTI::PointersChainInfo &Info, Type *AccessTy,
2810 // In the basic model we take into account GEP instructions only
2811 // (although here can come alloca instruction, a value, constants and/or
2812 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2813 // pointer). Typically, if Base is a not a GEP-instruction and all the
2814 // pointers are relative to the same base address, all the rest are
2815 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2816 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2817 // any their index is a non-const.
2818 // If no known dependencies between the pointers cost is calculated as a sum
2819 // of costs of GEP instructions.
2820 for (auto [I, V] : enumerate(Ptrs)) {
2821 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2822 if (!GEP)
2823 continue;
2824 if (Info.isSameBase() && V != Base) {
2825 if (GEP->hasAllConstantIndices())
2826 continue;
2827 // If the chain is unit-stride and BaseReg + stride*i is a legal
2828 // addressing mode, then presume the base GEP is sitting around in a
2829 // register somewhere and check if we can fold the offset relative to
2830 // it.
2831 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2832 if (Info.isUnitStride() &&
2833 isLegalAddressingMode(AccessTy,
2834 /* BaseGV */ nullptr,
2835 /* BaseOffset */ Stride * I,
2836 /* HasBaseReg */ true,
2837 /* Scale */ 0,
2838 GEP->getType()->getPointerAddressSpace()))
2839 continue;
2840 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2841 {TTI::OK_AnyValue, TTI::OP_None},
2842 {TTI::OK_AnyValue, TTI::OP_None}, {});
2843 } else {
2844 SmallVector<const Value *> Indices(GEP->indices());
2845 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2846 Indices, AccessTy, CostKind);
2847 }
2848 }
2849 return Cost;
2850}
2851
2854 OptimizationRemarkEmitter *ORE) const {
2855 // TODO: More tuning on benchmarks and metrics with changes as needed
2856 // would apply to all settings below to enable performance.
2857
2858
2859 if (ST->enableDefaultUnroll())
2860 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2861
2862 // Enable Upper bound unrolling universally, not dependent upon the conditions
2863 // below.
2864 UP.UpperBound = true;
2865
2866 // Disable loop unrolling for Oz and Os.
2867 UP.OptSizeThreshold = 0;
2869 if (L->getHeader()->getParent()->hasOptSize())
2870 return;
2871
2872 SmallVector<BasicBlock *, 4> ExitingBlocks;
2873 L->getExitingBlocks(ExitingBlocks);
2874 LLVM_DEBUG(dbgs() << "Loop has:\n"
2875 << "Blocks: " << L->getNumBlocks() << "\n"
2876 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2877
2878 // Only allow another exit other than the latch. This acts as an early exit
2879 // as it mirrors the profitability calculation of the runtime unroller.
2880 if (ExitingBlocks.size() > 2)
2881 return;
2882
2883 // Limit the CFG of the loop body for targets with a branch predictor.
2884 // Allowing 4 blocks permits if-then-else diamonds in the body.
2885 if (L->getNumBlocks() > 4)
2886 return;
2887
2888 // Scan the loop: don't unroll loops with calls as this could prevent
2889 // inlining. Don't unroll auto-vectorized loops either, though do allow
2890 // unrolling of the scalar remainder.
2891 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2893 for (auto *BB : L->getBlocks()) {
2894 for (auto &I : *BB) {
2895 // Both auto-vectorized loops and the scalar remainder have the
2896 // isvectorized attribute, so differentiate between them by the presence
2897 // of vector instructions.
2898 if (IsVectorized && (I.getType()->isVectorTy() ||
2899 llvm::any_of(I.operand_values(), [](Value *V) {
2900 return V->getType()->isVectorTy();
2901 })))
2902 return;
2903
2904 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2905 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2906 if (!isLoweredToCall(F))
2907 continue;
2908 }
2909 return;
2910 }
2911
2912 SmallVector<const Value *> Operands(I.operand_values());
2913 Cost += getInstructionCost(&I, Operands,
2915 }
2916 }
2917
2918 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2919
2920 UP.Partial = true;
2921 UP.Runtime = true;
2922 UP.UnrollRemainder = true;
2923 UP.UnrollAndJam = true;
2924
2925 // Force unrolling small loops can be very useful because of the branch
2926 // taken cost of the backedge.
2927 if (Cost < 12)
2928 UP.Force = true;
2929}
2930
2935
2937 MemIntrinsicInfo &Info) const {
2938 const DataLayout &DL = getDataLayout();
2939 Intrinsic::ID IID = Inst->getIntrinsicID();
2940 LLVMContext &C = Inst->getContext();
2941 bool HasMask = false;
2942
2943 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2944 bool IsWrite) -> int64_t {
2945 if (auto *TarExtTy =
2946 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2947 return TarExtTy->getIntParameter(0);
2948
2949 return 1;
2950 };
2951
2952 switch (IID) {
2953 case Intrinsic::riscv_vle_mask:
2954 case Intrinsic::riscv_vse_mask:
2955 case Intrinsic::riscv_vlseg2_mask:
2956 case Intrinsic::riscv_vlseg3_mask:
2957 case Intrinsic::riscv_vlseg4_mask:
2958 case Intrinsic::riscv_vlseg5_mask:
2959 case Intrinsic::riscv_vlseg6_mask:
2960 case Intrinsic::riscv_vlseg7_mask:
2961 case Intrinsic::riscv_vlseg8_mask:
2962 case Intrinsic::riscv_vsseg2_mask:
2963 case Intrinsic::riscv_vsseg3_mask:
2964 case Intrinsic::riscv_vsseg4_mask:
2965 case Intrinsic::riscv_vsseg5_mask:
2966 case Intrinsic::riscv_vsseg6_mask:
2967 case Intrinsic::riscv_vsseg7_mask:
2968 case Intrinsic::riscv_vsseg8_mask:
2969 HasMask = true;
2970 [[fallthrough]];
2971 case Intrinsic::riscv_vle:
2972 case Intrinsic::riscv_vse:
2973 case Intrinsic::riscv_vlseg2:
2974 case Intrinsic::riscv_vlseg3:
2975 case Intrinsic::riscv_vlseg4:
2976 case Intrinsic::riscv_vlseg5:
2977 case Intrinsic::riscv_vlseg6:
2978 case Intrinsic::riscv_vlseg7:
2979 case Intrinsic::riscv_vlseg8:
2980 case Intrinsic::riscv_vsseg2:
2981 case Intrinsic::riscv_vsseg3:
2982 case Intrinsic::riscv_vsseg4:
2983 case Intrinsic::riscv_vsseg5:
2984 case Intrinsic::riscv_vsseg6:
2985 case Intrinsic::riscv_vsseg7:
2986 case Intrinsic::riscv_vsseg8: {
2987 // Intrinsic interface:
2988 // riscv_vle(merge, ptr, vl)
2989 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2990 // riscv_vse(val, ptr, vl)
2991 // riscv_vse_mask(val, ptr, mask, vl, policy)
2992 // riscv_vlseg#(merge, ptr, vl, sew)
2993 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2994 // riscv_vsseg#(val, ptr, vl, sew)
2995 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2996 bool IsWrite = Inst->getType()->isVoidTy();
2997 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2998 // The results of segment loads are TargetExtType.
2999 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3000 unsigned SEW =
3001 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3002 ->getZExtValue();
3003 Ty = TarExtTy->getTypeParameter(0U);
3005 IntegerType::get(C, SEW),
3006 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3007 }
3008 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3009 unsigned VLIndex = RVVIInfo->VLOperand;
3010 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3011 MaybeAlign Alignment =
3012 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3013 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3014 Value *Mask = ConstantInt::getTrue(MaskType);
3015 if (HasMask)
3016 Mask = Inst->getArgOperand(VLIndex - 1);
3017 Value *EVL = Inst->getArgOperand(VLIndex);
3018 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3019 // RVV uses contiguous elements as a segment.
3020 if (SegNum > 1) {
3021 unsigned ElemSize = Ty->getScalarSizeInBits();
3022 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3023 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3024 }
3025 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3026 Alignment, Mask, EVL);
3027 return true;
3028 }
3029 case Intrinsic::riscv_vlse_mask:
3030 case Intrinsic::riscv_vsse_mask:
3031 case Intrinsic::riscv_vlsseg2_mask:
3032 case Intrinsic::riscv_vlsseg3_mask:
3033 case Intrinsic::riscv_vlsseg4_mask:
3034 case Intrinsic::riscv_vlsseg5_mask:
3035 case Intrinsic::riscv_vlsseg6_mask:
3036 case Intrinsic::riscv_vlsseg7_mask:
3037 case Intrinsic::riscv_vlsseg8_mask:
3038 case Intrinsic::riscv_vssseg2_mask:
3039 case Intrinsic::riscv_vssseg3_mask:
3040 case Intrinsic::riscv_vssseg4_mask:
3041 case Intrinsic::riscv_vssseg5_mask:
3042 case Intrinsic::riscv_vssseg6_mask:
3043 case Intrinsic::riscv_vssseg7_mask:
3044 case Intrinsic::riscv_vssseg8_mask:
3045 HasMask = true;
3046 [[fallthrough]];
3047 case Intrinsic::riscv_vlse:
3048 case Intrinsic::riscv_vsse:
3049 case Intrinsic::riscv_vlsseg2:
3050 case Intrinsic::riscv_vlsseg3:
3051 case Intrinsic::riscv_vlsseg4:
3052 case Intrinsic::riscv_vlsseg5:
3053 case Intrinsic::riscv_vlsseg6:
3054 case Intrinsic::riscv_vlsseg7:
3055 case Intrinsic::riscv_vlsseg8:
3056 case Intrinsic::riscv_vssseg2:
3057 case Intrinsic::riscv_vssseg3:
3058 case Intrinsic::riscv_vssseg4:
3059 case Intrinsic::riscv_vssseg5:
3060 case Intrinsic::riscv_vssseg6:
3061 case Intrinsic::riscv_vssseg7:
3062 case Intrinsic::riscv_vssseg8: {
3063 // Intrinsic interface:
3064 // riscv_vlse(merge, ptr, stride, vl)
3065 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3066 // riscv_vsse(val, ptr, stride, vl)
3067 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3068 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3069 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3070 // riscv_vssseg#(val, ptr, offset, vl, sew)
3071 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3072 bool IsWrite = Inst->getType()->isVoidTy();
3073 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3074 // The results of segment loads are TargetExtType.
3075 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3076 unsigned SEW =
3077 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3078 ->getZExtValue();
3079 Ty = TarExtTy->getTypeParameter(0U);
3081 IntegerType::get(C, SEW),
3082 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3083 }
3084 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3085 unsigned VLIndex = RVVIInfo->VLOperand;
3086 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3087 MaybeAlign Alignment =
3088 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3089
3090 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3091 // Use the pointer alignment as the element alignment if the stride is a
3092 // multiple of the pointer alignment. Otherwise, the element alignment
3093 // should be the greatest common divisor of pointer alignment and stride.
3094 // For simplicity, just consider unalignment for elements.
3095 unsigned PointerAlign = Alignment.valueOrOne().value();
3096 if (!isa<ConstantInt>(Stride) ||
3097 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3098 Alignment = Align(1);
3099
3100 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3101 Value *Mask = ConstantInt::getTrue(MaskType);
3102 if (HasMask)
3103 Mask = Inst->getArgOperand(VLIndex - 1);
3104 Value *EVL = Inst->getArgOperand(VLIndex);
3105 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3106 // RVV uses contiguous elements as a segment.
3107 if (SegNum > 1) {
3108 unsigned ElemSize = Ty->getScalarSizeInBits();
3109 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3110 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3111 }
3112 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3113 Alignment, Mask, EVL, Stride);
3114 return true;
3115 }
3116 case Intrinsic::riscv_vloxei_mask:
3117 case Intrinsic::riscv_vluxei_mask:
3118 case Intrinsic::riscv_vsoxei_mask:
3119 case Intrinsic::riscv_vsuxei_mask:
3120 case Intrinsic::riscv_vloxseg2_mask:
3121 case Intrinsic::riscv_vloxseg3_mask:
3122 case Intrinsic::riscv_vloxseg4_mask:
3123 case Intrinsic::riscv_vloxseg5_mask:
3124 case Intrinsic::riscv_vloxseg6_mask:
3125 case Intrinsic::riscv_vloxseg7_mask:
3126 case Intrinsic::riscv_vloxseg8_mask:
3127 case Intrinsic::riscv_vluxseg2_mask:
3128 case Intrinsic::riscv_vluxseg3_mask:
3129 case Intrinsic::riscv_vluxseg4_mask:
3130 case Intrinsic::riscv_vluxseg5_mask:
3131 case Intrinsic::riscv_vluxseg6_mask:
3132 case Intrinsic::riscv_vluxseg7_mask:
3133 case Intrinsic::riscv_vluxseg8_mask:
3134 case Intrinsic::riscv_vsoxseg2_mask:
3135 case Intrinsic::riscv_vsoxseg3_mask:
3136 case Intrinsic::riscv_vsoxseg4_mask:
3137 case Intrinsic::riscv_vsoxseg5_mask:
3138 case Intrinsic::riscv_vsoxseg6_mask:
3139 case Intrinsic::riscv_vsoxseg7_mask:
3140 case Intrinsic::riscv_vsoxseg8_mask:
3141 case Intrinsic::riscv_vsuxseg2_mask:
3142 case Intrinsic::riscv_vsuxseg3_mask:
3143 case Intrinsic::riscv_vsuxseg4_mask:
3144 case Intrinsic::riscv_vsuxseg5_mask:
3145 case Intrinsic::riscv_vsuxseg6_mask:
3146 case Intrinsic::riscv_vsuxseg7_mask:
3147 case Intrinsic::riscv_vsuxseg8_mask:
3148 HasMask = true;
3149 [[fallthrough]];
3150 case Intrinsic::riscv_vloxei:
3151 case Intrinsic::riscv_vluxei:
3152 case Intrinsic::riscv_vsoxei:
3153 case Intrinsic::riscv_vsuxei:
3154 case Intrinsic::riscv_vloxseg2:
3155 case Intrinsic::riscv_vloxseg3:
3156 case Intrinsic::riscv_vloxseg4:
3157 case Intrinsic::riscv_vloxseg5:
3158 case Intrinsic::riscv_vloxseg6:
3159 case Intrinsic::riscv_vloxseg7:
3160 case Intrinsic::riscv_vloxseg8:
3161 case Intrinsic::riscv_vluxseg2:
3162 case Intrinsic::riscv_vluxseg3:
3163 case Intrinsic::riscv_vluxseg4:
3164 case Intrinsic::riscv_vluxseg5:
3165 case Intrinsic::riscv_vluxseg6:
3166 case Intrinsic::riscv_vluxseg7:
3167 case Intrinsic::riscv_vluxseg8:
3168 case Intrinsic::riscv_vsoxseg2:
3169 case Intrinsic::riscv_vsoxseg3:
3170 case Intrinsic::riscv_vsoxseg4:
3171 case Intrinsic::riscv_vsoxseg5:
3172 case Intrinsic::riscv_vsoxseg6:
3173 case Intrinsic::riscv_vsoxseg7:
3174 case Intrinsic::riscv_vsoxseg8:
3175 case Intrinsic::riscv_vsuxseg2:
3176 case Intrinsic::riscv_vsuxseg3:
3177 case Intrinsic::riscv_vsuxseg4:
3178 case Intrinsic::riscv_vsuxseg5:
3179 case Intrinsic::riscv_vsuxseg6:
3180 case Intrinsic::riscv_vsuxseg7:
3181 case Intrinsic::riscv_vsuxseg8: {
3182 // Intrinsic interface (only listed ordered version):
3183 // riscv_vloxei(merge, ptr, index, vl)
3184 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3185 // riscv_vsoxei(val, ptr, index, vl)
3186 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3187 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3188 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3189 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3190 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3191 bool IsWrite = Inst->getType()->isVoidTy();
3192 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3193 // The results of segment loads are TargetExtType.
3194 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3195 unsigned SEW =
3196 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3197 ->getZExtValue();
3198 Ty = TarExtTy->getTypeParameter(0U);
3200 IntegerType::get(C, SEW),
3201 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3202 }
3203 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3204 unsigned VLIndex = RVVIInfo->VLOperand;
3205 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3206 Value *Mask;
3207 if (HasMask) {
3208 Mask = Inst->getArgOperand(VLIndex - 1);
3209 } else {
3210 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3211 // and casting that to scalar i64 triggers a vector/scalar mismatch
3212 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3213 // via extractelement instead.
3214 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3215 Mask = ConstantInt::getTrue(MaskType);
3216 }
3217 Value *EVL = Inst->getArgOperand(VLIndex);
3218 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3219 // RVV uses contiguous elements as a segment.
3220 if (SegNum > 1) {
3221 unsigned ElemSize = Ty->getScalarSizeInBits();
3222 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3223 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3224 }
3225 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3226 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3227 Align(1), Mask, EVL,
3228 /* Stride */ nullptr, OffsetOp);
3229 return true;
3230 }
3231 }
3232 return false;
3233}
3234
3236 if (Ty->isVectorTy()) {
3237 // f16 with only zvfhmin and bf16 will be promoted to f32
3238 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3239 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3240 EltTy->isBFloatTy())
3241 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3242 cast<VectorType>(Ty));
3243
3244 TypeSize Size = DL.getTypeSizeInBits(Ty);
3245 if (Size.isScalable() && ST->hasVInstructions())
3246 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3247
3248 if (ST->useRVVForFixedLengthVectors())
3249 return divideCeil(Size, ST->getRealMinVLen());
3250 }
3251
3252 return BaseT::getRegUsageForType(Ty);
3253}
3254
3255unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3256 if (SLPMaxVF.getNumOccurrences())
3257 return SLPMaxVF;
3258
3259 // Return how many elements can fit in getRegisterBitwidth. This is the
3260 // same routine as used in LoopVectorizer. We should probably be
3261 // accounting for whether we actually have instructions with the right
3262 // lane type, but we don't have enough information to do that without
3263 // some additional plumbing which hasn't been justified yet.
3264 TypeSize RegWidth =
3266 // If no vector registers, or absurd element widths, disable
3267 // vectorization by returning 1.
3268 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3269}
3270
3274
3276 return ST->enableUnalignedVectorMem();
3277}
3278
3281 ScalarEvolution *SE) const {
3282 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3283 return TTI::AMK_PostIndexed;
3284
3286}
3287
3289 const TargetTransformInfo::LSRCost &C2) const {
3290 // RISC-V specific here are "instruction number 1st priority".
3291 // If we need to emit adds inside the loop to add up base registers, then
3292 // we need at least one extra temporary register.
3293 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3294 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3295 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3296 C1.NumIVMuls, C1.NumBaseAdds,
3297 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3298 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3299 C2.NumIVMuls, C2.NumBaseAdds,
3300 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3301}
3302
3304 Align Alignment) const {
3305 auto *VTy = dyn_cast<VectorType>(DataTy);
3306 if (!VTy || VTy->isScalableTy())
3307 return false;
3308
3309 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3310 return false;
3311
3312 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3313 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3314 if (VTy->getElementType()->isIntegerTy(8))
3315 if (VTy->getElementCount().getFixedValue() > 256)
3316 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3317 ST->getMaxLMULForFixedLengthVectors();
3318 return true;
3319}
3320
3322 Align Alignment) const {
3323 auto *VTy = dyn_cast<VectorType>(DataTy);
3324 if (!VTy || VTy->isScalableTy())
3325 return false;
3326
3327 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3328 return false;
3329 return true;
3330}
3331
3332/// See if \p I should be considered for address type promotion. We check if \p
3333/// I is a sext with right type and used in memory accesses. If it used in a
3334/// "complex" getelementptr, we allow it to be promoted without finding other
3335/// sext instructions that sign extended the same initial value. A getelementptr
3336/// is considered as "complex" if it has more than 2 operands.
3338 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3339 bool Considerable = false;
3340 AllowPromotionWithoutCommonHeader = false;
3341 if (!isa<SExtInst>(&I))
3342 return false;
3343 Type *ConsideredSExtType =
3344 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3345 if (I.getType() != ConsideredSExtType)
3346 return false;
3347 // See if the sext is the one with the right type and used in at least one
3348 // GetElementPtrInst.
3349 for (const User *U : I.users()) {
3350 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3351 Considerable = true;
3352 // A getelementptr is considered as "complex" if it has more than 2
3353 // operands. We will promote a SExt used in such complex GEP as we
3354 // expect some computation to be merged if they are done on 64 bits.
3355 if (GEPInst->getNumOperands() > 2) {
3356 AllowPromotionWithoutCommonHeader = true;
3357 break;
3358 }
3359 }
3360 }
3361 return Considerable;
3362}
3363
3364bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3365 switch (Opcode) {
3366 case Instruction::Add:
3367 case Instruction::Sub:
3368 case Instruction::Mul:
3369 case Instruction::And:
3370 case Instruction::Or:
3371 case Instruction::Xor:
3372 case Instruction::FAdd:
3373 case Instruction::FSub:
3374 case Instruction::FMul:
3375 case Instruction::FDiv:
3376 case Instruction::ICmp:
3377 case Instruction::FCmp:
3378 return true;
3379 case Instruction::Shl:
3380 case Instruction::LShr:
3381 case Instruction::AShr:
3382 case Instruction::UDiv:
3383 case Instruction::SDiv:
3384 case Instruction::URem:
3385 case Instruction::SRem:
3386 case Instruction::Select:
3387 return Operand == 1;
3388 default:
3389 return false;
3390 }
3391}
3392
3394 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3395 return false;
3396
3397 if (canSplatOperand(I->getOpcode(), Operand))
3398 return true;
3399
3400 auto *II = dyn_cast<IntrinsicInst>(I);
3401 if (!II)
3402 return false;
3403
3404 switch (II->getIntrinsicID()) {
3405 case Intrinsic::fma:
3406 case Intrinsic::vp_fma:
3407 case Intrinsic::fmuladd:
3408 case Intrinsic::vp_fmuladd:
3409 return Operand == 0 || Operand == 1;
3410 case Intrinsic::vp_shl:
3411 case Intrinsic::vp_lshr:
3412 case Intrinsic::vp_ashr:
3413 case Intrinsic::vp_udiv:
3414 case Intrinsic::vp_sdiv:
3415 case Intrinsic::vp_urem:
3416 case Intrinsic::vp_srem:
3417 case Intrinsic::ssub_sat:
3418 case Intrinsic::vp_ssub_sat:
3419 case Intrinsic::usub_sat:
3420 case Intrinsic::vp_usub_sat:
3421 case Intrinsic::vp_select:
3422 return Operand == 1;
3423 // These intrinsics are commutative.
3424 case Intrinsic::vp_add:
3425 case Intrinsic::vp_mul:
3426 case Intrinsic::vp_and:
3427 case Intrinsic::vp_or:
3428 case Intrinsic::vp_xor:
3429 case Intrinsic::vp_fadd:
3430 case Intrinsic::vp_fmul:
3431 case Intrinsic::vp_icmp:
3432 case Intrinsic::vp_fcmp:
3433 case Intrinsic::smin:
3434 case Intrinsic::vp_smin:
3435 case Intrinsic::umin:
3436 case Intrinsic::vp_umin:
3437 case Intrinsic::smax:
3438 case Intrinsic::vp_smax:
3439 case Intrinsic::umax:
3440 case Intrinsic::vp_umax:
3441 case Intrinsic::sadd_sat:
3442 case Intrinsic::vp_sadd_sat:
3443 case Intrinsic::uadd_sat:
3444 case Intrinsic::vp_uadd_sat:
3445 // These intrinsics have 'vr' versions.
3446 case Intrinsic::vp_sub:
3447 case Intrinsic::vp_fsub:
3448 case Intrinsic::vp_fdiv:
3449 return Operand == 0 || Operand == 1;
3450 default:
3451 return false;
3452 }
3453}
3454
3455/// Check if sinking \p I's operands to I's basic block is profitable, because
3456/// the operands can be folded into a target instruction, e.g.
3457/// splats of scalars can fold into vector instructions.
3460 using namespace llvm::PatternMatch;
3461
3462 if (I->isBitwiseLogicOp()) {
3463 if (!I->getType()->isVectorTy()) {
3464 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3465 for (auto &Op : I->operands()) {
3466 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3467 if (match(Op.get(), m_Not(m_Value()))) {
3468 Ops.push_back(&Op);
3469 return true;
3470 }
3471 }
3472 }
3473 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3474 for (auto &Op : I->operands()) {
3475 // (and X, (not Y)) -> (vandn.vv X, Y)
3476 if (match(Op.get(), m_Not(m_Value()))) {
3477 Ops.push_back(&Op);
3478 return true;
3479 }
3480 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3482 m_ZeroInt()),
3483 m_Value(), m_ZeroMask()))) {
3484 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3485 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3486 Ops.push_back(&Not);
3487 Ops.push_back(&InsertElt);
3488 Ops.push_back(&Op);
3489 return true;
3490 }
3491 }
3492 }
3493 }
3494
3495 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3496 return false;
3497
3498 // Don't sink splat operands if the target prefers it. Some targets requires
3499 // S2V transfer buffers and we can run out of them copying the same value
3500 // repeatedly.
3501 // FIXME: It could still be worth doing if it would improve vector register
3502 // pressure and prevent a vector spill.
3503 if (!ST->sinkSplatOperands())
3504 return false;
3505
3506 for (auto OpIdx : enumerate(I->operands())) {
3507 if (!canSplatOperand(I, OpIdx.index()))
3508 continue;
3509
3510 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3511 // Make sure we are not already sinking this operand
3512 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3513 continue;
3514
3515 // We are looking for a splat that can be sunk.
3517 m_Value(), m_ZeroMask())))
3518 continue;
3519
3520 // Don't sink i1 splats.
3521 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3522 continue;
3523
3524 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3525 // and vector registers
3526 for (Use &U : Op->uses()) {
3527 Instruction *Insn = cast<Instruction>(U.getUser());
3528 if (!canSplatOperand(Insn, U.getOperandNo()))
3529 return false;
3530 }
3531
3532 // Sink any fpexts since they might be used in a widening fp pattern.
3533 Use *InsertEltUse = &Op->getOperandUse(0);
3534 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3535 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3536 Ops.push_back(&InsertElt->getOperandUse(1));
3537 Ops.push_back(InsertEltUse);
3538 Ops.push_back(&OpIdx.value());
3539 }
3540 return true;
3541}
3542
3544RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3546 // TODO: Enable expansion when unaligned access is not supported after we fix
3547 // issues in ExpandMemcmp.
3548 if (!ST->enableUnalignedScalarMem())
3549 return Options;
3550
3551 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3552 return Options;
3553
3554 Options.AllowOverlappingLoads = true;
3555 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3556 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3557 if (ST->is64Bit()) {
3558 Options.LoadSizes = {8, 4, 2, 1};
3559 Options.AllowedTailExpansions = {3, 5, 6};
3560 } else {
3561 Options.LoadSizes = {4, 2, 1};
3562 Options.AllowedTailExpansions = {3};
3563 }
3564
3565 if (IsZeroCmp && ST->hasVInstructions()) {
3566 unsigned VLenB = ST->getRealMinVLen() / 8;
3567 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3568 // `VLenB * MaxLMUL` so that it fits in a single register group.
3569 unsigned MinSize = ST->getXLen() / 8 + 1;
3570 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3571 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3572 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3573 }
3574 return Options;
3575}
3576
3578 const Instruction *I) const {
3580 // For the binary operators (e.g. or) we need to be more careful than
3581 // selects, here we only transform them if they are already at a natural
3582 // break point in the code - the end of a block with an unconditional
3583 // terminator.
3584 if (I->getOpcode() == Instruction::Or &&
3585 isa<BranchInst>(I->getNextNode()) &&
3586 cast<BranchInst>(I->getNextNode())->isUnconditional())
3587 return true;
3588
3589 if (I->getOpcode() == Instruction::Add ||
3590 I->getOpcode() == Instruction::Sub)
3591 return true;
3592 }
3594}
3595
3597 const Function *Caller, const Attribute &Attr) const {
3598 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3599 // restrictions on their signatures). We can outline from the bodies of these
3600 // handlers, but when we do we need to make sure we don't mark the outlined
3601 // function as an interrupt handler too.
3602 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3603 return false;
3604
3606}
3607
3608std::optional<Instruction *>
3610 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3611 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3612 // creating redundant masks.
3613 const DataLayout &DL = IC.getDataLayout();
3614 if (II.user_empty())
3615 return {};
3616 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3617 if (!TargetVecTy)
3618 return {};
3619 const APInt *Scalar;
3620 uint64_t VL;
3622 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3623 !all_of(II.users(), [TargetVecTy](User *U) {
3624 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3625 }))
3626 return {};
3627 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3628 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3629 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3630 if (TargetEltBW % SourceEltBW)
3631 return {};
3632 unsigned TargetScale = TargetEltBW / SourceEltBW;
3633 if (VL % TargetScale)
3634 return {};
3635 Type *VLTy = II.getOperand(2)->getType();
3636 ElementCount SourceEC = SourceVecTy->getElementCount();
3637 unsigned NewEltBW = SourceEltBW * TargetScale;
3638 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3639 !DL.fitsInLegalInteger(NewEltBW))
3640 return {};
3641 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3642 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3643 return {};
3644 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3645 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3646 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3647 "Lossless bitcast between types expected");
3648 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3649 return IC.replaceInstUsesWith(
3650 II,
3653 RetTy, Intrinsic::riscv_vmv_v_x,
3654 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3655 ConstantInt::get(VLTy, VL / TargetScale)}),
3656 SourceVecTy));
3657}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).