LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(2700), cl::Hidden);
40
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(1000), cl::Hidden);
45
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(200), cl::Hidden);
50
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(true), cl::Hidden);
55
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering statically-sized memcpy, memmove, or"
84 "memset as a loop"),
85 cl::init(16), cl::Hidden);
86
87static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88 unsigned Depth = 0) {
90 if (!I)
91 return false;
92
93 for (const Value *V : I->operand_values()) {
94 if (!L->contains(I))
95 continue;
96 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98 return SubLoop->contains(PHI); }))
99 return true;
100 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101 return true;
102 }
103 return false;
104}
105
107 : BaseT(TM, F.getDataLayout()),
108 TargetTriple(TM->getTargetTriple()),
109 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
110 TLI(ST->getTargetLowering()) {}
111
114 OptimizationRemarkEmitter *ORE) const {
115 const Function &F = *L->getHeader()->getParent();
116 UP.Threshold =
117 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
118 UP.MaxCount = std::numeric_limits<unsigned>::max();
119 UP.Partial = true;
120
121 // Conditional branch in a loop back edge needs 3 additional exec
122 // manipulations in average.
123 UP.BEInsns += 3;
124
125 // We want to run unroll even for the loops which have been vectorized.
126 UP.UnrollVectorizedLoop = true;
127
128 // TODO: Do we want runtime unrolling?
129
130 // Maximum alloca size than can fit registers. Reserve 16 registers.
131 const unsigned MaxAlloca = (256 - 16) * 4;
132 unsigned ThresholdPrivate = UnrollThresholdPrivate;
133 unsigned ThresholdLocal = UnrollThresholdLocal;
134
135 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
136 // provided threshold value as the default for Threshold
137 if (MDNode *LoopUnrollThreshold =
138 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
139 if (LoopUnrollThreshold->getNumOperands() == 2) {
141 LoopUnrollThreshold->getOperand(1));
142 if (MetaThresholdValue) {
143 // We will also use the supplied value for PartialThreshold for now.
144 // We may introduce additional metadata if it becomes necessary in the
145 // future.
146 UP.Threshold = MetaThresholdValue->getSExtValue();
148 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
149 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
150 }
151 }
152 }
153
154 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
155 for (const BasicBlock *BB : L->getBlocks()) {
156 const DataLayout &DL = BB->getDataLayout();
157 unsigned LocalGEPsSeen = 0;
158
159 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
160 return SubLoop->contains(BB); }))
161 continue; // Block belongs to an inner loop.
162
163 for (const Instruction &I : *BB) {
164 // Unroll a loop which contains an "if" statement whose condition
165 // defined by a PHI belonging to the loop. This may help to eliminate
166 // if region and potentially even PHI itself, saving on both divergence
167 // and registers used for the PHI.
168 // Add a small bonus for each of such "if" statements.
169 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
170 if (UP.Threshold < MaxBoost && Br->isConditional()) {
171 BasicBlock *Succ0 = Br->getSuccessor(0);
172 BasicBlock *Succ1 = Br->getSuccessor(1);
173 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
174 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
175 continue;
176 if (dependsOnLocalPhi(L, Br->getCondition())) {
178 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
179 << " for loop:\n"
180 << *L << " due to " << *Br << '\n');
181 if (UP.Threshold >= MaxBoost)
182 return;
183 }
184 }
185 continue;
186 }
187
189 if (!GEP)
190 continue;
191
192 unsigned AS = GEP->getAddressSpace();
193 unsigned Threshold = 0;
195 Threshold = ThresholdPrivate;
197 Threshold = ThresholdLocal;
198 else
199 continue;
200
201 if (UP.Threshold >= Threshold)
202 continue;
203
204 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
205 const Value *Ptr = GEP->getPointerOperand();
206 const AllocaInst *Alloca =
208 if (!Alloca || !Alloca->isStaticAlloca())
209 continue;
210 auto AllocaSize = Alloca->getAllocationSize(DL);
211 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
212 continue;
213 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
215 LocalGEPsSeen++;
216 // Inhibit unroll for local memory if we have seen addressing not to
217 // a variable, most likely we will be unable to combine it.
218 // Do not unroll too deep inner loops for local memory to give a chance
219 // to unroll an outer loop for a more important reason.
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
221 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
222 !isa<Argument>(GEP->getPointerOperand())))
223 continue;
224 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
225 << *L << " due to LDS use.\n");
227 }
228
229 // Check if GEP depends on a value defined by this loop itself.
230 bool HasLoopDef = false;
231 for (const Value *Op : GEP->operands()) {
232 const Instruction *Inst = dyn_cast<Instruction>(Op);
233 if (!Inst || L->isLoopInvariant(Op))
234 continue;
235
236 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
237 return SubLoop->contains(Inst); }))
238 continue;
239 HasLoopDef = true;
240 break;
241 }
242 if (!HasLoopDef)
243 continue;
244
245 // We want to do whatever we can to limit the number of alloca
246 // instructions that make it through to the code generator. allocas
247 // require us to use indirect addressing, which is slow and prone to
248 // compiler bugs. If this loop does an address calculation on an
249 // alloca ptr, then we want to use a higher than normal loop unroll
250 // threshold. This will give SROA a better chance to eliminate these
251 // allocas.
252 //
253 // We also want to have more unrolling for local memory to let ds
254 // instructions with different offsets combine.
255 //
256 // Don't use the maximum allowed value here as it will make some
257 // programs way too big.
258 UP.Threshold = Threshold;
259 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
260 << " for loop:\n"
261 << *L << " due to " << *GEP << '\n');
262 if (UP.Threshold >= MaxBoost)
263 return;
264 }
265
266 // If we got a GEP in a small BB from inner loop then increase max trip
267 // count to analyze for better estimation cost in unroll
268 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
270 }
271}
272
277
281
282const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
283 // Codegen control options which don't matter.
284 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
285 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
286 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
287
288 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
289
290 // Property of the kernel/environment which can't actually differ.
291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292 AMDGPU::FeatureTrapHandler,
293
294 // The default assumption needs to be ecc is enabled, but no directly
295 // exposed operations depend on it, so it can be safely inlined.
296 AMDGPU::FeatureSRAMECC,
297
298 // Perf-tuning features
299 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
300
302 : BaseT(TM, F.getDataLayout()),
303 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
304 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
305 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
307 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
308 HasFP64FP16Denormals =
309 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
310}
311
313 return !F || !ST->isSingleLaneExecution(*F);
314}
315
316unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
317 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318 // registers. See getRegisterClassForType for the implementation.
319 // In this case vector registers are not vector in terms of
320 // VGPRs, but those which can hold multiple values.
321
322 // This is really the number of registers to fill when vectorizing /
323 // interleaving loops, so we lie to avoid trying to use all registers.
324 return 4;
325}
326
329 switch (K) {
331 return TypeSize::getFixed(32);
333 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
335 return TypeSize::getScalable(0);
336 }
337 llvm_unreachable("Unsupported register kind");
338}
339
341 return 32;
342}
343
344unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346 return 32 * 4 / ElemWidth;
347 // For a given width return the max 0number of elements that can be combined
348 // into a wider bit value:
349 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
350 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352 : 1;
353}
354
355unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
356 unsigned ChainSizeInBytes,
357 VectorType *VecTy) const {
358 unsigned VecRegBitWidth = VF * LoadSize;
359 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
360 // TODO: Support element-size less than 32bit?
361 return 128 / LoadSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
367 unsigned ChainSizeInBytes,
368 VectorType *VecTy) const {
369 unsigned VecRegBitWidth = VF * StoreSize;
370 if (VecRegBitWidth > 128)
371 return 128 / StoreSize;
372
373 return VF;
374}
375
376unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
377 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
378 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
380 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
381 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
383 return 512;
384 }
385
386 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
387 return 8 * ST->getMaxPrivateElementSize();
388
389 // Common to flat, global, local and region. Assume for unknown addrspace.
390 return 128;
391}
392
393bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
394 Align Alignment,
395 unsigned AddrSpace) const {
396 // We allow vectorization of flat stores, even though we may need to decompose
397 // them later if they may access private memory. We don't have enough context
398 // here, and legalization can handle it.
399 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
400 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
401 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
402 }
403 return true;
404}
405
406bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
407 Align Alignment,
408 unsigned AddrSpace) const {
409 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
410}
411
412bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416}
417
421
423 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
424 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
425 std::optional<uint32_t> AtomicElementSize) const {
426
427 if (AtomicElementSize)
428 return Type::getIntNTy(Context, *AtomicElementSize * 8);
429
430 // 16-byte accesses achieve the highest copy throughput.
431 // If the operation has a fixed known length that is large enough, it is
432 // worthwhile to return an even wider type and let legalization lower it into
433 // multiple accesses, effectively unrolling the memcpy loop.
434 // We also rely on legalization to decompose into smaller accesses for
435 // subtargets and address spaces where it is necessary.
436 //
437 // Don't unroll if Length is not a constant, since unrolling leads to worse
438 // performance for length values that are smaller or slightly larger than the
439 // total size of the type returned here. Mitigating that would require a more
440 // complex lowering for variable-length memcpy and memmove.
441 unsigned I32EltsInVector = 4;
444 MemcpyLoopUnroll * I32EltsInVector);
445
446 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
447}
448
450 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
451 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
452 Align SrcAlign, Align DestAlign,
453 std::optional<uint32_t> AtomicCpySize) const {
454
455 if (AtomicCpySize)
457 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
458 DestAlign, AtomicCpySize);
459
460 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
461 while (RemainingBytes >= 16) {
462 OpsOut.push_back(I32x4Ty);
463 RemainingBytes -= 16;
464 }
465
466 Type *I64Ty = Type::getInt64Ty(Context);
467 while (RemainingBytes >= 8) {
468 OpsOut.push_back(I64Ty);
469 RemainingBytes -= 8;
470 }
471
472 Type *I32Ty = Type::getInt32Ty(Context);
473 while (RemainingBytes >= 4) {
474 OpsOut.push_back(I32Ty);
475 RemainingBytes -= 4;
476 }
477
478 Type *I16Ty = Type::getInt16Ty(Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(I8Ty);
487 --RemainingBytes;
488 }
489}
490
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498}
499
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap: {
505 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
506 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
507 if (!Ordering || !Volatile)
508 return false; // Invalid.
509
510 unsigned OrderingVal = Ordering->getZExtValue();
511 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
512 return false;
513
514 Info.PtrVal = Inst->getArgOperand(0);
515 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
516 Info.ReadMem = true;
517 Info.WriteMem = true;
518 Info.IsVolatile = !Volatile->isZero();
519 return true;
520 }
521 default:
522 return false;
523 }
524}
525
527 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
529 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
530
531 // Legalize the type.
532 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
533 int ISD = TLI->InstructionOpcodeToISD(Opcode);
534
535 // Because we don't have any legal vector operations, but the legal types, we
536 // need to account for split vectors.
537 unsigned NElts = LT.second.isVector() ?
538 LT.second.getVectorNumElements() : 1;
539
540 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
541
542 switch (ISD) {
543 case ISD::SHL:
544 case ISD::SRL:
545 case ISD::SRA:
546 if (SLT == MVT::i64)
547 return get64BitInstrCost(CostKind) * LT.first * NElts;
548
549 if (ST->has16BitInsts() && SLT == MVT::i16)
550 NElts = (NElts + 1) / 2;
551
552 // i32
553 return getFullRateInstrCost() * LT.first * NElts;
554 case ISD::ADD:
555 case ISD::SUB:
556 case ISD::AND:
557 case ISD::OR:
558 case ISD::XOR:
559 if (SLT == MVT::i64) {
560 // and, or and xor are typically split into 2 VALU instructions.
561 return 2 * getFullRateInstrCost() * LT.first * NElts;
562 }
563
564 if (ST->has16BitInsts() && SLT == MVT::i16)
565 NElts = (NElts + 1) / 2;
566
567 return LT.first * NElts * getFullRateInstrCost();
568 case ISD::MUL: {
569 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
570 if (SLT == MVT::i64) {
571 const int FullRateCost = getFullRateInstrCost();
572 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
573 }
574
575 if (ST->has16BitInsts() && SLT == MVT::i16)
576 NElts = (NElts + 1) / 2;
577
578 // i32
579 return QuarterRateCost * NElts * LT.first;
580 }
581 case ISD::FMUL:
582 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
583 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
584 // fused operation.
585 if (CxtI && CxtI->hasOneUse())
586 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
587 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
588 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
589 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
591 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
593
594 // Estimate all types may be fused with contract/unsafe flags
595 const TargetOptions &Options = TLI->getTargetMachine().Options;
596 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
597 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
599 }
600 }
601 [[fallthrough]];
602 case ISD::FADD:
603 case ISD::FSUB:
604 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
605 NElts = (NElts + 1) / 2;
606 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
607 NElts = (NElts + 1) / 2;
608 if (SLT == MVT::f64)
609 return LT.first * NElts * get64BitInstrCost(CostKind);
610
611 if (ST->has16BitInsts() && SLT == MVT::f16)
612 NElts = (NElts + 1) / 2;
613
614 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
615 return LT.first * NElts * getFullRateInstrCost();
616 break;
617 case ISD::FDIV:
618 case ISD::FREM:
619 // FIXME: frem should be handled separately. The fdiv in it is most of it,
620 // but the current lowering is also not entirely correct.
621 if (SLT == MVT::f64) {
622 int Cost = 7 * get64BitInstrCost(CostKind) +
623 getQuarterRateInstrCost(CostKind) +
624 3 * getHalfRateInstrCost(CostKind);
625 // Add cost of workaround.
626 if (!ST->hasUsableDivScaleConditionOutput())
627 Cost += 3 * getFullRateInstrCost();
628
629 return LT.first * Cost * NElts;
630 }
631
632 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
633 // TODO: This is more complicated, unsafe flags etc.
634 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
635 (SLT == MVT::f16 && ST->has16BitInsts())) {
636 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
637 }
638 }
639
640 if (SLT == MVT::f16 && ST->has16BitInsts()) {
641 // 2 x v_cvt_f32_f16
642 // f32 rcp
643 // f32 fmul
644 // v_cvt_f16_f32
645 // f16 div_fixup
646 int Cost =
647 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
648 return LT.first * Cost * NElts;
649 }
650
651 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
652 // Fast unsafe fdiv lowering:
653 // f32 rcp
654 // f32 fmul
655 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
656 return LT.first * Cost * NElts;
657 }
658
659 if (SLT == MVT::f32 || SLT == MVT::f16) {
660 // 4 more v_cvt_* insts without f16 insts support
661 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
662 1 * getQuarterRateInstrCost(CostKind);
663
664 if (!HasFP32Denormals) {
665 // FP mode switches.
666 Cost += 2 * getFullRateInstrCost();
667 }
668
669 return LT.first * NElts * Cost;
670 }
671 break;
672 case ISD::FNEG:
673 // Use the backend' estimation. If fneg is not free each element will cost
674 // one additional instruction.
675 return TLI->isFNegFree(SLT) ? 0 : NElts;
676 default:
677 break;
678 }
679
680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
681 Args, CxtI);
682}
683
684// Return true if there's a potential benefit from using v2f16/v2i16
685// instructions for an intrinsic, even if it requires nontrivial legalization.
687 switch (ID) {
688 case Intrinsic::fma:
689 case Intrinsic::fmuladd:
690 case Intrinsic::copysign:
691 case Intrinsic::minimumnum:
692 case Intrinsic::maximumnum:
693 case Intrinsic::canonicalize:
694 // There's a small benefit to using vector ops in the legalized code.
695 case Intrinsic::round:
696 case Intrinsic::uadd_sat:
697 case Intrinsic::usub_sat:
698 case Intrinsic::sadd_sat:
699 case Intrinsic::ssub_sat:
700 case Intrinsic::abs:
701 return true;
702 default:
703 return false;
704 }
705}
706
710 switch (ICA.getID()) {
711 case Intrinsic::fabs:
712 // Free source modifier in the common case.
713 return 0;
714 case Intrinsic::amdgcn_workitem_id_x:
715 case Intrinsic::amdgcn_workitem_id_y:
716 case Intrinsic::amdgcn_workitem_id_z:
717 // TODO: If hasPackedTID, or if the calling context is not an entry point
718 // there may be a bit instruction.
719 return 0;
720 case Intrinsic::amdgcn_workgroup_id_x:
721 case Intrinsic::amdgcn_workgroup_id_y:
722 case Intrinsic::amdgcn_workgroup_id_z:
723 case Intrinsic::amdgcn_lds_kernel_id:
724 case Intrinsic::amdgcn_dispatch_ptr:
725 case Intrinsic::amdgcn_dispatch_id:
726 case Intrinsic::amdgcn_implicitarg_ptr:
727 case Intrinsic::amdgcn_queue_ptr:
728 // Read from an argument register.
729 return 0;
730 default:
731 break;
732 }
733
734 Type *RetTy = ICA.getReturnType();
735
736 Intrinsic::ID IID = ICA.getID();
737 switch (IID) {
738 case Intrinsic::exp:
739 case Intrinsic::exp2:
740 case Intrinsic::exp10: {
741 // Legalize the type.
742 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
743 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
744 unsigned NElts =
745 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
746
747 if (SLT == MVT::f64) {
748 unsigned NumOps = 20;
749 if (IID == Intrinsic::exp)
750 ++NumOps;
751 else if (IID == Intrinsic::exp10)
752 NumOps += 3;
753
754 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
755 }
756
757 if (SLT == MVT::f32) {
758 unsigned NumFullRateOps = 0;
759 // v_exp_f32 (quarter rate).
760 unsigned NumQuarterRateOps = 1;
761
762 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
763 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
764 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
765 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
766 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
767 } else {
768 if (IID == Intrinsic::exp) {
769 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
770 NumFullRateOps = 1;
771 } else if (IID == Intrinsic::exp10) {
772 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
773 NumFullRateOps = 3;
774 NumQuarterRateOps = 2;
775 }
776 // Denorm scaling adds setcc + select + fadd + select + fmul.
777 if (HasFP32Denormals)
778 NumFullRateOps += 5;
779 }
780
782 NumFullRateOps * getFullRateInstrCost() +
783 NumQuarterRateOps * getQuarterRateInstrCost(CostKind);
784 return LT.first * NElts * Cost;
785 }
786
787 break;
788 }
789 default:
790 break;
791 }
792
795
796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
797 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
798 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
799
800 if ((ST->hasVOP3PInsts() &&
801 (SLT == MVT::f16 || SLT == MVT::i16 ||
802 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
803 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
804 NElts = (NElts + 1) / 2;
805
806 // TODO: Get more refined intrinsic costs?
807 unsigned InstRate = getQuarterRateInstrCost(CostKind);
808
809 switch (ICA.getID()) {
810 case Intrinsic::fma:
811 case Intrinsic::fmuladd:
812 if (SLT == MVT::f64) {
813 InstRate = get64BitInstrCost(CostKind);
814 break;
815 }
816
817 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
818 InstRate = getFullRateInstrCost();
819 else {
820 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
821 : getQuarterRateInstrCost(CostKind);
822 }
823 break;
824 case Intrinsic::copysign:
825 return NElts * getFullRateInstrCost();
826 case Intrinsic::minimumnum:
827 case Intrinsic::maximumnum: {
828 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
829 // promotion takes the place of the canonicalize.
830 unsigned NumOps = 3;
831 if (const IntrinsicInst *II = ICA.getInst()) {
832 // Directly legal with ieee=0
833 // TODO: Not directly legal with strictfp
835 NumOps = 1;
836 }
837
838 unsigned BaseRate =
839 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
840 InstRate = BaseRate * NumOps;
841 break;
842 }
843 case Intrinsic::canonicalize: {
844 InstRate =
845 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
846 break;
847 }
848 case Intrinsic::uadd_sat:
849 case Intrinsic::usub_sat:
850 case Intrinsic::sadd_sat:
851 case Intrinsic::ssub_sat: {
852 if (SLT == MVT::i16 || SLT == MVT::i32)
853 InstRate = getFullRateInstrCost();
854
855 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
856 if (any_of(ValidSatTys, equal_to(LT.second)))
857 NElts = 1;
858 break;
859 }
860 case Intrinsic::abs:
861 // Expansion takes 2 instructions for VALU
862 if (SLT == MVT::i16 || SLT == MVT::i32)
863 InstRate = 2 * getFullRateInstrCost();
864 break;
865 default:
866 break;
867 }
868
869 return LT.first * NElts * InstRate;
870}
871
874 const Instruction *I) const {
875 assert((I == nullptr || I->getOpcode() == Opcode) &&
876 "Opcode should reflect passed instruction.");
877 const bool SCost =
879 const int CBrCost = SCost ? 5 : 7;
880 switch (Opcode) {
881 case Instruction::Br: {
882 // Branch instruction takes about 4 slots on gfx900.
883 const auto *BI = dyn_cast_or_null<BranchInst>(I);
884 if (BI && BI->isUnconditional())
885 return SCost ? 1 : 4;
886 // Suppose conditional branch takes additional 3 exec manipulations
887 // instructions in average.
888 return CBrCost;
889 }
890 case Instruction::Switch: {
891 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
892 // Each case (including default) takes 1 cmp + 1 cbr instructions in
893 // average.
894 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
895 }
896 case Instruction::Ret:
897 return SCost ? 1 : 10;
898 }
899 return BaseT::getCFInstrCost(Opcode, CostKind, I);
900}
901
904 std::optional<FastMathFlags> FMF,
907 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
908
909 EVT OrigTy = TLI->getValueType(DL, Ty);
910
911 // Computes cost on targets that have packed math instructions(which support
912 // 16-bit types only).
913 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
914 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
915
916 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
917 return LT.first * getFullRateInstrCost();
918}
919
922 FastMathFlags FMF,
924 EVT OrigTy = TLI->getValueType(DL, Ty);
925
926 // Computes cost on targets that have packed math instructions(which support
927 // 16-bit types only).
928 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
929 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
930
931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
932 return LT.first * getHalfRateInstrCost(CostKind);
933}
934
936 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
937 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
938 switch (Opcode) {
939 case Instruction::ExtractElement:
940 case Instruction::InsertElement: {
941 unsigned EltSize
942 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
943 if (EltSize < 32) {
944 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
945 return 0;
946 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
947 VIC);
948 }
949
950 // Extracts are just reads of a subregister, so are free. Inserts are
951 // considered free because we don't want to have any cost for scalarizing
952 // operations, and we don't have to copy into a different register class.
953
954 // Dynamic indexing isn't free and is best avoided.
955 return Index == ~0u ? 2 : 0;
956 }
957 default:
958 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
959 VIC);
960 }
961}
962
963/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
964/// this is analyzing the collective result of all output registers. Otherwise,
965/// this is only querying a specific result index if this returns multiple
966/// registers in a struct.
968 const CallInst *CI, ArrayRef<unsigned> Indices) const {
969 // TODO: Handle complex extract indices
970 if (Indices.size() > 1)
971 return true;
972
973 const DataLayout &DL = CI->getDataLayout();
974 const SIRegisterInfo *TRI = ST->getRegisterInfo();
975 TargetLowering::AsmOperandInfoVector TargetConstraints =
976 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
977
978 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
979
980 int OutputIdx = 0;
981 for (auto &TC : TargetConstraints) {
982 if (TC.Type != InlineAsm::isOutput)
983 continue;
984
985 // Skip outputs we don't care about.
986 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
987 continue;
988
989 TLI->ComputeConstraintToUse(TC, SDValue());
990
991 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
992 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
993
994 // For AGPR constraints null is returned on subtargets without AGPRs, so
995 // assume divergent for null.
996 if (!RC || !TRI->isSGPRClass(RC))
997 return true;
998 }
999
1000 return false;
1001}
1002
1004 const IntrinsicInst *ReadReg) const {
1005 Metadata *MD =
1006 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1008 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1009
1010 // Special case registers that look like VCC.
1011 MVT VT = MVT::getVT(ReadReg->getType());
1012 if (VT == MVT::i1)
1013 return true;
1014
1015 // Special case scalar registers that start with 'v'.
1016 if (RegName.starts_with("vcc") || RegName.empty())
1017 return false;
1018
1019 // VGPR or AGPR is divergent. There aren't any specially named vector
1020 // registers.
1021 return RegName[0] == 'v' || RegName[0] == 'a';
1022}
1023
1024/// \returns true if the result of the value could potentially be
1025/// different across workitems in a wavefront.
1026bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1027 if (const Argument *A = dyn_cast<Argument>(V))
1029
1030 // Loads from the private and flat address spaces are divergent, because
1031 // threads can execute the load instruction with the same inputs and get
1032 // different results.
1033 //
1034 // All other loads are not divergent, because if threads issue loads with the
1035 // same arguments, they will always get the same result.
1036 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1037 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1038 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1039
1040 // Atomics are divergent because they are executed sequentially: when an
1041 // atomic operation refers to the same address in each thread, then each
1042 // thread after the first sees the value written by the previous thread as
1043 // original value.
1045 return true;
1046
1048 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1049 switch (IID) {
1050 case Intrinsic::read_register:
1052 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1053 unsigned SrcAS =
1054 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1055 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1056 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1057 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1058 ST->hasGloballyAddressableScratch();
1059 }
1060 case Intrinsic::amdgcn_workitem_id_y:
1061 case Intrinsic::amdgcn_workitem_id_z: {
1062 const Function *F = Intrinsic->getFunction();
1063 bool HasUniformYZ =
1064 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1065 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1066 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1067 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1068 }
1069 default:
1071 }
1072 }
1073
1074 // Assume all function calls are a source of divergence.
1075 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1076 if (CI->isInlineAsm())
1078 return true;
1079 }
1080
1081 // Assume all function calls are a source of divergence.
1082 if (isa<InvokeInst>(V))
1083 return true;
1084
1085 // If the target supports globally addressable scratch, the mapping from
1086 // scratch memory to the flat aperture changes therefore an address space cast
1087 // is no longer uniform.
1088 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1089 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1090 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1091 ST->hasGloballyAddressableScratch();
1092 }
1093
1094 return false;
1095}
1096
1097bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1098 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1099 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1100
1101 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1102 if (CI->isInlineAsm())
1104 return false;
1105 }
1106
1107 // In most cases TID / wavefrontsize is uniform.
1108 //
1109 // However, if a kernel has uneven dimesions we can have a value of
1110 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1111 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1112 // packed into a same wave which gives 1 and 0 after the division by 64
1113 // respectively.
1114 //
1115 // The X dimension doesn't reset within a wave if either both the Y
1116 // and Z dimensions are of length 1, or if the X dimension's required
1117 // size is a power of 2. Note, however, if the X dimension's maximum
1118 // size is a power of 2 < the wavefront size, division by the wavefront
1119 // size is guaranteed to yield 0, so this is also a no-reset case.
1120 bool XDimDoesntResetWithinWaves = false;
1121 if (auto *I = dyn_cast<Instruction>(V)) {
1122 const Function *F = I->getFunction();
1123 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1124 }
1125 using namespace llvm::PatternMatch;
1126 uint64_t C;
1128 m_ConstantInt(C))) ||
1130 m_ConstantInt(C)))) {
1131 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1132 }
1133
1134 Value *Mask;
1136 m_Value(Mask)))) {
1137 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1138 ST->getWavefrontSizeLog2() &&
1139 XDimDoesntResetWithinWaves;
1140 }
1141
1142 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1143 if (!ExtValue)
1144 return false;
1145
1146 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1147 if (!CI)
1148 return false;
1149
1150 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1151 switch (Intrinsic->getIntrinsicID()) {
1152 default:
1153 return false;
1154 case Intrinsic::amdgcn_if:
1155 case Intrinsic::amdgcn_else: {
1156 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1157 return Indices.size() == 1 && Indices[0] == 1;
1158 }
1159 }
1160 }
1161
1162 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1163 // divergent for the overall struct return. We need to override it in the
1164 // case we're extracting an SGPR component here.
1165 if (CI->isInlineAsm())
1166 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1167
1168 return false;
1169}
1170
1172 Intrinsic::ID IID) const {
1173 switch (IID) {
1174 case Intrinsic::amdgcn_is_shared:
1175 case Intrinsic::amdgcn_is_private:
1176 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1177 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1178 case Intrinsic::amdgcn_load_to_lds:
1179 case Intrinsic::amdgcn_make_buffer_rsrc:
1180 OpIndexes.push_back(0);
1181 return true;
1182 default:
1183 return false;
1184 }
1185}
1186
1188 Value *OldV,
1189 Value *NewV) const {
1190 auto IntrID = II->getIntrinsicID();
1191 switch (IntrID) {
1192 case Intrinsic::amdgcn_is_shared:
1193 case Intrinsic::amdgcn_is_private: {
1194 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1196 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1197 LLVMContext &Ctx = NewV->getType()->getContext();
1198 ConstantInt *NewVal = (TrueAS == NewAS) ?
1200 return NewVal;
1201 }
1202 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1203 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1204 Type *DestTy = II->getType();
1205 Type *SrcTy = NewV->getType();
1206 unsigned NewAS = SrcTy->getPointerAddressSpace();
1208 return nullptr;
1209 Module *M = II->getModule();
1211 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1212 II->setArgOperand(0, NewV);
1213 II->setCalledFunction(NewDecl);
1214 return II;
1215 }
1216 case Intrinsic::amdgcn_load_to_lds: {
1217 Type *SrcTy = NewV->getType();
1218 Module *M = II->getModule();
1219 Function *NewDecl =
1220 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1221 II->setArgOperand(0, NewV);
1222 II->setCalledFunction(NewDecl);
1223 return II;
1224 }
1225 case Intrinsic::amdgcn_make_buffer_rsrc: {
1226 Type *SrcTy = NewV->getType();
1227 Type *DstTy = II->getType();
1228 Module *M = II->getModule();
1230 M, II->getIntrinsicID(), {DstTy, SrcTy});
1231 II->setArgOperand(0, NewV);
1232 II->setCalledFunction(NewDecl);
1233 return II;
1234 }
1235 default:
1236 return nullptr;
1237 }
1238}
1239
1241 VectorType *DstTy, VectorType *SrcTy,
1242 ArrayRef<int> Mask,
1244 int Index, VectorType *SubTp,
1246 const Instruction *CxtI) const {
1247 if (!isa<FixedVectorType>(SrcTy))
1248 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1249 SubTp);
1250
1251 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1252
1253 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1254 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1255 (ScalarSize == 16 || ScalarSize == 8)) {
1256 // Larger vector widths may require additional instructions, but are
1257 // typically cheaper than scalarized versions.
1258 //
1259 // We assume that shuffling at a register granularity can be done for free.
1260 // This is not true for vectors fed into memory instructions, but it is
1261 // effectively true for all other shuffling. The emphasis of the logic here
1262 // is to assist generic transform in cleaning up / canonicalizing those
1263 // shuffles.
1264
1265 // With op_sel VOP3P instructions freely can access the low half or high
1266 // half of a register, so any swizzle of two elements is free.
1267 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1268 unsigned NumSrcElts = SrcVecTy->getNumElements();
1269 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1270 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1271 Kind == TTI::SK_PermuteSingleSrc))
1272 return 0;
1273 }
1274
1275 unsigned EltsPerReg = 32 / ScalarSize;
1276 switch (Kind) {
1277 case TTI::SK_Broadcast:
1278 // A single v_perm_b32 can be re-used for all destination registers.
1279 return 1;
1280 case TTI::SK_Reverse:
1281 // One instruction per register.
1282 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1283 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1286 if (Index % EltsPerReg == 0)
1287 return 0; // Shuffling at register granularity
1288 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1289 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1292 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1293 if (!DstVecTy)
1295 unsigned NumDstElts = DstVecTy->getNumElements();
1296 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1297 unsigned EndIndex = Index + NumInsertElts;
1298 unsigned BeginSubIdx = Index % EltsPerReg;
1299 unsigned EndSubIdx = EndIndex % EltsPerReg;
1300 unsigned Cost = 0;
1301
1302 if (BeginSubIdx != 0) {
1303 // Need to shift the inserted vector into place. The cost is the number
1304 // of destination registers overlapped by the inserted vector.
1305 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1306 }
1307
1308 // If the last register overlap is partial, there may be three source
1309 // registers feeding into it; that takes an extra instruction.
1310 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1311 Cost += 1;
1312
1313 return Cost;
1314 }
1315 case TTI::SK_Splice: {
1316 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1317 if (!DstVecTy)
1319 unsigned NumElts = DstVecTy->getNumElements();
1320 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1321 // Determine the sub-region of the result vector that requires
1322 // sub-register shuffles / mixing.
1323 unsigned EltsFromLHS = NumElts - Index;
1324 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1325 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1326 if (LHSIsAligned && RHSIsAligned)
1327 return 0;
1328 if (LHSIsAligned && !RHSIsAligned)
1329 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1330 if (!LHSIsAligned && RHSIsAligned)
1331 return divideCeil(EltsFromLHS, EltsPerReg);
1332 return divideCeil(NumElts, EltsPerReg);
1333 }
1334 default:
1335 break;
1336 }
1337
1338 if (!Mask.empty()) {
1339 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1340
1341 // Generically estimate the cost by assuming that each destination
1342 // register is derived from sources via v_perm_b32 instructions if it
1343 // can't be copied as-is.
1344 //
1345 // For each destination register, derive the cost of obtaining it based
1346 // on the number of source registers that feed into it.
1347 unsigned Cost = 0;
1348 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1350 bool Aligned = true;
1351 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1352 int SrcIdx = Mask[DstIdx + I];
1353 if (SrcIdx == -1)
1354 continue;
1355 int Reg;
1356 if (SrcIdx < (int)NumSrcElts) {
1357 Reg = SrcIdx / EltsPerReg;
1358 if (SrcIdx % EltsPerReg != I)
1359 Aligned = false;
1360 } else {
1361 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1362 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1363 Aligned = false;
1364 }
1365 if (!llvm::is_contained(Regs, Reg))
1366 Regs.push_back(Reg);
1367 }
1368 if (Regs.size() >= 2)
1369 Cost += Regs.size() - 1;
1370 else if (!Aligned)
1371 Cost += 1;
1372 }
1373 return Cost;
1374 }
1375 }
1376
1377 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1378 SubTp);
1379}
1380
1381/// Whether it is profitable to sink the operands of an
1382/// Instruction I to the basic block of I.
1383/// This helps using several modifiers (like abs and neg) more often.
1385 SmallVectorImpl<Use *> &Ops) const {
1386 using namespace PatternMatch;
1387
1388 for (auto &Op : I->operands()) {
1389 // Ensure we are not already sinking this operand.
1390 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1391 continue;
1392
1393 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1394 Ops.push_back(&Op);
1395 continue;
1396 }
1397
1398 // Check for zero-cost multiple use InsertElement/ExtractElement
1399 // instructions
1400 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1401 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1402 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1403 if (VecOpInst && VecOpInst->hasOneUse())
1404 continue;
1405
1406 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1408 OpInst->getOperand(0),
1409 OpInst->getOperand(1)) == 0) {
1410 Ops.push_back(&Op);
1411 continue;
1412 }
1413 }
1414 }
1415
1416 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1417
1418 unsigned EltSize = DL.getTypeSizeInBits(
1419 cast<VectorType>(Shuffle->getType())->getElementType());
1420
1421 // For i32 (or greater) shufflevectors, these will be lowered into a
1422 // series of insert / extract elements, which will be coalesced away.
1423 if (EltSize < 16 || !ST->has16BitInsts())
1424 continue;
1425
1426 int NumSubElts, SubIndex;
1427 if (Shuffle->changesLength()) {
1428 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1429 Ops.push_back(&Op);
1430 continue;
1431 }
1432
1433 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1434 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1435 !(SubIndex & 0x1)) {
1436 Ops.push_back(&Op);
1437 continue;
1438 }
1439 }
1440
1441 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1442 Shuffle->isSingleSource()) {
1443 Ops.push_back(&Op);
1444 continue;
1445 }
1446 }
1447 }
1448
1449 return !Ops.empty();
1450}
1451
1453 const Function *Callee) const {
1454 const TargetMachine &TM = getTLI()->getTargetMachine();
1455 const GCNSubtarget *CallerST
1456 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1457 const GCNSubtarget *CalleeST
1458 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1459
1460 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1461 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1462
1463 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1464 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1465 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1466 return false;
1467
1468 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1469 // no way to support merge for backend defined attributes.
1470 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1471 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1472 if (!CallerMode.isInlineCompatible(CalleeMode))
1473 return false;
1474
1475 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1476 Callee->hasFnAttribute(Attribute::InlineHint))
1477 return true;
1478
1479 // Hack to make compile times reasonable.
1480 if (InlineMaxBB) {
1481 // Single BB does not increase total BB amount.
1482 if (Callee->size() == 1)
1483 return true;
1484 size_t BBSize = Caller->size() + Callee->size() - 1;
1485 return BBSize <= InlineMaxBB;
1486 }
1487
1488 return true;
1489}
1490
1492 const SITargetLowering *TLI,
1493 const GCNTTIImpl *TTIImpl) {
1494 const int NrOfSGPRUntilSpill = 26;
1495 const int NrOfVGPRUntilSpill = 32;
1496
1497 const DataLayout &DL = TTIImpl->getDataLayout();
1498
1499 unsigned adjustThreshold = 0;
1500 int SGPRsInUse = 0;
1501 int VGPRsInUse = 0;
1502 for (const Use &A : CB->args()) {
1503 SmallVector<EVT, 4> ValueVTs;
1504 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1505 for (auto ArgVT : ValueVTs) {
1506 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1507 CB->getContext(), CB->getCallingConv(), ArgVT);
1509 SGPRsInUse += CCRegNum;
1510 else
1511 VGPRsInUse += CCRegNum;
1512 }
1513 }
1514
1515 // The cost of passing function arguments through the stack:
1516 // 1 instruction to put a function argument on the stack in the caller.
1517 // 1 instruction to take a function argument from the stack in callee.
1518 // 1 instruction is explicitly take care of data dependencies in callee
1519 // function.
1520 InstructionCost ArgStackCost(1);
1521 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1522 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1524 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1525 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1527
1528 // The penalty cost is computed relative to the cost of instructions and does
1529 // not model any storage costs.
1530 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1531 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1532 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1533 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1534 return adjustThreshold;
1535}
1536
1537static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1538 const DataLayout &DL) {
1539 // If we have a pointer to a private array passed into a function
1540 // it will not be optimized out, leaving scratch usage.
1541 // This function calculates the total size in bytes of the memory that would
1542 // end in scratch if the call was not inlined.
1543 unsigned AllocaSize = 0;
1545 for (Value *PtrArg : CB->args()) {
1546 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1547 if (!Ty)
1548 continue;
1549
1550 unsigned AddrSpace = Ty->getAddressSpace();
1551 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1552 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1553 continue;
1554
1556 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1557 continue;
1558
1559 if (auto Size = AI->getAllocationSize(DL))
1560 AllocaSize += Size->getFixedValue();
1561 }
1562 return AllocaSize;
1563}
1564
1569
1571 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1572
1573 // Private object passed as arguments may end up in scratch usage if the call
1574 // is not inlined. Increase the inline threshold to promote inlining.
1575 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1576 if (AllocaSize > 0)
1577 Threshold += ArgAllocaCost;
1578 return Threshold;
1579}
1580
1582 const AllocaInst *AI) const {
1583
1584 // Below the cutoff, assume that the private memory objects would be
1585 // optimized
1586 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1587 if (AllocaSize <= ArgAllocaCutoff)
1588 return 0;
1589
1590 // Above the cutoff, we give a cost to each private memory object
1591 // depending its size. If the array can be optimized by SROA this cost is not
1592 // added to the total-cost in the inliner cost analysis.
1593 //
1594 // We choose the total cost of the alloca such that their sum cancels the
1595 // bonus given in the threshold (ArgAllocaCost).
1596 //
1597 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1598 //
1599 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1600 // the single-bb bonus and the vector-bonus.
1601 //
1602 // We compensate the first two multipliers, by repeating logic from the
1603 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1604 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1605 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1606
1607 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1608 return BB.getTerminator()->getNumSuccessors() > 1;
1609 });
1610 if (SingleBB) {
1611 Threshold += Threshold / 2;
1612 }
1613
1614 auto ArgAllocaSize = AI->getAllocationSize(DL);
1615 if (!ArgAllocaSize)
1616 return 0;
1617
1618 // Attribute the bonus proportionally to the alloca size
1619 unsigned AllocaThresholdBonus =
1620 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1621
1622 return AllocaThresholdBonus;
1623}
1624
1627 OptimizationRemarkEmitter *ORE) const {
1628 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1629}
1630
1632 TTI::PeelingPreferences &PP) const {
1633 CommonTTI.getPeelingPreferences(L, SE, PP);
1634}
1635
1636int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1637 return ST->hasFullRate64Ops()
1638 ? getFullRateInstrCost()
1639 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1640 : getQuarterRateInstrCost(CostKind);
1641}
1642
1643std::pair<InstructionCost, MVT>
1644GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1645 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1646 auto Size = DL.getTypeSizeInBits(Ty);
1647 // Maximum load or store can handle 8 dwords for scalar and 4 for
1648 // vector ALU. Let's assume anything above 8 dwords is expensive
1649 // even if legal.
1650 if (Size <= 256)
1651 return Cost;
1652
1653 Cost.first += (Size + 255) / 256;
1654 return Cost;
1655}
1656
1658 return ST->hasPrefetch() ? 128 : 0;
1659}
1660
1663}
1664
1666 const Function &F,
1667 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1668 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1669 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1670 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1671 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1672 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1673 ST->getFlatWorkGroupSizes(F);
1674 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1675 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1676 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1677 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1678 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1679}
1680
1683 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1684 return KnownIEEEMode::On; // Only mode on gfx1170+
1685
1686 const Function *F = I.getFunction();
1687 if (!F)
1689
1690 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1691 if (IEEEAttr.isValid())
1693
1694 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1696}
1697
1699 Align Alignment,
1700 unsigned AddressSpace,
1702 TTI::OperandValueInfo OpInfo,
1703 const Instruction *I) const {
1704 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1705 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1706 VecTy->getElementType()->isIntegerTy(8)) {
1707 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1709 }
1710 }
1711 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1712 OpInfo, I);
1713}
1714
1716 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1717 if (VecTy->getElementType()->isIntegerTy(8)) {
1718 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1719 return divideCeil(ElementCount - 1, 4);
1720 }
1721 }
1722 return BaseT::getNumberOfParts(Tp);
1723}
1724
1727 if (isAlwaysUniform(V))
1729
1730 if (isSourceOfDivergence(V))
1732
1734}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:73
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...