27#include "llvm/IR/IntrinsicsAMDGPU.h"
34#define DEBUG_TYPE "AMDGPUtti"
37 "amdgpu-unroll-threshold-private",
38 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
42 "amdgpu-unroll-threshold-local",
43 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
47 "amdgpu-unroll-threshold-if",
48 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
52 "amdgpu-unroll-runtime-local",
53 cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
63 cl::desc(
"Cost of alloca argument"));
71 cl::desc(
"Maximum alloca size to use for inline cost"));
76 cl::desc(
"Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc(
"Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering statically-sized memcpy, memmove, or"
93 for (
const Value *V :
I->operand_values()) {
98 return SubLoop->contains(PHI); }))
108 TargetTriple(TM->getTargetTriple()),
110 TLI(ST->getTargetLowering()) {}
115 const Function &
F = *L->getHeader()->getParent();
117 F.getFnAttributeAsParsedInteger(
"amdgpu-unroll-threshold", 300);
118 UP.
MaxCount = std::numeric_limits<unsigned>::max();
131 const unsigned MaxAlloca = (256 - 16) * 4;
137 if (
MDNode *LoopUnrollThreshold =
139 if (LoopUnrollThreshold->getNumOperands() == 2) {
141 LoopUnrollThreshold->getOperand(1));
142 if (MetaThresholdValue) {
148 ThresholdPrivate = std::min(ThresholdPrivate, UP.
Threshold);
149 ThresholdLocal = std::min(ThresholdLocal, UP.
Threshold);
154 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
157 unsigned LocalGEPsSeen = 0;
160 return SubLoop->contains(BB); }))
170 if (UP.
Threshold < MaxBoost && Br->isConditional()) {
173 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
174 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
180 << *L <<
" due to " << *Br <<
'\n');
192 unsigned AS =
GEP->getAddressSpace();
193 unsigned Threshold = 0;
195 Threshold = ThresholdPrivate;
197 Threshold = ThresholdLocal;
205 const Value *Ptr =
GEP->getPointerOperand();
211 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
225 << *L <<
" due to LDS use.\n");
230 bool HasLoopDef =
false;
233 if (!Inst || L->isLoopInvariant(
Op))
237 return SubLoop->contains(Inst); }))
261 << *L <<
" due to " << *
GEP <<
'\n');
284 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
285 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
286 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
288 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292 AMDGPU::FeatureTrapHandler,
296 AMDGPU::FeatureSRAMECC,
299 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
304 TLI(ST->getTargetLowering()), CommonTTI(TM,
F),
305 IsGraphics(
AMDGPU::isGraphics(
F.getCallingConv())) {
308 HasFP64FP16Denormals =
313 return !
F || !ST->isSingleLaneExecution(*
F);
345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346 return 32 * 4 / ElemWidth;
349 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
350 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356 unsigned ChainSizeInBytes,
358 unsigned VecRegBitWidth = VF * LoadSize;
361 return 128 / LoadSize;
367 unsigned ChainSizeInBytes,
369 unsigned VecRegBitWidth = VF * StoreSize;
370 if (VecRegBitWidth > 128)
371 return 128 / StoreSize;
387 return 8 * ST->getMaxPrivateElementSize();
395 unsigned AddrSpace)
const {
400 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
401 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
408 unsigned AddrSpace)
const {
414 unsigned AddrSpace)
const {
424 unsigned DestAddrSpace,
Align SrcAlign,
Align DestAlign,
425 std::optional<uint32_t> AtomicElementSize)
const {
427 if (AtomicElementSize)
441 unsigned I32EltsInVector = 4;
451 unsigned RemainingBytes,
unsigned SrcAddrSpace,
unsigned DestAddrSpace,
453 std::optional<uint32_t> AtomicCpySize)
const {
457 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
458 DestAlign, AtomicCpySize);
461 while (RemainingBytes >= 16) {
463 RemainingBytes -= 16;
467 while (RemainingBytes >= 8) {
473 while (RemainingBytes >= 4) {
479 while (RemainingBytes >= 2) {
485 while (RemainingBytes) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap: {
507 if (!Ordering || !Volatile)
510 unsigned OrderingVal = Ordering->getZExtValue();
517 Info.WriteMem =
true;
518 Info.IsVolatile = !Volatile->isZero();
532 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
533 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537 unsigned NElts = LT.second.isVector() ?
538 LT.second.getVectorNumElements() : 1;
547 return get64BitInstrCost(
CostKind) * LT.first * NElts;
549 if (ST->has16BitInsts() && SLT == MVT::i16)
550 NElts = (NElts + 1) / 2;
553 return getFullRateInstrCost() * LT.first * NElts;
559 if (SLT == MVT::i64) {
561 return 2 * getFullRateInstrCost() * LT.first * NElts;
564 if (ST->has16BitInsts() && SLT == MVT::i16)
565 NElts = (NElts + 1) / 2;
567 return LT.first * NElts * getFullRateInstrCost();
569 const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
570 if (SLT == MVT::i64) {
571 const int FullRateCost = getFullRateInstrCost();
572 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
575 if (ST->has16BitInsts() && SLT == MVT::i16)
576 NElts = (NElts + 1) / 2;
579 return QuarterRateCost * NElts * LT.first;
587 const int OPC = TLI->InstructionOpcodeToISD(
FAdd->getOpcode());
589 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
591 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
604 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
605 NElts = (NElts + 1) / 2;
606 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
607 NElts = (NElts + 1) / 2;
609 return LT.first * NElts * get64BitInstrCost(
CostKind);
611 if (ST->has16BitInsts() && SLT == MVT::f16)
612 NElts = (NElts + 1) / 2;
614 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
615 return LT.first * NElts * getFullRateInstrCost();
621 if (SLT == MVT::f64) {
626 if (!ST->hasUsableDivScaleConditionOutput())
627 Cost += 3 * getFullRateInstrCost();
629 return LT.first *
Cost * NElts;
634 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
635 (SLT == MVT::f16 && ST->has16BitInsts())) {
636 return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
640 if (SLT == MVT::f16 && ST->has16BitInsts()) {
647 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
648 return LT.first *
Cost * NElts;
655 int Cost = getQuarterRateInstrCost(
CostKind) + getFullRateInstrCost();
656 return LT.first *
Cost * NElts;
659 if (SLT == MVT::f32 || SLT == MVT::f16) {
661 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
662 1 * getQuarterRateInstrCost(
CostKind);
664 if (!HasFP32Denormals) {
666 Cost += 2 * getFullRateInstrCost();
669 return LT.first * NElts *
Cost;
675 return TLI->isFNegFree(SLT) ? 0 : NElts;
689 case Intrinsic::fmuladd:
690 case Intrinsic::copysign:
691 case Intrinsic::minimumnum:
692 case Intrinsic::maximumnum:
693 case Intrinsic::canonicalize:
695 case Intrinsic::round:
696 case Intrinsic::uadd_sat:
697 case Intrinsic::usub_sat:
698 case Intrinsic::sadd_sat:
699 case Intrinsic::ssub_sat:
710 switch (ICA.
getID()) {
711 case Intrinsic::fabs:
714 case Intrinsic::amdgcn_workitem_id_x:
715 case Intrinsic::amdgcn_workitem_id_y:
716 case Intrinsic::amdgcn_workitem_id_z:
720 case Intrinsic::amdgcn_workgroup_id_x:
721 case Intrinsic::amdgcn_workgroup_id_y:
722 case Intrinsic::amdgcn_workgroup_id_z:
723 case Intrinsic::amdgcn_lds_kernel_id:
724 case Intrinsic::amdgcn_dispatch_ptr:
725 case Intrinsic::amdgcn_dispatch_id:
726 case Intrinsic::amdgcn_implicitarg_ptr:
727 case Intrinsic::amdgcn_queue_ptr:
739 case Intrinsic::exp2:
740 case Intrinsic::exp10: {
742 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
745 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
747 if (SLT == MVT::f64) {
749 if (IID == Intrinsic::exp)
751 else if (IID == Intrinsic::exp10)
757 if (SLT == MVT::f32) {
758 unsigned NumFullRateOps = 0;
760 unsigned NumQuarterRateOps = 1;
766 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
768 if (IID == Intrinsic::exp) {
771 }
else if (IID == Intrinsic::exp10) {
774 NumQuarterRateOps = 2;
777 if (HasFP32Denormals)
782 NumFullRateOps * getFullRateInstrCost() +
783 NumQuarterRateOps * getQuarterRateInstrCost(
CostKind);
784 return LT.first * NElts *
Cost;
796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
798 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
800 if ((ST->hasVOP3PInsts() &&
801 (SLT == MVT::f16 || SLT == MVT::i16 ||
802 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
803 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
804 NElts = (NElts + 1) / 2;
807 unsigned InstRate = getQuarterRateInstrCost(
CostKind);
809 switch (ICA.
getID()) {
811 case Intrinsic::fmuladd:
812 if (SLT == MVT::f64) {
813 InstRate = get64BitInstrCost(
CostKind);
817 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
818 InstRate = getFullRateInstrCost();
820 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(
CostKind)
821 : getQuarterRateInstrCost(
CostKind);
824 case Intrinsic::copysign:
825 return NElts * getFullRateInstrCost();
826 case Intrinsic::minimumnum:
827 case Intrinsic::maximumnum: {
839 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
840 InstRate = BaseRate *
NumOps;
843 case Intrinsic::canonicalize: {
845 SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
848 case Intrinsic::uadd_sat:
849 case Intrinsic::usub_sat:
850 case Intrinsic::sadd_sat:
851 case Intrinsic::ssub_sat: {
852 if (SLT == MVT::i16 || SLT == MVT::i32)
853 InstRate = getFullRateInstrCost();
855 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
862 if (SLT == MVT::i16 || SLT == MVT::i32)
863 InstRate = 2 * getFullRateInstrCost();
869 return LT.first * NElts * InstRate;
875 assert((
I ==
nullptr ||
I->getOpcode() == Opcode) &&
876 "Opcode should reflect passed instruction.");
879 const int CBrCost = SCost ? 5 : 7;
881 case Instruction::Br: {
884 if (BI && BI->isUnconditional())
885 return SCost ? 1 : 4;
890 case Instruction::Switch: {
894 return (
SI ? (
SI->getNumCases() + 1) : 4) * (CBrCost + 1);
896 case Instruction::Ret:
897 return SCost ? 1 : 10;
904 std::optional<FastMathFlags> FMF,
909 EVT OrigTy = TLI->getValueType(
DL, Ty);
916 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
917 return LT.first * getFullRateInstrCost();
924 EVT OrigTy = TLI->getValueType(
DL, Ty);
931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
932 return LT.first * getHalfRateInstrCost(
CostKind);
939 case Instruction::ExtractElement:
940 case Instruction::InsertElement: {
944 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
955 return Index == ~0u ? 2 : 0;
970 if (Indices.
size() > 1)
976 TLI->ParseConstraints(
DL, ST->getRegisterInfo(), *CI);
978 const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
981 for (
auto &TC : TargetConstraints) {
986 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
989 TLI->ComputeConstraintToUse(TC,
SDValue());
992 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
996 if (!RC || !
TRI->isSGPRClass(RC))
1026bool GCNTTIImpl::isSourceOfDivergence(
const Value *V)
const {
1050 case Intrinsic::read_register:
1052 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1054 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1055 unsigned DstAS =
Intrinsic->getType()->getPointerAddressSpace();
1058 ST->hasGloballyAddressableScratch();
1060 case Intrinsic::amdgcn_workitem_id_y:
1061 case Intrinsic::amdgcn_workitem_id_z: {
1066 *
F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1067 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1076 if (CI->isInlineAsm())
1091 ST->hasGloballyAddressableScratch();
1097bool GCNTTIImpl::isAlwaysUniform(
const Value *V)
const {
1102 if (CI->isInlineAsm())
1120 bool XDimDoesntResetWithinWaves =
false;
1123 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*
F);
1125 using namespace llvm::PatternMatch;
1131 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1138 ST->getWavefrontSizeLog2() &&
1139 XDimDoesntResetWithinWaves;
1154 case Intrinsic::amdgcn_if:
1155 case Intrinsic::amdgcn_else: {
1156 ArrayRef<unsigned> Indices = ExtValue->
getIndices();
1157 return Indices.
size() == 1 && Indices[0] == 1;
1174 case Intrinsic::amdgcn_is_shared:
1175 case Intrinsic::amdgcn_is_private:
1176 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1177 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1178 case Intrinsic::amdgcn_load_to_lds:
1179 case Intrinsic::amdgcn_make_buffer_rsrc:
1189 Value *NewV)
const {
1190 auto IntrID =
II->getIntrinsicID();
1192 case Intrinsic::amdgcn_is_shared:
1193 case Intrinsic::amdgcn_is_private: {
1194 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1202 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1203 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1204 Type *DestTy =
II->getType();
1211 M,
II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1212 II->setArgOperand(0, NewV);
1213 II->setCalledFunction(NewDecl);
1216 case Intrinsic::amdgcn_load_to_lds: {
1221 II->setArgOperand(0, NewV);
1222 II->setCalledFunction(NewDecl);
1225 case Intrinsic::amdgcn_make_buffer_rsrc: {
1227 Type *DstTy =
II->getType();
1230 M,
II->getIntrinsicID(), {DstTy, SrcTy});
1231 II->setArgOperand(0, NewV);
1232 II->setCalledFunction(NewDecl);
1253 unsigned ScalarSize =
DL.getTypeSizeInBits(SrcTy->getElementType());
1255 (ScalarSize == 16 || ScalarSize == 8)) {
1268 unsigned NumSrcElts = SrcVecTy->getNumElements();
1269 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1275 unsigned EltsPerReg = 32 / ScalarSize;
1283 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1286 if (Index % EltsPerReg == 0)
1289 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1295 unsigned NumDstElts = DstVecTy->getNumElements();
1297 unsigned EndIndex = Index + NumInsertElts;
1298 unsigned BeginSubIdx = Index % EltsPerReg;
1299 unsigned EndSubIdx = EndIndex % EltsPerReg;
1302 if (BeginSubIdx != 0) {
1310 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1319 unsigned NumElts = DstVecTy->getNumElements();
1323 unsigned EltsFromLHS = NumElts - Index;
1324 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1325 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1326 if (LHSIsAligned && RHSIsAligned)
1328 if (LHSIsAligned && !RHSIsAligned)
1329 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1330 if (!LHSIsAligned && RHSIsAligned)
1338 if (!Mask.empty()) {
1348 for (
unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1351 for (
unsigned I = 0;
I < EltsPerReg && DstIdx +
I < Mask.size(); ++
I) {
1352 int SrcIdx = Mask[DstIdx +
I];
1356 if (SrcIdx < (
int)NumSrcElts) {
1357 Reg = SrcIdx / EltsPerReg;
1358 if (SrcIdx % EltsPerReg !=
I)
1361 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1362 if ((SrcIdx - NumSrcElts) % EltsPerReg !=
I)
1368 if (Regs.
size() >= 2)
1388 for (
auto &
Op :
I->operands()) {
1401 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1403 if (VecOpInst && VecOpInst->
hasOneUse())
1408 OpInst->getOperand(0),
1409 OpInst->getOperand(1)) == 0) {
1418 unsigned EltSize =
DL.getTypeSizeInBits(
1423 if (EltSize < 16 || !ST->has16BitInsts())
1426 int NumSubElts, SubIndex;
1427 if (Shuffle->changesLength()) {
1428 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1433 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1434 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1435 !(SubIndex & 0x1)) {
1441 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1442 Shuffle->isSingleSource()) {
1449 return !
Ops.empty();
1460 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1461 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1463 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1464 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1465 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1475 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1476 Callee->hasFnAttribute(Attribute::InlineHint))
1482 if (Callee->size() == 1)
1484 size_t BBSize = Caller->size() + Callee->size() - 1;
1494 const int NrOfSGPRUntilSpill = 26;
1495 const int NrOfVGPRUntilSpill = 32;
1499 unsigned adjustThreshold = 0;
1505 for (
auto ArgVT : ValueVTs) {
1509 SGPRsInUse += CCRegNum;
1511 VGPRsInUse += CCRegNum;
1521 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1524 ArgStackCost +=
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
1530 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1532 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1534 return adjustThreshold;
1543 unsigned AllocaSize = 0;
1550 unsigned AddrSpace = Ty->getAddressSpace();
1560 AllocaSize +=
Size->getFixedValue();
1604 static_assert(InlinerVectorBonusPercent == 0,
"vector bonus assumed to be 0");
1608 return BB.getTerminator()->getNumSuccessors() > 1;
1611 Threshold += Threshold / 2;
1619 unsigned AllocaThresholdBonus =
1620 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1622 return AllocaThresholdBonus;
1628 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1633 CommonTTI.getPeelingPreferences(L, SE, PP);
1637 return ST->hasFullRate64Ops()
1638 ? getFullRateInstrCost()
1639 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
1640 : getQuarterRateInstrCost(
CostKind);
1643std::pair<InstructionCost, MVT>
1644GCNTTIImpl::getTypeLegalizationCost(
Type *Ty)
const {
1646 auto Size =
DL.getTypeSizeInBits(Ty);
1653 Cost.first += (
Size + 255) / 256;
1658 return ST->hasPrefetch() ? 128 : 0;
1669 LB.
push_back({
"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1670 LB.push_back({
"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1671 LB.push_back({
"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1672 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1673 ST->getFlatWorkGroupSizes(
F);
1674 LB.push_back({
"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1675 LB.push_back({
"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1676 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(
F);
1677 LB.push_back({
"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1678 LB.push_back({
"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1683 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1690 Attribute IEEEAttr =
F->getFnAttribute(
"amdgpu-ieee");
1705 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1706 VecTy->getElementType()->isIntegerTy(8)) {
1717 if (VecTy->getElementType()->isIntegerTy(8)) {
1727 if (isAlwaysUniform(V))
1730 if (isSourceOfDivergence(V))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
Functions, function parameters, and return types can have attributes to indicate how they should be t...
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
LLVM Basic Block Representation.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isScalar() const
Exactly one element.
Convenience struct for specifying and reasoning about fast-math flags.
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
FastMathFlags getFlags() const
Type * getReturnType() const
const IntrinsicInst * getInst() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
A Module instance is used to store all the information related to an LLVM module.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
@ ADD
Simple integer binary arithmetic operators.
@ FADD
Simple binary floating point operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
@ AND
Bitwise operators - logical and, logical or, logical xor.
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
uint64_t getScalarSizeInBits() const
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const