17#include "llvm/IR/IntrinsicsAMDGPU.h"
18#include "llvm/IR/IntrinsicsR600.h"
22#define DEBUG_TYPE "amdgpu-attributor"
27 "amdgpu-indirect-call-specialization-threshold",
29 "A threshold controls whether an indirect call will be specialized"),
32#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
35#include "AMDGPUAttributes.def"
39#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43#include "AMDGPUAttributes.def"
48#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
49static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
51#include "AMDGPUAttributes.def"
61 bool HasApertureRegs,
bool SupportsGetDoorBellID,
62 unsigned CodeObjectVersion) {
64 case Intrinsic::amdgcn_workitem_id_x:
67 case Intrinsic::amdgcn_workgroup_id_x:
69 return WORKGROUP_ID_X;
70 case Intrinsic::amdgcn_workitem_id_y:
71 case Intrinsic::r600_read_tidig_y:
73 case Intrinsic::amdgcn_workitem_id_z:
74 case Intrinsic::r600_read_tidig_z:
76 case Intrinsic::amdgcn_workgroup_id_y:
77 case Intrinsic::r600_read_tgid_y:
78 return WORKGROUP_ID_Y;
79 case Intrinsic::amdgcn_workgroup_id_z:
80 case Intrinsic::r600_read_tgid_z:
81 return WORKGROUP_ID_Z;
82 case Intrinsic::amdgcn_cluster_id_x:
85 case Intrinsic::amdgcn_cluster_id_y:
87 case Intrinsic::amdgcn_cluster_id_z:
89 case Intrinsic::amdgcn_lds_kernel_id:
91 case Intrinsic::amdgcn_dispatch_ptr:
93 case Intrinsic::amdgcn_dispatch_id:
95 case Intrinsic::amdgcn_implicitarg_ptr:
96 return IMPLICIT_ARG_PTR;
99 case Intrinsic::amdgcn_queue_ptr:
102 case Intrinsic::amdgcn_is_shared:
103 case Intrinsic::amdgcn_is_private:
111 case Intrinsic::trap:
112 case Intrinsic::debugtrap:
113 case Intrinsic::ubsantrap:
114 if (SupportsGetDoorBellID)
138 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
139 F.hasFnAttribute(Attribute::SanitizeThread) ||
140 F.hasFnAttribute(Attribute::SanitizeMemory) ||
141 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
142 F.hasFnAttribute(Attribute::SanitizeMemTag);
148 AMDGPUInformationCache(
const Module &M, AnalysisGetter &AG,
150 SetVector<Function *> *
CGSCC, TargetMachine &TM)
156 enum ConstantStatus : uint8_t {
159 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
160 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
161 ADDR_SPACE_CAST_BOTH_TO_FLAT =
162 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
166 bool hasApertureRegs(Function &
F) {
167 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
168 return ST.hasApertureRegs();
172 bool supportsGetDoorbellID(Function &
F) {
173 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
174 return ST.supportsGetDoorbellID();
177 std::optional<std::pair<unsigned, unsigned>>
178 getFlatWorkGroupSizeAttr(
const Function &
F)
const {
182 return std::make_pair(
R->first, *(
R->second));
185 std::pair<unsigned, unsigned>
186 getDefaultFlatWorkGroupSize(
const Function &
F)
const {
187 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
188 return ST.getDefaultFlatWorkGroupSize(
F.getCallingConv());
191 std::pair<unsigned, unsigned>
192 getMaximumFlatWorkGroupRange(
const Function &
F) {
193 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
194 return {
ST.getMinFlatWorkGroupSize(),
ST.getMaxFlatWorkGroupSize()};
197 SmallVector<unsigned> getMaxNumWorkGroups(
const Function &
F) {
198 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
199 return ST.getMaxNumWorkGroups(
F);
203 unsigned getCodeObjectVersion()
const {
return CodeObjectVersion; }
205 std::optional<std::pair<unsigned, unsigned>>
206 getWavesPerEUAttr(
const Function &
F) {
212 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
213 Val->second =
ST.getMaxWavesPerEU();
215 return std::make_pair(Val->first, *(Val->second));
219 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
220 return ST.getMaxWavesPerEU();
223 unsigned getMaxAddrSpace()
const override {
230 static uint8_t visitConstExpr(
const ConstantExpr *CE) {
231 uint8_t Status = NONE;
233 if (
CE->getOpcode() == Instruction::AddrSpaceCast) {
234 unsigned SrcAS =
CE->getOperand(0)->getType()->getPointerAddressSpace();
236 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
238 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
245 uint8_t getConstantAccess(
const Constant *
C,
246 SmallPtrSetImpl<const Constant *> &Visited) {
247 auto It = ConstantStatus.find(
C);
248 if (It != ConstantStatus.end())
256 Result |= visitConstExpr(CE);
258 for (
const Use &U :
C->operands()) {
260 if (!OpC || !Visited.
insert(OpC).second)
263 Result |= getConstantAccess(OpC, Visited);
270 bool needsQueuePtr(
const Constant *
C, Function &Fn) {
272 bool HasAperture = hasApertureRegs(Fn);
275 if (!IsNonEntryFunc && HasAperture)
278 SmallPtrSet<const Constant *, 8> Visited;
279 uint8_t
Access = getConstantAccess(
C, Visited);
282 if (IsNonEntryFunc && (
Access & DS_GLOBAL))
285 return !HasAperture && (
Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
288 bool checkConstForAddrSpaceCastFromPrivate(
const Constant *
C) {
289 SmallPtrSet<const Constant *, 8> Visited;
290 uint8_t
Access = getConstantAccess(
C, Visited);
291 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
296 DenseMap<const Constant *, uint8_t> ConstantStatus;
297 const unsigned CodeObjectVersion;
300struct AAAMDAttributes
301 :
public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
303 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
306 AAAMDAttributes(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
309 static AAAMDAttributes &createForPosition(
const IRPosition &IRP,
313 StringRef
getName()
const override {
return "AAAMDAttributes"; }
316 const char *getIdAddr()
const override {
return &ID; }
320 static bool classof(
const AbstractAttribute *AA) {
325 static const char ID;
327const char AAAMDAttributes::ID = 0;
329struct AAUniformWorkGroupSize
330 :
public StateWrapper<BooleanState, AbstractAttribute> {
331 using Base = StateWrapper<BooleanState, AbstractAttribute>;
332 AAUniformWorkGroupSize(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
335 static AAUniformWorkGroupSize &createForPosition(
const IRPosition &IRP,
339 StringRef
getName()
const override {
return "AAUniformWorkGroupSize"; }
342 const char *getIdAddr()
const override {
return &ID; }
346 static bool classof(
const AbstractAttribute *AA) {
351 static const char ID;
353const char AAUniformWorkGroupSize::ID = 0;
355struct AAUniformWorkGroupSizeFunction :
public AAUniformWorkGroupSize {
356 AAUniformWorkGroupSizeFunction(
const IRPosition &IRP, Attributor &
A)
357 : AAUniformWorkGroupSize(IRP,
A) {}
361 CallingConv::ID CC =
F->getCallingConv();
363 if (CC != CallingConv::AMDGPU_KERNEL)
366 bool InitialValue =
F->hasFnAttribute(
"uniform-work-group-size");
369 indicateOptimisticFixpoint();
371 indicatePessimisticFixpoint();
377 auto CheckCallSite = [&](AbstractCallSite CS) {
380 <<
"->" << getAssociatedFunction()->
getName() <<
"\n");
382 const auto *CallerInfo =
A.getAAFor<AAUniformWorkGroupSize>(
384 if (!CallerInfo || !CallerInfo->isValidState())
388 CallerInfo->getState());
393 bool AllCallSitesKnown =
true;
394 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
395 return indicatePessimisticFixpoint();
402 return ChangeStatus::UNCHANGED;
404 LLVMContext &Ctx = getAssociatedFunction()->getContext();
405 return A.manifestAttrs(getIRPosition(),
406 {Attribute::get(Ctx,
"uniform-work-group-size")},
410 bool isValidState()
const override {
415 const std::string getAsStr(Attributor *)
const override {
416 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) +
"]";
420 void trackStatistics()
const override {}
423AAUniformWorkGroupSize &
424AAUniformWorkGroupSize::createForPosition(
const IRPosition &IRP,
427 return *
new (
A.Allocator) AAUniformWorkGroupSizeFunction(IRP,
A);
429 "AAUniformWorkGroupSize is only valid for function position");
432struct AAAMDAttributesFunction :
public AAAMDAttributes {
433 AAAMDAttributesFunction(
const IRPosition &IRP, Attributor &
A)
434 : AAAMDAttributes(IRP,
A) {}
446 if (HasSanitizerAttrs) {
447 removeAssumedBits(IMPLICIT_ARG_PTR);
448 removeAssumedBits(HOSTCALL_PTR);
449 removeAssumedBits(FLAT_SCRATCH_INIT);
453 if (HasSanitizerAttrs &&
454 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
455 Attr.first == FLAT_SCRATCH_INIT))
458 if (
F->hasFnAttribute(Attr.second))
459 addKnownBits(Attr.first);
462 if (
F->isDeclaration())
468 indicatePessimisticFixpoint();
476 auto OrigAssumed = getAssumed();
479 const AACallEdges *AAEdges =
A.getAAFor<AACallEdges>(
480 *
this, this->getIRPosition(), DepClassTy::REQUIRED);
483 return indicatePessimisticFixpoint();
487 bool NeedsImplicit =
false;
488 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
489 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
490 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*
F);
491 unsigned COV = InfoCache.getCodeObjectVersion();
496 const AAAMDAttributes *AAAMD =
A.getAAFor<AAAMDAttributes>(
498 if (!AAAMD || !AAAMD->isValidState())
499 return indicatePessimisticFixpoint();
504 bool NonKernelOnly =
false;
507 HasApertureRegs, SupportsGetDoorbellID, COV);
518 if (!
Callee->hasFnAttribute(Attribute::NoCallback))
519 return indicatePessimisticFixpoint();
524 if ((IsNonEntryFunc || !NonKernelOnly))
525 removeAssumedBits(AttrMask);
531 removeAssumedBits(IMPLICIT_ARG_PTR);
533 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(
A)) {
537 removeAssumedBits(IMPLICIT_ARG_PTR);
539 removeAssumedBits(QUEUE_PTR);
542 if (funcRetrievesMultigridSyncArg(
A, COV)) {
543 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
544 "multigrid_sync_arg needs implicitarg_ptr");
545 removeAssumedBits(MULTIGRID_SYNC_ARG);
548 if (funcRetrievesHostcallPtr(
A, COV)) {
549 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"hostcall needs implicitarg_ptr");
550 removeAssumedBits(HOSTCALL_PTR);
553 if (funcRetrievesHeapPtr(
A, COV)) {
554 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"heap_ptr needs implicitarg_ptr");
555 removeAssumedBits(HEAP_PTR);
558 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(
A, COV)) {
559 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"queue_ptr needs implicitarg_ptr");
560 removeAssumedBits(QUEUE_PTR);
563 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(
A)) {
564 removeAssumedBits(LDS_KERNEL_ID);
567 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(
A, COV))
568 removeAssumedBits(DEFAULT_QUEUE);
570 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(
A, COV))
571 removeAssumedBits(COMPLETION_ACTION);
573 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(
A))
574 removeAssumedBits(FLAT_SCRATCH_INIT);
576 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
577 : ChangeStatus::UNCHANGED;
582 LLVMContext &Ctx = getAssociatedFunction()->getContext();
585 if (isKnown(Attr.first))
586 AttrList.
push_back(Attribute::get(Ctx, Attr.second));
589 return A.manifestAttrs(getIRPosition(), AttrList,
593 const std::string getAsStr(Attributor *)
const override {
595 raw_string_ostream OS(Str);
598 if (isAssumed(Attr.first))
599 OS <<
' ' << Attr.second;
605 void trackStatistics()
const override {}
608 bool checkForQueuePtr(Attributor &
A) {
612 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
614 bool NeedsQueuePtr =
false;
617 unsigned SrcAS =
static_cast<AddrSpaceCastInst &
>(
I).getSrcAddressSpace();
619 NeedsQueuePtr =
true;
625 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
631 if (!HasApertureRegs) {
632 bool UsedAssumedInformation =
false;
633 A.checkForAllInstructions(CheckAddrSpaceCasts, *
this,
634 {Instruction::AddrSpaceCast},
635 UsedAssumedInformation);
642 if (!IsNonEntryFunc && HasApertureRegs)
645 for (BasicBlock &BB : *
F) {
646 for (Instruction &
I : BB) {
647 for (
const Use &U :
I.operands()) {
649 if (InfoCache.needsQueuePtr(
C, *
F))
659 bool funcRetrievesMultigridSyncArg(Attributor &
A,
unsigned COV) {
661 AA::RangeTy
Range(Pos, 8);
662 return funcRetrievesImplicitKernelArg(
A,
Range);
665 bool funcRetrievesHostcallPtr(Attributor &
A,
unsigned COV) {
667 AA::RangeTy
Range(Pos, 8);
668 return funcRetrievesImplicitKernelArg(
A,
Range);
671 bool funcRetrievesDefaultQueue(Attributor &
A,
unsigned COV) {
673 AA::RangeTy
Range(Pos, 8);
674 return funcRetrievesImplicitKernelArg(
A,
Range);
677 bool funcRetrievesCompletionAction(Attributor &
A,
unsigned COV) {
679 AA::RangeTy
Range(Pos, 8);
680 return funcRetrievesImplicitKernelArg(
A,
Range);
683 bool funcRetrievesHeapPtr(Attributor &
A,
unsigned COV) {
687 return funcRetrievesImplicitKernelArg(
A,
Range);
690 bool funcRetrievesQueuePtr(Attributor &
A,
unsigned COV) {
694 return funcRetrievesImplicitKernelArg(
A,
Range);
697 bool funcRetrievesImplicitKernelArg(Attributor &
A, AA::RangeTy
Range) {
709 const auto *PointerInfoAA =
A.getAAFor<AAPointerInfo>(
711 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
714 return PointerInfoAA->forallInterferingAccesses(
715 Range, [](
const AAPointerInfo::Access &Acc,
bool IsExact) {
720 bool UsedAssumedInformation =
false;
721 return !
A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *
this,
722 UsedAssumedInformation);
725 bool funcRetrievesLDSKernelId(Attributor &
A) {
730 bool UsedAssumedInformation =
false;
731 return !
A.checkForAllCallLikeInstructions(DoesNotRetrieve, *
this,
732 UsedAssumedInformation);
737 bool needFlatScratchInit(Attributor &
A) {
738 assert(isAssumed(FLAT_SCRATCH_INIT));
747 bool UsedAssumedInformation =
false;
748 if (!
A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *
this,
749 {Instruction::AddrSpaceCast},
750 UsedAssumedInformation))
754 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
758 for (
const Use &U :
I.operands()) {
760 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(
C))
782 return Callee->getIntrinsicID() !=
783 Intrinsic::amdgcn_addrspacecast_nonnull;
786 UsedAssumedInformation =
false;
790 return !
A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *
this,
791 UsedAssumedInformation);
795AAAMDAttributes &AAAMDAttributes::createForPosition(
const IRPosition &IRP,
798 return *
new (
A.Allocator) AAAMDAttributesFunction(IRP,
A);
803struct AAAMDSizeRangeAttribute
804 :
public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
805 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
809 AAAMDSizeRangeAttribute(
const IRPosition &IRP, Attributor &
A,
811 :
Base(IRP, 32), AttrName(AttrName) {}
814 void trackStatistics()
const override {}
816 template <
class AttributeImpl>
ChangeStatus updateImplImpl(Attributor &
A) {
819 auto CheckCallSite = [&](AbstractCallSite CS) {
822 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
824 const auto *CallerInfo =
A.getAAFor<AttributeImpl>(
826 if (!CallerInfo || !CallerInfo->isValidState())
835 bool AllCallSitesKnown =
true;
836 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
839 return indicatePessimisticFixpoint();
847 emitAttributeIfNotDefaultAfterClamp(Attributor &
A,
848 std::pair<unsigned, unsigned>
Default) {
850 unsigned Lower = getAssumed().getLower().getZExtValue();
851 unsigned Upper = getAssumed().getUpper().getZExtValue();
861 return ChangeStatus::UNCHANGED;
864 LLVMContext &Ctx =
F->getContext();
865 SmallString<10> Buffer;
866 raw_svector_ostream OS(Buffer);
868 return A.manifestAttrs(getIRPosition(),
869 {Attribute::get(Ctx, AttrName, OS.str())},
873 const std::string getAsStr(Attributor *)
const override {
875 raw_string_ostream OS(Str);
877 OS << getAssumed().getLower() <<
',' << getAssumed().getUpper() - 1;
884struct AAAMDFlatWorkGroupSize :
public AAAMDSizeRangeAttribute {
885 AAAMDFlatWorkGroupSize(
const IRPosition &IRP, Attributor &
A)
886 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-flat-work-group-size") {}
890 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
892 bool HasAttr =
false;
893 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*
F);
894 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*
F);
896 if (
auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*
F)) {
900 if (*Attr != MaxRange) {
908 if (
Range == MaxRange)
912 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
913 IntegerRangeState IRS(CR);
917 indicateOptimisticFixpoint();
921 return updateImplImpl<AAAMDFlatWorkGroupSize>(
A);
925 static AAAMDFlatWorkGroupSize &createForPosition(
const IRPosition &IRP,
930 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
931 return emitAttributeIfNotDefaultAfterClamp(
932 A, InfoCache.getMaximumFlatWorkGroupRange(*
F));
936 StringRef
getName()
const override {
return "AAAMDFlatWorkGroupSize"; }
939 const char *getIdAddr()
const override {
return &
ID; }
943 static bool classof(
const AbstractAttribute *AA) {
948 static const char ID;
951const char AAAMDFlatWorkGroupSize::ID = 0;
953AAAMDFlatWorkGroupSize &
954AAAMDFlatWorkGroupSize::createForPosition(
const IRPosition &IRP,
957 return *
new (
A.Allocator) AAAMDFlatWorkGroupSize(IRP,
A);
959 "AAAMDFlatWorkGroupSize is only valid for function position");
962struct TupleDecIntegerRangeState :
public AbstractState {
963 DecIntegerState<uint32_t>
X,
Y, Z;
965 bool isValidState()
const override {
966 return X.isValidState() &&
Y.isValidState() &&
Z.isValidState();
969 bool isAtFixpoint()
const override {
970 return X.isAtFixpoint() &&
Y.isAtFixpoint() &&
Z.isAtFixpoint();
974 return X.indicateOptimisticFixpoint() |
Y.indicateOptimisticFixpoint() |
975 Z.indicateOptimisticFixpoint();
979 return X.indicatePessimisticFixpoint() |
Y.indicatePessimisticFixpoint() |
980 Z.indicatePessimisticFixpoint();
983 TupleDecIntegerRangeState
operator^=(
const TupleDecIntegerRangeState &
Other) {
994 TupleDecIntegerRangeState &getAssumed() {
return *
this; }
995 const TupleDecIntegerRangeState &getAssumed()
const {
return *
this; }
998using AAAMDMaxNumWorkgroupsState =
999 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1002struct AAAMDMaxNumWorkgroups
1003 :
public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1004 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1006 AAAMDMaxNumWorkgroups(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1010 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1012 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*
F);
1014 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1015 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1016 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1019 indicatePessimisticFixpoint();
1025 auto CheckCallSite = [&](AbstractCallSite CS) {
1028 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
1030 const auto *CallerInfo =
A.getAAFor<AAAMDMaxNumWorkgroups>(
1032 if (!CallerInfo || !CallerInfo->isValidState())
1040 bool AllCallSitesKnown =
true;
1041 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1044 return indicatePessimisticFixpoint();
1050 static AAAMDMaxNumWorkgroups &createForPosition(
const IRPosition &IRP,
1055 LLVMContext &Ctx =
F->getContext();
1056 SmallString<32> Buffer;
1057 raw_svector_ostream OS(Buffer);
1058 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed();
1062 return A.manifestAttrs(
1064 {Attribute::get(Ctx,
"amdgpu-max-num-workgroups", OS.str())},
1068 StringRef
getName()
const override {
return "AAAMDMaxNumWorkgroups"; }
1070 const std::string getAsStr(Attributor *)
const override {
1071 std::string Buffer =
"AAAMDMaxNumWorkgroupsState[";
1072 raw_string_ostream OS(Buffer);
1073 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed()
1078 const char *getIdAddr()
const override {
return &
ID; }
1082 static bool classof(
const AbstractAttribute *AA) {
1086 void trackStatistics()
const override {}
1089 static const char ID;
1092const char AAAMDMaxNumWorkgroups::ID = 0;
1094AAAMDMaxNumWorkgroups &
1095AAAMDMaxNumWorkgroups::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1097 return *
new (
A.Allocator) AAAMDMaxNumWorkgroups(IRP,
A);
1098 llvm_unreachable(
"AAAMDMaxNumWorkgroups is only valid for function position");
1102struct AAAMDWavesPerEU :
public AAAMDSizeRangeAttribute {
1103 AAAMDWavesPerEU(
const IRPosition &IRP, Attributor &
A)
1104 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-waves-per-eu") {}
1108 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1111 if (
auto Attr = InfoCache.getWavesPerEUAttr(*
F)) {
1112 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1113 1U, InfoCache.getMaxWavesPerEU(*
F)};
1114 if (*Attr != MaxWavesPerEURange) {
1115 auto [Min,
Max] = *Attr;
1116 ConstantRange
Range(APInt(32, Min), APInt(32, Max + 1));
1117 IntegerRangeState RangeState(
Range);
1118 this->getState() = RangeState;
1119 indicateOptimisticFixpoint();
1125 indicatePessimisticFixpoint();
1131 auto CheckCallSite = [&](AbstractCallSite CS) {
1135 <<
"->" <<
Func->getName() <<
'\n');
1138 const auto *CallerAA =
A.getAAFor<AAAMDWavesPerEU>(
1140 if (!CallerAA || !CallerAA->isValidState())
1143 ConstantRange Assumed = getAssumed();
1145 CallerAA->getAssumed().getLower().getZExtValue());
1147 CallerAA->getAssumed().getUpper().getZExtValue());
1148 ConstantRange
Range(APInt(32, Min), APInt(32, Max));
1149 IntegerRangeState RangeState(
Range);
1150 getState() = RangeState;
1151 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1152 : ChangeStatus::CHANGED;
1157 bool AllCallSitesKnown =
true;
1158 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
1159 return indicatePessimisticFixpoint();
1165 static AAAMDWavesPerEU &createForPosition(
const IRPosition &IRP,
1170 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1171 return emitAttributeIfNotDefaultAfterClamp(
1172 A, {1U, InfoCache.getMaxWavesPerEU(*
F)});
1176 StringRef
getName()
const override {
return "AAAMDWavesPerEU"; }
1179 const char *getIdAddr()
const override {
return &
ID; }
1183 static bool classof(
const AbstractAttribute *AA) {
1188 static const char ID;
1191const char AAAMDWavesPerEU::ID = 0;
1193AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(
const IRPosition &IRP,
1196 return *
new (
A.Allocator) AAAMDWavesPerEU(IRP,
A);
1201static unsigned inlineAsmGetNumRequiredAGPRs(
const InlineAsm *IA,
1202 const CallBase &
Call) {
1205 unsigned AGPRDefCount = 0;
1206 unsigned AGPRUseCount = 0;
1207 unsigned MaxPhysReg = 0;
1211 for (
const InlineAsm::ConstraintInfo &CI :
IA->ParseConstraints()) {
1217 Ty = STy->getElementType(ResNo);
1232 for (StringRef Code : CI.Codes) {
1233 unsigned RegCount = 0;
1234 if (
Code.starts_with(
"a")) {
1245 MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
1255 AGPRDefCount =
alignTo(AGPRDefCount, RegCount);
1257 AGPRDefCount += RegCount;
1258 if (CI.isEarlyClobber) {
1259 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1260 AGPRUseCount += RegCount;
1263 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1264 AGPRUseCount += RegCount;
1269 unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
1274 return std::min(MaxVirtReg + MaxPhysReg, 256u);
1277struct AAAMDGPUMinAGPRAlloc
1278 :
public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1279 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1280 AAAMDGPUMinAGPRAlloc(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1282 static AAAMDGPUMinAGPRAlloc &createForPosition(
const IRPosition &IRP,
1285 return *
new (
A.Allocator) AAAMDGPUMinAGPRAlloc(IRP,
A);
1287 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1292 auto [MinNumAGPR, MaxNumAGPR] =
1295 if (MinNumAGPR == 0)
1296 indicateOptimisticFixpoint();
1299 const std::string getAsStr(Attributor *
A)
const override {
1300 std::string Str =
"amdgpu-agpr-alloc=";
1301 raw_string_ostream OS(Str);
1306 void trackStatistics()
const override {}
1309 DecIntegerState<> Maximum;
1316 const Value *CalleeOp = CB.getCalledOperand();
1321 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
1325 switch (CB.getIntrinsicID()) {
1328 case Intrinsic::write_register:
1329 case Intrinsic::read_register:
1330 case Intrinsic::read_volatile_register: {
1335 auto [
Kind, RegIdx, NumRegs] =
1349 case Intrinsic::trap:
1350 case Intrinsic::debugtrap:
1351 case Intrinsic::ubsantrap:
1352 return CB.hasFnAttr(Attribute::NoCallback) ||
1353 !CB.hasFnAttr(
"trap-func-name");
1359 return CB.hasFnAttr(Attribute::NoCallback);
1363 auto *CBEdges =
A.getAAFor<AACallEdges>(
1365 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1370 for (
const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1371 const auto *CalleeInfo =
A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1373 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1384 bool UsedAssumedInformation =
false;
1385 if (!
A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *
this,
1386 UsedAssumedInformation))
1387 return indicatePessimisticFixpoint();
1393 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1394 SmallString<4> Buffer;
1395 raw_svector_ostream OS(Buffer);
1398 return A.manifestAttrs(
1399 getIRPosition(), {Attribute::get(Ctx,
"amdgpu-agpr-alloc", OS.str())});
1402 StringRef
getName()
const override {
return "AAAMDGPUMinAGPRAlloc"; }
1403 const char *getIdAddr()
const override {
return &
ID; }
1407 static bool classof(
const AbstractAttribute *AA) {
1411 static const char ID;
1414const char AAAMDGPUMinAGPRAlloc::ID = 0;
1418struct AAAMDGPUClusterDims
1419 :
public StateWrapper<BooleanState, AbstractAttribute> {
1420 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1421 AAAMDGPUClusterDims(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1424 static AAAMDGPUClusterDims &createForPosition(
const IRPosition &IRP,
1428 StringRef
getName()
const override {
return "AAAMDGPUClusterDims"; }
1431 const char *getIdAddr()
const override {
return &
ID; }
1435 static bool classof(
const AbstractAttribute *AA) {
1439 virtual const AMDGPU::ClusterDimsAttr &getClusterDims()
const = 0;
1442 static const char ID;
1445const char AAAMDGPUClusterDims::ID = 0;
1447struct AAAMDGPUClusterDimsFunction :
public AAAMDGPUClusterDims {
1448 AAAMDGPUClusterDimsFunction(
const IRPosition &IRP, Attributor &
A)
1449 : AAAMDGPUClusterDims(IRP,
A) {}
1453 assert(
F &&
"empty associated function");
1460 indicatePessimisticFixpoint();
1462 indicateOptimisticFixpoint();
1466 const std::string getAsStr(Attributor *
A)
const override {
1476 void trackStatistics()
const override {}
1479 auto OldState = Attr;
1481 auto CheckCallSite = [&](AbstractCallSite CS) {
1482 const auto *CallerAA =
A.getAAFor<AAAMDGPUClusterDims>(
1484 DepClassTy::REQUIRED);
1485 if (!CallerAA || !CallerAA->isValidState())
1488 return merge(CallerAA->getClusterDims());
1491 bool UsedAssumedInformation =
false;
1492 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1494 UsedAssumedInformation))
1495 return indicatePessimisticFixpoint();
1497 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1502 return ChangeStatus::UNCHANGED;
1503 return A.manifestAttrs(
1505 {Attribute::get(getAssociatedFunction()->
getContext(), AttrName,
1510 const AMDGPU::ClusterDimsAttr &getClusterDims()
const override {
1515 bool merge(
const AMDGPU::ClusterDimsAttr &
Other) {
1530 if (
Other.isUnknown())
1555 AMDGPU::ClusterDimsAttr Attr;
1557 static constexpr char AttrName[] =
"amdgpu-cluster-dims";
1560AAAMDGPUClusterDims &
1561AAAMDGPUClusterDims::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1563 return *
new (
A.Allocator) AAAMDGPUClusterDimsFunction(IRP,
A);
1564 llvm_unreachable(
"AAAMDGPUClusterDims is only valid for function position");
1567static bool runImpl(SetVector<Function *> &Functions,
bool IsModulePass,
1568 bool DeleteFns,
Module &M, AnalysisGetter &AG,
1569 TargetMachine &TM, AMDGPUAttributorOptions
Options,
1572 CallGraphUpdater CGUpdater;
1574 AMDGPUInformationCache InfoCache(M, AG,
Allocator,
nullptr, TM);
1575 DenseSet<const char *>
Allowed(
1576 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1578 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1584 AttributorConfig AC(CGUpdater);
1585 AC.IsClosedWorldModule =
Options.IsClosedWorld;
1587 AC.IsModulePass = IsModulePass;
1588 AC.DeleteFns = DeleteFns;
1589 AC.DefaultInitializeLiveInternals =
false;
1590 AC.IndirectCalleeSpecializationCallback =
1591 [](Attributor &
A,
const AbstractAttribute &AA, CallBase &CB,
1596 AC.IPOAmendableCB = [](
const Function &
F) {
1597 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1600 Attributor
A(Functions, InfoCache, AC);
1603 StringRef LTOPhaseStr =
to_string(LTOPhase);
1604 dbgs() <<
"[AMDGPUAttributor] Running at phase " << LTOPhaseStr <<
'\n'
1605 <<
"[AMDGPUAttributor] Module " <<
M.getName() <<
" is "
1606 << (AC.IsClosedWorldModule ?
"" :
"not ")
1607 <<
"assumed to be a closed world.\n";
1610 for (
auto *
F : Functions) {
1614 CallingConv::ID CC =
F->getCallingConv();
1621 if (!
F->isDeclaration() &&
ST.hasClusters())
1624 if (
ST.hasGFX90AInsts())
1628 Value *Ptr =
nullptr;
1630 Ptr = LI->getPointerOperand();
1632 Ptr =
SI->getPointerOperand();
1634 Ptr = RMW->getPointerOperand();
1636 Ptr = CmpX->getPointerOperand();
1642 if (
II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1649 return A.run() == ChangeStatus::CHANGED;
1662 if (!
F.isIntrinsic())
1663 Functions.insert(&
F);
1667 return runImpl(Functions,
true,
true, M, AG,
1668 TM, Options, LTOPhase)
1685 if (!
F->isIntrinsic())
1686 Functions.insert(
F);
1690 Module *M =
C.begin()->getFunction().getParent();
1693 return runImpl(Functions,
false,
false, *M, AG,
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
static bool hasSanitizerAttributes(const Function &F)
Returns true if sanitizer attributes are present on a function.
ImplicitArgumentPositions
static bool castRequiresQueuePtr(unsigned SrcAS)
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ClusterDimsAttr get(const Function &F)
std::string to_string() const
bool isVariableDims() const
uint64_t getZExtValue() const
Get zero extended value.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Value * getArgOperand(unsigned i) const
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
A proxy from a FunctionAnalysisManager to an SCC.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
A vector that has set insertion semantics.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
str - Get the contents as an std::string.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Type * getType() const
All values are typed, get the type of this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
std::tuple< char, unsigned, unsigned > parseAsmPhysRegName(StringRef RegName)
Returns a valid charcode or 0 in the first entry if this is a valid physical register name.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
NodeAddr< FuncNode * > Func
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
@ None
No LTO/ThinLTO behavior needed.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
DecIntegerState & takeAssumedMaximum(base_t Value)
Take maximum of assumed and Value.
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
@ IRP_FUNCTION
An attribute for a function (scope).
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Helper to tie a abstract state implementation to an abstract attribute.