47#define DEBUG_TYPE "si-insert-waitcnts"
50 "Force emit s_waitcnt expcnt(0) instrs");
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
54 "Force emit s_waitcnt vmcnt(0) instrs");
58 cl::desc(
"Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc(
"Force all waitcnt load counters to wait until 0"),
68 "amdgpu-expert-scheduling-mode",
69 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
117 TRACKINGID_RANGE_LEN = (1 << 16),
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
139 DECL(VMEM_SAMPLER_READ_ACCESS) \
140 DECL(VMEM_BVH_READ_ACCESS) \
141 DECL(GLOBAL_INV_ACCESS) \
142 DECL(VMEM_WRITE_ACCESS) \
143 DECL(SCRATCH_WRITE_ACCESS) \
153 DECL(EXP_POS_ACCESS) \
154 DECL(EXP_PARAM_ACCESS) \
156 DECL(EXP_LDS_ACCESS) \
157 DECL(VGPR_CSMACC_WRITE) \
158 DECL(VGPR_DPMACC_WRITE) \
159 DECL(VGPR_TRANS_WRITE) \
160 DECL(VGPR_XDL_WRITE) \
161 DECL(VGPR_LDS_READ) \
162 DECL(VGPR_FLAT_READ) \
166#define AMDGPU_EVENT_ENUM(Name) Name,
171#undef AMDGPU_EVENT_ENUM
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(VMEM_ACCESS, MaxEvent);
189#define AMDGPU_EVENT_NAME(Name) #Name,
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[
Event];
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
234 assert(updateVMCntOnly(Inst));
236 return VMEM_NOSAMPLER;
250 return VMEM_NOSAMPLER;
264 WaitEventSet() =
default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (
auto &
E : Events) {
275 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
276 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
277 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
278 bool contains(
const WaitEventType &Event)
const {
279 return Mask & (1 <<
Event);
283 return (~Mask &
Other.Mask) == 0;
308 return Mask ==
Other.Mask;
311 bool empty()
const {
return Mask == 0; }
313 bool twoOrMore()
const {
return Mask & (Mask - 1); }
314 operator bool()
const {
return !
empty(); }
315 void print(raw_ostream &OS)
const {
316 ListSeparator
LS(
", ");
317 for (WaitEventType Event : wait_events()) {
319 OS <<
LS << getWaitEventTypeName(Event);
325void WaitEventSet::dump()
const {
330class WaitcntBrackets;
338class WaitcntGenerator {
340 const GCNSubtarget &ST;
341 const SIInstrInfo &
TII;
342 AMDGPU::IsaVersion
IV;
345 bool ExpandWaitcntProfiling =
false;
346 const AMDGPU::HardwareLimits *Limits =
nullptr;
349 WaitcntGenerator() =
delete;
350 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
351 WaitcntGenerator(
const MachineFunction &MF,
InstCounterType MaxCounter,
352 const AMDGPU::HardwareLimits *Limits)
353 :
ST(MF.getSubtarget<GCNSubtarget>()),
TII(*
ST.getInstrInfo()),
357 ExpandWaitcntProfiling(
358 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
363 bool isOptNone()
const {
return OptNone; }
365 const AMDGPU::HardwareLimits &getLimits()
const {
return *Limits; }
379 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
380 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
384 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
389 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
391 AMDGPU::Waitcnt
Wait,
392 const WaitcntBrackets &ScoreBrackets) = 0;
408 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
410 virtual ~WaitcntGenerator() =
default;
413class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
414 static constexpr const WaitEventSet
417 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
418 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
419 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
420 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
421 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
430 using WaitcntGenerator::WaitcntGenerator;
432 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
433 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
436 bool createNewWaitcnt(MachineBasicBlock &
Block,
438 AMDGPU::Waitcnt
Wait,
439 const WaitcntBrackets &ScoreBrackets)
override;
442 return WaitEventMaskForInstPreGFX12[
T];
445 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
448class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
451 static constexpr const WaitEventSet
453 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
454 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
455 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
456 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
457 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
458 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
459 WaitEventSet({VMEM_BVH_READ_ACCESS}),
460 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
461 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
462 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
464 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
467 WaitcntGeneratorGFX12Plus() =
delete;
468 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
470 const AMDGPU::HardwareLimits *Limits,
472 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
475 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
476 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
479 bool createNewWaitcnt(MachineBasicBlock &
Block,
481 AMDGPU::Waitcnt
Wait,
482 const WaitcntBrackets &ScoreBrackets)
override;
485 return WaitEventMaskForInstGFX12Plus[
T];
488 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
492struct PreheaderFlushFlags {
493 bool FlushVmCnt =
false;
494 bool FlushDsCnt =
false;
497class SIInsertWaitcnts {
499 const GCNSubtarget *
ST;
500 const SIInstrInfo *
TII =
nullptr;
501 const SIRegisterInfo *
TRI =
nullptr;
502 const MachineRegisterInfo *MRI =
nullptr;
505 bool IsExpertMode =
false;
508 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
509 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
510 MachineLoopInfo *MLI;
511 MachinePostDominatorTree *PDT;
515 std::unique_ptr<WaitcntBrackets> Incoming;
519 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
523 std::unique_ptr<WaitcntGenerator> WCG;
526 DenseSet<MachineInstr *> CallInsts;
527 DenseSet<MachineInstr *> ReturnInsts;
532 DenseMap<MachineInstr *, bool> EndPgmInsts;
534 AMDGPU::HardwareLimits Limits;
537 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
539 : MLI(MLI), PDT(PDT), AA(AA) {
540 (void)ForceExpCounter;
541 (void)ForceLgkmCounter;
542 (void)ForceVMCounter;
545 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
547 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
548 const WaitcntBrackets &Brackets);
549 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
550 const WaitcntBrackets &ScoreBrackets);
551 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
552 bool isDSRead(
const MachineInstr &
MI)
const;
553 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
554 bool run(MachineFunction &MF);
556 void setForceEmitWaitcnt() {
562 ForceEmitWaitcnt[
EXP_CNT] =
true;
564 ForceEmitWaitcnt[
EXP_CNT] =
false;
569 ForceEmitWaitcnt[
DS_CNT] =
true;
570 ForceEmitWaitcnt[
KM_CNT] =
true;
572 ForceEmitWaitcnt[
DS_CNT] =
false;
573 ForceEmitWaitcnt[
KM_CNT] =
false;
580 ForceEmitWaitcnt[
BVH_CNT] =
true;
584 ForceEmitWaitcnt[
BVH_CNT] =
false;
587 ForceEmitWaitcnt[
VA_VDST] =
false;
588 ForceEmitWaitcnt[
VM_VSRC] =
false;
594 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
597 case AMDGPU::GLOBAL_INV:
598 return GLOBAL_INV_ACCESS;
600 case AMDGPU::GLOBAL_WB:
601 case AMDGPU::GLOBAL_WBINV:
602 return VMEM_WRITE_ACCESS;
608 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
609 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
618 if (
TII->mayAccessScratch(Inst))
619 return SCRATCH_WRITE_ACCESS;
620 return VMEM_WRITE_ACCESS;
624 return VmemReadMapping[getVmemType(Inst)];
627 std::optional<WaitEventType>
628 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
630 bool isAsync(
const MachineInstr &
MI)
const {
635 const MachineOperand *
Async =
636 TII->getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
640 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
644 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
648 bool isVmemAccess(
const MachineInstr &
MI)
const;
649 bool generateWaitcntInstBefore(MachineInstr &
MI,
650 WaitcntBrackets &ScoreBrackets,
651 MachineInstr *OldWaitcntInstr,
652 PreheaderFlushFlags FlushFlags);
653 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
655 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
656 MachineInstr *OldWaitcntInstr);
658 WaitEventSet getEventsFor(
const MachineInstr &Inst)
const;
659 void updateEventWaitcntAfter(MachineInstr &Inst,
660 WaitcntBrackets *ScoreBrackets);
662 MachineBasicBlock *
Block)
const;
663 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
664 WaitcntBrackets &ScoreBrackets);
665 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
666 WaitcntBrackets &ScoreBrackets);
669 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
671 bool ExpertMode)
const;
673 return WCG->getWaitEvents(
T);
676 return WCG->getCounterFromEvent(
E);
688class WaitcntBrackets {
696 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
697 for (
auto &[
ID, Val] : VMem) {
701 for (
auto &[
ID, Val] : SGPRs) {
706 if (NumUnusedVmem || NumUnusedSGPRs) {
707 errs() <<
"WaitcntBracket had unused entries at destruction time: "
708 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
709 <<
" SGPR unused entries\n";
720 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
721 return T ==
X_CNT ? 1 : 0;
725 return ScoreUBs[
T] - ScoreLBs[
T];
729 return getVMemScore(
ID,
T) > getScoreLB(
T);
747 return getScoreUB(
T) - getScoreLB(
T);
751 auto It = SGPRs.find(RU);
752 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
756 auto It = VMem.find(TID);
757 return It != VMem.end() ? It->second.Scores[
T] : 0;
764 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
767 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
768 AMDGPU::Waitcnt &UpdateWait)
const;
771 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
772 AMDGPU::Waitcnt &UpdateWait)
const;
773 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
774 AMDGPU::Waitcnt &UpdateWait)
const;
777 AMDGPU::Waitcnt &
Wait)
const;
779 AMDGPU::Waitcnt &
Wait)
const;
780 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
781 void tryClearSCCWriteEvent(MachineInstr *Inst);
783 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
786 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
787 void recordAsyncMark(MachineInstr &
MI);
789 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
790 bool hasPendingEvent(WaitEventType
E)
const {
791 return PendingEvents.contains(
E);
794 bool HasPending = PendingEvents &
Context->getWaitEvents(
T);
796 "Expected pending events iff scoreboard is not empty");
801 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
803 return Events.twoOrMore();
806 bool hasPendingFlat()
const {
813 void setPendingFlat() {
818 bool hasPendingGDS()
const {
819 return LastGDS > ScoreLBs[
DS_CNT] && LastGDS <= ScoreUBs[
DS_CNT];
822 unsigned getPendingGDSWait()
const {
823 return std::min(getScoreUB(
DS_CNT) - LastGDS,
827 void setPendingGDS() { LastGDS = ScoreUBs[
DS_CNT]; }
831 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
832 for (MCRegUnit RU : regunits(
Reg)) {
833 auto It = VMem.find(toVMEMID(RU));
834 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
841 for (MCRegUnit RU : regunits(
Reg)) {
842 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
843 It->second.VMEMTypes = 0;
844 if (It->second.empty())
850 void setStateOnFunctionEntryOrReturn() {
856 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
860 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
861 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
864 void print(raw_ostream &)
const;
869 void purgeEmptyTrackingData();
879 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
882 AMDGPU::Waitcnt &
Wait)
const;
884 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
885 unsigned OtherScore);
890 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
893 const TargetRegisterClass *RC =
Context->TRI->getPhysRegBaseClass(
Reg);
894 unsigned Size =
Context->TRI->getRegSizeInBits(*RC);
895 if (
Size == 16 &&
Context->ST->hasD16Writes32BitVgpr())
919 if (
Reg == AMDGPU::SCC) {
922 for (MCRegUnit RU : regunits(
Reg))
923 VMem[toVMEMID(RU)].Scores[
T] = Val;
925 auto STy = getSgprScoresIdx(
T);
926 for (MCRegUnit RU : regunits(
Reg))
927 SGPRs[RU].Scores[STy] = Val;
934 VMem[TID].Scores[
T] = Val;
940 const SIInsertWaitcnts *
Context;
944 WaitEventSet PendingEvents;
948 unsigned LastGDS = 0;
965 CounterValueArray Scores{};
967 unsigned VMEMTypes = 0;
977 std::array<unsigned, 2> Scores = {0};
979 bool empty()
const {
return !Scores[0] && !Scores[1]; }
982 DenseMap<VMEMID, VMEMInfo> VMem;
983 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
986 unsigned SCCScore = 0;
988 const MachineInstr *PendingSCCWrite =
nullptr;
992 SmallVector<const MachineInstr *> LDSDMAStores;
1001 static constexpr unsigned MaxAsyncMarks = 16;
1005 CounterValueArray AsyncScore{};
1008class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1011 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1013 bool runOnMachineFunction(MachineFunction &MF)
override;
1015 StringRef getPassName()
const override {
1016 return "SI insert wait instructions";
1019 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1022 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1031void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1033 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1041bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1046 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1056bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1058 if (!hasPointSampleAccel(
MI))
1061 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1064void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1068 unsigned UB = getScoreUB(
T);
1069 unsigned CurrScore = UB + 1;
1075 PendingEvents.insert(
E);
1076 setScoreUB(
T, CurrScore);
1079 const MachineRegisterInfo *MRI =
Context->MRI;
1088 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1089 setScoreByOperand(*AddrOp,
EXP_CNT, CurrScore);
1092 if (
const auto *Data0 =
1093 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1094 setScoreByOperand(*Data0,
EXP_CNT, CurrScore);
1095 if (
const auto *Data1 =
1096 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1097 setScoreByOperand(*Data1,
EXP_CNT, CurrScore);
1099 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1100 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1101 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1102 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1103 if (
TRI->isVectorRegister(*MRI,
Op.getReg()))
1104 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1107 }
else if (
TII->isFLAT(Inst)) {
1109 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1112 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1115 }
else if (
TII->isMIMG(Inst)) {
1119 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1122 }
else if (
TII->isMTBUF(Inst)) {
1125 }
else if (
TII->isMUBUF(Inst)) {
1129 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1132 }
else if (
TII->isLDSDIR(Inst)) {
1134 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1137 if (
TII->isEXP(Inst)) {
1142 for (MachineOperand &DefMO : Inst.
all_defs()) {
1143 if (
TRI->isVGPR(*MRI, DefMO.getReg())) {
1144 setScoreByOperand(DefMO,
EXP_CNT, CurrScore);
1148 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1149 if (
TRI->isVectorRegister(*MRI,
Op.getReg()))
1150 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1154 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1155 if (PendingEvents.contains(OtherEvent)) {
1160 setScoreLB(
T, getScoreUB(
T) - 1);
1161 PendingEvents.remove(OtherEvent);
1163 for (
const MachineOperand &
Op : Inst.
all_uses())
1164 setScoreByOperand(
Op,
T, CurrScore);
1168 for (
const MachineOperand &
Op : Inst.
operands()) {
1173 setScoreByOperand(
Op,
T, CurrScore);
1185 for (
const MachineOperand &
Op : Inst.
defs()) {
1187 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
1189 if (updateVMCntOnly(Inst)) {
1194 VmemType
V = getVmemType(Inst);
1195 unsigned char TypesMask = 1 <<
V;
1198 if (hasPointSampleAccel(Inst))
1199 TypesMask |= 1 << VMEM_NOSAMPLER;
1200 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1201 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1204 setScoreByOperand(
Op,
T, CurrScore);
1207 (
TII->isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1216 if (!MemOp->isStore() ||
1221 auto AAI = MemOp->getAAInfo();
1227 if (!AAI || !AAI.Scope)
1229 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1230 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1231 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1246 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1247 if (Slot && Slot < NUM_LDSDMA)
1248 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1256 "unexpected GFX1250 instruction");
1257 AsyncScore[
T] = CurrScore;
1261 setRegScore(AMDGPU::SCC,
T, CurrScore);
1262 PendingSCCWrite = &Inst;
1267void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1273 AsyncMarks.push_back(AsyncScore);
1276 dbgs() <<
"recordAsyncMark:\n" << Inst;
1277 for (
const auto &Mark : AsyncMarks) {
1284void WaitcntBrackets::print(raw_ostream &OS)
const {
1288 unsigned SR = getScoreRange(
T);
1291 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1295 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1299 OS <<
" EXP_CNT(" << SR <<
"):";
1302 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1306 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1309 OS <<
" BVH_CNT(" << SR <<
"):";
1312 OS <<
" KM_CNT(" << SR <<
"):";
1315 OS <<
" X_CNT(" << SR <<
"):";
1318 OS <<
" VA_VDST(" << SR <<
"): ";
1321 OS <<
" VM_VSRC(" << SR <<
"): ";
1324 OS <<
" UNKNOWN(" << SR <<
"):";
1330 unsigned LB = getScoreLB(
T);
1333 sort(SortedVMEMIDs);
1335 for (
auto ID : SortedVMEMIDs) {
1336 unsigned RegScore = VMem.at(
ID).Scores[
T];
1339 unsigned RelScore = RegScore - LB - 1;
1340 if (
ID < REGUNITS_END) {
1341 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1343 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1344 "Unhandled/unexpected ID value!");
1345 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1350 if (isSmemCounter(
T)) {
1352 sort(SortedSMEMIDs);
1353 for (
auto ID : SortedSMEMIDs) {
1354 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1357 unsigned RelScore = RegScore - LB - 1;
1358 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1362 if (
T ==
KM_CNT && SCCScore > 0)
1363 OS <<
' ' << SCCScore <<
":scc";
1368 OS <<
"Pending Events: ";
1369 if (hasPendingEvent()) {
1371 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1372 if (hasPendingEvent((WaitEventType)
I)) {
1373 OS <<
LS << WaitEventTypeName[
I];
1381 OS <<
"Async score: ";
1382 if (AsyncScore.empty())
1388 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1390 for (
const auto &Mark : AsyncMarks) {
1392 unsigned MarkedScore = Mark[
T];
1395 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1396 <<
"_CNT: " << MarkedScore;
1399 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1400 <<
"_CNT: " << MarkedScore;
1403 OS <<
" EXP_CNT: " << MarkedScore;
1406 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS")
1407 <<
"_CNT: " << MarkedScore;
1410 OS <<
" SAMPLE_CNT: " << MarkedScore;
1413 OS <<
" BVH_CNT: " << MarkedScore;
1416 OS <<
" KM_CNT: " << MarkedScore;
1419 OS <<
" X_CNT: " << MarkedScore;
1422 OS <<
" UNKNOWN: " << MarkedScore;
1433void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1434 AMDGPU::Waitcnt &UpdateWait)
const {
1435 simplifyWaitcnt(UpdateWait,
LOAD_CNT);
1436 simplifyWaitcnt(UpdateWait,
EXP_CNT);
1437 simplifyWaitcnt(UpdateWait,
DS_CNT);
1440 simplifyWaitcnt(UpdateWait,
BVH_CNT);
1441 simplifyWaitcnt(UpdateWait,
KM_CNT);
1442 simplifyXcnt(CheckWait, UpdateWait);
1443 simplifyWaitcnt(UpdateWait,
VA_VDST);
1444 simplifyVmVsrc(CheckWait, UpdateWait);
1448 unsigned &
Count)
const {
1452 if (
Count >= getScoreRange(
T))
1457 unsigned Cnt =
Wait.get(
T);
1458 simplifyWaitcnt(
T, Cnt);
1462void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1463 AMDGPU::Waitcnt &UpdateWait)
const {
1472 if (CheckWait.
get(
KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1477 if (CheckWait.
get(
LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1481 simplifyWaitcnt(UpdateWait,
X_CNT);
1484void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1485 AMDGPU::Waitcnt &UpdateWait)
const {
1490 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1491 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1492 CheckWait.get(DS_CNT)}))
1494 simplifyWaitcnt(UpdateWait,
VM_VSRC);
1497void WaitcntBrackets::purgeEmptyTrackingData() {
1509 unsigned ScoreToWait,
1510 AMDGPU::Waitcnt &
Wait)
const {
1511 const unsigned LB = getScoreLB(
T);
1512 const unsigned UB = getScoreUB(
T);
1515 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1517 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1521 addWait(
Wait,
T, 0);
1522 }
else if (counterOutOfOrder(
T)) {
1526 addWait(
Wait,
T, 0);
1530 unsigned NeededWait = std::min(
1531 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1532 addWait(
Wait,
T, NeededWait);
1537AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1539 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1541 for (
const auto &Mark : AsyncMarks) {
1547 if (AsyncMarks.size() == MaxAsyncMarks) {
1552 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1553 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1556 AMDGPU::Waitcnt
Wait;
1557 if (AsyncMarks.size() <=
N) {
1562 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1563 const auto &RequiredMark = AsyncMarks[MarkIndex];
1565 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1571 dbgs() <<
"Removing " << (MarkIndex + 1)
1572 <<
" async marks after determining wait\n";
1574 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1581 AMDGPU::Waitcnt &
Wait)
const {
1582 if (
Reg == AMDGPU::SCC) {
1583 determineWaitForScore(
T, SCCScore,
Wait);
1586 for (MCRegUnit RU : regunits(
Reg))
1587 determineWaitForScore(
1588 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1594 AMDGPU::Waitcnt &
Wait)
const {
1595 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1596 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1599void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1602 if (PendingSCCWrite &&
1603 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1605 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1608 SCC_WRITE_PendingEvent) {
1612 PendingEvents.remove(SCC_WRITE_PendingEvent);
1613 PendingSCCWrite =
nullptr;
1617void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1619 applyWaitcnt(
Wait,
T);
1623 const unsigned UB = getScoreUB(
T);
1627 if (counterOutOfOrder(
T))
1629 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1632 PendingEvents.remove(
Context->getWaitEvents(
T));
1635 if (
T ==
KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1636 if (!hasMixedPendingEvents(
X_CNT))
1637 applyWaitcnt(
X_CNT, 0);
1639 PendingEvents.remove(SMEM_GROUP);
1641 if (
T ==
LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1643 if (!hasMixedPendingEvents(
X_CNT))
1645 else if (
Count == 0)
1646 PendingEvents.remove(VMEM_GROUP);
1651 unsigned Cnt =
Wait.get(
T);
1652 applyWaitcnt(
T, Cnt);
1659 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1660 (
T ==
X_CNT && hasPendingEvent(SMEM_GROUP)))
1667 unsigned Events = hasPendingEvent(
T);
1670 Events &= ~(1 << GLOBAL_INV_ACCESS);
1673 return Events & (Events - 1);
1676 return hasMixedPendingEvents(
T);
1686char SIInsertWaitcntsLegacy::
ID = 0;
1691 return new SIInsertWaitcntsLegacy();
1696 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1701 if (NewEnc == MO.
getImm())
1712 case AMDGPU::S_WAIT_LOADCNT:
1714 case AMDGPU::S_WAIT_EXPCNT:
1716 case AMDGPU::S_WAIT_STORECNT:
1718 case AMDGPU::S_WAIT_SAMPLECNT:
1720 case AMDGPU::S_WAIT_BVHCNT:
1722 case AMDGPU::S_WAIT_DSCNT:
1724 case AMDGPU::S_WAIT_KMCNT:
1726 case AMDGPU::S_WAIT_XCNT:
1733bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1747bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1748 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1750 assert(isNormalMode(MaxCounter));
1753 MachineInstr *WaitcntInstr =
nullptr;
1754 MachineInstr *WaitcntVsCntInstr =
nullptr;
1757 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1759 dbgs() <<
"end of block\n";
1767 if (
II.isMetaInstruction()) {
1773 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1777 if (Opcode == AMDGPU::S_WAITCNT) {
1778 unsigned IEnc =
II.getOperand(0).getImm();
1781 ScoreBrackets.simplifyWaitcnt(OldWait);
1785 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1786 II.eraseFromParent();
1790 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1793 <<
"Before: " <<
Wait <<
'\n';);
1794 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, LDSDMA_BEGIN,
Wait);
1803 II.eraseFromParent();
1804 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1805 unsigned N =
II.getOperand(0).getImm();
1807 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1810 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1811 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1814 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1816 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1819 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1820 II.eraseFromParent();
1823 WaitcntVsCntInstr = &
II;
1830 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1839 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1840 <<
"New Instr at block end: "
1841 << *WaitcntInstr <<
'\n'
1842 :
dbgs() <<
"applied pre-existing waitcnt\n"
1843 <<
"Old Instr: " << *It
1844 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1847 if (WaitcntVsCntInstr) {
1849 *WaitcntVsCntInstr, AMDGPU::OpName::simm16,
Wait.get(
STORE_CNT));
1850 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1856 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1857 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1859 :
dbgs() <<
"applied pre-existing waitcnt\n"
1860 <<
"Old Instr: " << *It
1861 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1869bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1871 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1872 assert(isNormalMode(MaxCounter));
1880 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1883 EmitWaitcnt(--Outstanding);
1884 }
while (Outstanding > Target);
1890 if (
Wait.hasWaitExceptStoreCnt()) {
1892 if (ExpandWaitcntProfiling) {
1896 bool AnyOutOfOrder =
false;
1898 unsigned WaitCnt =
Wait.get(CT);
1899 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1900 AnyOutOfOrder =
true;
1905 if (AnyOutOfOrder) {
1913 unsigned WaitCnt =
Wait.get(CT);
1917 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1918 getWaitCountMax(getLimits(), CT) - 1);
1919 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1930 [[maybe_unused]]
auto SWaitInst =
1935 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1936 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1940 if (
Wait.hasWaitStoreCnt()) {
1944 !ScoreBrackets.counterOutOfOrder(
STORE_CNT)) {
1946 unsigned Outstanding =
1947 std::min(ScoreBrackets.getOutstanding(
STORE_CNT),
1948 getWaitCountMax(getLimits(),
STORE_CNT) - 1);
1949 EmitExpandedWaitcnt(
1951 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1952 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1956 [[maybe_unused]]
auto SWaitInst =
1958 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1963 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1964 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1972WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1973 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1977WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1978 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1979 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1980 ~0u , ExpertVal, ExpertVal);
1987bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1988 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1990 assert(!isNormalMode(MaxCounter));
1993 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1994 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1995 MachineInstr *WaitcntDepctrInstr =
nullptr;
1999 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
2001 dbgs() <<
"end of block\n";
2007 AMDGPU::Waitcnt RequiredWait;
2012 if (
II.isMetaInstruction()) {
2021 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
2025 if (Opcode == AMDGPU::S_WAITCNT)
2028 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2030 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2035 RequiredWait = RequiredWait.combined(OldWait);
2037 if (CombinedLoadDsCntInstr ==
nullptr) {
2038 CombinedLoadDsCntInstr = &
II;
2040 II.eraseFromParent();
2043 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2045 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2050 RequiredWait = RequiredWait.combined(OldWait);
2052 if (CombinedStoreDsCntInstr ==
nullptr) {
2053 CombinedStoreDsCntInstr = &
II;
2055 II.eraseFromParent();
2058 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2060 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2061 AMDGPU::Waitcnt OldWait;
2065 ScoreBrackets.simplifyWaitcnt(OldWait);
2067 if (WaitcntDepctrInstr ==
nullptr) {
2068 WaitcntDepctrInstr = &
II;
2077 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2085 II.eraseFromParent();
2089 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2092 II.eraseFromParent();
2094 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2100 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2102 addWait(
Wait, CT.value(), OldCnt);
2104 addWait(RequiredWait, CT.value(), OldCnt);
2106 if (WaitInstrs[CT.value()] ==
nullptr) {
2107 WaitInstrs[CT.value()] = &
II;
2109 II.eraseFromParent();
2115 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2116 Wait =
Wait.combined(RequiredWait);
2118 if (CombinedLoadDsCntInstr) {
2134 AMDGPU::OpName::simm16, NewEnc);
2135 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2141 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2142 <<
"New Instr at block end: "
2143 << *CombinedLoadDsCntInstr <<
'\n'
2144 :
dbgs() <<
"applied pre-existing waitcnt\n"
2145 <<
"Old Instr: " << *It <<
"New Instr: "
2146 << *CombinedLoadDsCntInstr <<
'\n');
2153 if (CombinedStoreDsCntInstr) {
2158 AMDGPU::OpName::simm16, NewEnc);
2159 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2165 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2166 <<
"New Instr at block end: "
2167 << *CombinedStoreDsCntInstr <<
'\n'
2168 :
dbgs() <<
"applied pre-existing waitcnt\n"
2169 <<
"Old Instr: " << *It <<
"New Instr: "
2170 << *CombinedStoreDsCntInstr <<
'\n');
2200 for (MachineInstr **WI : WaitsToErase) {
2204 (*WI)->eraseFromParent();
2211 if (!WaitInstrs[CT])
2214 unsigned NewCnt =
Wait.get(CT);
2215 if (NewCnt != ~0u) {
2217 AMDGPU::OpName::simm16, NewCnt);
2218 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2220 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2221 setNoWait(
Wait, CT);
2224 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2225 <<
"New Instr at block end: " << *WaitInstrs[CT]
2227 :
dbgs() <<
"applied pre-existing waitcnt\n"
2228 <<
"Old Instr: " << *It
2229 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2236 if (WaitcntDepctrInstr) {
2240 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2255 AMDGPU::OpName::simm16, Enc);
2257 <<
"New Instr at block end: "
2258 << *WaitcntDepctrInstr <<
'\n'
2259 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2260 <<
"Old Instr: " << *It <<
"New Instr: "
2261 << *WaitcntDepctrInstr <<
'\n');
2272bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2274 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2275 assert(!isNormalMode(MaxCounter));
2281 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2283 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
2285 EmitWaitcnt(Target);
2291 if (ExpandWaitcntProfiling) {
2298 if (ScoreBrackets.counterOutOfOrder(CT)) {
2305 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2306 getWaitCountMax(getLimits(), CT) - 1);
2307 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2318 MachineInstr *SWaitInst =
nullptr;
2342 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2343 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2355 [[maybe_unused]]
auto SWaitInst =
2362 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2363 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2366 if (
Wait.hasWaitDepctr()) {
2371 [[maybe_unused]]
auto SWaitInst =
2377 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2378 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2397bool SIInsertWaitcnts::generateWaitcntInstBefore(
2398 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2399 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2401 setForceEmitWaitcnt();
2405 AMDGPU::Waitcnt
Wait;
2406 const unsigned Opc =
MI.getOpcode();
2409 case AMDGPU::BUFFER_WBINVL1:
2410 case AMDGPU::BUFFER_WBINVL1_SC:
2411 case AMDGPU::BUFFER_WBINVL1_VOL:
2412 case AMDGPU::BUFFER_GL0_INV:
2413 case AMDGPU::BUFFER_GL1_INV: {
2421 case AMDGPU::SI_RETURN_TO_EPILOG:
2422 case AMDGPU::SI_RETURN:
2423 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2424 case AMDGPU::S_SETPC_B64_return: {
2429 AMDGPU::Waitcnt AllZeroWait =
2430 WCG->getAllZeroWaitcnt(
false);
2435 if (
ST->hasExtendedWaitCounts() &&
2436 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2441 case AMDGPU::S_ENDPGM:
2442 case AMDGPU::S_ENDPGM_SAVED: {
2451 EndPgmInsts[&
MI] = !ScoreBrackets.empty(
STORE_CNT) &&
2452 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2455 case AMDGPU::S_SENDMSG:
2456 case AMDGPU::S_SENDMSGHALT: {
2457 if (
ST->hasLegacyGeometry() &&
2472 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2475 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2476 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2477 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2478 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2485 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2486 addWait(
Wait,
DS_CNT, ScoreBrackets.getPendingGDSWait());
2493 Wait = AMDGPU::Waitcnt();
2495 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2496 if (CallAddrOp.
isReg()) {
2497 ScoreBrackets.determineWaitForPhysReg(
2500 if (
const auto *RtnAddrOp =
2501 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2502 ScoreBrackets.determineWaitForPhysReg(
2503 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2506 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2507 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2523 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2524 const Value *Ptr = Memop->getValue();
2525 if (Memop->isStore()) {
2526 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2527 addWait(
Wait, SmemAccessCounter, 0);
2529 SLoadAddresses.
erase(It);
2532 unsigned AS = Memop->getAddrSpace();
2536 if (
TII->mayWriteLDSThroughDMA(
MI))
2540 unsigned TID = LDSDMA_BEGIN;
2541 if (Ptr && Memop->getAAInfo()) {
2542 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2543 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2544 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2545 if ((
I + 1) >= NUM_LDSDMA) {
2548 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2552 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID +
I + 1,
Wait);
2556 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2558 if (Memop->isStore()) {
2559 ScoreBrackets.determineWaitForLDSDMA(
EXP_CNT, TID,
Wait);
2564 for (
const MachineOperand &
Op :
MI.operands()) {
2569 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2574 const bool IsVGPR =
TRI->isVectorRegister(*MRI,
Op.getReg());
2581 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2593 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2594 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2595 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2596 !
ST->hasVmemWriteVgprInOrder()) {
2600 ScoreBrackets.clearVgprVmemTypes(
Reg);
2603 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2607 }
else if (
Op.getReg() == AMDGPU::SCC) {
2610 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2613 if (
ST->hasWaitXcnt() &&
Op.isDef())
2614 ScoreBrackets.determineWaitForPhysReg(
X_CNT,
Reg,
Wait);
2632 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2633 !
ST->hasBackOffBarrier()) {
2634 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2641 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2646 ScoreBrackets.simplifyWaitcnt(
Wait);
2652 if (
TII->isVALU(
MI))
2659 ScoreBrackets.applyWaitcnt(
Wait,
X_CNT);
2666 Wait = WCG->getAllZeroWaitcnt(
false);
2670 if (!ForceEmitWaitcnt[
T])
2675 if (FlushFlags.FlushVmCnt) {
2680 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
2686 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2690bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2692 MachineBasicBlock &
Block,
2693 WaitcntBrackets &ScoreBrackets,
2694 MachineInstr *OldWaitcntInstr) {
2697 if (OldWaitcntInstr)
2701 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2706 MachineOperand *WaitExp =
2707 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2717 <<
"Update Instr: " << *It);
2720 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2725 ScoreBrackets.applyWaitcnt(
Wait);
2730std::optional<WaitEventType>
2731SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2732 if (
TII->isVALU(Inst)) {
2737 if (
TII->isXDL(Inst))
2738 return VGPR_XDL_WRITE;
2740 if (
TII->isTRANS(Inst))
2741 return VGPR_TRANS_WRITE;
2744 return VGPR_DPMACC_WRITE;
2746 return VGPR_CSMACC_WRITE;
2753 if (
TII->isFLAT(Inst))
2754 return VGPR_FLAT_READ;
2756 if (
TII->isDS(Inst))
2757 return VGPR_LDS_READ;
2759 if (
TII->isVMEM(Inst) ||
TII->isVIMAGE(Inst) ||
TII->isVSAMPLE(Inst))
2760 return VGPR_VMEM_READ;
2767bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2768 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2775 MachineBasicBlock *
Block)
const {
2776 auto BlockEnd =
Block->getParent()->end();
2777 auto BlockIter =
Block->getIterator();
2781 if (++BlockIter != BlockEnd) {
2782 It = BlockIter->instr_begin();
2789 if (!It->isMetaInstruction())
2797 return It->getOpcode() == AMDGPU::S_ENDPGM;
2801bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2802 MachineBasicBlock &
Block,
2803 WaitcntBrackets &ScoreBrackets) {
2804 AMDGPU::Waitcnt
Wait;
2805 bool NeedsEndPGMCheck =
false;
2813 NeedsEndPGMCheck =
true;
2816 ScoreBrackets.simplifyWaitcnt(
Wait);
2819 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2822 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2830WaitEventSet SIInsertWaitcnts::getEventsFor(
const MachineInstr &Inst)
const {
2831 WaitEventSet Events;
2833 if (
const auto ET = getExpertSchedulingEventType(Inst))
2837 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2839 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2840 Events.insert(GDS_ACCESS);
2841 Events.insert(GDS_GPR_LOCK);
2843 Events.insert(LDS_ACCESS);
2845 }
else if (
TII->isFLAT(Inst)) {
2847 Events.insert(getVmemWaitEventType(Inst));
2850 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2851 if (
ST->hasWaitXcnt())
2852 Events.insert(VMEM_GROUP);
2853 Events.insert(getVmemWaitEventType(Inst));
2855 if (
TII->mayAccessLDSThroughFlat(Inst))
2856 Events.insert(LDS_ACCESS);
2860 Inst.
getOpcode() == AMDGPU::BUFFER_WBL2)) {
2864 if (
ST->hasWaitXcnt())
2865 Events.insert(VMEM_GROUP);
2866 Events.insert(getVmemWaitEventType(Inst));
2867 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2869 Events.insert(VMW_GPR_LOCK);
2871 }
else if (
TII->isSMRD(Inst)) {
2872 if (
ST->hasWaitXcnt())
2873 Events.insert(SMEM_GROUP);
2874 Events.insert(SMEM_ACCESS);
2876 Events.insert(EXP_LDS_ACCESS);
2878 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2880 Events.insert(EXP_PARAM_ACCESS);
2882 Events.insert(EXP_POS_ACCESS);
2884 Events.insert(EXP_GPR_LOCK);
2886 Events.insert(SCC_WRITE);
2889 case AMDGPU::S_SENDMSG:
2890 case AMDGPU::S_SENDMSG_RTN_B32:
2891 case AMDGPU::S_SENDMSG_RTN_B64:
2892 case AMDGPU::S_SENDMSGHALT:
2893 Events.insert(SQ_MESSAGE);
2895 case AMDGPU::S_MEMTIME:
2896 case AMDGPU::S_MEMREALTIME:
2897 case AMDGPU::S_GET_BARRIER_STATE_M0:
2898 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2899 Events.insert(SMEM_ACCESS);
2906void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2907 WaitcntBrackets *ScoreBrackets) {
2909 WaitEventSet InstEvents = getEventsFor(Inst);
2910 for (WaitEventType
E : wait_events()) {
2911 if (InstEvents.contains(
E))
2912 ScoreBrackets->updateByEvent(
E, Inst);
2915 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2917 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2918 ScoreBrackets->setPendingGDS();
2920 }
else if (
TII->isFLAT(Inst)) {
2928 ScoreBrackets->setPendingFlat();
2929 }
else if (Inst.
isCall()) {
2931 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2932 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2933 }
else if (
TII->isVINTERP(Inst)) {
2934 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2939bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2940 unsigned OtherScore) {
2941 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2942 unsigned OtherShifted =
2943 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2944 Score = std::max(MyShifted, OtherShifted);
2945 return OtherShifted > MyShifted;
2950 bool StrictDom =
false;
2954 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2961 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2962 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2965 if (AsyncMarks.size() > MaxSize)
2966 AsyncMarks.erase(AsyncMarks.begin(),
2967 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2973 constexpr CounterValueArray ZeroMark{};
2974 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2977 dbgs() <<
"Before merge:\n";
2978 for (
const auto &Mark : AsyncMarks) {
2982 dbgs() <<
"Other marks:\n";
2983 for (
const auto &Mark : OtherMarks) {
2992 unsigned OtherSize = OtherMarks.size();
2993 unsigned OurSize = AsyncMarks.size();
2994 unsigned MergeCount = std::min(OtherSize, OurSize);
2997 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2998 OtherMarks[OtherSize - Idx][
T]);
3003 dbgs() <<
"After merge:\n";
3004 for (
const auto &Mark : AsyncMarks) {
3018bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
3019 bool StrictDom =
false;
3023 for (
auto K :
Other.VMem.keys())
3024 VMem.try_emplace(K);
3025 for (
auto K :
Other.SGPRs.keys())
3026 SGPRs.try_emplace(K);
3033 const WaitEventSet &EventsForT =
Context->getWaitEvents(
T);
3034 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3035 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
3036 if (!OldEvents.contains(OtherEvents))
3038 PendingEvents |= OtherEvents;
3041 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
3042 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
3043 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
3044 if (NewUB < ScoreLBs[
T])
3047 MergeInfo &
M = MergeInfos[
T];
3048 M.OldLB = ScoreLBs[
T];
3049 M.OtherLB =
Other.ScoreLBs[
T];
3050 M.MyShift = NewUB - ScoreUBs[
T];
3051 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
3053 ScoreUBs[
T] = NewUB;
3055 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
3058 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
3061 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
3062 if (
Other.hasPendingEvent(SCC_WRITE)) {
3063 if (!OldEvents.contains(SCC_WRITE)) {
3064 PendingSCCWrite =
Other.PendingSCCWrite;
3065 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
3066 PendingSCCWrite =
nullptr;
3071 for (
auto &[RegID, Info] : VMem)
3072 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
3074 if (isSmemCounter(
T)) {
3075 unsigned Idx = getSgprScoresIdx(
T);
3076 for (
auto &[RegID, Info] : SGPRs) {
3077 auto It =
Other.SGPRs.find(RegID);
3078 unsigned OtherScore =
3079 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3080 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
3085 for (
auto &[TID, Info] : VMem) {
3086 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
3087 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
3088 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
3089 Info.VMEMTypes = NewVmemTypes;
3093 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
3095 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
3097 purgeEmptyTrackingData();
3103 return Opcode == AMDGPU::S_WAITCNT ||
3106 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3107 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3108 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3109 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3113void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
3115 bool ExpertMode)
const {
3119 .
addImm(ExpertMode ? 2 : 0)
3137class VCCZWorkaround {
3138 const WaitcntBrackets &ScoreBrackets;
3139 const GCNSubtarget &
ST;
3140 const SIInstrInfo &
TII;
3141 const SIRegisterInfo &
TRI;
3142 bool VCCZCorruptionBug =
false;
3143 bool VCCZNotUpdatedByPartialWrites =
false;
3146 bool MustRecomputeVCCZ =
true;
3149 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
3150 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
3152 VCCZCorruptionBug =
ST.hasReadVCCZBug();
3153 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
3160 bool tryRecomputeVCCZ(MachineInstr &
MI) {
3162 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3172 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
3178 std::optional<bool> PartiallyWritesToVCCOpt;
3179 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
3180 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
3181 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
3183 if (VCCZNotUpdatedByPartialWrites) {
3184 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3187 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3193 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3195 if (!PartiallyWritesToVCCOpt)
3196 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3197 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3198 MI.definesRegister(AMDGPU::VCC,
nullptr);
3201 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3202 *PartiallyWritesToVCCOpt);
3204 MustRecomputeVCCZ =
false;
3214 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3217 MustRecomputeVCCZ =
false;
3227bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3228 MachineBasicBlock &
Block,
3229 WaitcntBrackets &ScoreBrackets) {
3233 dbgs() <<
"*** Begin Block: ";
3235 ScoreBrackets.dump();
3237 VCCZWorkaround VCCZW(ScoreBrackets, *ST, *
TII, *
TRI);
3240 MachineInstr *OldWaitcntInstr =
nullptr;
3245 Iter !=
E; ++Iter) {
3246 MachineInstr &Inst = *Iter;
3252 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3253 if (!OldWaitcntInstr)
3254 OldWaitcntInstr = &Inst;
3258 PreheaderFlushFlags FlushFlags;
3259 if (
Block.getFirstTerminator() == Inst)
3260 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3263 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3265 OldWaitcntInstr =
nullptr;
3267 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3273 assert(
ST->getGeneration() < AMDGPUSubtarget::GFX12);
3274 ScoreBrackets.recordAsyncMark(Inst);
3278 if (
TII->isSMRD(Inst)) {
3279 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3282 if (!Memop->isInvariant()) {
3283 const Value *Ptr = Memop->getValue();
3289 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3293 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3297 ScoreBrackets.dump();
3302 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3307 AMDGPU::Waitcnt
Wait;
3308 if (
Block.getFirstTerminator() ==
Block.end()) {
3309 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3310 if (FlushFlags.FlushVmCnt) {
3311 if (ScoreBrackets.hasPendingEvent(
LOAD_CNT))
3313 if (ScoreBrackets.hasPendingEvent(
SAMPLE_CNT))
3315 if (ScoreBrackets.hasPendingEvent(
BVH_CNT))
3318 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
3327 dbgs() <<
"*** End Block: ";
3329 ScoreBrackets.dump();
3335bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3336 if (
Block.size() <= 1)
3344 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3349 TII->isDS(
MI) || (
TII->isFLAT(
MI) &&
TII->mayAccessLDSThroughFlat(
MI));
3350 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3351 LastAtomicWithSoftXcnt =
nullptr;
3354 MI.mayLoad() &&
MI.mayStore();
3355 MachineInstr &PrevMI = *
MI.getPrevNode();
3357 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3360 if (LastAtomicWithSoftXcnt) {
3364 LastAtomicWithSoftXcnt = &
MI;
3372SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3373 const WaitcntBrackets &ScoreBrackets) {
3374 auto [Iterator, IsInserted] =
3377 return Iterator->second;
3381 return PreheaderFlushFlags();
3385 return PreheaderFlushFlags();
3388 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3389 return Iterator->second;
3392 return PreheaderFlushFlags();
3395bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3397 return TII->mayAccessVMEMThroughFlat(
MI);
3401bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3407bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3436SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3437 const WaitcntBrackets &Brackets) {
3438 PreheaderFlushFlags
Flags;
3439 bool HasVMemLoad =
false;
3440 bool HasVMemStore =
false;
3441 bool UsesVgprVMEMLoadedOutside =
false;
3442 bool UsesVgprDSReadOutside =
false;
3443 bool VMemInvalidated =
false;
3447 bool TrackSimpleDSOpt =
ST->hasExtendedWaitCounts();
3448 DenseSet<MCRegUnit> VgprUse;
3449 DenseSet<MCRegUnit> VgprDefVMEM;
3450 DenseSet<MCRegUnit> VgprDefDS;
3456 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3457 unsigned DSReadPosition = 0;
3458 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3459 bool TrackDSFlushPoint =
ST->hasExtendedWaitCounts() && IsSingleBlock;
3460 unsigned LastDSFlushPosition = 0;
3462 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3463 for (MachineInstr &
MI : *
MBB) {
3464 if (isVMEMOrFlatVMEM(
MI)) {
3465 HasVMemLoad |=
MI.mayLoad();
3466 HasVMemStore |=
MI.mayStore();
3470 if (mayStoreIncrementingDSCNT(
MI)) {
3473 if (VMemInvalidated)
3475 TrackSimpleDSOpt =
false;
3476 TrackDSFlushPoint =
false;
3478 bool IsDSRead = isDSRead(
MI);
3483 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3484 if (!TrackDSFlushPoint)
3486 if (
auto It = LastDSReadPositionMap.
find(RU);
3487 It != LastDSReadPositionMap.
end()) {
3491 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3495 for (
const MachineOperand &
Op :
MI.all_uses()) {
3496 if (
Op.isDebug() || !
TRI->isVectorRegister(*MRI,
Op.getReg()))
3499 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3503 VMemInvalidated =
true;
3507 TrackSimpleDSOpt =
false;
3510 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3514 updateDSReadFlushTracking(RU);
3519 VMEMID
ID = toVMEMID(RU);
3523 UsesVgprVMEMLoadedOutside =
true;
3527 else if (Brackets.hasPendingVMEM(
ID,
DS_CNT))
3528 UsesVgprDSReadOutside =
true;
3533 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3534 for (
const MachineOperand &
Op :
MI.all_defs()) {
3535 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3539 VMemInvalidated =
true;
3544 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3555 if (IsDSRead || TrackDSFlushPoint) {
3556 for (
const MachineOperand &
Op :
MI.all_defs()) {
3557 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
3559 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3562 updateDSReadFlushTracking(RU);
3565 if (TrackDSFlushPoint)
3566 LastDSReadPositionMap[RU] = DSReadPosition;
3575 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3576 ((!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3577 (HasVMemLoad &&
ST->hasVmemWriteVgprInOrder())))
3578 Flags.FlushVmCnt =
true;
3584 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3587 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3588 bool DSFlushPointPrefetch =
3589 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3591 if (SimpleDSOpt || DSFlushPointPrefetch)
3592 Flags.FlushDsCnt =
true;
3597bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3598 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3600 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3602 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3603 AA = &AAR->getAAResults();
3605 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3617 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
3622 .preserve<AAManager>();
3627 TII = ST->getInstrInfo();
3628 TRI = &
TII->getRegisterInfo();
3637 if (ST->hasExtendedWaitCounts()) {
3638 IsExpertMode = ST->hasExpertSchedulingMode() &&
3646 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3656 ForceEmitWaitcnt[
T] =
false;
3658 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3663 MachineBasicBlock &EntryBB = MF.
front();
3673 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3676 if (
ST->hasExtendedWaitCounts()) {
3683 if (!
ST->hasImageInsts() &&
3688 TII->get(instrsForExtendedCounterTypes[CT]))
3701 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3702 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3703 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3710 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3713 std::unique_ptr<WaitcntBrackets> Brackets;
3718 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3720 MachineBasicBlock *
MBB = BII->first;
3721 BlockInfo &BI = BII->second;
3727 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3729 *Brackets = *BI.Incoming;
3732 Brackets = std::make_unique<WaitcntBrackets>(
this);
3737 Brackets->~WaitcntBrackets();
3738 new (Brackets.get()) WaitcntBrackets(
this);
3742 if (
ST->hasWaitXcnt())
3744 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3747 if (Brackets->hasPendingEvent()) {
3748 BlockInfo *MoveBracketsToSucc =
nullptr;
3750 auto *SuccBII = BlockInfos.
find(Succ);
3751 BlockInfo &SuccBI = SuccBII->second;
3752 if (!SuccBI.Incoming) {
3753 SuccBI.Dirty =
true;
3754 if (SuccBII <= BII) {
3758 if (!MoveBracketsToSucc) {
3759 MoveBracketsToSucc = &SuccBI;
3761 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3765 dbgs() <<
"Try to merge ";
3771 if (SuccBI.Incoming->merge(*Brackets)) {
3772 SuccBI.Dirty =
true;
3773 if (SuccBII <= BII) {
3780 if (MoveBracketsToSucc)
3781 MoveBracketsToSucc->Incoming = std::move(Brackets);
3786 if (
ST->hasScalarStores()) {
3787 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3788 bool HaveScalarStores =
false;
3790 for (MachineBasicBlock &
MBB : MF) {
3791 for (MachineInstr &
MI :
MBB) {
3792 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
3793 HaveScalarStores =
true;
3795 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3796 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3801 if (HaveScalarStores) {
3810 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3811 bool SeenDCacheWB =
false;
3815 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3816 SeenDCacheWB =
true;
3817 else if (
TII->isScalarStore(*
I))
3818 SeenDCacheWB =
false;
3821 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3822 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3838 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3840 setSchedulingMode(EntryBB,
I,
true);
3842 for (MachineInstr *
MI : CallInsts) {
3843 MachineBasicBlock &
MBB = *
MI->getParent();
3844 setSchedulingMode(
MBB,
MI,
false);
3845 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3848 for (MachineInstr *
MI : ReturnInsts)
3849 setSchedulingMode(*
MI->getParent(),
MI,
false);
3860 for (
auto [
MI,
_] : EndPgmInsts) {
3862 TII->get(AMDGPU::S_ALLOC_VGPR))
3866 }
else if (!WCG->isOptNone() &&
3867 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3868 (MF.getFrameInfo().hasCalls() ||
3869 ST->getOccupancyWithNumVGPRs(
3870 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3873 for (
auto [
MI, Flag] : EndPgmInsts) {
3875 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3877 TII->get(AMDGPU::S_NOP))
3881 TII->get(AMDGPU::S_SENDMSG))
3889 ReturnInsts.
clear();
3890 EndPgmInsts.clear();
3891 PreheadersToFlush.
clear();
3892 SLoadAddresses.
clear();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
static constexpr bool is_iterable