31#define DEBUG_TYPE "igrouplp"
37 cl::desc(
"Whether to use the exponential time solver to fit "
38 "the instructions to the pipeline as closely as "
44 cl::desc(
"The maximum number of scheduling group conflicts "
45 "which we attempt to solve with the exponential time "
46 "exact solver. Problem sizes greater than this will"
47 "be solved by the less accurate greedy algorithm. Selecting "
48 "solver by size is superseded by manually selecting "
49 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
53 cl::desc(
"The amount of branches that we are willing to explore with"
54 "the exact algorithm before giving up."));
58 cl::desc(
"Whether to use the cost heuristic to make choices as we "
59 "traverse the search space using the exact solver. Defaulted "
60 "to on, and if turned off, we will use the node order -- "
61 "attempting to put the later nodes in the later sched groups. "
62 "Experimentally, results are mixed, so this should be set on a "
63 "case-by-case basis."));
67enum class SchedGroupMask {
80 ALL = ALU | VALU | SALU |
MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
81 DS_READ | DS_WRITE | TRANS,
90class InstructionRule {
96 std::optional<SmallVector<SUnit *, 4>> Cache;
106 bool NeedsCache =
false)
113 virtual ~InstructionRule() =
default;
126 SchedGroupMask SGMask;
129 std::optional<unsigned> MaxSize;
142 static unsigned NumSchedGroups;
159 bool canAddSU(
SUnit &SU)
const;
164 void link(
SUnit &SU,
bool MakePred =
false);
168 int link(
SUnit &SU,
bool MakePred,
169 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
178 void link(SchedGroup &OtherGroup);
181 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
187 void addRule(std::shared_ptr<InstructionRule> NewRule) {
192 bool allowedByRules(
const SUnit *SU,
194 for (
auto &Rule : Rules) {
195 if (!Rule->apply(SU, Collection, SyncPipe))
202 void add(
SUnit &SU) {
204 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
210 void pop() { Collection.
pop_back(); }
213 void findCandidateSUnits(
T Begin,
T End,
214 SUnitsToCandidateSGsMap &SyncedInstrs);
219 void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
221 int getSyncID() {
return SyncID; }
223 int getSGID() {
return SGID; }
225 SchedGroupMask
getMask() {
return SGMask; }
227 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
229 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
230 SGID = NumSchedGroups++;
233 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
235 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
236 SGID = NumSchedGroups++;
240using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
252class PipelineSolver {
265 bool NeedsSolver =
false;
269 unsigned computeProblemSize();
280 int CurrConflInstNo = 0;
282 int CurrSyncGroupIdx = 0;
284 int BeginSyncGroupIdx = 0;
290 bool IsBottomUp =
true;
293 void advancePosition();
296 void retreatPosition();
305 template <
typename T>
306 void greedyFind(std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
311 template <
typename T>
318 template <
typename T>
void linkSchedGroups(
T I,
T E);
322 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
326 template <
typename T>
327 int linkSUnit(
SUnit *SU,
int SGID,
328 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
330 void removeEdges(
const std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
332 void convertSyncMapsToArrays();
344 : DAG(DAG), SyncedInstrs(SyncedInstrs),
345 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
347 for (
auto &PipelineInstrs : SyncedInstrs) {
348 if (PipelineInstrs.second.
size() > 0) {
357 convertSyncMapsToArrays();
359 CurrPipeline = BestPipeline;
361 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
362 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
365 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
370void PipelineSolver::reset() {
372 for (
auto &SyncPipeline : CurrPipeline) {
373 for (
auto &SG : SyncPipeline) {
375 SG.Collection.
clear();
379 if (SchedBarr != TempCollection.
end())
380 SG.Collection.push_back(*SchedBarr);
384 CurrSyncGroupIdx = BeginSyncGroupIdx;
389void PipelineSolver::convertSyncMapsToArrays() {
390 for (
auto &SyncPipe : SyncedSchedGroups) {
391 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
394 int PipelineIDx = SyncedInstrs.size() - 1;
395 PipelineInstrs.resize(SyncedInstrs.size());
396 for (
auto &SyncInstrMap : SyncedInstrs) {
397 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
398 if (PipelineInstrs[PipelineIDx].
size() == 0) {
399 PipelineInstrs[PipelineIDx].push_back(
400 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
403 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
406 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
407 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
409 PipelineInstrs[PipelineIDx].insert(
410 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
416template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
417 for (;
I !=
E; ++
I) {
419 for (
auto J = std::next(
I); J !=
E; ++J) {
426void PipelineSolver::makePipeline() {
428 for (
auto &SyncPipeline : BestPipeline) {
430 for (
auto &SG : SyncPipeline) {
433 SUnit *SGBarr =
nullptr;
434 for (
auto &SU : SG.Collection) {
435 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
442 SG.link(*SGBarr,
false);
446 for (
auto &SyncPipeline : BestPipeline) {
447 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
448 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
453int PipelineSolver::linkSUnit(
454 SUnit *SU,
int SGID, std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
456 bool MakePred =
false;
459 if (
I->getSGID() == SGID) {
464 AddedCost += Group.link(*SU, MakePred, AddedEdges);
470int PipelineSolver::addEdges(
472 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
482 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
484 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
488void PipelineSolver::removeEdges(
489 const std::list<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
492 for (
auto &PredSuccPair : EdgesToRemove) {
493 SUnit *Pred = PredSuccPair.first;
494 SUnit *Succ = PredSuccPair.second;
497 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
498 if (Match != Succ->
Preds.end()) {
499 assert(Match->isArtificial());
505void PipelineSolver::advancePosition() {
508 if (
static_cast<size_t>(CurrConflInstNo) >=
509 PipelineInstrs[CurrSyncGroupIdx].
size()) {
513 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
514 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
519void PipelineSolver::retreatPosition() {
520 assert(CurrConflInstNo >= 0);
521 assert(CurrSyncGroupIdx >= 0);
523 if (CurrConflInstNo > 0) {
528 if (CurrConflInstNo == 0) {
531 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
536 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
539 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
543bool PipelineSolver::checkOptimal() {
544 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
545 if (BestCost == -1 || CurrCost < BestCost) {
546 BestPipeline = CurrPipeline;
553 bool DoneExploring =
false;
554 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
555 DoneExploring =
true;
557 return (DoneExploring || BestCost == 0);
561void PipelineSolver::populateReadyList(
563 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
564 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
565 assert(CurrSU.second.size() >= 1);
567 for (;
I !=
E; ++
I) {
568 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
570 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
571 return SG.getSGID() == CandSGID;
576 if (Match->isFull()) {
577 ReadyList.push_back(std::pair(*
I, MissPenalty));
581 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
582 ReadyList.push_back(std::pair(*
I, TempCost));
583 removeEdges(AddedEdges);
585 ReadyList.push_back(std::pair(*
I, -1));
591 assert(ReadyList.size() == CurrSU.second.size());
594bool PipelineSolver::solveExact() {
598 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
601 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
602 assert(
static_cast<size_t>(CurrConflInstNo) <
603 PipelineInstrs[CurrSyncGroupIdx].
size());
604 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
606 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
611 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
612 CurrSU.second.rend())
613 : populateReadyList(ReadyList, CurrSU.second.
begin(),
614 CurrSU.second.end());
616 auto *
I = ReadyList.
begin();
617 auto *
E = ReadyList.
end();
618 for (;
I !=
E; ++
I) {
622 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
625 int CandSGID =
I->first;
627 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
628 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
630 for (
auto &SG : SyncPipeline) {
631 if (SG.getSGID() == CandSGID)
638 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
642 << (
int)Match->getMask() <<
"and ID " << CandSGID
644 Match->add(*CurrSU.first);
645 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
646 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
647 CurrCost += AddedCost;
650 bool FinishedExploring =
false;
653 if (CurrCost < BestCost || BestCost == -1) {
655 FinishedExploring = BestCost != 0;
656 if (!FinishedExploring)
662 CurrCost -= AddedCost;
663 removeEdges(AddedEdges);
665 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
666 if (FinishedExploring)
673 CurrCost += MissPenalty;
676 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
678 bool FinishedExploring =
false;
679 if (CurrCost < BestCost || BestCost == -1) {
681 bool FinishedExploring = BestCost != 0;
682 if (!FinishedExploring)
688 CurrCost -= MissPenalty;
689 return FinishedExploring;
693void PipelineSolver::greedyFind(
694 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
695 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
696 int BestNodeCost = -1;
698 SchedGroup *BestGroup =
nullptr;
699 int BestGroupID = -1;
700 std::list<std::pair<SUnit *, SUnit *>> BestEdges;
701 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
703 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
709 for (;
I !=
E; ++
I) {
711 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
712 return SG.getSGID() == CandSGID;
716 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
717 << (
int)Match->getMask() <<
"\n");
719 if (Match->isFull()) {
723 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
724 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
728 std::list<std::pair<SUnit *, SUnit *>> TempEdges;
729 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, TempEdges);
732 if (TempCost < BestNodeCost || BestNodeCost == -1) {
733 BestEdges = TempEdges;
735 BestNodeCost = TempCost;
736 BestGroupID = CandSGID;
738 if (BestNodeCost == 0)
742 removeEdges(TempEdges);
745 if (BestGroupID != -1) {
746 BestGroup->add(*CurrSU.first);
747 if (AddedEdges.empty())
748 AddedEdges = BestEdges;
750 AddedEdges.splice(std::prev(AddedEdges.cend()), BestEdges);
752 for (
const std::pair<SUnit *, SUnit *> &
E : BestEdges) {
753 if (!BestGroup->tryAddEdge(
E.first,
E.second))
757 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
758 << (
int)BestGroup->getMask() <<
"\n");
759 BestCost += TempCost;
761 BestCost += MissPenalty;
763 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
766bool PipelineSolver::solveGreedy() {
768 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
770 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
771 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
773 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
774 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
777 BestPipeline = CurrPipeline;
778 removeEdges(AddedEdges);
782unsigned PipelineSolver::computeProblemSize() {
783 unsigned ProblemSize = 0;
784 for (
auto &PipeConflicts : PipelineInstrs) {
785 ProblemSize += PipeConflicts.size();
791void PipelineSolver::solve() {
795 unsigned ProblemSize = computeProblemSize();
798 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
799 MissPenalty = (ProblemSize / 2) + 1;
802 if (EnableExactSolver || BelowCutoff) {
806 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
810 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
822enum IGLPStrategyID :
int {
823 MFMASmallGemmOptID = 0,
824 MFMASmallGemmSingleWaveOptID = 1,
825 MFMAExpInterleaveID = 2,
826 MFMAExpSimpleInterleaveID = 3
838 virtual bool applyIGLPStrategy(
847 bool IsBottomUp =
true;
852 virtual ~IGLPStrategy() =
default;
855class MFMASmallGemmOpt final :
public IGLPStrategy {
858 bool applyIGLPStrategy(
869 : IGLPStrategy(DAG,
TII) {
874bool MFMASmallGemmOpt::applyIGLPStrategy(
879 unsigned MFMACount = 0;
881 if (
TII->isMFMAorWMMA(
I))
884 const unsigned PipelineSyncID = 0;
885 SchedGroup *SG =
nullptr;
886 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
887 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
888 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
889 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
891 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
892 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
893 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
899class MFMAExpInterleaveOpt final :
public IGLPStrategy {
902 static unsigned TransPipeCount;
904 static unsigned MFMAPipeCount;
906 static unsigned AddPipeCount;
908 static unsigned MFMAEnablement;
910 static unsigned ExpRequirement;
912 static unsigned MFMAChains;
914 static unsigned MFMAChainLength;
919 static bool HasChainBetweenCvt;
921 static std::optional<unsigned> FirstPipeDSR;
930 class IsPipeExp final :
public InstructionRule {
935 auto *DAG = SyncPipe[0].DAG;
937 if (Cache->empty()) {
938 auto I = DAG->SUnits.rbegin();
939 auto E = DAG->SUnits.rend();
940 for (;
I !=
E;
I++) {
941 if (
TII->isMFMAorWMMA(*
I->getInstr()))
942 Cache->push_back(&*
I);
948 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
949 return DAG->IsReachable(TargetSU,
const_cast<SUnit *
>(SU));
954 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
955 : InstructionRule(
TII, SGID, NeedsCache) {}
960 class EnablesNthMFMA final :
public InstructionRule {
967 bool FoundTrans =
false;
968 unsigned Counter = 1;
969 auto *DAG = SyncPipe[0].DAG;
971 if (Cache->empty()) {
972 auto I = DAG->SUnits.begin();
973 auto E = DAG->SUnits.end();
974 for (;
I !=
E;
I++) {
975 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
977 Cache->push_back(&*
I);
982 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
989 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
993 bool NeedsCache =
false)
999 class EnablesNthMFMAInChain final :
public InstructionRule {
1007 auto *DAG = SyncPipe[0].DAG;
1009 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1012 if (Cache->empty()) {
1013 auto *TempSU = ChainSeed;
1018 for (
auto &Succ : TempSU->Succs) {
1019 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1020 TempSU = Succ.getSUnit();
1029 Cache->push_back(TempSU);
1035 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1038 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1040 bool NeedsCache =
false)
1042 ChainSeed(ChainSeed) {}
1048 class LessThanNSuccs final :
public InstructionRule {
1051 bool HasIntermediary =
false;
1056 if (!SyncPipe.
size())
1060 return Succ.getKind() == SDep::Data;
1062 if (SuccSize >=
Size)
1065 if (HasIntermediary) {
1066 for (
auto Succ : SU->
Succs) {
1069 return SuccSucc.getKind() == SDep::Data;
1071 if (SuccSize >=
Size)
1079 bool HasIntermediary =
false,
bool NeedsCache =
false)
1080 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1081 HasIntermediary(HasIntermediary) {}
1088 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1091 bool HasIntermediary =
false;
1096 if (!SyncPipe.
size())
1100 return Succ.getKind() == SDep::Data;
1102 if (SuccSize >=
Size)
1105 if (HasIntermediary) {
1106 for (
auto Succ : SU->
Succs) {
1109 return SuccSucc.getKind() == SDep::Data;
1111 if (SuccSize >=
Size)
1119 unsigned SGID,
bool HasIntermediary =
false,
1120 bool NeedsCache =
false)
1121 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1122 HasIntermediary(HasIntermediary) {}
1126 class IsCvt final :
public InstructionRule {
1131 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1132 Opc == AMDGPU::V_CVT_I32_F32_e32;
1134 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1135 : InstructionRule(
TII, SGID, NeedsCache) {}
1139 class IsFMA final :
public InstructionRule {
1146 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1147 : InstructionRule(
TII, SGID, NeedsCache) {}
1151 class IsPipeAdd final :
public InstructionRule {
1157 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1158 : InstructionRule(
TII, SGID, NeedsCache) {}
1163 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1165 unsigned Distance = 1;
1170 SchedGroup *OtherGroup =
nullptr;
1171 if (!SyncPipe.
size())
1174 for (
auto &PipeSG : SyncPipe) {
1175 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1176 OtherGroup = &PipeSG;
1181 if (!OtherGroup->Collection.size())
1184 for (
auto &OtherEle : OtherGroup->Collection) {
1185 for (
auto &Succ : OtherEle->Succs) {
1186 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1194 unsigned SGID,
bool NeedsCache =
false)
1195 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1200 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1202 unsigned Distance = 1;
1207 SchedGroup *OtherGroup =
nullptr;
1208 if (!SyncPipe.
size())
1211 for (
auto &PipeSG : SyncPipe) {
1212 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1213 OtherGroup = &PipeSG;
1218 if (!OtherGroup->Collection.size())
1221 auto *DAG = SyncPipe[0].DAG;
1223 for (
auto &OtherEle : OtherGroup->Collection)
1224 if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
1229 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1230 unsigned SGID,
bool NeedsCache =
false)
1231 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1235 class OccursAtOrAfterNode final :
public InstructionRule {
1246 bool NeedsCache =
false)
1252 class IsExactMFMA final :
public InstructionRule {
1260 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1263 if (Cache->empty()) {
1264 auto *TempSU = ChainSeed;
1269 for (
auto &Succ : TempSU->Succs) {
1270 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1271 TempSU = Succ.getSUnit();
1280 Cache->push_back(TempSU);
1286 return (*Cache)[0] == SU;
1290 unsigned SGID,
bool NeedsCache =
false)
1292 ChainSeed(ChainSeed) {}
1298 class OccursAfterExp final :
public InstructionRule {
1303 auto *DAG = SyncPipe[0].DAG;
1304 if (Cache->empty()) {
1305 for (
auto &SU : DAG->SUnits)
1307 Cache->push_back(&SU);
1314 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1318 bool NeedsCache =
false)
1319 : InstructionRule(
TII, SGID, NeedsCache) {}
1323 bool applyIGLPStrategy(
1332 : IGLPStrategy(DAG,
TII) {
1337unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1338unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1339unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1340unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1341unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1342unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1343unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1344bool MFMAExpInterleaveOpt::HasCvt =
false;
1345bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1346std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1355 auto isBitPack = [](
unsigned Opc) {
1356 return Opc == AMDGPU::V_PACK_B32_F16_e64 ||
Opc == AMDGPU::V_PERM_B32_e64;
1359 auto isCvt = [](
unsigned Opc) {
1360 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
Opc == AMDGPU::V_CVT_I32_F32_e32;
1363 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1366 for (
SUnit &SU : DAG->SUnits) {
1370 if (SU.
Succs.size() >= 7)
1372 for (
auto &Succ : SU.
Succs) {
1373 if (Succ.getSUnit()->Succs.size() >= 7)
1392 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1397 std::optional<SUnit *> TempMFMA;
1398 std::optional<SUnit *> TempExp;
1400 for (
auto &PredSU : ExpPipeCands) {
1401 for (
auto &SuccSU : MFMAPipeCands) {
1402 if (DAG->IsReachable(SuccSU, PredSU)) {
1414 if (!(TempExp && TempMFMA))
1417 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1418 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1422 for (
auto &SuccSU : MFMAPipeCands) {
1423 if (MFMAPipeSUs.
size() &&
1424 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1425 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1429 for (
auto &PredSU : ExpPipeCands) {
1430 if (DAG->IsReachable(SuccSU, PredSU)) {
1437 MFMAPipeCount = MFMAPipeSUs.
size();
1439 assert(TempExp && TempMFMA);
1440 assert(MFMAPipeCount > 0);
1442 std::optional<SUnit *> TempCvt;
1443 for (
auto &SuccSU : CvtSUs) {
1444 if (DAG->IsReachable(SuccSU, *TempExp)) {
1451 if (TempCvt.has_value()) {
1452 for (
auto &SuccSU : MFMAPipeSUs) {
1453 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1461 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1465 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1467 MFMAChainSeeds.push_back(MFMAPipeSU);
1475 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1476 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1477 Pred.getSUnit()->getInstr()->mayLoad())
1478 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1481 MFMAChainLength = MFMAPipeCount / MFMAChains;
1484 unsigned PackSuccCount =
1486 return DAG->IsReachable(VPack, *TempExp);
1490 unsigned PackPredCount =
1492 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1493 return isBitPack(Opc);
1497 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1498 return isBitPack(Opc);
1501 if (PackPred == (*TempMFMA)->Preds.end())
1509 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1513 MFMAEnablement *= PackSuccCount;
1518 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1521 ExpRequirement *= PackPredCount;
1531 MFMAChainSeeds.clear();
1538bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1543 bool IsSmallKernelType =
1544 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1545 bool IsLargeKernelType =
1546 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1548 if (!(IsSmallKernelType || IsLargeKernelType))
1554 unsigned PipelineSyncID = 0;
1555 SchedGroup *SG =
nullptr;
1557 unsigned MFMAChain = 0;
1558 unsigned PositionInChain = 0;
1559 unsigned CurrMFMAForTransPosition = 0;
1561 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1562 &CurrMFMAForTransPosition]() {
1563 CurrMFMAForTransPosition += MFMAEnablement;
1564 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1565 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1568 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1569 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1570 return (TempMFMAForTrans / MFMAChains);
1573 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1574 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1575 return TempMFMAForTrans % MFMAChains;
1578 unsigned CurrMFMAPosition = 0;
1579 unsigned MFMAChainForMFMA = 0;
1580 unsigned PositionInChainForMFMA = 0;
1582 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1583 &PositionInChainForMFMA]() {
1585 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1586 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1590 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1592 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1593 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1594 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1595 bool UsesVALU = IsSmallKernelType;
1600 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1601 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1602 if (!IsPostRA && MFMAChains) {
1603 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1604 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1608 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1609 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1610 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1613 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1614 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1615 if (!IsPostRA && MFMAChains) {
1616 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1617 getNextTransPositionInChain(),
1618 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1620 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1621 SG->getSGID(),
true));
1622 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1623 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1627 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1628 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1629 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1631 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1635 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1636 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1637 if (!IsPostRA && MFMAChains)
1638 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1639 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1641 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1642 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1643 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1644 HasChainBetweenCvt));
1645 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1647 incrementTransPosition();
1650 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1653 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1654 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1655 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1656 if (HasChainBetweenCvt)
1657 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1658 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1660 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1661 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1662 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1667 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1668 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1669 if (!IsPostRA && MFMAChains) {
1670 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1671 getNextTransPositionInChain(),
1672 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1674 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1675 TII, SG->getSGID(),
true));
1676 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1677 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1681 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1682 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1683 if (!IsPostRA && MFMAChains)
1684 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1685 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1688 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1689 SG->getSGID(),
true));
1690 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1691 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1692 HasChainBetweenCvt));
1693 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1698 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1699 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1700 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1701 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1702 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1703 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1708 unsigned MFMARatio =
1709 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1712 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1714 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1715 ? TransPipeCount - (2 * ExpRequirement)
1717 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1719 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1720 ? MFMAPipeCount - (MFMAEnablement * 2)
1722 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1724 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1725 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1727 for (
unsigned I = 0;
I < LoopSize;
I++) {
1728 if (!(
I * ExpRatio % ExpRequirement))
1729 incrementTransPosition();
1732 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1733 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1734 if (!IsPostRA && MFMAChains)
1735 SG->addRule(std::make_shared<IsExactMFMA>(
1736 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1737 SG->getSGID(),
true));
1739 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1740 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1741 incrementMFMAPosition();
1744 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1745 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1746 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1747 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1750 if (UsesDSRead && !(
I % 4)) {
1751 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1752 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1753 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1755 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1759 for (
unsigned J = 0; J < ExpRatio; J++) {
1760 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1761 auto MaxMFMAOffset =
1762 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1766 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1767 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1768 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1769 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1770 auto DSROffset =
I / 4 + 1;
1771 auto MaxDSROffset = MaxMFMAOffset / 4;
1773 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1774 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1775 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1777 if (HasChainBetweenCvt)
1778 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1779 CurrentOffset,
TII, SG->getSGID()));
1781 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1783 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1788 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1789 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1790 if (!IsPostRA && MFMAChains)
1791 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1792 getNextTransPositionInChain(),
1793 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1796 SG->addRule(std::make_shared<EnablesNthMFMA>(
1797 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1798 TII, SG->getSGID(),
true));
1799 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1800 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1804 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1805 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1806 if (!IsPostRA && MFMAChains)
1807 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1808 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1811 SG->addRule(std::make_shared<EnablesNthMFMA>(
1812 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1813 TII, SG->getSGID(),
true));
1814 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1815 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1816 HasChainBetweenCvt));
1817 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1822 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1823 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1824 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1825 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1829class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1831 bool applyIGLPStrategy(
1842 : IGLPStrategy(DAG,
TII) {
1847bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1852 unsigned MFMACount = 0;
1854 if (
TII->isMFMAorWMMA(
I))
1857 const unsigned PipelineSyncID = 0;
1858 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1859 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1860 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1861 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1863 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1864 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1865 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1871class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1874 class EnablesInitialMFMA final :
public InstructionRule {
1878 if (!SyncPipe.
size())
1881 if (!Cache->size()) {
1882 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1883 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1887 Cache->push_back(&Elt);
1892 auto *DAG = SyncPipe[0].DAG;
1893 for (
auto &Elt : *Cache) {
1901 bool NeedsCache =
false)
1902 : InstructionRule(
TII, SGID, NeedsCache) {}
1906 class IsPermForDSW final :
public InstructionRule {
1911 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1914 bool FitsInGroup =
false;
1916 if (!Collection.
size()) {
1917 for (
auto &Succ : SU->
Succs) {
1918 SUnit *SuccUnit = Succ.getSUnit();
1921 Cache->push_back(SuccUnit);
1932 return ThisSucc.getSUnit() == Elt;
1937 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1938 : InstructionRule(
TII, SGID, NeedsCache) {}
1942 class IsSuccOfPrevGroup final :
public InstructionRule {
1946 SchedGroup *OtherGroup =
nullptr;
1947 for (
auto &PipeSG : SyncPipe) {
1948 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1949 OtherGroup = &PipeSG;
1955 if (!OtherGroup->Collection.size())
1959 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
1960 return any_of(Elt->Succs,
1961 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1965 bool NeedsCache =
false)
1966 : InstructionRule(
TII, SGID, NeedsCache) {}
1970 class VMEMSize final :
public InstructionRule {
1975 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1977 if (!Collection.
size())
1982 auto TRI =
TII->getRegisterInfo();
1983 auto &MRI =
MI->getMF()->getRegInfo();
1984 for (
auto &Elt : Collection) {
1985 auto Op = Elt->getInstr()->getOperand(0);
1987 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(MRI,
Op));
1991 if (NumBits < 128) {
1993 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1994 MRI,
MI->getOperand(0))) <=
2002 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
2003 : InstructionRule(
TII, SGID, NeedsCache) {}
2008 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2010 unsigned Distance = 1;
2015 SchedGroup *OtherGroup =
nullptr;
2016 if (!SyncPipe.
size())
2019 if (!Cache->size()) {
2021 for (
auto &PipeSG : SyncPipe) {
2022 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2023 OtherGroup = &PipeSG;
2029 if (!OtherGroup->Collection.size())
2032 for (
auto &OtherEle : OtherGroup->Collection) {
2033 for (
auto &Pred : OtherEle->Preds) {
2034 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2035 AMDGPU::V_PERM_B32_e64)
2036 Cache->push_back(Pred.getSUnit());
2045 auto *DAG = SyncPipe[0].DAG;
2052 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2053 unsigned SGID,
bool NeedsCache =
false)
2054 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2058 bool applyIGLPStrategy(
2069 : IGLPStrategy(DAG,
TII) {
2074static unsigned DSWCount = 0;
2075static unsigned DSWWithPermCount = 0;
2076static unsigned DSWWithSharedVMEMCount = 0;
2078bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2079 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2082 unsigned MFMACount = 0;
2083 unsigned DSRCount = 0;
2085 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2087 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2088 DSWWithSharedVMEMCount == 0)) &&
2089 "DSWCounters should be zero in pre-RA scheduling!");
2091 for (
auto &SU : DAG->SUnits) {
2092 auto *
I = SU.getInstr();
2093 if (
TII->isMFMAorWMMA(*
I))
2095 else if (
TII->isDS(*
I)) {
2098 else if (
I->mayStore() && IsInitial) {
2100 for (
auto Pred : SU.Preds) {
2101 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2102 AMDGPU::V_PERM_B32_e64) {
2112 DSWWithPermCount = DSWithPerms.
size();
2113 auto *
I = DSWithPerms.
begin();
2114 auto *
E = DSWithPerms.
end();
2122 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2124 for (;
I !=
E;
I++) {
2125 SUnit *Cand =
nullptr;
2126 bool MissedAny =
false;
2127 for (
auto &Pred : (*I)->Preds) {
2128 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2134 for (
auto &Succ : Pred.getSUnit()->Succs) {
2135 auto *
MI = Succ.getSUnit()->getInstr();
2136 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2139 if (MissedAny || !VMEMLookup.
size()) {
2141 VMEMLookup[
MI] = *
I;
2158 if (!MissedAny && Cand) {
2159 DSWWithSharedVMEMCount += 2;
2166 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2168 unsigned PipelineSyncID = 0;
2170 if (DSWWithPermCount) {
2171 for (
unsigned I = 0;
I < MFMACount;
I++) {
2172 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2173 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2174 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2176 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2177 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2178 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2188 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2189 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2190 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2191 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2195 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2198 for (
unsigned I = 4;
I < DSRCount; ++
I) {
2199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2200 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2201 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2203 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2204 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2205 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2211 for (
unsigned I = DSWWithSharedVMEMCount;
I < DSWWithPermCount; ++
I) {
2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2214 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2215 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2217 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2218 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2219 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2220 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2223 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2224 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2225 1,
TII, SG->getSGID(),
true));
2226 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2227 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2229 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2230 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2231 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2233 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2234 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2235 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2236 3,
TII, SG->getSGID(),
true));
2237 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2238 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2242 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2248 for (
unsigned I = DSWWithPermCount;
I < DSWCount;
I++) {
2249 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2250 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2251 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2253 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2254 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2255 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2256 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2258 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2259 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2260 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2268 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2271 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2272 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2274 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2275 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2276 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2277 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2281 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2285 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2286 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2288 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2289 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2290 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2291 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2293 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2294 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2295 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2297 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2298 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2299 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2300 2,
TII, SG->getSGID(),
true));
2301 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2302 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2304 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2305 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2306 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2308 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2309 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2310 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2311 4,
TII, SG->getSGID(),
true));
2312 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2313 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2315 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2316 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2317 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2323static std::unique_ptr<IGLPStrategy>
2324createIGLPStrategy(IGLPStrategyID
ID, ScheduleDAGInstrs *DAG,
2325 const SIInstrInfo *
TII) {
2327 case MFMASmallGemmOptID:
2328 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2329 case MFMASmallGemmSingleWaveOptID:
2330 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2331 case MFMAExpInterleaveID:
2332 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2333 case MFMAExpSimpleInterleaveID:
2334 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2340class IGroupLPDAGMutation :
public ScheduleDAGMutation {
2342 const SIInstrInfo *
TII;
2349 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2352 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2355 void addSchedBarrierEdges(SUnit &SU);
2366 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2369 void initSchedGroupBarrierPipelineStage(
2370 std::vector<SUnit>::reverse_iterator RIter);
2372 bool initIGLPOpt(SUnit &SU);
2375 void apply(ScheduleDAGInstrs *DAGInstrs)
override;
2382 bool IsBottomUp =
true;
2387 IGroupLPDAGMutation() =
default;
2391unsigned SchedGroup::NumSchedGroups = 0;
2393bool SchedGroup::tryAddEdge(SUnit *
A, SUnit *
B) {
2397bool SchedGroup::canAddMI(
const MachineInstr &
MI)
const {
2399 if (
MI.isMetaInstruction())
2402 else if (
MI.isInlineAsm()) {
2404 auto &MRI =
MI.getParent()->getParent()->getRegInfo();
2405 bool SGPR_used =
false, SGPR_big_def =
false, VGPR_used =
false,
2406 VMFMA_used =
false, VReg32_used =
false,
MayLoad =
MI.mayLoad(),
2408 for (
const MachineOperand &Operand :
MI.operands())
2409 if (Operand.isReg()) {
2410 const TargetRegisterClass &RegClass =
2411 *
TRI.getRegClassForOperandReg(MRI, Operand);
2412 if (
TRI.hasVGPRs(&RegClass)) {
2414 if (Operand.isUse() &&
TRI.getRegSizeInBits(RegClass) == 32)
2420 if (
TRI.hasAGPRs(&RegClass) ||
TRI.getRegSizeInBits(RegClass) > 128)
2422 if (
TRI.hasSGPRs(&RegClass))
2424 if (
TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2425 SGPR_big_def =
true;
2428 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2429 SGMask_t InlineAsmMask = 0;
2430 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2431 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2432 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2433 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2435 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2436 if (VGPR_used && MayLoad)
2437 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2438 : SchedGroupMask::VMEM_READ);
2439 if (VGPR_used && MayStore)
2440 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2441 : SchedGroupMask::VMEM_WRITE);
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2444 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2445 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2446 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2447 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2448 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2449 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2450 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2451 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2452 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2454 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2457 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2462 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2470 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2474 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2475 TII->isMFMAorWMMA(
MI))
2478 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2482 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2483 MI.mayLoad() &&
TII->isVMEM(
MI))
2486 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2487 MI.mayStore() &&
TII->isVMEM(
MI))
2490 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2494 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2495 MI.mayLoad() &&
TII->isDS(
MI))
2498 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2499 MI.mayStore() &&
TII->isDS(
MI))
2502 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2507 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2508 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2513int SchedGroup::link(SUnit &SU,
bool MakePred,
2514 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2515 int MissedEdges = 0;
2516 for (
auto *
A : Collection) {
2518 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2528 bool Added = tryAddEdge(
A,
B);
2530 AddedEdges.emplace_back(
A,
B);
2538void SchedGroup::link(SUnit &SU,
bool MakePred) {
2539 for (
auto *
A : Collection) {
2541 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2550void SchedGroup::link(SUnit &SU,
2551 function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>
P) {
2552 for (
auto *
A : Collection) {
2561void SchedGroup::link(SchedGroup &OtherGroup) {
2562 for (
auto *
B : OtherGroup.Collection)
2566bool SchedGroup::canAddSU(SUnit &SU)
const {
2568 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2569 return canAddMI(
MI);
2572 const MachineBasicBlock *
MBB =
MI.getParent();
2574 while (
E !=
MBB->
end() &&
E->isBundledWithPred())
2578 return std::all_of(
B,
E, [
this](MachineInstr &
MI) {
return canAddMI(
MI); });
2582void SchedGroup::findCandidateSUnits(
T Begin,
T End,
2583 SUnitsToCandidateSGsMap &SyncedInstrs) {
2586 SyncedInstrs[&SU].push_back(SGID);
2590void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2591 findCandidateSUnits(DAG->
SUnits.rbegin(), DAG->
SUnits.rend(), SyncedInstrs);
2594void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2595 const TargetSchedModel *TSchedModel = DAGInstrs->
getSchedModel();
2596 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2601 TII =
ST.getInstrInfo();
2602 DAG =
static_cast<ScheduleDAGMI *
>(DAGInstrs);
2603 SyncedSchedGroups.clear();
2604 SyncedInstrs.clear();
2605 bool FoundSB =
false;
2606 bool FoundIGLP =
false;
2607 bool ShouldApplyIGLP =
false;
2608 for (
auto R = DAG->
SUnits.rbegin(),
E = DAG->
SUnits.rend(); R !=
E; ++R) {
2609 unsigned Opc =
R->getInstr()->getOpcode();
2611 if (
Opc == AMDGPU::SCHED_BARRIER) {
2612 addSchedBarrierEdges(*R);
2614 }
else if (
Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2615 initSchedGroupBarrierPipelineStage(R);
2617 }
else if (
Opc == AMDGPU::IGLP_OPT) {
2618 if (!FoundSB && !FoundIGLP) {
2620 ShouldApplyIGLP = initIGLPOpt(*R);
2625 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2626 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2634void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2636 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2637 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2638 <<
MI.getOperand(0).getImm() <<
"\n");
2640 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2641 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2643 for (SUnit &SU : DAG->
SUnits)
2644 if (SG.canAddSU(SU))
2650 (function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>)[](
2651 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2655IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2658 SchedGroupMask InvertedMask = ~Mask;
2661 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2662 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2663 ~SchedGroupMask
::MFMA & ~SchedGroupMask::TRANS;
2665 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2666 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2667 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2668 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2669 InvertedMask &= ~SchedGroupMask::ALU;
2672 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2673 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2675 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2676 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2677 InvertedMask &= ~SchedGroupMask::VMEM;
2680 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2681 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2683 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2684 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2685 InvertedMask &= ~SchedGroupMask::DS;
2687 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2690 return InvertedMask;
2693void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2694 std::vector<SUnit>::reverse_iterator RIter) {
2695 MachineInstr &SGB = *RIter->getInstr();
2702 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2705 SG.findCandidateSUnits(RIter, SG.DAG->
SUnits.rend(),
2706 SyncedInstrs[SG.getSyncID()]);
2709bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2710 IGLPStrategyID StrategyID =
2712 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2713 if (!S->shouldApplyStrategy(DAG,
Phase))
2716 IsBottomUp = S->IsBottomUp;
2717 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(
Phase);
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Register const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
LLVM_ABI void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
@ LLVM_MARK_AS_BITMASK_ENUM
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Function object to check whether the second component of a container supported by std::get (like std:...