LLVM 23.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
20#include "SIInstrInfo.h"
23#include "llvm/ADT/DenseMap.h"
26
27#include <type_traits>
28
29using namespace llvm;
30
31#define DEBUG_TYPE "igrouplp"
32
33namespace {
34
35static cl::opt<bool> EnableExactSolver(
36 "amdgpu-igrouplp-exact-solver", cl::Hidden,
37 cl::desc("Whether to use the exponential time solver to fit "
38 "the instructions to the pipeline as closely as "
39 "possible."),
40 cl::init(false));
41
42static cl::opt<unsigned> CutoffForExact(
43 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
44 cl::desc("The maximum number of scheduling group conflicts "
45 "which we attempt to solve with the exponential time "
46 "exact solver. Problem sizes greater than this will"
47 "be solved by the less accurate greedy algorithm. Selecting "
48 "solver by size is superseded by manually selecting "
49 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
50
51static cl::opt<uint64_t> MaxBranchesExplored(
52 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
53 cl::desc("The amount of branches that we are willing to explore with"
54 "the exact algorithm before giving up."));
55
56static cl::opt<bool> UseCostHeur(
57 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
58 cl::desc("Whether to use the cost heuristic to make choices as we "
59 "traverse the search space using the exact solver. Defaulted "
60 "to on, and if turned off, we will use the node order -- "
61 "attempting to put the later nodes in the later sched groups. "
62 "Experimentally, results are mixed, so this should be set on a "
63 "case-by-case basis."));
64
65// Components of the mask that determines which instruction types may be may be
66// classified into a SchedGroup.
67enum class SchedGroupMask {
68 NONE = 0u,
69 ALU = 1u << 0,
70 VALU = 1u << 1,
71 SALU = 1u << 2,
72 MFMA = 1u << 3,
73 VMEM = 1u << 4,
74 VMEM_READ = 1u << 5,
75 VMEM_WRITE = 1u << 6,
76 DS = 1u << 7,
77 DS_READ = 1u << 8,
78 DS_WRITE = 1u << 9,
79 TRANS = 1u << 10,
80 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
81 DS_READ | DS_WRITE | TRANS,
82 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
83};
84
85class SchedGroup;
86
87// InstructionRule class is used to enact a filter which determines whether or
88// not an SU maps to a given SchedGroup. It contains complementary data
89// structures (e.g Cache) to help those filters.
90class InstructionRule {
91protected:
92 const SIInstrInfo *TII;
93 unsigned SGID;
94 // A cache made available to the Filter to store SUnits for subsequent
95 // invocations of the Filter
96 std::optional<SmallVector<SUnit *, 4>> Cache;
97
98public:
99 virtual bool
100 apply(const SUnit *, const ArrayRef<SUnit *>,
102 return true;
103 };
104
105 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
106 bool NeedsCache = false)
107 : TII(TII), SGID(SGID) {
108 if (NeedsCache) {
109 Cache = SmallVector<SUnit *, 4>();
110 }
111 }
112
113 virtual ~InstructionRule() = default;
114};
115
116using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
117
118// Classify instructions into groups to enable fine tuned control over the
119// scheduler. These groups may be more specific than current SchedModel
120// instruction classes.
121class SchedGroup {
122private:
123 // Mask that defines which instruction types can be classified into this
124 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
125 // and SCHED_GROUP_BARRIER.
126 SchedGroupMask SGMask;
127
128 // Maximum number of SUnits that can be added to this group.
129 std::optional<unsigned> MaxSize;
130
131 // SchedGroups will only synchronize with other SchedGroups that have the same
132 // SyncID.
133 int SyncID = 0;
134
135 // SGID is used to map instructions to candidate SchedGroups
136 unsigned SGID;
137
138 // The different rules each instruction in this SchedGroup must conform to
140
141 // Count of the number of created SchedGroups, used to initialize SGID.
142 static unsigned NumSchedGroups;
143
144 // Use SGMask to determine whether we can classify MI as a member of this
145 // SchedGroup object.
146 bool canAddMI(const MachineInstr &MI) const;
147
148public:
149 // Collection of SUnits that are classified as members of this group.
150 SmallVector<SUnit *, 32> Collection;
151
153 const SIInstrInfo *TII;
154
155 // Try to add and edge from SU A to SU B.
156 bool tryAddEdge(SUnit *A, SUnit *B);
157
158 // Returns true if SU can be added to this SchedGroup.
159 bool canAddSU(SUnit &SU) const;
160
161 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
162 // MakePred is true, SU will be a predecessor of the SUnits in this
163 // SchedGroup, otherwise SU will be a successor.
164 void link(SUnit &SU, bool MakePred = false);
165
166 // Add DAG dependencies and track which edges are added, and the count of
167 // missed edges
168 int link(SUnit &SU, bool MakePred,
169 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
170
171 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
172 // Use the predicate to determine whether SU should be a predecessor (P =
173 // true) or a successor (P = false) of this SchedGroup.
174 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
175
176 // Add DAG dependencies such that SUnits in this group shall be ordered
177 // before SUnits in OtherGroup.
178 void link(SchedGroup &OtherGroup);
179
180 // Returns true if no more instructions may be added to this group.
181 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
182
183 // Append a constraint that SUs must meet in order to fit into this
184 // SchedGroup. Since many rules involve the relationship between a SchedGroup
185 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
186 // time (rather than SchedGroup init time.)
187 void addRule(std::shared_ptr<InstructionRule> NewRule) {
188 Rules.push_back(NewRule);
189 }
190
191 // Returns true if the SU matches all rules
192 bool allowedByRules(const SUnit *SU,
193 SmallVectorImpl<SchedGroup> &SyncPipe) const {
194 for (auto &Rule : Rules) {
195 if (!Rule->apply(SU, Collection, SyncPipe))
196 return false;
197 }
198 return true;
199 }
200
201 // Add SU to the SchedGroup.
202 void add(SUnit &SU) {
203 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
204 << format_hex((int)SGMask, 10, true) << " adding "
205 << *SU.getInstr());
206 Collection.push_back(&SU);
207 }
208
209 // Remove last element in the SchedGroup
210 void pop() { Collection.pop_back(); }
211
212 template <class T>
213 void findCandidateSUnits(T Begin, T End,
214 SUnitsToCandidateSGsMap &SyncedInstrs);
215
216 /// Find each SUnit in the DAG that could potentially be added to
217 /// this SchedGroup and add the SGID to the candidate SchedGroups
218 /// for SU in \p SyncedInstrs.
219 void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
220
221 int getSyncID() { return SyncID; }
222
223 int getSGID() { return SGID; }
224
225 SchedGroupMask getMask() { return SGMask; }
226
227 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
228 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
229 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
230 SGID = NumSchedGroups++;
231 }
232
233 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
234 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
235 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
236 SGID = NumSchedGroups++;
237 }
238};
239
240using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
241using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
242
243// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
244// in non-trivial cases. For example, if the requested pipeline is
245// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
246// in the DAG, then we will have an instruction that can not be trivially
247// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
248// to find a good solution to the pipeline -- a greedy algorithm and an exact
249// algorithm. The exact algorithm has an exponential time complexity and should
250// only be used for small sized problems or medium sized problems where an exact
251// solution is highly desired.
252class PipelineSolver {
253 [[maybe_unused]] ScheduleDAGMI *DAG;
254
255 // Instructions that can be assigned to multiple SchedGroups
257 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
259 // The current working pipeline
261 // The pipeline that has the best solution found so far
263
264 // Whether or not we actually have any SyncedInstrs to try to solve.
265 bool NeedsSolver = false;
266
267 // Compute an estimate of the size of search tree -- the true size is
268 // the product of each conflictedInst.Matches.size() across all SyncPipelines
269 unsigned computeProblemSize();
270
271 // The cost penalty of not assigning a SU to a SchedGroup
272 int MissPenalty = 0;
273
274 // Costs in terms of the number of edges we are unable to add
275 int BestCost = -1;
276 int CurrCost = 0;
277
278 // Index pointing to the conflicting instruction that is currently being
279 // fitted
280 int CurrConflInstNo = 0;
281 // Index to the pipeline that is currently being fitted
282 int CurrSyncGroupIdx = 0;
283 // The first non trivial pipeline
284 int BeginSyncGroupIdx = 0;
285
286 // How many branches we have explored
287 uint64_t BranchesExplored = 0;
288
289 // The direction in which we process the candidate SchedGroups per SU
290 bool IsBottomUp = true;
291
292 // Update indices to fit next conflicting instruction
293 void advancePosition();
294 // Recede indices to attempt to find better fit for previous conflicting
295 // instruction
296 void retreatPosition();
297
298 // The exponential time algorithm which finds the provably best fit
299 bool solveExact();
300 // The polynomial time algorithm which attempts to find a good fit
301 bool solveGreedy();
302 // Find the best SchedGroup for the current SU using the heuristic given all
303 // current information. One step in the greedy algorithm. Templated against
304 // the SchedGroup iterator (either reverse or forward).
305 template <typename T>
306 void greedyFind(std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
307 // Whether or not the current solution is optimal
308 bool checkOptimal();
309 // Populate the ready list, prioiritizing fewest missed edges first
310 // Templated against the SchedGroup iterator (either reverse or forward).
311 template <typename T>
312 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
313 T E);
314 // Add edges corresponding to the SchedGroups as assigned by solver
315 void makePipeline();
316 // Link the SchedGroups in the best found pipeline.
317 // Tmplated against the SchedGroup iterator (either reverse or forward).
318 template <typename T> void linkSchedGroups(T I, T E);
319 // Add the edges from the SU to the other SchedGroups in pipeline, and
320 // return the number of edges missed.
321 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
322 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
323 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
324 /// returns the cost (in terms of missed pipeline edges), and tracks the edges
325 /// added in \p AddedEdges
326 template <typename T>
327 int linkSUnit(SUnit *SU, int SGID,
328 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
329 /// Remove the edges passed via \p AddedEdges
330 void removeEdges(const std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
331 // Convert the passed in maps to arrays for bidirectional iterators
332 void convertSyncMapsToArrays();
333
334 void reset();
335
336public:
337 // Invoke the solver to map instructions to instruction groups. Heuristic &&
338 // command-line-option determines to use exact or greedy algorithm.
339 void solve();
340
341 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
343 ScheduleDAGMI *DAG, bool IsBottomUp = true)
344 : DAG(DAG), SyncedInstrs(SyncedInstrs),
345 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
346
347 for (auto &PipelineInstrs : SyncedInstrs) {
348 if (PipelineInstrs.second.size() > 0) {
349 NeedsSolver = true;
350 break;
351 }
352 }
353
354 if (!NeedsSolver)
355 return;
356
357 convertSyncMapsToArrays();
358
359 CurrPipeline = BestPipeline;
360
361 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
362 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
363 ++BeginSyncGroupIdx;
364
365 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
366 return;
367 }
368};
369
370void PipelineSolver::reset() {
371
372 for (auto &SyncPipeline : CurrPipeline) {
373 for (auto &SG : SyncPipeline) {
374 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
375 SG.Collection.clear();
376 auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
377 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
378 });
379 if (SchedBarr != TempCollection.end())
380 SG.Collection.push_back(*SchedBarr);
381 }
382 }
383
384 CurrSyncGroupIdx = BeginSyncGroupIdx;
385 CurrConflInstNo = 0;
386 CurrCost = 0;
387}
388
389void PipelineSolver::convertSyncMapsToArrays() {
390 for (auto &SyncPipe : SyncedSchedGroups) {
391 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
392 }
393
394 int PipelineIDx = SyncedInstrs.size() - 1;
395 PipelineInstrs.resize(SyncedInstrs.size());
396 for (auto &SyncInstrMap : SyncedInstrs) {
397 for (auto &SUsToCandSGs : SyncInstrMap.second) {
398 if (PipelineInstrs[PipelineIDx].size() == 0) {
399 PipelineInstrs[PipelineIDx].push_back(
400 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
401 continue;
402 }
403 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
404 // Insert them in sorted order -- this allows for good parsing order in
405 // the greedy algorithm
406 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
407 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
408 ++SortPosition;
409 PipelineInstrs[PipelineIDx].insert(
410 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
411 }
412 --PipelineIDx;
413 }
414}
415
416template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
417 for (; I != E; ++I) {
418 auto &GroupA = *I;
419 for (auto J = std::next(I); J != E; ++J) {
420 auto &GroupB = *J;
421 GroupA.link(GroupB);
422 }
423 }
424}
425
426void PipelineSolver::makePipeline() {
427 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
428 for (auto &SyncPipeline : BestPipeline) {
429 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
430 for (auto &SG : SyncPipeline) {
431 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
432 << " has: \n");
433 SUnit *SGBarr = nullptr;
434 for (auto &SU : SG.Collection) {
435 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
436 SGBarr = SU;
437 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
438 }
439 // Command line requested IGroupLP doesn't have SGBarr
440 if (!SGBarr)
441 continue;
442 SG.link(*SGBarr, false);
443 }
444 }
445
446 for (auto &SyncPipeline : BestPipeline) {
447 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
448 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
449 }
450}
451
452template <typename T>
453int PipelineSolver::linkSUnit(
454 SUnit *SU, int SGID, std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
455 T I, T E) {
456 bool MakePred = false;
457 int AddedCost = 0;
458 for (; I < E; ++I) {
459 if (I->getSGID() == SGID) {
460 MakePred = true;
461 continue;
462 }
463 auto Group = *I;
464 AddedCost += Group.link(*SU, MakePred, AddedEdges);
465 assert(AddedCost >= 0);
466 }
467 return AddedCost;
468}
469
470int PipelineSolver::addEdges(
471 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
472 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
473
474 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
475 // instructions that are the ultimate successors in the resultant mutation.
476 // Therefore, in such a configuration, the SchedGroups occurring before the
477 // candidate SGID are successors of the candidate SchedGroup, thus the current
478 // SU should be linked as a predecessor to SUs in those SchedGroups. The
479 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
480 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
481 // IsBottomUp (in reverse).
482 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
483 SyncPipeline.rend())
484 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
485 SyncPipeline.end());
486}
487
488void PipelineSolver::removeEdges(
489 const std::list<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
490 // Only remove the edges that we have added when testing
491 // the fit.
492 for (auto &PredSuccPair : EdgesToRemove) {
493 SUnit *Pred = PredSuccPair.first;
494 SUnit *Succ = PredSuccPair.second;
495
496 auto *Match = llvm::find_if(
497 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
498 if (Match != Succ->Preds.end()) {
499 assert(Match->isArtificial());
500 Succ->removePred(*Match);
501 }
502 }
503}
504
505void PipelineSolver::advancePosition() {
506 ++CurrConflInstNo;
507
508 if (static_cast<size_t>(CurrConflInstNo) >=
509 PipelineInstrs[CurrSyncGroupIdx].size()) {
510 CurrConflInstNo = 0;
511 ++CurrSyncGroupIdx;
512 // Advance to next non-trivial pipeline
513 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
514 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
515 ++CurrSyncGroupIdx;
516 }
517}
518
519void PipelineSolver::retreatPosition() {
520 assert(CurrConflInstNo >= 0);
521 assert(CurrSyncGroupIdx >= 0);
522
523 if (CurrConflInstNo > 0) {
524 --CurrConflInstNo;
525 return;
526 }
527
528 if (CurrConflInstNo == 0) {
529 // If we return to the starting position, we have explored
530 // the entire tree
531 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
532 return;
533
534 --CurrSyncGroupIdx;
535 // Go to previous non-trivial pipeline
536 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
537 --CurrSyncGroupIdx;
538
539 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
540 }
541}
542
543bool PipelineSolver::checkOptimal() {
544 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
545 if (BestCost == -1 || CurrCost < BestCost) {
546 BestPipeline = CurrPipeline;
547 BestCost = CurrCost;
548 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
549 }
550 assert(BestCost >= 0);
551 }
552
553 bool DoneExploring = false;
554 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
555 DoneExploring = true;
556
557 return (DoneExploring || BestCost == 0);
558}
559
560template <typename T>
561void PipelineSolver::populateReadyList(
562 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
563 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
564 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
565 assert(CurrSU.second.size() >= 1);
566
567 for (; I != E; ++I) {
568 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
569 int CandSGID = *I;
570 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
571 return SG.getSGID() == CandSGID;
572 });
573 assert(Match);
574
575 if (UseCostHeur) {
576 if (Match->isFull()) {
577 ReadyList.push_back(std::pair(*I, MissPenalty));
578 continue;
579 }
580
581 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
582 ReadyList.push_back(std::pair(*I, TempCost));
583 removeEdges(AddedEdges);
584 } else
585 ReadyList.push_back(std::pair(*I, -1));
586 }
587
588 if (UseCostHeur)
589 std::sort(ReadyList.begin(), ReadyList.end(), llvm::less_second());
590
591 assert(ReadyList.size() == CurrSU.second.size());
592}
593
594bool PipelineSolver::solveExact() {
595 if (checkOptimal())
596 return true;
597
598 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
599 return false;
600
601 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
602 assert(static_cast<size_t>(CurrConflInstNo) <
603 PipelineInstrs[CurrSyncGroupIdx].size());
604 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
605 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
606 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
607
608 // SchedGroup -> Cost pairs
610 // Prioritize the candidate sched groups in terms of lowest cost first
611 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
612 CurrSU.second.rend())
613 : populateReadyList(ReadyList, CurrSU.second.begin(),
614 CurrSU.second.end());
615
616 auto *I = ReadyList.begin();
617 auto *E = ReadyList.end();
618 for (; I != E; ++I) {
619 // If we are trying SGs in least cost order, and the current SG is cost
620 // infeasible, then all subsequent SGs will also be cost infeasible, so we
621 // can prune.
622 if (BestCost != -1 && (CurrCost + I->second > BestCost))
623 return false;
624
625 int CandSGID = I->first;
626 int AddedCost = 0;
627 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
628 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
629 SchedGroup *Match;
630 for (auto &SG : SyncPipeline) {
631 if (SG.getSGID() == CandSGID)
632 Match = &SG;
633 }
634
635 if (Match->isFull())
636 continue;
637
638 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
639 continue;
640
641 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
642 << (int)Match->getMask() << "and ID " << CandSGID
643 << "\n");
644 Match->add(*CurrSU.first);
645 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
646 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
647 CurrCost += AddedCost;
648 advancePosition();
649 ++BranchesExplored;
650 bool FinishedExploring = false;
651 // If the Cost after adding edges is greater than a known solution,
652 // backtrack
653 if (CurrCost < BestCost || BestCost == -1) {
654 if (solveExact()) {
655 FinishedExploring = BestCost != 0;
656 if (!FinishedExploring)
657 return true;
658 }
659 }
660
661 retreatPosition();
662 CurrCost -= AddedCost;
663 removeEdges(AddedEdges);
664 Match->pop();
665 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
666 if (FinishedExploring)
667 return true;
668 }
669
670 // Try the pipeline where the current instruction is omitted
671 // Potentially if we omit a problematic instruction from the pipeline,
672 // all the other instructions can nicely fit.
673 CurrCost += MissPenalty;
674 advancePosition();
675
676 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
677
678 bool FinishedExploring = false;
679 if (CurrCost < BestCost || BestCost == -1) {
680 if (solveExact()) {
681 bool FinishedExploring = BestCost != 0;
682 if (!FinishedExploring)
683 return true;
684 }
685 }
686
687 retreatPosition();
688 CurrCost -= MissPenalty;
689 return FinishedExploring;
690}
691
692template <typename T>
693void PipelineSolver::greedyFind(
694 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
695 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
696 int BestNodeCost = -1;
697 int TempCost;
698 SchedGroup *BestGroup = nullptr;
699 int BestGroupID = -1;
700 std::list<std::pair<SUnit *, SUnit *>> BestEdges;
701 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
702 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
703 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
704
705 // Since we have added the potential SchedGroups from bottom up, but
706 // traversed the DAG from top down, parse over the groups from last to
707 // first. If we fail to do this for the greedy algorithm, the solution will
708 // likely not be good in more complex cases.
709 for (; I != E; ++I) {
710 int CandSGID = *I;
711 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
712 return SG.getSGID() == CandSGID;
713 });
714 assert(Match);
715
716 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
717 << (int)Match->getMask() << "\n");
718
719 if (Match->isFull()) {
720 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
721 continue;
722 }
723 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
724 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
725 continue;
726 }
727
728 std::list<std::pair<SUnit *, SUnit *>> TempEdges;
729 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, TempEdges);
730 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
731
732 if (TempCost < BestNodeCost || BestNodeCost == -1) {
733 BestEdges = TempEdges;
734 BestGroup = Match;
735 BestNodeCost = TempCost;
736 BestGroupID = CandSGID;
737
738 if (BestNodeCost == 0)
739 break;
740 }
741
742 removeEdges(TempEdges);
743 }
744
745 if (BestGroupID != -1) {
746 BestGroup->add(*CurrSU.first);
747 if (AddedEdges.empty())
748 AddedEdges = BestEdges;
749 else
750 AddedEdges.splice(std::prev(AddedEdges.cend()), BestEdges);
751
752 for (const std::pair<SUnit *, SUnit *> &E : BestEdges) {
753 if (!BestGroup->tryAddEdge(E.first, E.second))
754 llvm_unreachable("Edges known to be insertable.");
755 }
756
757 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
758 << (int)BestGroup->getMask() << "\n");
759 BestCost += TempCost;
760 } else
761 BestCost += MissPenalty;
762
763 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
764}
765
766bool PipelineSolver::solveGreedy() {
767 BestCost = 0;
768 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
769
770 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
771 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
772 IsBottomUp
773 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
774 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
775 advancePosition();
776 }
777 BestPipeline = CurrPipeline;
778 removeEdges(AddedEdges);
779 return false;
780}
781
782unsigned PipelineSolver::computeProblemSize() {
783 unsigned ProblemSize = 0;
784 for (auto &PipeConflicts : PipelineInstrs) {
785 ProblemSize += PipeConflicts.size();
786 }
787
788 return ProblemSize;
789}
790
791void PipelineSolver::solve() {
792 if (!NeedsSolver)
793 return;
794
795 unsigned ProblemSize = computeProblemSize();
796 assert(ProblemSize > 0);
797
798 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
799 MissPenalty = (ProblemSize / 2) + 1;
800
801 LLVM_DEBUG(DAG->dump());
802 if (EnableExactSolver || BelowCutoff) {
803 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
804 solveGreedy();
805 reset();
806 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
807 if (BestCost > 0) {
808 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
809 solveExact();
810 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
811 }
812 } else { // Use the Greedy Algorithm by default
813 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
814 solveGreedy();
815 }
816
817 makePipeline();
818 LLVM_DEBUG(dbgs() << "After applying mutation\n");
819 LLVM_DEBUG(DAG->dump());
820}
821
822enum IGLPStrategyID : int {
823 MFMASmallGemmOptID = 0,
824 MFMASmallGemmSingleWaveOptID = 1,
825 MFMAExpInterleaveID = 2,
826 MFMAExpSimpleInterleaveID = 3
827};
828
829// Implement a IGLP scheduling strategy.
830class IGLPStrategy {
831protected:
833
834 const SIInstrInfo *TII;
835
836public:
837 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
838 virtual bool applyIGLPStrategy(
840 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
842
843 // Returns true if this strategy should be applied to a ScheduleDAG.
844 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
846
847 bool IsBottomUp = true;
848
849 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
850 : DAG(DAG), TII(TII) {}
851
852 virtual ~IGLPStrategy() = default;
853};
854
855class MFMASmallGemmOpt final : public IGLPStrategy {
856private:
857public:
858 bool applyIGLPStrategy(
860 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
862
863 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
865 return true;
866 }
867
868 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
869 : IGLPStrategy(DAG, TII) {
870 IsBottomUp = true;
871 }
872};
873
874bool MFMASmallGemmOpt::applyIGLPStrategy(
876 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
878 // Count the number of MFMA instructions.
879 unsigned MFMACount = 0;
880 for (const MachineInstr &I : *DAG)
881 if (TII->isMFMAorWMMA(I))
882 ++MFMACount;
883
884 const unsigned PipelineSyncID = 0;
885 SchedGroup *SG = nullptr;
886 for (unsigned I = 0; I < MFMACount * 3; ++I) {
887 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
888 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
889 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
890
891 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
892 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
893 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
894 }
895
896 return true;
897}
898
899class MFMAExpInterleaveOpt final : public IGLPStrategy {
900private:
901 // The count of TRANS SUs involved in the interleaved pipeline
902 static unsigned TransPipeCount;
903 // The count of MFMA SUs involved in the interleaved pipeline
904 static unsigned MFMAPipeCount;
905 // The count of Add SUs involved in the interleaved pipeline
906 static unsigned AddPipeCount;
907 // The number of transitive MFMA successors for each TRANS SU
908 static unsigned MFMAEnablement;
909 // The number of transitive TRANS predecessors for each MFMA SU
910 static unsigned ExpRequirement;
911 // The count of independent "chains" of MFMA instructions in the pipeline
912 static unsigned MFMAChains;
913 // The length of each independent "chain" of MFMA instructions
914 static unsigned MFMAChainLength;
915 // Whether or not the pipeline has V_CVT instructions
916 static bool HasCvt;
917 // Whether or not there are instructions between the TRANS instruction and
918 // V_CVT
919 static bool HasChainBetweenCvt;
920 // The first occuring DS_READ which feeds an MFMA chain
921 static std::optional<unsigned> FirstPipeDSR;
922 // The MFMAPipe SUs with no MFMA predecessors
923 SmallVector<SUnit *, 4> MFMAChainSeeds;
924 // Compute the heuristics for the pipeline, returning whether or not the DAG
925 // is well formatted for the mutation
926 bool analyzeDAG(const SIInstrInfo *TII);
927
928 /// Whether or not the instruction is a transitive predecessor of an MFMA
929 /// instruction
930 class IsPipeExp final : public InstructionRule {
931 public:
932 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
933 SmallVectorImpl<SchedGroup> &SyncPipe) override {
934
935 auto *DAG = SyncPipe[0].DAG;
936
937 if (Cache->empty()) {
938 auto I = DAG->SUnits.rbegin();
939 auto E = DAG->SUnits.rend();
940 for (; I != E; I++) {
941 if (TII->isMFMAorWMMA(*I->getInstr()))
942 Cache->push_back(&*I);
943 }
944 if (Cache->empty())
945 return false;
946 }
947
948 auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
949 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
950 });
951
952 return Reaches;
953 }
954 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
955 : InstructionRule(TII, SGID, NeedsCache) {}
956 };
957
958 /// Whether or not the instruction is a transitive predecessor of the
959 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
960 class EnablesNthMFMA final : public InstructionRule {
961 private:
962 unsigned Number = 1;
963
964 public:
965 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
966 SmallVectorImpl<SchedGroup> &SyncPipe) override {
967 bool FoundTrans = false;
968 unsigned Counter = 1;
969 auto *DAG = SyncPipe[0].DAG;
970
971 if (Cache->empty()) {
972 auto I = DAG->SUnits.begin();
973 auto E = DAG->SUnits.end();
974 for (; I != E; I++) {
975 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
976 if (Counter == Number) {
977 Cache->push_back(&*I);
978 break;
979 }
980 ++Counter;
981 }
982 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
983 FoundTrans = true;
984 }
985 if (Cache->empty())
986 return false;
987 }
988
989 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
990 }
991
992 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
993 bool NeedsCache = false)
994 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
995 };
996
997 /// Whether or not the instruction enables the exact MFMA that is the \p
998 /// Number th MFMA in the chain starting with \p ChainSeed
999 class EnablesNthMFMAInChain final : public InstructionRule {
1000 private:
1001 unsigned Number = 1;
1002 SUnit *ChainSeed;
1003
1004 public:
1005 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1006 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1007 auto *DAG = SyncPipe[0].DAG;
1008
1009 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1010 return false;
1011
1012 if (Cache->empty()) {
1013 auto *TempSU = ChainSeed;
1014 auto Depth = Number;
1015 while (Depth > 0) {
1016 --Depth;
1017 bool Found = false;
1018 for (auto &Succ : TempSU->Succs) {
1019 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1020 TempSU = Succ.getSUnit();
1021 Found = true;
1022 break;
1023 }
1024 }
1025 if (!Found)
1026 return false;
1027 }
1028
1029 Cache->push_back(TempSU);
1030 }
1031 // If we failed to find the instruction to be placed into the cache, we
1032 // would have already exited.
1033 assert(!Cache->empty());
1034
1035 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1036 }
1037
1038 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1039 const SIInstrInfo *TII, unsigned SGID,
1040 bool NeedsCache = false)
1041 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1042 ChainSeed(ChainSeed) {}
1043 };
1044
1045 /// Whether or not the instruction has less than \p Size immediate successors.
1046 /// If \p HasIntermediary is true, this tests also whether all successors of
1047 /// the SUnit have less than \p Size successors.
1048 class LessThanNSuccs final : public InstructionRule {
1049 private:
1050 unsigned Size = 1;
1051 bool HasIntermediary = false;
1052
1053 public:
1054 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1055 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1056 if (!SyncPipe.size())
1057 return false;
1058
1059 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1060 return Succ.getKind() == SDep::Data;
1061 });
1062 if (SuccSize >= Size)
1063 return false;
1064
1065 if (HasIntermediary) {
1066 for (auto Succ : SU->Succs) {
1067 unsigned SuccSize =
1068 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1069 return SuccSucc.getKind() == SDep::Data;
1070 });
1071 if (SuccSize >= Size)
1072 return false;
1073 }
1074 }
1075
1076 return true;
1077 }
1078 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1079 bool HasIntermediary = false, bool NeedsCache = false)
1080 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1081 HasIntermediary(HasIntermediary) {}
1082 };
1083
1084 /// Whether or not the instruction has greater than or equal to \p Size
1085 /// immediate successors. If \p HasIntermediary is true, this tests also
1086 /// whether all successors of the SUnit have greater than or equal to \p Size
1087 /// successors.
1088 class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1089 private:
1090 unsigned Size = 1;
1091 bool HasIntermediary = false;
1092
1093 public:
1094 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1095 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1096 if (!SyncPipe.size())
1097 return false;
1098
1099 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1100 return Succ.getKind() == SDep::Data;
1101 });
1102 if (SuccSize >= Size)
1103 return true;
1104
1105 if (HasIntermediary) {
1106 for (auto Succ : SU->Succs) {
1107 unsigned SuccSize =
1108 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1109 return SuccSucc.getKind() == SDep::Data;
1110 });
1111 if (SuccSize >= Size)
1112 return true;
1113 }
1114 }
1115
1116 return false;
1117 }
1118 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1119 unsigned SGID, bool HasIntermediary = false,
1120 bool NeedsCache = false)
1121 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1122 HasIntermediary(HasIntermediary) {}
1123 };
1124
1125 // Whether or not the instruction is a relevant V_CVT instruction.
1126 class IsCvt final : public InstructionRule {
1127 public:
1128 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1129 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1130 auto Opc = SU->getInstr()->getOpcode();
1131 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1132 Opc == AMDGPU::V_CVT_I32_F32_e32;
1133 }
1134 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1135 : InstructionRule(TII, SGID, NeedsCache) {}
1136 };
1137
1138 // Whether or not the instruction is FMA_F32.
1139 class IsFMA final : public InstructionRule {
1140 public:
1141 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1142 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1143 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1144 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1145 }
1146 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1147 : InstructionRule(TII, SGID, NeedsCache) {}
1148 };
1149
1150 // Whether or not the instruction is a V_ADD_F32 instruction.
1151 class IsPipeAdd final : public InstructionRule {
1152 public:
1153 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1154 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1155 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1156 }
1157 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1158 : InstructionRule(TII, SGID, NeedsCache) {}
1159 };
1160
1161 /// Whether or not the instruction is an immediate RAW successor
1162 /// of the SchedGroup \p Distance steps before.
1163 class IsSuccOfPrevNthGroup final : public InstructionRule {
1164 private:
1165 unsigned Distance = 1;
1166
1167 public:
1168 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1169 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1170 SchedGroup *OtherGroup = nullptr;
1171 if (!SyncPipe.size())
1172 return false;
1173
1174 for (auto &PipeSG : SyncPipe) {
1175 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1176 OtherGroup = &PipeSG;
1177 }
1178
1179 if (!OtherGroup)
1180 return false;
1181 if (!OtherGroup->Collection.size())
1182 return true;
1183
1184 for (auto &OtherEle : OtherGroup->Collection) {
1185 for (auto &Succ : OtherEle->Succs) {
1186 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1187 return true;
1188 }
1189 }
1190
1191 return false;
1192 }
1193 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1194 unsigned SGID, bool NeedsCache = false)
1195 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1196 };
1197
1198 /// Whether or not the instruction is a transitive successor of any
1199 /// instruction the the SchedGroup \p Distance steps before.
1200 class IsReachableFromPrevNthGroup final : public InstructionRule {
1201 private:
1202 unsigned Distance = 1;
1203
1204 public:
1205 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1206 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1207 SchedGroup *OtherGroup = nullptr;
1208 if (!SyncPipe.size())
1209 return false;
1210
1211 for (auto &PipeSG : SyncPipe) {
1212 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1213 OtherGroup = &PipeSG;
1214 }
1215
1216 if (!OtherGroup)
1217 return false;
1218 if (!OtherGroup->Collection.size())
1219 return true;
1220
1221 auto *DAG = SyncPipe[0].DAG;
1222
1223 for (auto &OtherEle : OtherGroup->Collection)
1224 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1225 return true;
1226
1227 return false;
1228 }
1229 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1230 unsigned SGID, bool NeedsCache = false)
1231 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1232 };
1233
1234 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1235 class OccursAtOrAfterNode final : public InstructionRule {
1236 private:
1237 unsigned Number = 1;
1238
1239 public:
1240 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1241 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1242
1243 return SU->NodeNum >= Number;
1244 }
1245 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1246 bool NeedsCache = false)
1247 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1248 };
1249
1250 /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1251 /// starting with \p ChainSeed
1252 class IsExactMFMA final : public InstructionRule {
1253 private:
1254 unsigned Number = 1;
1255 SUnit *ChainSeed;
1256
1257 public:
1258 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1259 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1260 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1261 return false;
1262
1263 if (Cache->empty()) {
1264 auto *TempSU = ChainSeed;
1265 auto Depth = Number;
1266 while (Depth > 0) {
1267 --Depth;
1268 bool Found = false;
1269 for (auto &Succ : TempSU->Succs) {
1270 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1271 TempSU = Succ.getSUnit();
1272 Found = true;
1273 break;
1274 }
1275 }
1276 if (!Found) {
1277 return false;
1278 }
1279 }
1280 Cache->push_back(TempSU);
1281 }
1282 // If we failed to find the instruction to be placed into the cache, we
1283 // would have already exited.
1284 assert(!Cache->empty());
1285
1286 return (*Cache)[0] == SU;
1287 }
1288
1289 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1290 unsigned SGID, bool NeedsCache = false)
1291 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1292 ChainSeed(ChainSeed) {}
1293 };
1294
1295 // Whether the instruction occurs after the first TRANS instruction. This
1296 // implies the instruction can not be a predecessor of the first TRANS
1297 // insruction
1298 class OccursAfterExp final : public InstructionRule {
1299 public:
1300 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1301 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1302
1303 auto *DAG = SyncPipe[0].DAG;
1304 if (Cache->empty()) {
1305 for (auto &SU : DAG->SUnits)
1306 if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1307 Cache->push_back(&SU);
1308 break;
1309 }
1310 if (Cache->empty())
1311 return false;
1312 }
1313
1314 return SU->NodeNum > (*Cache)[0]->NodeNum;
1315 }
1316
1317 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1318 bool NeedsCache = false)
1319 : InstructionRule(TII, SGID, NeedsCache) {}
1320 };
1321
1322public:
1323 bool applyIGLPStrategy(
1325 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1327
1328 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1330
1331 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1332 : IGLPStrategy(DAG, TII) {
1333 IsBottomUp = false;
1334 }
1335};
1336
1337unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1338unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1339unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1340unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1341unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1342unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1343unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1344bool MFMAExpInterleaveOpt::HasCvt = false;
1345bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1346std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1347
1348bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1349 SmallVector<SUnit *, 10> ExpPipeCands;
1350 SmallVector<SUnit *, 10> MFMAPipeCands;
1351 SmallVector<SUnit *, 10> MFMAPipeSUs;
1354
1355 auto isBitPack = [](unsigned Opc) {
1356 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1357 };
1358
1359 auto isCvt = [](unsigned Opc) {
1360 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1361 };
1362
1363 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1364
1365 AddPipeCount = 0;
1366 for (SUnit &SU : DAG->SUnits) {
1367 auto Opc = SU.getInstr()->getOpcode();
1368 if (TII->isTRANS(Opc)) {
1369 // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1370 if (SU.Succs.size() >= 7)
1371 continue;
1372 for (auto &Succ : SU.Succs) {
1373 if (Succ.getSUnit()->Succs.size() >= 7)
1374 continue;
1375 }
1376 ExpPipeCands.push_back(&SU);
1377 }
1378
1379 if (TII->isMFMAorWMMA(*SU.getInstr()))
1380 MFMAPipeCands.push_back(&SU);
1381
1382 if (isBitPack(Opc))
1383 PackSUs.push_back(&SU);
1384
1385 if (isCvt(Opc))
1386 CvtSUs.push_back(&SU);
1387
1388 if (isAdd(Opc))
1389 ++AddPipeCount;
1390 }
1391
1392 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1393 return false;
1394
1395 TransPipeCount = 0;
1396
1397 std::optional<SUnit *> TempMFMA;
1398 std::optional<SUnit *> TempExp;
1399 // Count the number of EXPs that reach an MFMA
1400 for (auto &PredSU : ExpPipeCands) {
1401 for (auto &SuccSU : MFMAPipeCands) {
1402 if (DAG->IsReachable(SuccSU, PredSU)) {
1403 if (!TempExp) {
1404 TempExp = PredSU;
1405 TempMFMA = SuccSU;
1406 }
1407 MFMAPipeSUs.push_back(SuccSU);
1408 ++TransPipeCount;
1409 break;
1410 }
1411 }
1412 }
1413
1414 if (!(TempExp && TempMFMA))
1415 return false;
1416
1417 HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) {
1418 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1419 });
1420
1421 // Count the number of MFMAs that are reached by an EXP
1422 for (auto &SuccSU : MFMAPipeCands) {
1423 if (MFMAPipeSUs.size() &&
1424 any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) {
1425 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1426 }))
1427 continue;
1428
1429 for (auto &PredSU : ExpPipeCands) {
1430 if (DAG->IsReachable(SuccSU, PredSU)) {
1431 MFMAPipeSUs.push_back(SuccSU);
1432 break;
1433 }
1434 }
1435 }
1436
1437 MFMAPipeCount = MFMAPipeSUs.size();
1438
1439 assert(TempExp && TempMFMA);
1440 assert(MFMAPipeCount > 0);
1441
1442 std::optional<SUnit *> TempCvt;
1443 for (auto &SuccSU : CvtSUs) {
1444 if (DAG->IsReachable(SuccSU, *TempExp)) {
1445 TempCvt = SuccSU;
1446 break;
1447 }
1448 }
1449
1450 HasCvt = false;
1451 if (TempCvt.has_value()) {
1452 for (auto &SuccSU : MFMAPipeSUs) {
1453 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1454 HasCvt = true;
1455 break;
1456 }
1457 }
1458 }
1459
1460 MFMAChains = 0;
1461 for (auto &MFMAPipeSU : MFMAPipeSUs) {
1462 if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1463 continue;
1464 if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
1465 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1466 })) {
1467 MFMAChainSeeds.push_back(MFMAPipeSU);
1468 ++MFMAChains;
1469 }
1470 }
1471
1472 if (!MFMAChains)
1473 return false;
1474
1475 for (auto Pred : MFMAChainSeeds[0]->Preds) {
1476 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1477 Pred.getSUnit()->getInstr()->mayLoad())
1478 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1479 }
1480
1481 MFMAChainLength = MFMAPipeCount / MFMAChains;
1482
1483 // The number of bit pack operations that depend on a single V_EXP
1484 unsigned PackSuccCount =
1485 llvm::count_if(PackSUs, [this, &TempExp](SUnit *VPack) {
1486 return DAG->IsReachable(VPack, *TempExp);
1487 });
1488
1489 // The number of bit pack operations an MFMA depends on
1490 unsigned PackPredCount =
1491 llvm::count_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1492 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1493 return isBitPack(Opc);
1494 });
1495
1496 auto *PackPred = llvm::find_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1497 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1498 return isBitPack(Opc);
1499 });
1500
1501 if (PackPred == (*TempMFMA)->Preds.end())
1502 return false;
1503
1504 MFMAEnablement = 0;
1505 ExpRequirement = 0;
1506 // How many MFMAs depend on a single bit pack operation
1507 MFMAEnablement =
1508 llvm::count_if(PackPred->getSUnit()->Succs, [&TII](SDep &Succ) {
1509 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1510 });
1511
1512 // The number of MFMAs that depend on a single V_EXP
1513 MFMAEnablement *= PackSuccCount;
1514
1515 // The number of V_EXPs required to resolve all dependencies for an MFMA
1516 ExpRequirement =
1517 llvm::count_if(ExpPipeCands, [this, &PackPred](SUnit *ExpBase) {
1518 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1519 });
1520
1521 ExpRequirement *= PackPredCount;
1522 return true;
1523}
1524
1525bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1527 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1528 const SIInstrInfo *TII = ST.getInstrInfo();
1529
1531 MFMAChainSeeds.clear();
1532 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1533 return false;
1534
1535 return true;
1536}
1537
1538bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1540 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1542
1543 bool IsSmallKernelType =
1544 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1545 bool IsLargeKernelType =
1546 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1547
1548 if (!(IsSmallKernelType || IsLargeKernelType))
1549 return false;
1550
1551 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1552 const SIInstrInfo *TII = ST.getInstrInfo();
1553
1554 unsigned PipelineSyncID = 0;
1555 SchedGroup *SG = nullptr;
1556
1557 unsigned MFMAChain = 0;
1558 unsigned PositionInChain = 0;
1559 unsigned CurrMFMAForTransPosition = 0;
1560
1561 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1562 &CurrMFMAForTransPosition]() {
1563 CurrMFMAForTransPosition += MFMAEnablement;
1564 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1565 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1566 };
1567
1568 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1569 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1570 return (TempMFMAForTrans / MFMAChains);
1571 };
1572
1573 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1574 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1575 return TempMFMAForTrans % MFMAChains;
1576 };
1577
1578 unsigned CurrMFMAPosition = 0;
1579 unsigned MFMAChainForMFMA = 0;
1580 unsigned PositionInChainForMFMA = 0;
1581
1582 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1583 &PositionInChainForMFMA]() {
1584 ++CurrMFMAPosition;
1585 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1586 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1587 };
1588
1589 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1590 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1591
1592 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1593 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1594 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1595 bool UsesVALU = IsSmallKernelType;
1596
1597 // PHASE 1: "Prefetch"
1598 if (UsesFMA) {
1599 // First Round FMA
1600 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1601 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1602 if (!IsPostRA && MFMAChains) {
1603 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1604 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1605 true));
1606 } else
1607 SG->addRule(
1608 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1609 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1610 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1611
1612 // Second Round FMA
1613 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1614 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1615 if (!IsPostRA && MFMAChains) {
1616 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1617 getNextTransPositionInChain(),
1618 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1619 } else
1620 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1621 SG->getSGID(), true));
1622 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1623 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1624 }
1625
1626 if (UsesDSRead) {
1627 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1628 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1629 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1630 SG->getSGID()));
1631 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1632 }
1633
1634 // First Round EXP
1635 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1636 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1637 if (!IsPostRA && MFMAChains)
1638 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1639 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1640 else
1641 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1642 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1643 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1644 HasChainBetweenCvt));
1645 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1646
1647 incrementTransPosition();
1648
1649 // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1650 for (unsigned I = 0; I < ExpRequirement; I++) {
1651 // First Round CVT
1652 if (UsesCvt) {
1653 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1654 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1655 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1656 if (HasChainBetweenCvt)
1657 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1658 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1659 else
1660 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1661 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1662 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1663 }
1664
1665 // Third Round FMA
1666 if (UsesFMA) {
1667 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1668 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1669 if (!IsPostRA && MFMAChains) {
1670 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1671 getNextTransPositionInChain(),
1672 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1673 } else
1674 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1675 TII, SG->getSGID(), true));
1676 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1677 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1678 }
1679
1680 // Second Round EXP
1681 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1682 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1683 if (!IsPostRA && MFMAChains)
1684 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1685 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1686 true));
1687 else
1688 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1689 SG->getSGID(), true));
1690 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1691 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1692 HasChainBetweenCvt));
1693 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1694 }
1695
1696 // The "extra" EXP which enables all MFMA
1697 // TODO: UsesExtraExp
1698 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1699 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1700 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1701 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1702 8, TII, SG->getSGID(), HasChainBetweenCvt));
1703 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1704
1705 // PHASE 2: Main Interleave Loop
1706
1707 // The number of MFMAs per iteration
1708 unsigned MFMARatio =
1709 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1710 // The number of Exps per iteration
1711 unsigned ExpRatio =
1712 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1713 // The reamaining Exps
1714 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1715 ? TransPipeCount - (2 * ExpRequirement)
1716 : 0;
1717 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1718 // In loop MFMAs
1719 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1720 ? MFMAPipeCount - (MFMAEnablement * 2)
1721 : 0;
1722 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1723 unsigned VALUOps =
1724 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1725 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1726
1727 for (unsigned I = 0; I < LoopSize; I++) {
1728 if (!(I * ExpRatio % ExpRequirement))
1729 incrementTransPosition();
1730
1731 // Round N MFMA
1732 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1733 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1734 if (!IsPostRA && MFMAChains)
1735 SG->addRule(std::make_shared<IsExactMFMA>(
1736 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1737 SG->getSGID(), true));
1738 else
1739 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1740 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1741 incrementMFMAPosition();
1742
1743 if (UsesVALU) {
1744 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1745 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1746 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1747 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1748 }
1749
1750 if (UsesDSRead && !(I % 4)) {
1751 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1752 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1753 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1754 SG->getSGID()));
1755 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1756 }
1757
1758 // CVT, EXP, FMA Interleaving
1759 for (unsigned J = 0; J < ExpRatio; J++) {
1760 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1761 auto MaxMFMAOffset =
1762 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1763
1764 // Round N + 1 CVT
1765 if (UsesCvt) {
1766 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1767 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1768 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1769 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1770 auto DSROffset = I / 4 + 1;
1771 auto MaxDSROffset = MaxMFMAOffset / 4;
1772 // TODO: UsesExtraExp
1773 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1774 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1775 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1776 ExpOffset;
1777 if (HasChainBetweenCvt)
1778 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1779 CurrentOffset, TII, SG->getSGID()));
1780 else
1781 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1782 SG->getSGID()));
1783 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1784 }
1785
1786 // Round N + 3 FMA
1787 if (UsesFMA) {
1788 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1789 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1790 if (!IsPostRA && MFMAChains)
1791 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1792 getNextTransPositionInChain(),
1793 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1794 true));
1795 else
1796 SG->addRule(std::make_shared<EnablesNthMFMA>(
1797 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1798 TII, SG->getSGID(), true));
1799 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1800 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1801 }
1802
1803 // Round N + 2 Exp
1804 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1805 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1806 if (!IsPostRA && MFMAChains)
1807 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1808 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1809 true));
1810 else
1811 SG->addRule(std::make_shared<EnablesNthMFMA>(
1812 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1813 TII, SG->getSGID(), true));
1814 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1815 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1816 HasChainBetweenCvt));
1817 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1818 }
1819 }
1820
1821 // PHASE 3: Remaining MFMAs
1822 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1823 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1824 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1825 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1826 return true;
1827}
1828
1829class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
1830public:
1831 bool applyIGLPStrategy(
1833 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1835
1836 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1837 AMDGPU::SchedulingPhase Phase) override {
1838 return true;
1839 }
1840
1841 MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1842 : IGLPStrategy(DAG, TII) {
1843 IsBottomUp = true;
1844 }
1845};
1846
1847bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1849 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1851 // Count the number of MFMA instructions.
1852 unsigned MFMACount = 0;
1853 for (const MachineInstr &I : *DAG)
1854 if (TII->isMFMAorWMMA(I))
1855 ++MFMACount;
1856
1857 const unsigned PipelineSyncID = 0;
1858 for (unsigned I = 0; I < MFMACount * 3; ++I) {
1859 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1860 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1861 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1862
1863 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1864 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1865 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1866 }
1867
1868 return true;
1869}
1870
1871class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1872private:
1873 // Whether the DS_READ is a predecessor of first four MFMA in region
1874 class EnablesInitialMFMA final : public InstructionRule {
1875 public:
1876 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1877 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1878 if (!SyncPipe.size())
1879 return false;
1880 int MFMAsFound = 0;
1881 if (!Cache->size()) {
1882 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
1883 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
1884 ++MFMAsFound;
1885 if (MFMAsFound > 4)
1886 break;
1887 Cache->push_back(&Elt);
1888 }
1889 }
1890 }
1891
1892 auto *DAG = SyncPipe[0].DAG;
1893 for (auto &Elt : *Cache) {
1894 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
1895 return true;
1896 }
1897 return false;
1898 }
1899
1900 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
1901 bool NeedsCache = false)
1902 : InstructionRule(TII, SGID, NeedsCache) {}
1903 };
1904
1905 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1906 class IsPermForDSW final : public InstructionRule {
1907 public:
1908 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1909 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1910 auto *MI = SU->getInstr();
1911 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1912 return false;
1913
1914 bool FitsInGroup = false;
1915 // Does the VALU have a DS_WRITE successor
1916 if (!Collection.size()) {
1917 for (auto &Succ : SU->Succs) {
1918 SUnit *SuccUnit = Succ.getSUnit();
1919 if (TII->isDS(*SuccUnit->getInstr()) &&
1920 SuccUnit->getInstr()->mayStore()) {
1921 Cache->push_back(SuccUnit);
1922 FitsInGroup = true;
1923 }
1924 }
1925 return FitsInGroup;
1926 }
1927
1928 // Does the VALU have a DS_WRITE successor that is the same as other
1929 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
1930 return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
1931 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
1932 return ThisSucc.getSUnit() == Elt;
1933 });
1934 });
1935 }
1936
1937 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1938 : InstructionRule(TII, SGID, NeedsCache) {}
1939 };
1940
1941 // Whether the SU is a successor of any element in previous SchedGroup
1942 class IsSuccOfPrevGroup final : public InstructionRule {
1943 public:
1944 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1945 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1946 SchedGroup *OtherGroup = nullptr;
1947 for (auto &PipeSG : SyncPipe) {
1948 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
1949 OtherGroup = &PipeSG;
1950 }
1951 }
1952
1953 if (!OtherGroup)
1954 return false;
1955 if (!OtherGroup->Collection.size())
1956 return true;
1957
1958 // Does the previous VALU have this DS_Write as a successor
1959 return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
1960 return any_of(Elt->Succs,
1961 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1962 });
1963 }
1964 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1965 bool NeedsCache = false)
1966 : InstructionRule(TII, SGID, NeedsCache) {}
1967 };
1968
1969 // Whether the combined load width of group is 128 bits
1970 class VMEMSize final : public InstructionRule {
1971 public:
1972 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1973 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1974 auto *MI = SU->getInstr();
1975 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1976 return false;
1977 if (!Collection.size())
1978 return true;
1979
1980 int NumBits = 0;
1981
1982 auto TRI = TII->getRegisterInfo();
1983 auto &MRI = MI->getMF()->getRegInfo();
1984 for (auto &Elt : Collection) {
1985 auto Op = Elt->getInstr()->getOperand(0);
1986 auto Size =
1987 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1988 NumBits += Size;
1989 }
1990
1991 if (NumBits < 128) {
1992 assert(TII->isVMEM(*MI) && MI->mayLoad());
1993 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1994 MRI, MI->getOperand(0))) <=
1995 128)
1996 return true;
1997 }
1998
1999 return false;
2000 }
2001
2002 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
2003 : InstructionRule(TII, SGID, NeedsCache) {}
2004 };
2005
2006 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
2007 /// that is \p Distance steps away
2008 class SharesPredWithPrevNthGroup final : public InstructionRule {
2009 private:
2010 unsigned Distance = 1;
2011
2012 public:
2013 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
2014 SmallVectorImpl<SchedGroup> &SyncPipe) override {
2015 SchedGroup *OtherGroup = nullptr;
2016 if (!SyncPipe.size())
2017 return false;
2018
2019 if (!Cache->size()) {
2020
2021 for (auto &PipeSG : SyncPipe) {
2022 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2023 OtherGroup = &PipeSG;
2024 }
2025 }
2026
2027 if (!OtherGroup)
2028 return false;
2029 if (!OtherGroup->Collection.size())
2030 return true;
2031
2032 for (auto &OtherEle : OtherGroup->Collection) {
2033 for (auto &Pred : OtherEle->Preds) {
2034 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2035 AMDGPU::V_PERM_B32_e64)
2036 Cache->push_back(Pred.getSUnit());
2037 }
2038 }
2039
2040 // If the other group has no PERM preds, then this group won't share any
2041 if (!Cache->size())
2042 return false;
2043 }
2044
2045 auto *DAG = SyncPipe[0].DAG;
2046 // Does the previous DS_WRITE share a V_PERM predecessor with this
2047 // VMEM_READ
2048 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
2049 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
2050 });
2051 }
2052 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2053 unsigned SGID, bool NeedsCache = false)
2054 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
2055 };
2056
2057public:
2058 bool applyIGLPStrategy(
2060 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2062
2063 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2064 AMDGPU::SchedulingPhase Phase) override {
2065 return true;
2066 }
2067
2068 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
2069 : IGLPStrategy(DAG, TII) {
2070 IsBottomUp = false;
2071 }
2072};
2073
2074static unsigned DSWCount = 0;
2075static unsigned DSWWithPermCount = 0;
2076static unsigned DSWWithSharedVMEMCount = 0;
2077
2078bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2079 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2080 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2082 unsigned MFMACount = 0;
2083 unsigned DSRCount = 0;
2084
2085 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2086
2087 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2088 DSWWithSharedVMEMCount == 0)) &&
2089 "DSWCounters should be zero in pre-RA scheduling!");
2090 SmallVector<SUnit *, 6> DSWithPerms;
2091 for (auto &SU : DAG->SUnits) {
2092 auto *I = SU.getInstr();
2093 if (TII->isMFMAorWMMA(*I))
2094 ++MFMACount;
2095 else if (TII->isDS(*I)) {
2096 if (I->mayLoad())
2097 ++DSRCount;
2098 else if (I->mayStore() && IsInitial) {
2099 ++DSWCount;
2100 for (auto Pred : SU.Preds) {
2101 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2102 AMDGPU::V_PERM_B32_e64) {
2103 DSWithPerms.push_back(&SU);
2104 break;
2105 }
2106 }
2107 }
2108 }
2109 }
2110
2111 if (IsInitial) {
2112 DSWWithPermCount = DSWithPerms.size();
2113 auto *I = DSWithPerms.begin();
2114 auto *E = DSWithPerms.end();
2115
2116 // Get the count of DS_WRITES with V_PERM predecessors which
2117 // have loop carried dependencies (WAR) on the same VMEM_READs.
2118 // We consider partial overlap as a miss -- in other words,
2119 // for a given DS_W, we only consider another DS_W as matching
2120 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2121 // for every V_PERM pred of this DS_W.
2122 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2124 for (; I != E; I++) {
2125 SUnit *Cand = nullptr;
2126 bool MissedAny = false;
2127 for (auto &Pred : (*I)->Preds) {
2128 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2129 continue;
2130
2131 if (Cand && llvm::is_contained(Counted, Cand))
2132 break;
2133
2134 for (auto &Succ : Pred.getSUnit()->Succs) {
2135 auto *MI = Succ.getSUnit()->getInstr();
2136 if (!TII->isVMEM(*MI) || !MI->mayLoad())
2137 continue;
2138
2139 if (MissedAny || !VMEMLookup.size()) {
2140 MissedAny = true;
2141 VMEMLookup[MI] = *I;
2142 continue;
2143 }
2144
2145 auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
2146 if (Inserted) {
2147 MissedAny = true;
2148 continue;
2149 }
2150
2151 Cand = It->second;
2152 if (llvm::is_contained(Counted, Cand)) {
2153 MissedAny = true;
2154 break;
2155 }
2156 }
2157 }
2158 if (!MissedAny && Cand) {
2159 DSWWithSharedVMEMCount += 2;
2160 Counted.push_back(Cand);
2161 Counted.push_back(*I);
2162 }
2163 }
2164 }
2165
2166 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2167 SchedGroup *SG;
2168 unsigned PipelineSyncID = 0;
2169 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2170 if (DSWWithPermCount) {
2171 for (unsigned I = 0; I < MFMACount; I++) {
2172 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2173 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2174 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2175
2176 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2177 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
2178 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2179 }
2180 }
2181
2182 PipelineSyncID = 1;
2183 // Phase 1: Break up DS_READ and MFMA clusters.
2184 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2185 // prefetch
2186
2187 // Make ready initial MFMA
2188 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2189 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
2190 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
2191 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2192
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2195 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2196
2197 // Interleave MFMA with DS_READ prefetch
2198 for (unsigned I = 4; I < DSRCount; ++I) {
2199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2200 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
2201 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2202
2203 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2204 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2205 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2206 }
2207
2208 // Phase 2a: Loop carried dependency with V_PERM
2209 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2210 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2211 for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2214 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2215 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2216
2217 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2218 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2219 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2220 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2221
2222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2223 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2224 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2225 1, TII, SG->getSGID(), true));
2226 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2227 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2228
2229 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2230 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2231 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2232
2233 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2234 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2235 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2236 3, TII, SG->getSGID(), true));
2237 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2238 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2239
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2242 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2243 }
2244
2245 // Phase 2b: Loop carried dependency without V_PERM
2246 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2247 // Interleave MFMA to keep XDL unit busy throughout.
2248 for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
2249 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2250 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2251 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2252
2253 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2254 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2255 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2256 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2257
2258 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2259 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2260 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2261 }
2262
2263 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2264 // ultimately used by two DS_WRITE
2265 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2266 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2267
2268 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2271 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2272 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2273
2274 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2275 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2276 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2277 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2278
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2281 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2282
2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2285 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2286 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2287
2288 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2289 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2290 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2291 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2292
2293 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2294 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2295 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2296
2297 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2298 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2299 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2300 2, TII, SG->getSGID(), true));
2301 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2302 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2303
2304 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2305 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2306 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2307
2308 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2309 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2310 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2311 4, TII, SG->getSGID(), true));
2312 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2313 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2314
2315 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2316 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2317 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2318 }
2319
2320 return true;
2321}
2322
2323static std::unique_ptr<IGLPStrategy>
2324createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2325 const SIInstrInfo *TII) {
2326 switch (ID) {
2327 case MFMASmallGemmOptID:
2328 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
2329 case MFMASmallGemmSingleWaveOptID:
2330 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2331 case MFMAExpInterleaveID:
2332 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2333 case MFMAExpSimpleInterleaveID:
2334 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
2335 }
2336
2337 llvm_unreachable("Unknown IGLPStrategyID");
2338}
2339
2340class IGroupLPDAGMutation : public ScheduleDAGMutation {
2341private:
2342 const SIInstrInfo *TII;
2343
2344 ScheduleDAGMI *DAG;
2345
2346 // Organize lists of SchedGroups by their SyncID. SchedGroups /
2347 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2348 // between then.
2349 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2350
2351 // Used to track instructions that can be mapped to multiple sched groups
2352 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2353
2354 // Add DAG edges that enforce SCHED_BARRIER ordering.
2355 void addSchedBarrierEdges(SUnit &SU);
2356
2357 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2358 // not be reordered accross the SCHED_BARRIER. This is used for the base
2359 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2360 // SCHED_BARRIER will always block all instructions that can be classified
2361 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2362 // and may only synchronize with some SchedGroups. Returns the inverse of
2363 // Mask. SCHED_BARRIER's mask describes which instruction types should be
2364 // allowed to be scheduled across it. Invert the mask to get the
2365 // SchedGroupMask of instructions that should be barred.
2366 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2367
2368 // Create SchedGroups for a SCHED_GROUP_BARRIER.
2369 void initSchedGroupBarrierPipelineStage(
2370 std::vector<SUnit>::reverse_iterator RIter);
2371
2372 bool initIGLPOpt(SUnit &SU);
2373
2374public:
2375 void apply(ScheduleDAGInstrs *DAGInstrs) override;
2376
2377 // The order in which the PipelineSolver should process the candidate
2378 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2379 // created SchedGroup first, and will consider that as the ultimate
2380 // predecessor group when linking. TOP_DOWN instead links and processes the
2381 // first created SchedGroup first.
2382 bool IsBottomUp = true;
2383
2384 // The scheduling phase this application of IGLP corresponds with.
2385 AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2386
2387 IGroupLPDAGMutation() = default;
2388 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2389};
2390
2391unsigned SchedGroup::NumSchedGroups = 0;
2392
2393bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2394 return A != B && DAG->addEdge(B, SDep(A, SDep::Artificial));
2395}
2396
2397bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2398 bool Result = false;
2399 if (MI.isMetaInstruction())
2400 Result = false;
2401
2402 else if (MI.isInlineAsm()) {
2403 const SIRegisterInfo &TRI = TII->getRegisterInfo();
2404 auto &MRI = MI.getParent()->getParent()->getRegInfo();
2405 bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
2406 VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
2407 MayStore = MI.mayStore();
2408 for (const MachineOperand &Operand : MI.operands())
2409 if (Operand.isReg()) {
2410 const TargetRegisterClass &RegClass =
2411 *TRI.getRegClassForOperandReg(MRI, Operand);
2412 if (TRI.hasVGPRs(&RegClass)) {
2413 VGPR_used = true;
2414 if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
2415 VReg32_used = true;
2416 }
2417 // > 128 bit registers are usually only used by MFMA instructions, so
2418 // we're using that as a heuristic to guess the schedule group mask of
2419 // the inline asm.
2420 if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
2421 VMFMA_used = true;
2422 if (TRI.hasSGPRs(&RegClass))
2423 SGPR_used = true;
2424 if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2425 SGPR_big_def = true;
2426 }
2427
2428 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2429 SGMask_t InlineAsmMask = 0;
2430 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2431 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2432 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2433 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2434 if (VMFMA_used)
2435 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2436 if (VGPR_used && MayLoad)
2437 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2438 : SchedGroupMask::VMEM_READ);
2439 if (VGPR_used && MayStore)
2440 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2441 : SchedGroupMask::VMEM_WRITE);
2442 if (SGPR_big_def)
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2444 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2445 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2446 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2447 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2448 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2449 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2450 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2451 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2452 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2453
2454 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2455 }
2456
2457 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2458 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2459 TII->isTRANS(MI)))
2460 Result = !MI.mayLoadOrStore();
2461
2462 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2463 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
2464 // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
2465 // For our purposes, these shall not be classified as VALU as this results
2466 // in unexpected behavior.
2467 Result = !MI.mayLoadOrStore();
2468 }
2469
2470 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2471 TII->isSALU(MI))
2472 Result = !MI.mayLoadOrStore();
2473
2474 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2475 TII->isMFMAorWMMA(MI))
2476 Result = true;
2477
2478 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2479 TII->isVMEM(MI))
2480 Result = true;
2481
2482 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2483 MI.mayLoad() && TII->isVMEM(MI))
2484 Result = true;
2485
2486 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2487 MI.mayStore() && TII->isVMEM(MI))
2488 Result = true;
2489
2490 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2491 TII->isDS(MI))
2492 Result = true;
2493
2494 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2495 MI.mayLoad() && TII->isDS(MI))
2496 Result = true;
2497
2498 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2499 MI.mayStore() && TII->isDS(MI))
2500 Result = true;
2501
2502 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2503 TII->isTRANS(MI))
2504 Result = true;
2505
2506 LLVM_DEBUG(
2507 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2508 << (Result ? " could classify " : " unable to classify ") << MI);
2509
2510 return Result;
2511}
2512
2513int SchedGroup::link(SUnit &SU, bool MakePred,
2514 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2515 int MissedEdges = 0;
2516 for (auto *A : Collection) {
2517 SUnit *B = &SU;
2518 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2519 continue;
2520 if (MakePred)
2521 std::swap(A, B);
2522
2523 if (DAG->IsReachable(B, A))
2524 continue;
2525
2526 // tryAddEdge returns false if there is a dependency that makes adding
2527 // the A->B edge impossible, otherwise it returns true;
2528 bool Added = tryAddEdge(A, B);
2529 if (Added)
2530 AddedEdges.emplace_back(A, B);
2531 else
2532 ++MissedEdges;
2533 }
2534
2535 return MissedEdges;
2536}
2537
2538void SchedGroup::link(SUnit &SU, bool MakePred) {
2539 for (auto *A : Collection) {
2540 SUnit *B = &SU;
2541 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2542 continue;
2543 if (MakePred)
2544 std::swap(A, B);
2545
2546 tryAddEdge(A, B);
2547 }
2548}
2549
2550void SchedGroup::link(SUnit &SU,
2551 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2552 for (auto *A : Collection) {
2553 SUnit *B = &SU;
2554 if (P(A, B))
2555 std::swap(A, B);
2556
2557 tryAddEdge(A, B);
2558 }
2559}
2560
2561void SchedGroup::link(SchedGroup &OtherGroup) {
2562 for (auto *B : OtherGroup.Collection)
2563 link(*B);
2564}
2565
2566bool SchedGroup::canAddSU(SUnit &SU) const {
2567 MachineInstr &MI = *SU.getInstr();
2568 if (MI.getOpcode() != TargetOpcode::BUNDLE)
2569 return canAddMI(MI);
2570
2571 // Special case for bundled MIs.
2572 const MachineBasicBlock *MBB = MI.getParent();
2573 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2574 while (E != MBB->end() && E->isBundledWithPred())
2575 ++E;
2576
2577 // Return true if all of the bundled MIs can be added to this group.
2578 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2579}
2580
2581template <class T>
2582void SchedGroup::findCandidateSUnits(T Begin, T End,
2583 SUnitsToCandidateSGsMap &SyncedInstrs) {
2584 for (SUnit &SU : make_range(Begin, End)) {
2585 if (canAddSU(SU))
2586 SyncedInstrs[&SU].push_back(SGID);
2587 }
2588}
2589
2590void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2591 findCandidateSUnits(DAG->SUnits.rbegin(), DAG->SUnits.rend(), SyncedInstrs);
2592}
2593
2594void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2595 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2596 if (!TSchedModel || DAGInstrs->SUnits.empty())
2597 return;
2598
2599 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2600 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2601 TII = ST.getInstrInfo();
2602 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2603 SyncedSchedGroups.clear();
2604 SyncedInstrs.clear();
2605 bool FoundSB = false;
2606 bool FoundIGLP = false;
2607 bool ShouldApplyIGLP = false;
2608 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2609 unsigned Opc = R->getInstr()->getOpcode();
2610 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2611 if (Opc == AMDGPU::SCHED_BARRIER) {
2612 addSchedBarrierEdges(*R);
2613 FoundSB = true;
2614 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2615 initSchedGroupBarrierPipelineStage(R);
2616 FoundSB = true;
2617 } else if (Opc == AMDGPU::IGLP_OPT) {
2618 if (!FoundSB && !FoundIGLP) {
2619 FoundIGLP = true;
2620 ShouldApplyIGLP = initIGLPOpt(*R);
2621 }
2622 }
2623 }
2624
2625 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2626 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2627 // PipelineSolver performs the mutation by adding the edges it
2628 // determined as the best
2629 PS.solve();
2630 return;
2631 }
2632}
2633
2634void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2635 MachineInstr &MI = *SchedBarrier.getInstr();
2636 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2637 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2638 << MI.getOperand(0).getImm() << "\n");
2639 auto InvertedMask =
2640 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2641 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2642
2643 for (SUnit &SU : DAG->SUnits)
2644 if (SG.canAddSU(SU))
2645 SG.add(SU);
2646
2647 // Preserve original instruction ordering relative to the SCHED_BARRIER.
2648 SG.link(
2649 SchedBarrier,
2650 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2651 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
2652}
2653
2654SchedGroupMask
2655IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2656 // Invert mask and erase bits for types of instructions that are implied to be
2657 // allowed past the SCHED_BARRIER.
2658 SchedGroupMask InvertedMask = ~Mask;
2659
2660 // ALU implies VALU, SALU, MFMA, TRANS.
2661 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2662 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2663 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2664 // VALU, SALU, MFMA, TRANS implies ALU.
2665 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2666 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2667 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2668 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2669 InvertedMask &= ~SchedGroupMask::ALU;
2670
2671 // VMEM implies VMEM_READ, VMEM_WRITE.
2672 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2673 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2674 // VMEM_READ, VMEM_WRITE implies VMEM.
2675 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2676 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2677 InvertedMask &= ~SchedGroupMask::VMEM;
2678
2679 // DS implies DS_READ, DS_WRITE.
2680 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2681 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2682 // DS_READ, DS_WRITE implies DS.
2683 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2684 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2685 InvertedMask &= ~SchedGroupMask::DS;
2686
2687 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2688 << "\n");
2689
2690 return InvertedMask;
2691}
2692
2693void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2694 std::vector<SUnit>::reverse_iterator RIter) {
2695 MachineInstr &SGB = *RIter->getInstr();
2696 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2697 int32_t SGMask = SGB.getOperand(0).getImm();
2698 int32_t Size = SGB.getOperand(1).getImm();
2699 int32_t SyncID = SGB.getOperand(2).getImm();
2700
2701 Size++; // Make room for the SCHED_GROUP_BARRIER instruction
2702 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2703 Size, SyncID, DAG, TII);
2704 SG.add(*RIter);
2705 SG.findCandidateSUnits(RIter, SG.DAG->SUnits.rend(),
2706 SyncedInstrs[SG.getSyncID()]);
2707}
2708
2709bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2710 IGLPStrategyID StrategyID =
2711 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
2712 auto S = createIGLPStrategy(StrategyID, DAG, TII);
2713 if (!S->shouldApplyStrategy(DAG, Phase))
2714 return false;
2715
2716 IsBottomUp = S->IsBottomUp;
2717 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2718}
2719
2720} // namespace
2721
2722/// \p Phase specifes whether or not this is a reentry into the
2723/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2724/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2725/// scheduling "phases"), we can reenter this mutation framework more than once
2726/// for a given region.
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(Phase);
2730}
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
#define T
#define P(N)
Interface definition for SIInstrInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
unsigned size() const
Definition DenseMap.h:110
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
int64_t getImm() const
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition Format.h:191
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1448