LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
41#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCSchedule.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(false));
55
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(false));
61
63 "amdgpu-schedule-metric-bias", cl::Hidden,
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(false));
75
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(false));
80
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
105
106const unsigned ScheduleMetrics::ScaleFactor = 100;
107
114
117
118 MF = &DAG->MF;
119
120 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
121
123 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
125 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
126
128 // Set the initial TargetOccupnacy to the maximum occupancy that we can
129 // achieve for this function. This effectively sets a lower bound on the
130 // 'Critical' register limits in the scheduler.
131 // Allow for lower occupancy targets if kernel is wave limited or memory
132 // bound, and using the relaxed occupancy feature.
136 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
137
138 if (!KnownExcessRP) {
139 VGPRCriticalLimit = std::min(
140 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
142 } else {
143 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
144 // returns a reasonably small number for targets with lots of VGPRs, such
145 // as GFX10 and GFX11.
146 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
147 "VGPRCriticalLimit calculation method.\n");
148 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
149 unsigned Granule =
150 AMDGPU::IsaInfo::getVGPRAllocGranule(ST, DynamicVGPRBlockSize);
151 unsigned Addressable =
152 AMDGPU::IsaInfo::getAddressableNumVGPRs(ST, DynamicVGPRBlockSize);
153 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
154 VGPRBudget = std::max(VGPRBudget, Granule);
155 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
156 }
157
158 // Subtract error margin and bias from register limits and avoid overflow.
163 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
164 << ", VGPRExcessLimit = " << VGPRExcessLimit
165 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
166 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
167}
168
169/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
170/// current register pressure.
171///
172/// This works for the common case, but it has a few exceptions that have been
173/// observed through trial and error:
174/// - Explicit physical register operands
175/// - Subregister definitions
176///
177/// In both of those cases, PressureDiff doesn't represent the actual pressure,
178/// and querying LiveIntervals through the RegPressureTracker is needed to get
179/// an accurate value.
180///
181/// We should eventually only use PressureDiff for maximum performance, but this
182/// already allows 80% of SUs to take the fast path without changing scheduling
183/// at all. Further changes would either change scheduling, or require a lot
184/// more logic to recover an accurate pressure estimate from the PressureDiffs.
185static bool canUsePressureDiffs(const SUnit &SU) {
186 if (!SU.isInstr())
187 return false;
188
189 // Cannot use pressure diffs for subregister defs or with physregs, it's
190 // imprecise in both cases.
191 for (const auto &Op : SU.getInstr()->operands()) {
192 if (!Op.isReg() || Op.isImplicit())
193 continue;
194 if (Op.getReg().isPhysical() ||
195 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
196 return false;
197 }
198 return true;
199}
200
202 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
203 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
205 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
206 // getDownwardPressure() and getUpwardPressure() make temporary changes to
207 // the tracker, so we need to pass those function a non-const copy.
208 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
209 if (!useGCNTrackers()) {
210 AtTop
211 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
212 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
213
214 return;
215 }
216
217 // GCNTrackers
218 Pressure.resize(4, 0);
219 MachineInstr *MI = SU->getInstr();
220 GCNRegPressure NewPressure;
221 if (AtTop) {
222 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
223 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
224 } else {
225 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
226 TempUpwardTracker.recede(*MI);
227 NewPressure = TempUpwardTracker.getPressure();
228 }
229 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
230 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
231 NewPressure.getArchVGPRNum();
232 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
233}
234
236 SUnit *SU) const {
237 // Only implemented for top-down scheduling currently.
238 if (!Zone.isTop() || !SU)
239 return 0;
240
241 MachineInstr *MI = SU->getInstr();
242 unsigned CurrCycle = Zone.getCurrCycle();
243 unsigned Stall = 0;
244
245 // Query SchedModel for resource stalls (unbuffered resources).
246 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
247 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
248 for (const MCWriteProcResEntry &PE :
249 make_range(SchedModel->getWriteProcResBegin(SC),
250 SchedModel->getWriteProcResEnd(SC))) {
251 unsigned NextAvail =
252 Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
253 PE.AcquireAtCycle)
254 .first;
255 if (NextAvail > CurrCycle)
256 Stall = std::max(Stall, NextAvail - CurrCycle);
257 }
258 }
259
260 // Query HazardRecognizer for sequence-dependent hazard penalties.
261 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
262 // the post-RA configuration without vreg liveness.
263 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
264 Zone.HazardRec->isEnabled()) {
265 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
266 Stall = std::max(Stall, HR->getHazardWaitStates(MI));
267 }
268
269 return Stall;
270}
271
273 bool AtTop,
274 const RegPressureTracker &RPTracker,
275 const SIRegisterInfo *SRI,
276 unsigned SGPRPressure,
277 unsigned VGPRPressure, bool IsBottomUp) {
278 Cand.SU = SU;
279 Cand.AtTop = AtTop;
280
281 if (!DAG->isTrackingPressure())
282 return;
283
284 Pressure.clear();
285 MaxPressure.clear();
286
287 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
288 // possible over querying the RegPressureTracker.
289 //
290 // RegPressureTracker will make a lot of LIS queries which are very
291 // expensive, it is considered a slow function in this context.
292 //
293 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
294 // trivial lookup into an array. It is pretty much free.
295 //
296 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
297 // PressureDiffs.
298 if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
299 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
301 } else {
302 // Reserve 4 slots.
303 Pressure.resize(4, 0);
304 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
305 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
306
307 for (const auto &Diff : DAG->getPressureDiff(SU)) {
308 if (!Diff.isValid())
309 continue;
310 // PressureDiffs is always bottom-up so if we're working top-down we need
311 // to invert its sign.
312 Pressure[Diff.getPSet()] +=
313 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
314 }
315
316#ifdef EXPENSIVE_CHECKS
317 std::vector<unsigned> CheckPressure, CheckMaxPressure;
318 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
320 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
321 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
322 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
323 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
324 errs() << "Register Pressure is inaccurate when calculated through "
325 "PressureDiff\n"
326 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
327 << ", expected "
328 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
329 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
330 << ", expected "
331 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
332 report_fatal_error("inaccurate register pressure calculation");
333 }
334#endif
335 }
336
337 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
338 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
339
340 // If two instructions increase the pressure of different register sets
341 // by the same amount, the generic scheduler will prefer to schedule the
342 // instruction that increases the set with the least amount of registers,
343 // which in our case would be SGPRs. This is rarely what we want, so
344 // when we report excess/critical register pressure, we do it either
345 // only for VGPRs or only for SGPRs.
346
347 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
348 const unsigned MaxVGPRPressureInc = 16;
349 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
350 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
351
352 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
353 // to increase the likelihood we don't go over the limits. We should improve
354 // the analysis to look through dependencies to find the path with the least
355 // register pressure.
356
357 // We only need to update the RPDelta for instructions that increase register
358 // pressure. Instructions that decrease or keep reg pressure the same will be
359 // marked as RegExcess in tryCandidate() when they are compared with
360 // instructions that increase the register pressure.
361 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
362 HasHighPressure = true;
363 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
364 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
365 }
366
367 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
368 HasHighPressure = true;
369 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
370 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
371 }
372
373 // Register pressure is considered 'CRITICAL' if it is approaching a value
374 // that would reduce the wave occupancy for the execution unit. When
375 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
376 // has the same cost, so we don't need to prefer one over the other.
377
378 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
379 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
380
381 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
382 HasHighPressure = true;
383 if (SGPRDelta > VGPRDelta) {
384 Cand.RPDelta.CriticalMax =
385 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
386 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
387 } else {
388 Cand.RPDelta.CriticalMax =
389 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
390 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
391 }
392 }
393}
394
396 const TargetSchedModel *SchedModel) {
397 bool HasBufferedModel =
398 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
399 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
400 return Combined <= PendingQueueLimit && HasBufferedModel;
401}
402
404 const TargetSchedModel *SchedModel) {
405 // pickOnlyChoice() releases pending instructions and checks for new hazards.
406 SUnit *OnlyChoice = Zone.pickOnlyChoice();
407 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
408 return OnlyChoice;
409
410 return nullptr;
411}
412
414 const SchedCandidate &Preferred) {
415 LLVM_DEBUG({
416 dbgs() << "Prefer:\t\t";
417 DAG->dumpNode(*Preferred.SU);
418
419 if (Current.SU) {
420 dbgs() << "Not:\t";
421 DAG->dumpNode(*Current.SU);
422 }
423
424 dbgs() << "Reason:\t\t";
425 traceCandidate(Preferred);
426 });
427}
428
429// This function is mostly cut and pasted from
430// GenericScheduler::pickNodeFromQueue()
432 const CandPolicy &ZonePolicy,
433 const RegPressureTracker &RPTracker,
434 SchedCandidate &Cand, bool &IsPending,
435 bool IsBottomUp) {
436 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
438 unsigned SGPRPressure = 0;
439 unsigned VGPRPressure = 0;
440 IsPending = false;
441 if (DAG->isTrackingPressure()) {
442 if (!useGCNTrackers()) {
443 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
444 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
445 } else {
446 GCNRPTracker *T = IsBottomUp
447 ? static_cast<GCNRPTracker *>(&UpwardTracker)
448 : static_cast<GCNRPTracker *>(&DownwardTracker);
449 SGPRPressure = T->getPressure().getSGPRNum();
450 VGPRPressure = T->getPressure().getArchVGPRNum();
451 }
452 }
453 LLVM_DEBUG(dbgs() << "Available Q:\n");
454 ReadyQueue &AQ = Zone.Available;
455 for (SUnit *SU : AQ) {
456
457 SchedCandidate TryCand(ZonePolicy);
458 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
459 VGPRPressure, IsBottomUp);
460 // Pass SchedBoundary only when comparing nodes from the same boundary.
461 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
462 tryCandidate(Cand, TryCand, ZoneArg);
463 if (TryCand.Reason != NoCand) {
464 // Initialize resource delta if needed in case future heuristics query it.
465 if (TryCand.ResDelta == SchedResourceDelta())
466 TryCand.initResourceDelta(Zone.DAG, SchedModel);
467 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
468 Cand.setBest(TryCand);
469 } else {
470 printCandidateDecision(TryCand, Cand);
471 }
472 }
473
474 if (!shouldCheckPending(Zone, SchedModel))
475 return;
476
477 LLVM_DEBUG(dbgs() << "Pending Q:\n");
478 ReadyQueue &PQ = Zone.Pending;
479 for (SUnit *SU : PQ) {
480
481 SchedCandidate TryCand(ZonePolicy);
482 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
483 VGPRPressure, IsBottomUp);
484 // Pass SchedBoundary only when comparing nodes from the same boundary.
485 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
486 tryPendingCandidate(Cand, TryCand, ZoneArg);
487 if (TryCand.Reason != NoCand) {
488 // Initialize resource delta if needed in case future heuristics query it.
489 if (TryCand.ResDelta == SchedResourceDelta())
490 TryCand.initResourceDelta(Zone.DAG, SchedModel);
491 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
492 IsPending = true;
493 Cand.setBest(TryCand);
494 } else {
495 printCandidateDecision(TryCand, Cand);
496 }
497 }
498}
499
500// This function is mostly cut and pasted from
501// GenericScheduler::pickNodeBidirectional()
503 bool &PickedPending) {
504 // Schedule as far as possible in the direction of no choice. This is most
505 // efficient, but also provides the best heuristics for CriticalPSets.
506 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
507 IsTopNode = false;
508 return SU;
509 }
510 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
511 IsTopNode = true;
512 return SU;
513 }
514 // Set the bottom-up policy based on the state of the current bottom zone
515 // and the instructions outside the zone, including the top zone.
516 CandPolicy BotPolicy;
517 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
518 // Set the top-down policy based on the state of the current top zone and
519 // the instructions outside the zone, including the bottom zone.
520 CandPolicy TopPolicy;
521 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
522
523 bool BotPending = false;
524 // See if BotCand is still valid (because we previously scheduled from Top).
525 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
526 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
527 BotCand.Policy != BotPolicy) {
528 BotCand.reset(CandPolicy());
529 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
530 BotPending,
531 /*IsBottomUp=*/true);
532 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
533 } else {
535#ifndef NDEBUG
536 if (VerifyScheduling) {
537 SchedCandidate TCand;
538 TCand.reset(CandPolicy());
539 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
540 BotPending,
541 /*IsBottomUp=*/true);
542 assert(TCand.SU == BotCand.SU &&
543 "Last pick result should correspond to re-picking right now");
544 }
545#endif
546 }
547
548 bool TopPending = false;
549 // Check if the top Q has a better candidate.
550 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
551 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
552 TopCand.Policy != TopPolicy) {
553 TopCand.reset(CandPolicy());
554 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
555 TopPending,
556 /*IsBottomUp=*/false);
557 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
558 } else {
560#ifndef NDEBUG
561 if (VerifyScheduling) {
562 SchedCandidate TCand;
563 TCand.reset(CandPolicy());
564 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
565 TopPending,
566 /*IsBottomUp=*/false);
567 assert(TCand.SU == TopCand.SU &&
568 "Last pick result should correspond to re-picking right now");
569 }
570#endif
571 }
572
573 // Pick best from BotCand and TopCand.
574 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
575 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
576 SchedCandidate Cand = BotPending ? TopCand : BotCand;
577 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
578 PickedPending = BotPending && TopPending;
579
580 TryCand.Reason = NoCand;
581 if (BotPending || TopPending) {
582 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
583 } else {
584 tryCandidate(Cand, TryCand, nullptr);
585 }
586
587 if (TryCand.Reason != NoCand) {
588 Cand.setBest(TryCand);
589 }
590
591 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
592
593 IsTopNode = Cand.AtTop;
594 return Cand.SU;
595}
596
597// This function is mostly cut and pasted from
598// GenericScheduler::pickNode()
600 if (DAG->top() == DAG->bottom()) {
601 assert(Top.Available.empty() && Top.Pending.empty() &&
602 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
603 return nullptr;
604 }
605 bool PickedPending;
606 SUnit *SU;
607 do {
608 PickedPending = false;
609 if (RegionPolicy.OnlyTopDown) {
611 if (!SU) {
612 CandPolicy NoPolicy;
613 TopCand.reset(NoPolicy);
614 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
615 PickedPending,
616 /*IsBottomUp=*/false);
617 assert(TopCand.Reason != NoCand && "failed to find a candidate");
618 SU = TopCand.SU;
619 }
620 IsTopNode = true;
621 } else if (RegionPolicy.OnlyBottomUp) {
623 if (!SU) {
624 CandPolicy NoPolicy;
625 BotCand.reset(NoPolicy);
626 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
627 PickedPending,
628 /*IsBottomUp=*/true);
629 assert(BotCand.Reason != NoCand && "failed to find a candidate");
630 SU = BotCand.SU;
631 }
632 IsTopNode = false;
633 } else {
634 SU = pickNodeBidirectional(IsTopNode, PickedPending);
635 }
636 } while (SU->isScheduled);
637
638 if (PickedPending) {
639 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
640 SchedBoundary &Zone = IsTopNode ? Top : Bot;
641 unsigned CurrentCycle = Zone.getCurrCycle();
642 if (ReadyCycle > CurrentCycle)
643 Zone.bumpCycle(ReadyCycle);
644
645 // FIXME: checkHazard() doesn't give information about which cycle the
646 // hazard will resolve so just keep bumping the cycle by 1. This could be
647 // made more efficient if checkHazard() returned more details.
648 while (Zone.checkHazard(SU))
649 Zone.bumpCycle(Zone.getCurrCycle() + 1);
650
651 Zone.releasePending();
652 }
653
654 if (SU->isTopReady())
655 Top.removeReady(SU);
656 if (SU->isBottomReady())
657 Bot.removeReady(SU);
658
659 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
660 << *SU->getInstr());
661 return SU;
662}
663
664void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
665 if (useGCNTrackers()) {
666 MachineInstr *MI = SU->getInstr();
667 IsTopNode ? (void)DownwardTracker.advance(MI, false)
668 : UpwardTracker.recede(*MI);
669 }
670
671 return GenericScheduler::schedNode(SU, IsTopNode);
672}
673
678
681 if (!CurrentStage)
682 CurrentStage = SchedStages.begin();
683 else
684 CurrentStage++;
685
686 return CurrentStage != SchedStages.end();
687}
688
691 return std::next(CurrentStage) != SchedStages.end();
692}
693
695 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
696 return *std::next(CurrentStage);
697}
698
700 SchedCandidate &TryCand,
701 SchedBoundary *Zone) const {
702 // Initialize the candidate if needed.
703 if (!Cand.isValid()) {
704 TryCand.Reason = NodeOrder;
705 return true;
706 }
707
708 // Bias PhysReg Defs and copies to their uses and defined respectively.
709 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
710 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
711 return TryCand.Reason != NoCand;
712
713 // Avoid exceeding the target's limit.
714 if (DAG->isTrackingPressure() &&
715 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
716 RegExcess, TRI, DAG->MF))
717 return TryCand.Reason != NoCand;
718
719 // Avoid increasing the max critical pressure in the scheduled region.
720 if (DAG->isTrackingPressure() &&
722 TryCand, Cand, RegCritical, TRI, DAG->MF))
723 return TryCand.Reason != NoCand;
724
725 bool SameBoundary = Zone != nullptr;
726 if (SameBoundary) {
729 TryCand, Cand, ResourceReduce))
730 return TryCand.Reason != NoCand;
732 Cand.ResDelta.DemandedResources, TryCand, Cand,
734 return TryCand.Reason != NoCand;
735 }
736
737 return false;
738}
739
752
757
759 SchedCandidate &TryCand,
760 SchedBoundary *Zone) const {
761 // Initialize the candidate if needed.
762 if (!Cand.isValid()) {
763 TryCand.Reason = NodeOrder;
764 return true;
765 }
766
767 // Avoid spilling by exceeding the register limit.
768 if (DAG->isTrackingPressure() &&
769 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
770 RegExcess, TRI, DAG->MF))
771 return TryCand.Reason != NoCand;
772
773 // Bias PhysReg Defs and copies to their uses and defined respectively.
774 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
775 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
776 return TryCand.Reason != NoCand;
777
778 bool SameBoundary = Zone != nullptr;
779 if (SameBoundary) {
780 // Prioritize instructions that read unbuffered resources by stall cycles.
781 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
782 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
783 return TryCand.Reason != NoCand;
784
785 // Avoid critical resource consumption and balance the schedule.
788 TryCand, Cand, ResourceReduce))
789 return TryCand.Reason != NoCand;
791 Cand.ResDelta.DemandedResources, TryCand, Cand,
793 return TryCand.Reason != NoCand;
794
795 // Unconditionally try to reduce latency.
796 if (tryLatency(TryCand, Cand, *Zone))
797 return TryCand.Reason != NoCand;
798
799 // Weak edges are for clustering and other constraints.
800 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
801 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
802 return TryCand.Reason != NoCand;
803 }
804
805 // Keep clustered nodes together to encourage downstream peephole
806 // optimizations which may reduce resource requirements.
807 //
808 // This is a best effort to set things up for a post-RA pass. Optimizations
809 // like generating loads of multiple registers should ideally be done within
810 // the scheduler pass by combining the loads during DAG postprocessing.
811 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
812 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
813 bool CandIsClusterSucc =
814 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
815 bool TryCandIsClusterSucc =
816 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
817 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
818 Cluster))
819 return TryCand.Reason != NoCand;
820
821 // Avoid increasing the max critical pressure in the scheduled region.
822 if (DAG->isTrackingPressure() &&
824 TryCand, Cand, RegCritical, TRI, DAG->MF))
825 return TryCand.Reason != NoCand;
826
827 // Avoid increasing the max pressure of the entire region.
828 if (DAG->isTrackingPressure() &&
829 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
830 Cand, RegMax, TRI, DAG->MF))
831 return TryCand.Reason != NoCand;
832
833 if (SameBoundary) {
834 // Fall through to original instruction order.
835 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
836 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
837 TryCand.Reason = NodeOrder;
838 return true;
839 }
840 }
841 return false;
842}
843
849
850/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
851/// much as possible. This is achieved by:
852// 1. Prioritize clustered operations before stall latency heuristic.
853// 2. Prioritize long-latency-load before stall latency heuristic.
854///
855/// \param Cand provides the policy and current best candidate.
856/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
857/// \param Zone describes the scheduled zone that we are extending, or nullptr
858/// if Cand is from a different zone than TryCand.
859/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
861 SchedCandidate &TryCand,
862 SchedBoundary *Zone) const {
863 // Initialize the candidate if needed.
864 if (!Cand.isValid()) {
865 TryCand.Reason = NodeOrder;
866 return true;
867 }
868
869 // Bias PhysReg Defs and copies to their uses and defined respectively.
870 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
871 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
872 return TryCand.Reason != NoCand;
873
874 if (DAG->isTrackingPressure()) {
875 // Avoid exceeding the target's limit.
876 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
877 RegExcess, TRI, DAG->MF))
878 return TryCand.Reason != NoCand;
879
880 // Avoid increasing the max critical pressure in the scheduled region.
882 TryCand, Cand, RegCritical, TRI, DAG->MF))
883 return TryCand.Reason != NoCand;
884 }
885
886 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
887 // get more benefit from clausing these memory instructions.
888 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
889 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
890 bool CandIsClusterSucc =
891 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
892 bool TryCandIsClusterSucc =
893 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
894 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
895 Cluster))
896 return TryCand.Reason != NoCand;
897
898 // We only compare a subset of features when comparing nodes between
899 // Top and Bottom boundary. Some properties are simply incomparable, in many
900 // other instances we should only override the other boundary if something
901 // is a clear good pick on one boundary. Skip heuristics that are more
902 // "tie-breaking" in nature.
903 bool SameBoundary = Zone != nullptr;
904 if (SameBoundary) {
905 // For loops that are acyclic path limited, aggressively schedule for
906 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
907 // heuristics to take precedence.
908 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
909 tryLatency(TryCand, Cand, *Zone))
910 return TryCand.Reason != NoCand;
911
912 // MaxMemoryClause-specific: Prioritize long latency memory load
913 // instructions in top-bottom order to hide more latency. The mayLoad check
914 // is used to exclude store-like instructions, which we do not want to
915 // scheduler them too early.
916 bool TryMayLoad =
917 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
918 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
919
920 if (TryMayLoad || CandMayLoad) {
921 bool TryLongLatency =
922 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
923 bool CandLongLatency =
924 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
925
926 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
927 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
928 Cand, Stall))
929 return TryCand.Reason != NoCand;
930 }
931 // Prioritize instructions that read unbuffered resources by stall cycles.
932 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
933 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
934 return TryCand.Reason != NoCand;
935 }
936
937 if (SameBoundary) {
938 // Weak edges are for clustering and other constraints.
939 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
940 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
941 return TryCand.Reason != NoCand;
942 }
943
944 // Avoid increasing the max pressure of the entire region.
945 if (DAG->isTrackingPressure() &&
946 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
947 Cand, RegMax, TRI, DAG->MF))
948 return TryCand.Reason != NoCand;
949
950 if (SameBoundary) {
951 // Avoid critical resource consumption and balance the schedule.
954 TryCand, Cand, ResourceReduce))
955 return TryCand.Reason != NoCand;
957 Cand.ResDelta.DemandedResources, TryCand, Cand,
959 return TryCand.Reason != NoCand;
960
961 // Avoid serializing long latency dependence chains.
962 // For acyclic path limited loops, latency was already checked above.
963 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
964 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
965 return TryCand.Reason != NoCand;
966
967 // Fall through to original instruction order.
968 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
969 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
970 TryCand.Reason = NodeOrder;
971 return true;
972 }
973 }
974
975 return false;
976}
977
979 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
980 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
981 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
982 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
983 RegionLiveOuts(this, /*IsLiveOut=*/true) {
984
985 // We want regions with a single MI to be scheduled so that we can reason
986 // about them correctly during scheduling stages that move MIs between regions
987 // (e.g., rematerialization).
989 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
990 if (RelaxedOcc) {
991 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
992 if (MinOccupancy != StartingOccupancy)
993 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
994 << ".\n");
995 }
996}
997
998std::unique_ptr<GCNSchedStage>
999GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
1000 switch (SchedStageID) {
1002 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
1004 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
1006 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
1008 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
1010 return std::make_unique<PreRARematStage>(SchedStageID, *this);
1012 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
1014 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
1015 *this);
1016 }
1017
1018 llvm_unreachable("Unknown SchedStageID.");
1019}
1020
1022 // Collect all scheduling regions. The actual scheduling is performed in
1023 // GCNScheduleDAGMILive::finalizeSchedule.
1024 Regions.push_back(std::pair(RegionBegin, RegionEnd));
1025}
1026
1028GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1029 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1030 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
1032 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
1033 &LiveIns[RegionIdx]);
1034 return RPTracker.moveMaxPressure();
1035}
1036
1038 MachineBasicBlock::iterator RegionEnd) {
1039 assert(RegionBegin != RegionEnd && "Region must not be empty");
1040 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1041}
1042
1043void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1044 const MachineBasicBlock *MBB) {
1045 GCNDownwardRPTracker RPTracker(*LIS);
1046
1047 // If the block has the only successor then live-ins of that successor are
1048 // live-outs of the current block. We can reuse calculated live set if the
1049 // successor will be sent to scheduling past current block.
1050
1051 // However, due to the bug in LiveInterval analysis it may happen that two
1052 // predecessors of the same successor block have different lane bitmasks for
1053 // a live-out register. Workaround that by sticking to one-to-one relationship
1054 // i.e. one predecessor with one successor block.
1055 const MachineBasicBlock *OnlySucc = nullptr;
1056 if (MBB->succ_size() == 1) {
1057 auto *Candidate = *MBB->succ_begin();
1058 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1059 SlotIndexes *Ind = LIS->getSlotIndexes();
1060 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1061 OnlySucc = Candidate;
1062 }
1063 }
1064
1065 // Scheduler sends regions from the end of the block upwards.
1066 size_t CurRegion = RegionIdx;
1067 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1068 if (Regions[CurRegion].first->getParent() != MBB)
1069 break;
1070 --CurRegion;
1071
1072 auto I = MBB->begin();
1073 auto LiveInIt = MBBLiveIns.find(MBB);
1074 auto &Rgn = Regions[CurRegion];
1075 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1076 if (LiveInIt != MBBLiveIns.end()) {
1077 auto LiveIn = std::move(LiveInIt->second);
1078 RPTracker.reset(*MBB->begin(), MBB->end(), &LiveIn);
1079 MBBLiveIns.erase(LiveInIt);
1080 } else {
1081 I = Rgn.first;
1082 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1083#ifdef EXPENSIVE_CHECKS
1084 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1085#endif
1086 RPTracker.reset(*I, I->getParent()->end(), &LRS);
1087 }
1088
1089 for (;;) {
1090 I = RPTracker.getNext();
1091
1092 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1093 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1094 RPTracker.clearMaxPressure();
1095 }
1096
1097 if (Regions[CurRegion].second == I) {
1098 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1099 if (CurRegion-- == RegionIdx)
1100 break;
1101 auto &Rgn = Regions[CurRegion];
1102 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1103 }
1104 RPTracker.advanceBeforeNext();
1105 RPTracker.advanceToNext();
1106 }
1107
1108 if (OnlySucc) {
1109 if (I != MBB->end()) {
1110 RPTracker.advanceBeforeNext();
1111 RPTracker.advanceToNext();
1112 RPTracker.advance(MBB->end());
1113 }
1114 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1115 }
1116}
1117
1119GCNScheduleDAGMILive::getRegionLiveInMap() const {
1120 assert(!Regions.empty());
1121 std::vector<MachineInstr *> RegionFirstMIs;
1122 RegionFirstMIs.reserve(Regions.size());
1123 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1124 RegionFirstMIs.push_back(
1126
1127 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1128}
1129
1131GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1132 assert(!Regions.empty());
1133 std::vector<MachineInstr *> RegionLastMIs;
1134 RegionLastMIs.reserve(Regions.size());
1135 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1136 // Skip empty regions.
1137 if (RegionBegin == RegionEnd)
1138 continue;
1139 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1140 }
1141 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1142}
1143
1145 IdxToInstruction.clear();
1146
1147 RegionLiveRegMap =
1148 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1149 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1150 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1151 // Skip empty regions.
1152 if (RegionBegin == RegionEnd)
1153 continue;
1154 MachineInstr *RegionKey =
1155 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1156 IdxToInstruction[I] = RegionKey;
1157 }
1158}
1159
1161 // Start actual scheduling here. This function is called by the base
1162 // MachineScheduler after all regions have been recorded by
1163 // GCNScheduleDAGMILive::schedule().
1164 LiveIns.resize(Regions.size());
1165 Pressure.resize(Regions.size());
1166 RegionsWithHighRP.resize(Regions.size());
1167 RegionsWithExcessRP.resize(Regions.size());
1168 RegionsWithIGLPInstrs.resize(Regions.size());
1169 RegionsWithHighRP.reset();
1170 RegionsWithExcessRP.reset();
1171 RegionsWithIGLPInstrs.reset();
1172
1173 runSchedStages();
1174}
1175
1176void GCNScheduleDAGMILive::runSchedStages() {
1177 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1178
1179 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1180 if (!Regions.empty()) {
1181 BBLiveInMap = getRegionLiveInMap();
1182 if (S.useGCNTrackers())
1183 RegionLiveOuts.buildLiveRegMap();
1184 }
1185
1186#ifdef DUMP_MAX_REG_PRESSURE
1190 LIS->dump();
1191 }
1192#endif
1193
1194 while (S.advanceStage()) {
1195 auto Stage = createSchedStage(S.getCurrentStage());
1196 if (!Stage->initGCNSchedStage())
1197 continue;
1198
1199 for (auto Region : Regions) {
1200 RegionBegin = Region.first;
1201 RegionEnd = Region.second;
1202 // Setup for scheduling the region and check whether it should be skipped.
1203 if (!Stage->initGCNRegion()) {
1204 Stage->advanceRegion();
1205 exitRegion();
1206 continue;
1207 }
1208
1209 if (S.useGCNTrackers()) {
1210 const unsigned RegionIdx = Stage->getRegionIdx();
1211 S.getDownwardTracker()->reset(MRI, LiveIns[RegionIdx]);
1213 MRI, RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx));
1214 }
1215
1217 Stage->finalizeGCNRegion();
1218 Stage->advanceRegion();
1219 exitRegion();
1220 }
1221
1222 Stage->finalizeGCNSchedStage();
1223 }
1224
1225#ifdef DUMP_MAX_REG_PRESSURE
1229 LIS->dump();
1230 }
1231#endif
1232}
1233
1234#ifndef NDEBUG
1236 switch (StageID) {
1238 OS << "Max Occupancy Initial Schedule";
1239 break;
1241 OS << "Instruction Rewriting Reschedule";
1242 break;
1244 OS << "Unclustered High Register Pressure Reschedule";
1245 break;
1247 OS << "Clustered Low Occupancy Reschedule";
1248 break;
1250 OS << "Pre-RA Rematerialize";
1251 break;
1253 OS << "Max ILP Initial Schedule";
1254 break;
1256 OS << "Max memory clause Initial Schedule";
1257 break;
1258 }
1259
1260 return OS;
1261}
1262#endif
1263
1267
1269 if (!DAG.LIS)
1270 return false;
1271
1272 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1273 return true;
1274}
1275
1276void RewriteMFMAFormStage::findReachingDefs(
1277 MachineOperand &UseMO, LiveIntervals *LIS,
1278 SmallVectorImpl<SlotIndex> &DefIdxs) {
1279 MachineInstr *UseMI = UseMO.getParent();
1280 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1281 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1282
1283 // If the def is not a PHI, then it must be the only reaching def.
1284 if (!VNI->isPHIDef()) {
1285 DefIdxs.push_back(VNI->def);
1286 return;
1287 }
1288
1289 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1291
1292 // Mark the predecessor blocks for traversal
1293 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1294 Worklist.push_back(PredMBB);
1295 Visited.insert(PredMBB);
1296 }
1297
1298 while (!Worklist.empty()) {
1299 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1300
1301 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1302 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1303
1304 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1305
1306 // If there is a def in this block, then add it to the list. This is the
1307 // reaching def of this path.
1308 if (!VNI->isPHIDef()) {
1309 DefIdxs.push_back(VNI->def);
1310 continue;
1311 }
1312
1313 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1314 if (Visited.insert(PredMBB).second)
1315 Worklist.push_back(PredMBB);
1316 }
1317 }
1318}
1319
1320void RewriteMFMAFormStage::findReachingUses(
1322 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1323 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1324 for (MachineOperand &UseMO :
1325 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1326 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1327 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1328
1329 // If we find a use that contains this DefMI in its reachingDefs, then it is
1330 // a reaching use.
1331 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1332 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1333 }))
1334 ReachingUses.push_back(&UseMO);
1335 }
1336}
1337
1339 // We only need to run this pass if the architecture supports AGPRs.
1340 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1341 // is no need for this pass in that case, either.
1342 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1343 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1344 return false;
1345
1346 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1347 RegionsWithExcessArchVGPR.reset();
1348 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1350 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1351 RegionsWithExcessArchVGPR[Region] = true;
1352 }
1353
1354 if (RegionsWithExcessArchVGPR.none())
1355 return false;
1356
1357 TII = ST.getInstrInfo();
1358 SRI = ST.getRegisterInfo();
1359
1360 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1363
1364 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1365 return false;
1366
1367 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1368
1369 // If we haven't found the beneficial conditions, prefer the VGPR form which
1370 // may result in less cross RC copies.
1371 if (Cost > 0)
1372 return false;
1373
1374 return rewrite(RewriteCands);
1375}
1376
1379 return false;
1380
1382 return false;
1383
1384 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1385 return false;
1386
1387 SavedMutations.swap(DAG.Mutations);
1388 DAG.addMutation(
1390
1391 InitialOccupancy = DAG.MinOccupancy;
1392 // Aggressively try to reduce register pressure in the unclustered high RP
1393 // stage. Temporarily increase occupancy target in the region.
1394 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1395 ? InitialOccupancy + 1
1396 : InitialOccupancy;
1397 IsAnyRegionScheduled = false;
1398 S.SGPRLimitBias = S.HighRPSGPRBias;
1399 S.VGPRLimitBias = S.HighRPVGPRBias;
1400
1401 LLVM_DEBUG(
1402 dbgs()
1403 << "Retrying function scheduling without clustering. "
1404 "Aggressively try to reduce register pressure to achieve occupancy "
1405 << TempTargetOccupancy << ".\n");
1406
1407 return true;
1408}
1409
1412 return false;
1413
1415 return false;
1416
1417 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1418 // been dropped. All regions will have already been scheduled with the ideal
1419 // occupancy targets.
1420 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1421 return false;
1422
1423 LLVM_DEBUG(
1424 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1425 << DAG.MinOccupancy << ".\n");
1426 return true;
1427}
1428
1429/// Allows to easily filter for this stage's debug output.
1430#define REMAT_PREFIX "[PreRARemat] "
1431#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1432
1433#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1434Printable PreRARematStage::ScoredRemat::print() const {
1435 return Printable([&](raw_ostream &OS) {
1436 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1437 });
1438}
1439#endif
1440
1442 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1443 // regions inbetween the defs and region we sinked the def to. Will need to be
1444 // fixed if there is another pass after this pass.
1445 assert(!S.hasNextStage());
1446
1447 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1448 return false;
1449
1450#ifndef NDEBUG
1451 auto PrintTargetRegions = [&]() -> void {
1452 if (TargetRegions.none()) {
1453 dbgs() << REMAT_PREFIX << "No target regions\n";
1454 return;
1455 }
1456 dbgs() << REMAT_PREFIX << "Target regions:\n";
1457 for (unsigned I : TargetRegions.set_bits())
1458 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1459 };
1460#endif
1461
1462 // Set an objective for the stage based on current RP in each region.
1463 REMAT_DEBUG({
1464 dbgs() << "Analyzing ";
1465 MF.getFunction().printAsOperand(dbgs(), false);
1466 dbgs() << ": ";
1467 });
1468 if (!setObjective()) {
1469 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1470 << MFI.getMaxWavesPerEU() << '\n');
1471 return false;
1472 }
1473 LLVM_DEBUG({
1474 if (TargetOcc) {
1475 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1476 } else {
1477 dbgs() << "reduce spilling (minimum target occupancy is "
1478 << MFI.getMinWavesPerEU() << ")\n";
1479 }
1480 PrintTargetRegions();
1481 });
1482
1483 // We need up-to-date live-out info. to query live-out register masks in
1484 // regions containing rematerializable instructions.
1485 DAG.RegionLiveOuts.buildLiveRegMap();
1486
1487 if (!Remater.analyze()) {
1488 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1489 return false;
1490 }
1491 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1492
1493 // Set of registers already marked for potential remterialization; used to
1494 // avoid rematerialization chains.
1495 SmallSet<Register, 4> MarkedRegs;
1496
1497 // Collect candidates. We have more restrictions on what we can track here
1498 // compared to the rematerializer.
1499 SmallVector<ScoredRemat, 8> Candidates;
1500 SmallVector<unsigned> CandidateOrder;
1501 for (unsigned RegIdx = 0, E = Remater.getNumRegs(); RegIdx < E; ++RegIdx) {
1502 const Rematerializer::Reg &CandReg = Remater.getReg(RegIdx);
1503
1504 // Single user only.
1505 unsigned NumUsers = 0;
1506 for (const auto &[_, RegionUses] : CandReg.Uses)
1507 NumUsers += RegionUses.size();
1508 if (NumUsers != 1)
1509 continue;
1510
1511 // We further filter the registers that we can rematerialize based on our
1512 // current tracking capabilities in the stage. The user cannot itself be
1513 // marked rematerializable, and no register operand of the defining MI can
1514 // be marked rematerializable.
1515 MachineInstr *UseMI = *CandReg.Uses.begin()->getSecond().begin();
1516 const MachineOperand &UseMO = UseMI->getOperand(0);
1517 if (UseMO.isReg() && MarkedRegs.contains(UseMO.getReg()))
1518 continue;
1519 if (llvm::any_of(CandReg.DefMI->all_uses(),
1520 [&MarkedRegs](const MachineOperand &MO) {
1521 return MarkedRegs.contains(MO.getReg());
1522 }))
1523 continue;
1524
1525 // Do not rematerialize an instruction if it uses registers that aren't
1526 // available at its use. This ensures that we are not extending any live
1527 // range while rematerializing.
1528 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1529 if (!VirtRegAuxInfo::allUsesAvailableAt(CandReg.DefMI, UseIdx, *DAG.LIS,
1530 DAG.MRI, *DAG.TII))
1531 continue;
1532
1533 MarkedRegs.insert(CandReg.getDefReg());
1534 ScoredRemat &Cand = Candidates.emplace_back();
1535 Cand.init(RegIdx, FreqInfo, Remater, DAG);
1536 Cand.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1537 if (!Cand.hasNullScore())
1538 CandidateOrder.push_back(Candidates.size() - 1);
1539 }
1540
1541 if (TargetOcc) {
1542 // Every rematerialization we do here is likely to move the instruction
1543 // into a higher frequency region, increasing the total sum latency of the
1544 // instruction itself. This is acceptable if we are eliminating a spill in
1545 // the process, but when the goal is increasing occupancy we get nothing
1546 // out of rematerialization if occupancy is not increased in the end; in
1547 // such cases we want to roll back the rematerialization.
1548 Rollback = std::make_unique<RollbackSupport>(Remater);
1549 }
1550
1551 // Rematerialize registers in successive rounds until all RP targets are
1552 // satisifed or until we run out of rematerialization candidates.
1553 BitVector RecomputeRP(DAG.Regions.size());
1554 for (;;) {
1555 RecomputeRP.reset();
1556
1557 // Sort candidates in increasing score order.
1558 sort(CandidateOrder, [&](unsigned LHSIndex, unsigned RHSIndex) {
1559 return Candidates[LHSIndex] < Candidates[RHSIndex];
1560 });
1561
1562 REMAT_DEBUG({
1563 dbgs() << "==== NEW REMAT ROUND ====\n"
1564 << REMAT_PREFIX
1565 << "Candidates with non-null score, in rematerialization order:\n";
1566 for (const ScoredRemat &Cand : reverse(Candidates)) {
1567 dbgs() << REMAT_PREFIX << " " << Cand.print() << " | "
1568 << Remater.printRematReg(Cand.RegIdx) << '\n';
1569 }
1570 PrintTargetRegions();
1571 });
1572
1573 // Rematerialize registers in decreasing score order until we estimate
1574 // that all RP targets are satisfied or until rematerialization candidates
1575 // are no longer useful to decrease RP.
1576 while (!CandidateOrder.empty()) {
1577 const ScoredRemat &Cand = Candidates[CandidateOrder.back()];
1578 const Rematerializer::Reg &Reg = Remater.getReg(Cand.RegIdx);
1579
1580 // When previous rematerializations in this round have already satisfied
1581 // RP targets in all regions this rematerialization can impact, we have a
1582 // good indication that our scores have diverged significantly from
1583 // reality, in which case we interrupt this round and re-score. This also
1584 // ensures that every rematerialization we perform is possibly impactful
1585 // in at least one target region.
1586 if (!Cand.maybeBeneficial(TargetRegions, RPTargets)) {
1587 REMAT_DEBUG(dbgs() << "Interrupt round on stale score for "
1588 << Cand.print() << " | "
1589 << Remater.printRematReg(Cand.RegIdx));
1590 break;
1591 }
1592 CandidateOrder.pop_back();
1593
1594#ifdef EXPENSIVE_CHECKS
1595 // All uses are known to be available / live at the remat point. Thus,
1596 // the uses should already be live in to the using region.
1597 for (MachineOperand &MO : Reg.DefMI->operands()) {
1598 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1599 continue;
1600
1601 Register UseReg = MO.getReg();
1602 if (!UseReg.isVirtual())
1603 continue;
1604
1605 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1606 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1607 if (LI.hasSubRanges() && MO.getSubReg())
1608 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1609
1610 const unsigned UseRegion = Reg.Uses.begin()->first;
1611 LaneBitmask LiveInMask = DAG.LiveIns[UseRegion].at(UseReg);
1612 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1613 // If this register has lanes not covered by the LiveIns, be sure they
1614 // do not map to any subrange. ref:
1615 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1616 if (UncoveredLanes.any()) {
1617 assert(LI.hasSubRanges());
1618 for (LiveInterval::SubRange &SR : LI.subranges())
1619 assert((SR.LaneMask & UncoveredLanes).none());
1620 }
1621 }
1622#endif
1623
1624 // Remove the register from all regions where it is a live-in or live-out,
1625 // then rematerialize the register.
1626 REMAT_DEBUG(dbgs() << "** REMAT " << Remater.printRematReg(Cand.RegIdx)
1627 << '\n');
1628 removeFromLiveMaps(Reg.getDefReg(), Cand.LiveIn, Cand.LiveOut);
1629 if (Rollback) {
1630 Rollback->LiveMapUpdates.emplace_back(Cand.RegIdx, Cand.LiveIn,
1631 Cand.LiveOut);
1632 }
1633 Cand.rematerialize(Remater);
1634
1635 // Adjust RP targets. The save is guaranteed in regions in which the
1636 // register is live-through and unused but optimistic in all other regions
1637 // where the register is live.
1638 updateRPTargets(Cand.Live, Cand.RPSave);
1639 RecomputeRP |= Cand.UnpredictableRPSave;
1640 RescheduleRegions |= Cand.Live;
1641 if (!TargetRegions.any()) {
1642 REMAT_DEBUG(dbgs() << "All targets cleared, verifying...\n");
1643 break;
1644 }
1645 }
1646
1647 if (!updateAndVerifyRPTargets(RecomputeRP) && !TargetRegions.any()) {
1648 REMAT_DEBUG(dbgs() << "Objectives achieved!\n");
1649 break;
1650 }
1651
1652 // Update the score of remaining candidates and filter out those that have
1653 // become useless from the vector. Candidates never become useful after
1654 // having been useless for a round, so we can freely drop them without
1655 // losing any future rematerialization opportunity.
1656 unsigned NumUsefulCandidates = 0;
1657 for (unsigned CandIdx : CandidateOrder) {
1658 ScoredRemat &Candidate = Candidates[CandIdx];
1659 Candidate.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1660 if (!Candidate.hasNullScore())
1661 CandidateOrder[NumUsefulCandidates++] = CandIdx;
1662 }
1663 if (NumUsefulCandidates == 0) {
1664 REMAT_DEBUG(dbgs() << "Stop on exhausted rematerialization candidates\n");
1665 break;
1666 }
1667 CandidateOrder.truncate(NumUsefulCandidates);
1668 }
1669
1670 if (RescheduleRegions.none())
1671 return false;
1672
1673 // Commit all pressure changes to the DAG and compute minimum achieved
1674 // occupancy in impacted regions.
1675 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1676 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1677 for (unsigned I : RescheduleRegions.set_bits()) {
1678 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1679 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1680 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1681 << " (" << RPTargets[I] << ")\n");
1682 }
1683 AchievedOcc = MFI.getMaxWavesPerEU();
1684 for (const GCNRegPressure &RP : DAG.Pressure) {
1685 AchievedOcc =
1686 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1687 }
1688
1689 REMAT_DEBUG({
1690 dbgs() << "Retrying function scheduling with new min. occupancy of "
1691 << AchievedOcc << " from rematerializing (original was "
1692 << DAG.MinOccupancy;
1693 if (TargetOcc)
1694 dbgs() << ", target was " << *TargetOcc;
1695 dbgs() << ")\n";
1696 });
1697
1698 DAG.setTargetOccupancy(getStageTargetOccupancy());
1699 return true;
1700}
1701
1703 DAG.finishBlock();
1704 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1705}
1706
1708 SavedMutations.swap(DAG.Mutations);
1709 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1710 if (DAG.MinOccupancy > InitialOccupancy) {
1711 assert(IsAnyRegionScheduled);
1713 << " stage successfully increased occupancy to "
1714 << DAG.MinOccupancy << '\n');
1715 } else if (!IsAnyRegionScheduled) {
1716 assert(DAG.MinOccupancy == InitialOccupancy);
1718 << ": No regions scheduled, min occupancy stays at "
1719 << DAG.MinOccupancy << ", MFI occupancy stays at "
1720 << MFI.getOccupancy() << ".\n");
1721 }
1722
1724}
1725
1727 // Skip empty scheduling region.
1728 if (DAG.begin() == DAG.end())
1729 return false;
1730
1731 // Check whether this new region is also a new block.
1732 if (DAG.RegionBegin->getParent() != CurrentMBB)
1733 setupNewBlock();
1734
1735 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1736 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1737
1738 // Skip regions with 1 schedulable instruction.
1739 if (DAG.begin() == std::prev(DAG.end()))
1740 return false;
1741
1742 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1743 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1744 << " " << CurrentMBB->getName()
1745 << "\n From: " << *DAG.begin() << " To: ";
1746 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1747 else dbgs() << "End";
1748 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1749
1750 // Save original instruction order before scheduling for possible revert.
1751 Unsched.clear();
1752 Unsched.reserve(DAG.NumRegionInstrs);
1755 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1756 for (auto &I : DAG) {
1757 Unsched.push_back(&I);
1758 if (SII->isIGLPMutationOnly(I.getOpcode()))
1759 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1760 }
1761 } else {
1762 for (auto &I : DAG)
1763 Unsched.push_back(&I);
1764 }
1765
1766 PressureBefore = DAG.Pressure[RegionIdx];
1767
1768 LLVM_DEBUG(
1769 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1770 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1771 << "Region live-in pressure: "
1772 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1773 << "Region register pressure: " << print(PressureBefore));
1774
1775 S.HasHighPressure = false;
1776 S.KnownExcessRP = isRegionWithExcessRP();
1777
1778 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1780 SavedMutations.clear();
1781 SavedMutations.swap(DAG.Mutations);
1782 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1784 DAG.addMutation(createIGroupLPDAGMutation(
1785 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1787 }
1788
1789 return true;
1790}
1791
1793 // Only reschedule regions that have excess register pressure (i.e. spilling)
1794 // or had minimum occupancy at the beginning of the stage (as long as
1795 // rescheduling of previous regions did not make occupancy drop back down to
1796 // the initial minimum).
1797 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1798 // If no region has been scheduled yet, the DAG has not yet been updated with
1799 // the occupancy target. So retrieve it from the temporary.
1800 unsigned CurrentTargetOccupancy =
1801 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1802 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1803 (CurrentTargetOccupancy <= InitialOccupancy ||
1804 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1805 InitialOccupancy))
1806 return false;
1807
1808 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1809 // If this is the first region scheduled during this stage, make the target
1810 // occupancy changes in the DAG and MFI.
1811 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1812 IsAnyRegionScheduled = true;
1813 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1814 DAG.setTargetOccupancy(TempTargetOccupancy);
1815 }
1816 return IsSchedulingThisRegion;
1817}
1818
1820 // We may need to reschedule this region if it wasn't rescheduled in the last
1821 // stage, or if we found it was testing critical register pressure limits in
1822 // the unclustered reschedule stage. The later is because we may not have been
1823 // able to raise the min occupancy in the previous stage so the region may be
1824 // overly constrained even if it was already rescheduled.
1825 if (!DAG.RegionsWithHighRP[RegionIdx])
1826 return false;
1827
1829}
1830
1832 return !RevertAllRegions && RescheduleRegions[RegionIdx] &&
1834}
1835
1837 if (CurrentMBB)
1838 DAG.finishBlock();
1839
1840 CurrentMBB = DAG.RegionBegin->getParent();
1841 DAG.startBlock(CurrentMBB);
1842 // Get real RP for the region if it hasn't be calculated before. After the
1843 // initial schedule stage real RP will be collected after scheduling.
1847 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1848}
1849
1851 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1852 if (S.HasHighPressure)
1853 DAG.RegionsWithHighRP[RegionIdx] = true;
1854
1855 // Revert scheduling if we have dropped occupancy or there is some other
1856 // reason that the original schedule is better.
1858
1859 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1861 SavedMutations.swap(DAG.Mutations);
1862}
1863
1866 // When the goal is to increase occupancy, all regions must reach the target
1867 // occupancy for rematerializations to be possibly useful, otherwise we will
1868 // just hurt latency for no benefit. If minimum occupancy drops below the
1869 // target there is no point in trying to re-schedule further regions.
1870 if (!TargetOcc)
1871 return;
1872 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1873 if (DAG.MinOccupancy < *TargetOcc) {
1874 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1875 << " cannot meet occupancy target, interrupting "
1876 "re-scheduling in all regions\n");
1877 RevertAllRegions = true;
1878 }
1879}
1880
1882 // Check the results of scheduling.
1883 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1884
1885 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1886 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1887
1888 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1889
1890 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1891 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1892 DAG.Pressure[RegionIdx] = PressureAfter;
1893
1894 // Early out if we have achieved the occupancy target.
1895 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1896 return;
1897 }
1898
1899 unsigned TargetOccupancy = std::min(
1900 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1901 unsigned WavesAfter = std::min(
1902 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1903 unsigned WavesBefore = std::min(
1904 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1905 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1906 << ", after " << WavesAfter << ".\n");
1907
1908 // We may not be able to keep the current target occupancy because of the just
1909 // scheduled region. We might still be able to revert scheduling if the
1910 // occupancy before was higher, or if the current schedule has register
1911 // pressure higher than the excess limits which could lead to more spilling.
1912 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1913
1914 // Allow memory bound functions to drop to 4 waves if not limited by an
1915 // attribute.
1916 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1917 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1918 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1919 << MFI.getMinAllowedOccupancy() << " waves\n");
1920 NewOccupancy = WavesAfter;
1921 }
1922
1923 if (NewOccupancy < DAG.MinOccupancy) {
1924 DAG.MinOccupancy = NewOccupancy;
1925 MFI.limitOccupancy(DAG.MinOccupancy);
1926 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1927 << DAG.MinOccupancy << ".\n");
1928 }
1929 // The maximum number of arch VGPR on non-unified register file, or the
1930 // maximum VGPR + AGPR in the unified register file case.
1931 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1932 // The maximum number of arch VGPR for both unified and non-unified register
1933 // file.
1934 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1935 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1936
1937 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1938 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1939 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1940 PressureAfter.getSGPRNum() > MaxSGPRs) {
1941 DAG.RegionsWithHighRP[RegionIdx] = true;
1942 DAG.RegionsWithExcessRP[RegionIdx] = true;
1943 }
1944
1945 // Revert if this region's schedule would cause a drop in occupancy or
1946 // spilling.
1947 if (shouldRevertScheduling(WavesAfter)) {
1949 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1950 } else {
1951 DAG.Pressure[RegionIdx] = PressureAfter;
1952 }
1953}
1954
1955unsigned
1956GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1957 DenseMap<unsigned, unsigned> &ReadyCycles,
1958 const TargetSchedModel &SM) {
1959 unsigned ReadyCycle = CurrCycle;
1960 for (auto &D : SU.Preds) {
1961 if (D.isAssignedRegDep()) {
1962 MachineInstr *DefMI = D.getSUnit()->getInstr();
1963 unsigned Latency = SM.computeInstrLatency(DefMI);
1964 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1965 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1966 }
1967 }
1968 ReadyCycles[SU.NodeNum] = ReadyCycle;
1969 return ReadyCycle;
1970}
1971
1972#ifndef NDEBUG
1974 bool operator()(std::pair<MachineInstr *, unsigned> A,
1975 std::pair<MachineInstr *, unsigned> B) const {
1976 return A.second < B.second;
1977 }
1978};
1979
1980static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1981 EarlierIssuingCycle> &ReadyCycles) {
1982 if (ReadyCycles.empty())
1983 return;
1984 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1985 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1986 << " ##################\n# Cycle #\t\t\tInstruction "
1987 " "
1988 " \n";
1989 unsigned IPrev = 1;
1990 for (auto &I : ReadyCycles) {
1991 if (I.second > IPrev + 1)
1992 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1993 << " CYCLES DETECTED ******************************\n\n";
1994 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1995 IPrev = I.second;
1996 }
1997}
1998#endif
1999
2000ScheduleMetrics
2001GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
2002#ifndef NDEBUG
2003 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2004 ReadyCyclesSorted;
2005#endif
2006 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2007 unsigned SumBubbles = 0;
2008 DenseMap<unsigned, unsigned> ReadyCycles;
2009 unsigned CurrCycle = 0;
2010 for (auto &SU : InputSchedule) {
2011 unsigned ReadyCycle =
2012 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2013 SumBubbles += ReadyCycle - CurrCycle;
2014#ifndef NDEBUG
2015 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2016#endif
2017 CurrCycle = ++ReadyCycle;
2018 }
2019#ifndef NDEBUG
2020 LLVM_DEBUG(
2021 printScheduleModel(ReadyCyclesSorted);
2022 dbgs() << "\n\t"
2023 << "Metric: "
2024 << (SumBubbles
2025 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2026 : 1)
2027 << "\n\n");
2028#endif
2029
2030 return ScheduleMetrics(CurrCycle, SumBubbles);
2031}
2032
2035#ifndef NDEBUG
2036 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2037 ReadyCyclesSorted;
2038#endif
2039 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2040 unsigned SumBubbles = 0;
2041 DenseMap<unsigned, unsigned> ReadyCycles;
2042 unsigned CurrCycle = 0;
2043 for (auto &MI : DAG) {
2044 SUnit *SU = DAG.getSUnit(&MI);
2045 if (!SU)
2046 continue;
2047 unsigned ReadyCycle =
2048 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
2049 SumBubbles += ReadyCycle - CurrCycle;
2050#ifndef NDEBUG
2051 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2052#endif
2053 CurrCycle = ++ReadyCycle;
2054 }
2055#ifndef NDEBUG
2056 LLVM_DEBUG(
2057 printScheduleModel(ReadyCyclesSorted);
2058 dbgs() << "\n\t"
2059 << "Metric: "
2060 << (SumBubbles
2061 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2062 : 1)
2063 << "\n\n");
2064#endif
2065
2066 return ScheduleMetrics(CurrCycle, SumBubbles);
2067}
2068
2069bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2070 if (WavesAfter < DAG.MinOccupancy)
2071 return true;
2072
2073 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2074 if (DAG.MFI.isDynamicVGPREnabled()) {
2075 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2076 ST, DAG.MFI.getDynamicVGPRBlockSize(),
2077 PressureBefore.getVGPRNum(false));
2078 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2079 ST, DAG.MFI.getDynamicVGPRBlockSize(), PressureAfter.getVGPRNum(false));
2080 if (BlocksAfter > BlocksBefore)
2081 return true;
2082 }
2083
2084 return false;
2085}
2086
2089 return false;
2090
2092 return true;
2093
2094 if (mayCauseSpilling(WavesAfter))
2095 return true;
2096
2097 return false;
2098}
2099
2101 // If RP is not reduced in the unclustered reschedule stage, revert to the
2102 // old schedule.
2103 if ((WavesAfter <=
2104 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2105 mayCauseSpilling(WavesAfter)) ||
2107 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2108 return true;
2109 }
2110
2111 // Do not attempt to relax schedule even more if we are already spilling.
2113 return false;
2114
2115 LLVM_DEBUG(
2116 dbgs()
2117 << "\n\t *** In shouldRevertScheduling ***\n"
2118 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2119 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2120 LLVM_DEBUG(
2121 dbgs()
2122 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2124 unsigned OldMetric = MBefore.getMetric();
2125 unsigned NewMetric = MAfter.getMetric();
2126 unsigned WavesBefore = std::min(
2127 S.getTargetOccupancy(),
2128 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2129 unsigned Profit =
2130 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2132 NewMetric) /
2134 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2135 << MAfter << "Profit: " << Profit << "\n");
2136 return Profit < ScheduleMetrics::ScaleFactor;
2137}
2138
2141 return false;
2142
2144 return true;
2145
2146 if (mayCauseSpilling(WavesAfter))
2147 return true;
2148
2149 return false;
2150}
2151
2153 // When trying to increase occupancy (TargetOcc == true) the stage manages
2154 // region reverts globally (all or none), so we always return false here.
2155 return !TargetOcc && mayCauseSpilling(WavesAfter);
2156}
2157
2159 if (mayCauseSpilling(WavesAfter))
2160 return true;
2161
2162 return false;
2163}
2164
2166 unsigned WavesAfter) {
2167 return mayCauseSpilling(WavesAfter);
2168}
2169
2170bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2171 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2173 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2174 return true;
2175 }
2176
2177 return false;
2178}
2179
2181 ArrayRef<MachineInstr *> MIOrder) {
2182 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2183 DAG.Regions[RegionIdx].second)) ==
2184 MIOrder.size() &&
2185 "instruction number mismatch");
2186 if (MIOrder.empty())
2187 return;
2188
2189 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2190
2191 // Reconstruct MI sequence by moving instructions in desired order before
2192 // the current region's start.
2193 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2194 MachineBasicBlock *MBB = MIOrder.front()->getParent();
2195 for (MachineInstr *MI : MIOrder) {
2196 // Either move the next MI in order before the end of the region or move the
2197 // region end past the MI if it is at the correct position.
2198 MachineBasicBlock::iterator MII = MI->getIterator();
2199 if (MII != RegionEnd) {
2200 // Will subsequent splice move MI up past a non-debug instruction?
2201 bool NonDebugReordered =
2202 !MI->isDebugInstr() &&
2203 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2204 MBB->splice(RegionEnd, MBB, MI);
2205 // Only update LiveIntervals information if non-debug instructions are
2206 // reordered. Otherwise debug instructions could cause code generation to
2207 // change.
2208 if (NonDebugReordered)
2209 DAG.LIS->handleMove(*MI, true);
2210 } else {
2211 // MI is already at the expected position. However, earlier splices in
2212 // this loop may have changed neighboring slot indices, so this MI's
2213 // slot index can become non-monotonic w.r.t. the physical MBB order.
2214 // Only re-seat when monotonicity is actually violated to avoid
2215 // unnecessary LiveInterval changes that could perturb scheduling.
2216 if (!MI->isDebugInstr()) {
2217 SlotIndex MIIdx = DAG.LIS->getInstructionIndex(*MI);
2218 SlotIndex PrevIdx = DAG.LIS->getSlotIndexes()->getIndexBefore(*MI);
2219 if (PrevIdx >= MIIdx)
2220 DAG.LIS->handleMove(*MI, true);
2221 }
2222 ++RegionEnd;
2223 }
2224 if (MI->isDebugInstr()) {
2225 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2226 continue;
2227 }
2228
2229 // Reset read-undef flags and update them later.
2230 for (MachineOperand &Op : MI->all_defs())
2231 Op.setIsUndef(false);
2232 RegisterOperands RegOpers;
2233 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2234 if (DAG.ShouldTrackLaneMasks) {
2235 // Adjust liveness and add missing dead+read-undef flags.
2236 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2237 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2238 } else {
2239 // Adjust for missing dead-def flags.
2240 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2241 }
2242 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2243 }
2244
2245 // The region end doesn't change throughout scheduling since it itself is
2246 // outside the region (whether that is a MBB end or a terminator MI).
2247 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2248 DAG.Regions[RegionIdx].first = MIOrder.front();
2249}
2250
2251/// Returns true when \p RD will already be in AGPR-form after the rewrite, so
2252/// no bridge copy is needed at this reaching definition.
2254 const DenseSet<Register> &CandSrc2Regs,
2255 const SIInstrInfo &TII) {
2256 if (TII.isMAI(*RD))
2257 return true;
2258 if (RD->getOpcode() == AMDGPU::AV_MOV_B32_IMM_PSEUDO ||
2259 RD->getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO)
2260 return true;
2261 if (RD->isCopy() && CandSrc2Regs.contains(RD->getOperand(1).getReg()))
2262 return true;
2263 return false;
2264}
2265
2266void RewriteMFMAFormStage::resetRewriteCandsToVGPR(
2267 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2268 for (auto [MI, OriginalOpcode] : RewriteCands) {
2269 assert(TII->isMAI(*MI));
2270 const TargetRegisterClass *ADefRC =
2271 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2272 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2273 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2274 MI->setDesc(TII->get(OriginalOpcode));
2275
2276 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2277 if (!Src2->isReg())
2278 continue;
2279
2280 // Have to get src types separately since subregs may cause C and D
2281 // registers to be different types even though the actual operand is
2282 // the same size.
2283 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2284 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2285 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2286 }
2287}
2288
2289bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2290 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2291 return false;
2292 if (AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) == -1)
2293 return false;
2294 // Reject candidates whose users force an unavoidable bridge copy.
2295 Register DstReg = MI->getOperand(0).getReg();
2296 for (const MachineOperand &Use : DAG.MRI.use_nodbg_operands(DstReg)) {
2297 if (!TII->isMAI(*Use.getParent()) && !Use.getParent()->isCopy())
2298 return false;
2299 }
2300 return true;
2301}
2302
2303bool RewriteMFMAFormStage::initHeuristics(
2304 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2305 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2306 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2307 bool Changed = false;
2308
2309 // Collect the candidate group, its members share AGPR-form operands
2310 // post-rewrite, so reaching defs feeding any member don't need bridge copy.
2311 SmallPtrSet<MachineInstr *, 16> RewriteSet;
2312 DenseSet<Register> CandSrc2Regs;
2313 for (MachineBasicBlock &MBB : MF) {
2314 for (MachineInstr &MI : MBB) {
2315 if (!isRewriteCandidate(&MI))
2316 continue;
2317 RewriteSet.insert(&MI);
2318 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2319 if (Src2 && Src2->isReg())
2320 CandSrc2Regs.insert(Src2->getReg());
2321 }
2322 }
2323
2324 // Prepare for the heuristics
2325 for (MachineBasicBlock &MBB : MF) {
2326 for (MachineInstr &MI : MBB) {
2327 if (!isRewriteCandidate(&MI))
2328 continue;
2329
2330 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2331 assert(ReplacementOp != -1);
2332
2333 RewriteCands.push_back({&MI, MI.getOpcode()});
2334 MI.setDesc(TII->get(ReplacementOp));
2335
2336 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2337 if (Src2->isReg()) {
2338 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2339 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2340
2341 for (SlotIndex RDIdx : Src2ReachingDefs) {
2342 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2343 if (isReachingDefAGPRForm(RD, CandSrc2Regs, *TII))
2344 continue;
2345 CopyForDef.insert(RD);
2346 }
2347 }
2348
2349 MachineOperand &Dst = MI.getOperand(0);
2350 SmallVector<MachineOperand *, 8> DstReachingUses;
2351
2352 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2353
2354 for (MachineOperand *RUOp : DstReachingUses) {
2355 MachineInstr *UserMI = RUOp->getParent();
2356 // Group members read the AGPR result directly.
2357 if (TII->isMAI(*UserMI) && RewriteSet.contains(UserMI))
2358 continue;
2359
2360 // For any user of the result of the MFMA which is not an MFMA, we
2361 // insert a copy. For a given register, we will only insert one copy
2362 // per user block.
2363 CopyForUse[UserMI->getParent()].insert(RUOp->getReg());
2364
2365 if (TII->isMAI(*UserMI))
2366 continue;
2367
2368 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2369 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2370
2371 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2372 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2373 if (TII->isMAI(*RD))
2374 continue;
2375
2376 // For any definition of the user of the MFMA which is not an MFMA,
2377 // we insert a copy. We do this to transform all the reaching defs
2378 // of this use to AGPR. By doing this, we can insert a copy from
2379 // AGPR to VGPR at the user rather than after the MFMA.
2380 CopyForDef.insert(RD);
2381 }
2382 }
2383
2384 // Do the rewrite to allow for updated RP calculation.
2385 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2386 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2387 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2388 if (Src2->isReg()) {
2389 // Have to get src types separately since subregs may cause C and D
2390 // registers to be different types even though the actual operand is
2391 // the same size.
2392 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2393 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2394 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2395 }
2396 Changed = true;
2397 }
2398 }
2399
2400 return Changed;
2401}
2402
2403int64_t RewriteMFMAFormStage::getRewriteCost(
2404 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands,
2405 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2406 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2407 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2408
2409 int64_t BestSpillCost = 0;
2410 int64_t Cost = 0;
2411 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2412
2413 std::pair<unsigned, unsigned> MaxVectorRegs =
2414 ST.getMaxNumVectorRegs(MF.getFunction());
2415 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2416 unsigned AGPRThreshold = MaxVectorRegs.second;
2417 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2418
2419 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2420 if (!RegionsWithExcessArchVGPR[Region])
2421 continue;
2422
2423 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2424 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2425 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2426
2427 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2428 // addressable limit), rewriting alone should bring pressure to manageable
2429 // level. If we find any such region, then the rewrite is potentially
2430 // beneficial.
2431 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2432 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2433 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2434
2435 uint64_t BlockFreq =
2436 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2437 .getFrequency();
2438
2439 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2440 uint64_t RelativeFreq = EntryFreq && BlockFreq
2441 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2442 : BlockFreq / EntryFreq)
2443 : 1;
2444
2445 // This assumes perfect spilling / splitting -- using one spill / copy
2446 // instruction and one restoreFrom / copy for each excess register,
2447 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2448
2449 // Also account for the block frequency.
2450 if (RelativeFreqIsDenom)
2451 SpillCost /= (int64_t)RelativeFreq;
2452 else
2453 SpillCost *= (int64_t)RelativeFreq;
2454
2455 // If we have increased spilling in any block, just bail.
2456 if (SpillCost > 0) {
2457 resetRewriteCandsToVGPR(RewriteCands);
2458 return SpillCost;
2459 }
2460
2461 if (SpillCost < BestSpillCost)
2462 BestSpillCost = SpillCost;
2463 }
2464
2465 // Set the cost to the largest decrease in spill cost in order to not double
2466 // count spill reductions.
2467 Cost = BestSpillCost;
2468 assert(Cost <= 0);
2469
2470 unsigned CopyCost = 0;
2471
2472 // For each CopyForDef, increase the cost by the register size while
2473 // accounting for block frequency.
2474 for (MachineInstr *DefMI : CopyForDef) {
2475 Register DefReg = DefMI->getOperand(0).getReg();
2476 uint64_t DefFreq =
2477 EntryFreq
2478 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2479 : 1;
2480
2481 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2482 CopyCost += RC->getCopyCost() * DefFreq;
2483 }
2484
2485 // Account for CopyForUse copies in each block that the register is used.
2486 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2487 uint64_t UseFreq =
2488 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2489
2490 for (Register UseReg : UseRegs) {
2491 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2492 CopyCost += RC->getCopyCost() * UseFreq;
2493 }
2494 }
2495
2496 // Reset the classes that were changed to AGPR for better register bank
2497 // analysis. We must do rewriting after copy-insertion, as some defs of the
2498 // register may require VGPR. Additionally, if we bail out and don't perform
2499 // the rewrite then these need to be restored anyway.
2500 resetRewriteCandsToVGPR(RewriteCands);
2501
2502 return Cost + CopyCost;
2503}
2504
2505bool RewriteMFMAFormStage::rewrite(
2506 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2507 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2508 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2509
2510 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2511 RegionBoundaries Entry = DAG.Regions[Region];
2512 if (Entry.first == Entry.second)
2513 continue;
2514
2515 FirstMIToRegion[&*Entry.first] = Region;
2516 if (Entry.second != Entry.first->getParent()->end())
2517 LastMIToRegion[&*Entry.second] = Region;
2518 }
2519
2520 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2521 // The general assumption of the algorithm (and the previous cost calculation)
2522 // is that it is better to insert the copies in the MBB of the def of the src2
2523 // operands, and in the MBB of the user of the dest operands. This is based on
2524 // the assumption that the MFMAs are likely to appear in loop bodies, while
2525 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2526 // design, the algorithm for finding copy insertion points is more
2527 // complicated.
2528 //
2529 // There are three main cases to handle: 1. the reaching defs of the src2
2530 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2531 // defs of the reaching uses of the dst operand.
2532 //
2533 // In the first case, we simply insert copies after each of the reaching
2534 // definitions. In the second case, we collect all the uses of a given dest
2535 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2536 // earliest use. Since the use may have multiple reaching defs, and since we
2537 // want to replace the register it is using with the result of the copy, we
2538 // must handle case 3. In the third case, we simply insert a copy after each
2539 // of the reaching defs to connect to the copy of the reaching uses of the dst
2540 // reg. This allows us to avoid inserting copies next to the MFMAs.
2541 //
2542 // While inserting the copies, we maintain a map of operands which will use
2543 // different regs (i.e. the result of the copies). For example, a case 1 src2
2544 // operand will use the register result of the copies after the reaching defs,
2545 // as opposed to the original register. Now that we have completed our copy
2546 // analysis and placement, we can bulk update the registers. We do this
2547 // separately as to avoid complicating the reachingDef and reachingUse
2548 // queries.
2549 //
2550 // While inserting the copies, we also maintain a list or registers which we
2551 // will want to reclassify as AGPR. After doing the copy insertion and the
2552 // register replacement, we can finally do the reclassification. This uses the
2553 // redef map, as the registers we are interested in reclassifying may be
2554 // replaced by the result of a copy. We must do this after the copy analysis
2555 // and placement as we must have an accurate redef map -- otherwise we may end
2556 // up creating illegal instructions.
2557
2558 // The original registers of the MFMA that need to be reclassified as AGPR.
2559 DenseSet<Register> RewriteRegs;
2560 // The map of an original register in the MFMA to a new register (result of a
2561 // copy) that it should be replaced with.
2562 DenseMap<Register, Register> RedefMap;
2563 // The map of the original MFMA registers to the relevant MFMA operands.
2564 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2565 // The map of reaching defs for a given register -- to avoid duplicate copies.
2566 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2567 // The map of reaching uses for a given register by basic block -- to avoid
2568 // duplicate copies and to calculate per MBB insert pts.
2569 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2570 ReachingUseTracker;
2571
2572 // Collect the candidate group; its members share AGPR-form operands
2573 // post-rewrite, so reaching defs feeding any member need no bridge copy.
2574 SmallPtrSet<MachineInstr *, 16> RewriteCandsSet;
2575 DenseSet<Register> RewriteSrc2Regs;
2576 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2577 RewriteCandsSet.insert(MI);
2578 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2579 if (Src2 && Src2->isReg())
2580 RewriteSrc2Regs.insert(Src2->getReg());
2581 }
2582
2583 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2584 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2585 if (ReplacementOp == -1)
2586 continue;
2587 MI->setDesc(TII->get(ReplacementOp));
2588
2589 // Case 1: insert copies for the reaching defs of the Src2Reg.
2590 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2591 if (Src2->isReg()) {
2592 Register Src2Reg = Src2->getReg();
2593 if (!Src2Reg.isVirtual())
2594 return false;
2595
2596 Register MappedReg = Src2->getReg();
2597 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2598 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2599 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2600
2601 for (SlotIndex RDIndex : Src2ReachingDefs) {
2602 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2603 if (isReachingDefAGPRForm(RD, RewriteSrc2Regs, *TII))
2604 continue;
2605
2606 Src2DefsReplace.insert(RD);
2607 }
2608
2609 if (!Src2DefsReplace.empty()) {
2610 auto RI = RedefMap.find(Src2Reg);
2611 if (RI != RedefMap.end()) {
2612 MappedReg = RI->second;
2613 } else {
2614 assert(!ReachingDefCopyMap.contains(Src2Reg));
2615 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2616 const TargetRegisterClass *VGPRRC =
2617 SRI->getEquivalentVGPRClass(Src2RC);
2618
2619 // Track the mapping of the original register to the new register.
2620 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2621 RedefMap[Src2Reg] = MappedReg;
2622 }
2623
2624 // If none exists, create a copy from this reaching def.
2625 // We may have inserted a copy already in an earlier iteration.
2626 for (MachineInstr *RD : Src2DefsReplace) {
2627 // Do not create redundant copies.
2628 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2629 MachineInstrBuilder VGPRCopy =
2630 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2631 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2632 .addDef(MappedReg, {}, 0)
2633 .addUse(Src2Reg, {}, 0);
2634 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2635
2636 // If this reaching def was the last MI in the region, update the
2637 // region boundaries.
2638 if (LastMIToRegion.contains(RD)) {
2639 unsigned UpdateRegion = LastMIToRegion[RD];
2640 DAG.Regions[UpdateRegion].second = VGPRCopy;
2641 LastMIToRegion.erase(RD);
2642 }
2643 }
2644 }
2645 }
2646
2647 // Track the register for reclassification
2648 RewriteRegs.insert(Src2Reg);
2649
2650 // Always insert the operand for replacement. If this corresponds with a
2651 // chain of tied-def we may not see the VGPR requirement until later.
2652 ReplaceMap[Src2Reg].insert(Src2);
2653 }
2654
2655 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2656 // and after the reaching defs of the reaching uses of the dsts.
2657
2658 MachineOperand *Dst = &MI->getOperand(0);
2659 Register DstReg = Dst->getReg();
2660 if (!DstReg.isVirtual())
2661 return false;
2662
2663 Register MappedReg = DstReg;
2664 SmallVector<MachineOperand *, 8> DstReachingUses;
2665
2666 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2667 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2668
2669 findReachingUses(MI, DAG.LIS, DstReachingUses);
2670
2671 for (MachineOperand *RUOp : DstReachingUses) {
2672 MachineInstr *UserMI = RUOp->getParent();
2673 // Group members read the AGPR result directly.
2674 if (TII->isMAI(*UserMI) && RewriteCandsSet.contains(UserMI))
2675 continue;
2676
2677 // If there is a non mai reaching use, then we need a copy.
2678 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2679 DstReachingUseCopies.push_back(RUOp);
2680
2681 // Non-rewritten MAI: its defs aren't being reclassified.
2682 if (TII->isMAI(*UserMI))
2683 continue;
2684
2685 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2686 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2687
2688 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2689 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2690 if (TII->isMAI(*RD))
2691 continue;
2692
2693 // If there is a non mai reaching def of this reaching use, then we will
2694 // need a copy.
2695 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2696 DstUseDefsReplace.push_back(RD);
2697 }
2698 }
2699
2700 if (!DstUseDefsReplace.empty()) {
2701 auto RI = RedefMap.find(DstReg);
2702 if (RI != RedefMap.end()) {
2703 MappedReg = RI->second;
2704 } else {
2705 assert(!ReachingDefCopyMap.contains(DstReg));
2706 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2707 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2708
2709 // Track the mapping of the original register to the new register.
2710 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2711 RedefMap[DstReg] = MappedReg;
2712 }
2713
2714 // If none exists, create a copy from this reaching def.
2715 // We may have inserted a copy already in an earlier iteration.
2716 for (MachineInstr *RD : DstUseDefsReplace) {
2717 // Do not create reundant copies.
2718 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2719 MachineInstrBuilder VGPRCopy =
2720 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2721 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2722 .addDef(MappedReg, {}, 0)
2723 .addUse(DstReg, {}, 0);
2724 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2725
2726 // If this reaching def was the last MI in the region, update the
2727 // region boundaries.
2728 auto LMI = LastMIToRegion.find(RD);
2729 if (LMI != LastMIToRegion.end()) {
2730 unsigned UpdateRegion = LMI->second;
2731 DAG.Regions[UpdateRegion].second = VGPRCopy;
2732 LastMIToRegion.erase(RD);
2733 }
2734 }
2735 }
2736 }
2737
2738 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2739 for (MachineOperand *RU : DstReachingUseCopies) {
2740 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2741 // Just keep track of the reaching use of this register by block. After we
2742 // have scanned all the MFMAs we can find optimal insert pts.
2743 if (RUBlock != MI->getParent()) {
2744 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2745 continue;
2746 }
2747
2748 // Special case, the use is in the same block as the MFMA. Insert the copy
2749 // just before the use.
2750 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2751 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2752 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2753 MachineInstr *UseInst = RU->getParent();
2754 MachineInstrBuilder VGPRCopy =
2755 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2756 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2757 .addDef(NewUseReg, {}, 0)
2758 .addUse(DstReg, {}, 0);
2759 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2760 // Since we know this use has only one reaching def, we can replace the
2761 // use reg.
2762 RU->setReg(NewUseReg);
2763 // Track the copy source operand for r eplacement.
2764 DstRegSet.insert(&VGPRCopy->getOperand(1));
2765 }
2766
2767 // Track the register for reclassification
2768 RewriteRegs.insert(DstReg);
2769
2770 // Insert the dst operand for replacement. If this dst is in a chain of
2771 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2772 // all the correspond operands need to be replaced.
2773 DstRegSet.insert(Dst);
2774 }
2775
2776 // Handle the copies for dst uses.
2777 using RUBType =
2778 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2779 for (RUBType RUBlockEntry : ReachingUseTracker) {
2780 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2781 for (RUDType RUDst : RUBlockEntry.second) {
2782 MachineOperand *OpBegin = *RUDst.second.begin();
2783 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2784
2785 // Find the earliest use in this block.
2786 for (MachineOperand *User : RUDst.second) {
2787 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2788 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2789 InstPt = NewInstPt;
2790 }
2791
2792 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2793 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2794 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2795 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2796
2797 MachineInstrBuilder VGPRCopy =
2798 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2799 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2800 .addDef(NewUseReg, {}, 0)
2801 .addUse(RUDst.first, {}, 0);
2802 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2803
2804 // If this UseInst was the first MI in the region, update the region
2805 // boundaries.
2806 auto FI = FirstMIToRegion.find(UseInst);
2807 if (FI != FirstMIToRegion.end()) {
2808 unsigned UpdateRegion = FI->second;
2809 DAG.Regions[UpdateRegion].first = VGPRCopy;
2810 FirstMIToRegion.erase(UseInst);
2811 }
2812
2813 // Replace the operand for all users.
2814 for (MachineOperand *User : RUDst.second) {
2815 User->setReg(NewUseReg);
2816 }
2817
2818 // Track the copy source operand for replacement.
2819 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2820 }
2821 }
2822
2823 // We may have needed to insert copies after the reaching defs of the MFMAs.
2824 // Replace the original register with the result of the copy for all relevant
2825 // operands.
2826 for (std::pair<Register, Register> NewDef : RedefMap) {
2827 Register OldReg = NewDef.first;
2828 Register NewReg = NewDef.second;
2829
2830 // Replace the register for any associated operand in the MFMA chain.
2831 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2832 ReplaceOp->setReg(NewReg);
2833 }
2834
2835 // Finally, do the reclassification of the MFMA registers.
2836 for (Register RewriteReg : RewriteRegs) {
2837 Register RegToRewrite = RewriteReg;
2838
2839 // Be sure to update the replacement register and not the original.
2840 auto RI = RedefMap.find(RewriteReg);
2841 if (RI != RedefMap.end())
2842 RegToRewrite = RI->second;
2843
2844 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2845 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2846
2847 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2848 }
2849
2850 // Bulk update the LIS.
2851 DAG.LIS->reanalyze(DAG.MF);
2852 // Liveins may have been modified for cross RC copies
2853 RegionPressureMap LiveInUpdater(&DAG, false);
2854 LiveInUpdater.buildLiveRegMap();
2855
2856 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2857 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2858
2859 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2860
2861 return true;
2862}
2863
2864unsigned PreRARematStage::getStageTargetOccupancy() const {
2865 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2866}
2867
2868bool PreRARematStage::setObjective() {
2869 const Function &F = MF.getFunction();
2870
2871 // Set up "spilling targets" for all regions.
2872 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2873 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2874 bool HasVectorRegisterExcess = false;
2875 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2876 const GCNRegPressure &RP = DAG.Pressure[I];
2877 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2878 if (!Target.satisfied())
2879 TargetRegions.set(I);
2880 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2881 }
2882
2883 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2884 // In addition to register usage being above addressable limits, occupancy
2885 // below the minimum is considered like "spilling" as well.
2886 TargetOcc = std::nullopt;
2887 } else {
2888 // There is no spilling and room to improve occupancy; set up "increased
2889 // occupancy targets" for all regions.
2890 TargetOcc = DAG.MinOccupancy + 1;
2891 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2892 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2893 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2894 for (auto [I, Target] : enumerate(RPTargets)) {
2895 Target.setTarget(MaxSGPRs, MaxVGPRs);
2896 if (!Target.satisfied())
2897 TargetRegions.set(I);
2898 }
2899 }
2900
2901 return TargetRegions.any();
2902}
2903
2904bool PreRARematStage::ScoredRemat::maybeBeneficial(
2905 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2906 for (unsigned I : TargetRegions.set_bits()) {
2907 if (Live[I] && RPTargets[I].isSaveBeneficial(RPSave))
2908 return true;
2909 }
2910 return false;
2911}
2912
2915 assert(DAG.MLI && "MLI not defined in DAG");
2917 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2918
2919 const unsigned NumRegions = DAG.Regions.size();
2921 MaxFreq = 0;
2922 Regions.reserve(NumRegions);
2923 for (unsigned I = 0; I < NumRegions; ++I) {
2924 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2925 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2926 Regions.push_back(BlockFreq);
2927 if (BlockFreq && BlockFreq < MinFreq)
2928 MinFreq = BlockFreq;
2929 else if (BlockFreq > MaxFreq)
2930 MaxFreq = BlockFreq;
2931 }
2932 if (!MinFreq)
2933 return;
2934
2935 // Scale everything down if frequencies are high.
2936 if (MinFreq >= ScaleFactor * ScaleFactor) {
2937 for (uint64_t &Freq : Regions)
2938 Freq /= ScaleFactor;
2939 MinFreq /= ScaleFactor;
2940 MaxFreq /= ScaleFactor;
2941 }
2942}
2943
2944void PreRARematStage::ScoredRemat::init(RegisterIdx RegIdx,
2945 const FreqInfo &Freq,
2946 const Rematerializer &Remater,
2948 this->RegIdx = RegIdx;
2949 const unsigned NumRegions = DAG.Regions.size();
2950 LiveIn.resize(NumRegions);
2951 LiveOut.resize(NumRegions);
2952 Live.resize(NumRegions);
2953 UnpredictableRPSave.resize(NumRegions);
2954
2955 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
2956 Register DefReg = Reg.getDefReg();
2957 assert(Reg.Uses.size() == 1 && "expected users in single region");
2958 const unsigned UseRegion = Reg.Uses.begin()->first;
2959
2960 // Mark regions in which the rematerializable register is live.
2961 for (unsigned I = 0, E = NumRegions; I != E; ++I) {
2962 if (DAG.LiveIns[I].contains(DefReg))
2963 LiveIn.set(I);
2964 if (DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).contains(DefReg))
2965 LiveOut.set(I);
2966
2967 // If the register is both unused and live-through in the region, the
2968 // latter's RP is guaranteed to decrease.
2969 if (!LiveIn[I] || !LiveOut[I] || I == UseRegion)
2970 UnpredictableRPSave.set(I);
2971 }
2972 Live |= LiveIn;
2973 Live |= LiveOut;
2974 RPSave.inc(DefReg, LaneBitmask::getNone(), Reg.Mask, DAG.MRI);
2975
2976 // Get frequencies of defining and using regions. A rematerialization from the
2977 // least frequent region to the most frequent region will yield the greatest
2978 // in order to penalize rematerializations from or into regions whose
2979 int64_t DefOrMin = std::max(Freq.Regions[Reg.DefRegion], Freq.MinFreq);
2980 int64_t UseOrMax = Freq.Regions[UseRegion];
2981 if (!UseOrMax)
2982 UseOrMax = Freq.MaxFreq;
2983 FreqDiff = DefOrMin - UseOrMax;
2984}
2985
2986void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2987 ArrayRef<GCNRPTarget> RPTargets,
2988 const FreqInfo &FreqInfo,
2989 bool ReduceSpill) {
2990 MaxFreq = 0;
2991 RegionImpact = 0;
2992 for (unsigned I : TargetRegions.set_bits()) {
2993 if (!Live[I])
2994 continue;
2995
2996 // The rematerialization must contribute positively in at least one
2997 // register class with usage above the RP target for this region to
2998 // contribute to the score.
2999 const GCNRPTarget &RegionTarget = RPTargets[I];
3000 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
3001 if (!NumRegsBenefit)
3002 continue;
3003
3004 // Regions in which RP is guaranteed to decrease have more weight.
3005 RegionImpact += (UnpredictableRPSave[I] ? 1 : 2) * NumRegsBenefit;
3006
3007 if (ReduceSpill) {
3008 uint64_t Freq = FreqInfo.Regions[I];
3009 if (UnpredictableRPSave[I]) {
3010 // Apply a frequency penalty in regions in which we are not sure that RP
3011 // will decrease.
3012 Freq /= 2;
3013 }
3014 MaxFreq = std::max(MaxFreq, Freq);
3015 }
3016 }
3017}
3018
3019void PreRARematStage::ScoredRemat::rematerialize(
3020 Rematerializer &Remater) const {
3021 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3022 Rematerializer::DependencyReuseInfo DRI;
3023 for (const Rematerializer::Reg::Dependency &Dep : Reg.Dependencies)
3024 DRI.reuse(Dep.RegIdx);
3025 unsigned UseRegion = Reg.Uses.begin()->first;
3026 Remater.rematerializeToRegion(RegIdx, UseRegion, DRI);
3027}
3028
3029void PreRARematStage::updateRPTargets(const BitVector &Regions,
3030 const GCNRegPressure &RPSave) {
3031 for (unsigned I : Regions.set_bits()) {
3032 RPTargets[I].saveRP(RPSave);
3033 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3034 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3035 TargetRegions.reset(I);
3036 }
3037 }
3038}
3039
3040bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3041 bool TooOptimistic = false;
3042 for (unsigned I : Regions.set_bits()) {
3043 GCNRPTarget &Target = RPTargets[I];
3044 Target.setRP(DAG.getRealRegPressure(I));
3045
3046 // Since we were optimistic in assessing RP decreases in these regions, we
3047 // may need to remark the target as a target region if RP didn't decrease
3048 // as expected.
3049 if (!TargetRegions[I] && !Target.satisfied()) {
3050 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3051 TooOptimistic = true;
3052 TargetRegions.set(I);
3053 }
3054 }
3055 return TooOptimistic;
3056}
3057
3058void PreRARematStage::removeFromLiveMaps(Register Reg, const BitVector &LiveIn,
3059 const BitVector &LiveOut) {
3060 assert(LiveIn.size() == DAG.Regions.size() &&
3061 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3062 for (unsigned I : LiveIn.set_bits())
3063 DAG.LiveIns[I].erase(Reg);
3064 for (unsigned I : LiveOut.set_bits())
3065 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3066}
3067
3068void PreRARematStage::addToLiveMaps(Register Reg, LaneBitmask Mask,
3069 const BitVector &LiveIn,
3070 const BitVector &LiveOut) {
3071 assert(LiveIn.size() == DAG.Regions.size() &&
3072 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3073 std::pair<Register, LaneBitmask> LiveReg(Reg, Mask);
3074 for (unsigned I : LiveIn.set_bits())
3075 DAG.LiveIns[I].insert(LiveReg);
3076 for (unsigned I : LiveOut.set_bits())
3077 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3078}
3079
3081 // We consider that reducing spilling is always beneficial so we never
3082 // rollback rematerializations or revert scheduling in such cases.
3083 if (!TargetOcc)
3084 return;
3085
3086 // When increasing occupancy, it is possible that re-scheduling is not able to
3087 // achieve the target occupancy in all regions, in which case re-scheduling in
3088 // all regions should be reverted.
3089 if (DAG.MinOccupancy >= *TargetOcc)
3090 return;
3091
3092 // Revert re-scheduling in all affected regions.
3093 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3094 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3095 << '\n');
3096 DAG.Pressure[RegionIdx] = MaxPressure;
3097 modifyRegionSchedule(RegionIdx, OrigMIOrder);
3098 }
3099
3100 // It is possible that re-scheduling lowers occupancy over the one achieved
3101 // just through rematerializations, in which case we revert re-scheduling in
3102 // all regions but do not roll back rematerializations.
3103 if (AchievedOcc >= *TargetOcc) {
3104 DAG.setTargetOccupancy(AchievedOcc);
3105 return;
3106 }
3107
3108 // Reset the target occupancy to what it was pre-rematerialization.
3109 DAG.setTargetOccupancy(*TargetOcc - 1);
3110
3111 // Roll back changes made by the stage, then recompute pressure in all
3112 // affected regions.
3113 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3114 assert(Rollback && "rollbacker should be defined");
3115 Rollback->Listener.rollback(Remater);
3116 for (const auto &[RegIdx, LiveIn, LiveOut] : Rollback->LiveMapUpdates) {
3117 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3118 addToLiveMaps(Reg.getDefReg(), Reg.Mask, LiveIn, LiveOut);
3119 }
3120
3121#ifdef EXPENSIVE_CHECKS
3122 // In particular, we want to check for coherent MI/slot order in regions in
3123 // which reverts and/or rollbacks may have happened.
3124 MF.verify();
3125#endif
3126 for (unsigned I : RescheduleRegions.set_bits())
3127 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3128
3130}
3131
3132void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3133 MinOccupancy = TargetOccupancy;
3134 if (MFI.getOccupancy() < TargetOccupancy)
3135 MFI.increaseOccupancy(MF, MinOccupancy);
3136 else
3137 MFI.limitOccupancy(MinOccupancy);
3138}
3139
3141 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3142 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3143 return SII->isIGLPMutationOnly(MI->getOpcode());
3144 });
3145}
3146
3151
3153 HasIGLPInstrs = hasIGLPInstrs(this);
3154 if (HasIGLPInstrs) {
3155 SavedMutations.clear();
3156 SavedMutations.swap(Mutations);
3158 }
3159
3161}
3162
3164 if (HasIGLPInstrs)
3165 SavedMutations.swap(Mutations);
3166
3168}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool isReachingDefAGPRForm(MachineInstr *RD, const DenseSet< Register > &CandSrc2Regs, const SIInstrInfo &TII)
Returns true when RD will already be in AGPR-form after the rewrite, so no bridge copy is needed at t...
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true))
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
MIR-level target-independent rematerialization helpers.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
BitVector & reset()
Reset all bits in the bitvector.
Definition BitVector.h:409
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:216
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
bool reset(const MachineInstr &MI, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegs=nullptr)
Reset tracker to the point before the MI filling LiveRegs upon this point using LIS.
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
void modifyRegionSchedule(unsigned RegionIdx, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx to MIOrder.
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::optional< bool > GCNTrackersOverride
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void reset(const MachineInstr &MI)
Resets tracker to the point just after MI (in program order), which can be a debug instruction.
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
A live range for subregisters.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
MIR-level target-independent rematerializer.
bool isIGLPMutationOnly(unsigned Opcode) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool hasReservedResource
Uses a reserved resource.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
ScheduleHazardRecognizer * HazardRec
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
LLVM_ABI std::pair< unsigned, unsigned > getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle)
Compute the next cycle at which the given processor resource can be scheduled.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static LLVM_ABI bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getAddressableNumVGPRs(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo &STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop, bool BiasPRegsExtra=false)
Minimize physical register live ranges.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:861
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:74
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)
DependencyReuseInfo & reuse(RegisterIdx DepIdx)
RegisterIdx RegIdx
The corresponding register's index in the rematerializer.
A rematerializable register defined by a single machine instruction.
MachineInstr * DefMI
Single MI defining the rematerializable register.
SmallDenseMap< unsigned, RegionUsers, 2 > Uses
Uses of the register, mapped by region.
Register getDefReg() const
Returns the rematerializable register from its defining instruction.