LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
17#include "llvm/ADT/Statistic.h"
22#include "llvm/Support/Debug.h"
24
25using namespace llvm;
26
27#define DEBUG_TYPE "gcn-hazard-recognizer"
28
29STATISTIC(NumWMMANopsHoisted,
30 "Number of WMMA hazard V_NOPs hoisted from loops");
31STATISTIC(NumWMMAHoistingBailed,
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
33
34namespace {
35
36struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
37 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
38
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
40 if (Arg.getAsInteger(0, Value))
41 return O.error("'" + Arg + "' value invalid for uint argument!");
42
43 if (Value > 100)
44 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
45
46 return false;
47 }
48};
49
50} // end anonymous namespace
51
53 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
54 cl::desc("Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
56
57// This is intended for debugging purposes only.
59 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
60 cl::desc("Insert a s_nop x before every instruction"));
61
63 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
64 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
65
66//===----------------------------------------------------------------------===//
67// Hazard Recognizer Implementation
68//===----------------------------------------------------------------------===//
69
71 const GCNSubtarget &ST);
72
74 MachineLoopInfo *MLI)
75 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
80 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
81}
82
84 EmittedInstrs.clear();
85}
86
90
92 CurrCycleInstr = MI;
93}
94
95static bool isDivFMas(unsigned Opcode) {
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
97}
98
99static bool isSGetReg(unsigned Opcode) {
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
101}
102
103static bool isSSetReg(unsigned Opcode) {
104 switch (Opcode) {
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
109 return true;
110 }
111 return false;
112}
113
114static bool isRWLane(unsigned Opcode) {
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
116}
117
118static bool isRFE(unsigned Opcode) {
119 return Opcode == AMDGPU::S_RFE_B64;
120}
121
122static bool isSMovRel(unsigned Opcode) {
123 switch (Opcode) {
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
128 return true;
129 default:
130 return false;
131 }
132}
133
135 const MachineInstr &MI) {
136 if (TII.isAlwaysGDS(MI.getOpcode()))
137 return true;
138
139 switch (MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
143 return true;
144 // These DS opcodes don't support GDS.
145 case AMDGPU::DS_NOP:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
148 return false;
149 default:
150 if (TII.isDS(MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
152 AMDGPU::OpName::gds);
153 if (MI.getOperand(GDS).getImm())
154 return true;
155 }
156 return false;
157 }
158}
159
160static bool isPermlane(const MachineInstr &MI) {
161 unsigned Opcode = MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185 AMDGPU::OpName::simm16);
186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187}
188
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 // Hazards which cannot be mitigated with S_NOPs.
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(MI) > 0)
211 return Hazard;
212 }
213
214 if (ST.hasNoDataDepHazard())
215 return NoHazard;
216
217 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
221 return HazardType;
222
223 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
224 return HazardType;
225
226 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
227 return HazardType;
228
229 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
230 return HazardType;
231
234 checkMAIVALUHazards(MI) > 0)
235 return HazardType;
236
237 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
238 return HazardType;
239
240 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
241 return HazardType;
242
243 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
244 return HazardType;
245
246 if (((ST.hasReadM0MovRelInterpHazard() &&
247 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
251 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
254 checkReadM0Hazards(MI) > 0)
255 return HazardType;
256
257 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
258 return HazardType;
259
261 checkMAILdStHazards(MI) > 0)
262 return HazardType;
263
264 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
265 return HazardType;
266
267 return NoHazard;
268}
269
271 unsigned Quantity) {
272 while (Quantity > 0) {
273 unsigned Arg = std::min(Quantity, 8u);
274 Quantity -= Arg;
275 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
276 .addImm(Arg - 1);
277 }
278}
279
280unsigned
281GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
286}
287
288void GCNHazardRecognizer::processBundle() {
289 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
290 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
291 // Check bundled MachineInstr's for hazards.
292 for (; MI != E && MI->isInsideBundle(); ++MI) {
293 CurrCycleInstr = &*MI;
294 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
295
296 if (IsHazardRecognizerMode) {
297 fixHazards(CurrCycleInstr);
298
299 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
300 }
301
302 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
303 // include the bundled MI directly after, only add a maximum of
304 // (MaxLookAhead - 1) noops to EmittedInstrs.
305 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
306 EmittedInstrs.push_front(nullptr);
307
308 EmittedInstrs.push_front(CurrCycleInstr);
309 EmittedInstrs.resize(MaxLookAhead);
310 }
311 CurrCycleInstr = nullptr;
312}
313
314void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
315 assert(IsHazardRecognizerMode);
316
317 unsigned NumPreNoops = PreEmitNoops(MI);
318 EmitNoops(NumPreNoops);
319 if (MI->isInsideBundle())
320 insertNoopsInBundle(MI, TII, NumPreNoops);
321 else
322 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
323 NumPreNoops);
325 AdvanceCycle();
326}
327
329 IsHazardRecognizerMode = true;
330 CurrCycleInstr = MI;
331 unsigned W = PreEmitNoopsCommon(MI);
332 fixHazards(MI);
333 CurrCycleInstr = nullptr;
334 return std::max(W, NopPadding.getValue());
335}
336
338 if (MI->isBundle())
339 return 0;
340
341 int WaitStates = 0;
342
344 return std::max(WaitStates, checkSMRDHazards(MI));
345
346 if (ST.hasNSAtoVMEMBug())
347 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
348
349 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
350
351 if (ST.hasNoDataDepHazard())
352 return WaitStates;
353
355 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
356
358 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
359
361 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
362
363 if (isDivFMas(MI->getOpcode()))
364 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
365
366 if (isRWLane(MI->getOpcode()))
367 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
368
371 checkMAIVALUHazards(MI) > 0)
372 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
373
374 if (MI->isInlineAsm())
375 return std::max(WaitStates, checkInlineAsmHazards(MI));
376
377 if (isSGetReg(MI->getOpcode()))
378 return std::max(WaitStates, checkGetRegHazards(MI));
379
380 if (isSSetReg(MI->getOpcode()))
381 return std::max(WaitStates, checkSetRegHazards(MI));
382
383 if (isRFE(MI->getOpcode()))
384 return std::max(WaitStates, checkRFEHazards(MI));
385
386 if ((ST.hasReadM0MovRelInterpHazard() &&
387 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
388 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
389 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
390 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
391 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
392 (ST.hasReadM0LdsDirectHazard() &&
393 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
394 return std::max(WaitStates, checkReadM0Hazards(MI));
395
397 return std::max(WaitStates, checkMAIHazards(MI));
398
400 return std::max(WaitStates, checkMAILdStHazards(MI));
401
402 if (ST.hasGFX950Insts() && isPermlane(*MI))
403 return std::max(WaitStates, checkPermlaneHazards(MI));
404
405 return WaitStates;
406}
407
409 EmittedInstrs.push_front(nullptr);
410}
411
413 // When the scheduler detects a stall, it will call AdvanceCycle() without
414 // emitting any instructions.
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(nullptr);
417 return;
418 }
419
420 if (CurrCycleInstr->isBundle()) {
421 processBundle();
422 return;
423 }
424
425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426 if (!NumWaitStates) {
427 CurrCycleInstr = nullptr;
428 return;
429 }
430
431 // Keep track of emitted instructions
432 EmittedInstrs.push_front(CurrCycleInstr);
433
434 // Add a nullptr for each additional wait state after the first. Make sure
435 // not to add more than getMaxLookAhead() items to the list, since we
436 // truncate the list to that size right after this loop.
437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438 i < e; ++i) {
439 EmittedInstrs.push_front(nullptr);
440 }
441
442 // getMaxLookahead() is the largest number of wait states we will ever need
443 // to insert, so there is no point in keeping track of more than that many
444 // wait states.
445 EmittedInstrs.resize(getMaxLookAhead());
446
447 CurrCycleInstr = nullptr;
448}
449
451 assert(!IsHazardRecognizerMode &&
452 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
453}
454
455//===----------------------------------------------------------------------===//
456// Helper Functions
457//===----------------------------------------------------------------------===//
458
460
461// Search for a hazard in a block and its predecessors.
462template <typename StateT>
463static bool
464hasHazard(StateT InitialState,
465 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
466 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
467 const MachineBasicBlock *InitialMBB,
469 struct StateMapKey {
471 unsigned Idx;
472 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
473 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
474 }
475 };
476 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
477 static inline StateMapKey getEmptyKey() {
478 return {static_cast<SmallVectorImpl<StateT> *>(
481 }
482 static inline StateMapKey getTombstoneKey() {
483 return {static_cast<SmallVectorImpl<StateT> *>(
486 }
487 static unsigned getHashValue(const StateMapKey &Key) {
488 return StateT::getHashValue((*Key.States)[Key.Idx]);
489 }
490 static unsigned getHashValue(const StateT &State) {
491 return StateT::getHashValue(State);
492 }
493 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
494 const auto EKey = getEmptyKey();
495 const auto TKey = getTombstoneKey();
496 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
497 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
498 return StateMapKey::isEqual(LHS, RHS);
499 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
500 }
501 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
502 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
503 StateMapKey::isEqual(RHS, getTombstoneKey()))
504 return false;
505 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
506 }
507 };
508
511
513 const MachineBasicBlock *MBB = InitialMBB;
514 StateT State = InitialState;
515
517 unsigned WorkIdx = 0;
518 for (;;) {
519 bool Expired = false;
520 for (auto E = MBB->instr_rend(); I != E; ++I) {
521 // No need to look at parent BUNDLE instructions.
522 if (I->isBundle())
523 continue;
524
525 auto Result = IsHazard(State, *I);
526 if (Result == HazardFound)
527 return true;
528 if (Result == HazardExpired) {
529 Expired = true;
530 break;
531 }
532
533 if (I->isInlineAsm() || I->isMetaInstruction())
534 continue;
535
536 UpdateState(State, *I);
537 }
538
539 if (!Expired) {
540 unsigned StateIdx = States.size();
541 StateMapKey Key = {&States, StateIdx};
542 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
543 if (Insertion.second) {
544 States.emplace_back(State);
545 } else {
546 StateIdx = Insertion.first->second;
547 }
548 for (MachineBasicBlock *Pred : MBB->predecessors())
549 Worklist.insert(std::pair(Pred, StateIdx));
550 }
551
552 if (WorkIdx == Worklist.size())
553 break;
554
555 unsigned StateIdx;
556 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
557 State = States[StateIdx];
558 I = MBB->instr_rbegin();
559 }
560
561 return false;
562}
563
564// Returns a minimum wait states since \p I walking all predecessors.
565// Only scans until \p IsExpired does not return true.
566// Can only be run in a hazard recognizer mode.
567static int
569 const MachineBasicBlock *MBB,
571 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
575 for (auto E = MBB->instr_rend(); I != E; ++I) {
576 // Don't add WaitStates for parent BUNDLE instructions.
577 if (I->isBundle())
578 continue;
579
580 if (IsHazard(*I))
581 return WaitStates;
582
583 if (I->isInlineAsm())
584 continue;
585
586 WaitStates += GetNumWaitStates(*I);
587
588 if (IsExpired(*I, WaitStates))
589 return std::numeric_limits<int>::max();
590 }
591
592 int MinWaitStates = std::numeric_limits<int>::max();
593 for (MachineBasicBlock *Pred : MBB->predecessors()) {
594 if (!Visited.insert(Pred).second)
595 continue;
596
597 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
598 IsExpired, Visited, GetNumWaitStates);
599
600 MinWaitStates = std::min(MinWaitStates, W);
601 }
602
603 return MinWaitStates;
604}
605
606static int
608 const MachineInstr *MI,
613 return getWaitStatesSince(IsHazard, MI->getParent(),
614 std::next(MI->getReverseIterator()), 0, IsExpired,
615 Visited, GetNumWaitStates);
616}
617
618int GCNHazardRecognizer::getWaitStatesSince(
619 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
620 if (IsHazardRecognizerMode) {
621 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
622 return WaitStates >= Limit;
623 };
624 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
625 GetNumWaitStates);
626 }
627
628 int WaitStates = 0;
629 for (MachineInstr *MI : EmittedInstrs) {
630 if (MI) {
631 if (IsHazard(*MI))
632 return WaitStates;
633
634 if (MI->isInlineAsm())
635 continue;
636 }
637 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
638
639 if (WaitStates >= Limit)
640 break;
641 }
642 return std::numeric_limits<int>::max();
643}
644
645int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
646 int Limit) const {
647 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
648}
649
650int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
651 IsHazardFn IsHazardDef,
652 int Limit) const {
653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
654
655 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
656 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
657 };
658
659 return getWaitStatesSince(IsHazardFn, Limit);
660}
661
662int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
663 int Limit) const {
664 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
665 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
666 };
667
668 return getWaitStatesSince(IsHazardFn, Limit);
669}
670
671//===----------------------------------------------------------------------===//
672// No-op Hazard Detection
673//===----------------------------------------------------------------------===//
674
675static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
676 MCRegister Reg) {
677 for (MCRegUnit Unit : TRI.regunits(Reg))
678 BV.set(static_cast<unsigned>(Unit));
679}
680
681static void addRegsToSet(const SIRegisterInfo &TRI,
683 BitVector &DefSet, BitVector &UseSet) {
684 for (const MachineOperand &Op : Ops) {
685 if (Op.isReg())
686 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
687 }
688}
689
690void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
691 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
692}
693
695 return !SIInstrInfo::isSMRD(*MI);
696}
697
699 return !SIInstrInfo::isVMEM(*MI);
700}
701
702int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
703 // SMEM soft clause are only present on VI+, and only matter if xnack is
704 // enabled.
705 if (!ST.isXNACKEnabled())
706 return 0;
707
708 bool IsSMRD = TII.isSMRD(*MEM);
709
710 resetClause();
711
712 // A soft-clause is any group of consecutive SMEM instructions. The
713 // instructions in this group may return out of order and/or may be
714 // replayed (i.e. the same instruction issued more than once).
715 //
716 // In order to handle these situations correctly we need to make sure that
717 // when a clause has more than one instruction, no instruction in the clause
718 // writes to a register that is read by another instruction in the clause
719 // (including itself). If we encounter this situation, we need to break the
720 // clause by inserting a non SMEM instruction.
721
722 for (MachineInstr *MI : EmittedInstrs) {
723 // When we hit a non-SMEM instruction then we have passed the start of the
724 // clause and we can stop.
725 if (!MI)
726 break;
727
729 break;
730
731 addClauseInst(*MI);
732 }
733
734 if (ClauseDefs.none())
735 return 0;
736
737 // We need to make sure not to put loads and stores in the same clause if they
738 // use the same address. For now, just start a new clause whenever we see a
739 // store.
740 if (MEM->mayStore())
741 return 1;
742
743 addClauseInst(*MEM);
744
745 // If the set of defs and uses intersect then we cannot add this instruction
746 // to the clause, so we have a hazard.
747 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
748}
749
750int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
751 int WaitStatesNeeded = 0;
752
753 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
754
755 // This SMRD hazard only affects SI.
756 if (!ST.hasSMRDReadVALUDefHazard())
757 return WaitStatesNeeded;
758
759 // A read of an SGPR by SMRD instruction requires 4 wait states when the
760 // SGPR was written by a VALU instruction.
761 int SmrdSgprWaitStates = 4;
762 auto IsHazardDefFn = [this](const MachineInstr &MI) {
763 return TII.isVALU(MI);
764 };
765 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
766 return TII.isSALU(MI);
767 };
768
769 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
770
771 for (const MachineOperand &Use : SMRD->uses()) {
772 if (!Use.isReg())
773 continue;
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
776 SmrdSgprWaitStates);
777 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
778
779 // This fixes what appears to be undocumented hardware behavior in SI where
780 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
781 // needs some number of nops in between. We don't know how many we need, but
782 // let's use 4. This wasn't discovered before probably because the only
783 // case when this happens is when we expand a 64-bit pointer into a full
784 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
785 // probably never encountered in the closed-source land.
786 if (IsBufferSMRD) {
787 int WaitStatesNeededForUse =
788 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
789 IsBufferHazardDefFn,
790 SmrdSgprWaitStates);
791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
792 }
793 }
794
795 return WaitStatesNeeded;
796}
797
798int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
799 if (!ST.hasVMEMReadSGPRVALUDefHazard())
800 return 0;
801
802 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
803
804 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
805 // SGPR was written by a VALU Instruction.
806 const int VmemSgprWaitStates = 5;
807 auto IsHazardDefFn = [this](const MachineInstr &MI) {
808 return TII.isVALU(MI);
809 };
810 for (const MachineOperand &Use : VMEM->uses()) {
811 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
812 continue;
813
814 int WaitStatesNeededForUse =
815 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
816 VmemSgprWaitStates);
817 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
818 }
819 return WaitStatesNeeded;
820}
821
822int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
823 const SIRegisterInfo *TRI = ST.getRegisterInfo();
824 const SIInstrInfo *TII = ST.getInstrInfo();
825
826 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
827 int DppVgprWaitStates = 2;
828 int DppExecWaitStates = 5;
829 int WaitStatesNeeded = 0;
830 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
831 return TII->isVALU(MI);
832 };
833
834 for (const MachineOperand &Use : DPP->uses()) {
835 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
836 continue;
837 int WaitStatesNeededForUse =
838 DppVgprWaitStates - getWaitStatesSinceDef(
839 Use.getReg(),
840 [](const MachineInstr &) { return true; },
841 DppVgprWaitStates);
842 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
843 }
844
845 WaitStatesNeeded = std::max(
846 WaitStatesNeeded,
847 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
848 DppExecWaitStates));
849
850 return WaitStatesNeeded;
851}
852
853int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
854 const SIInstrInfo *TII = ST.getInstrInfo();
855
856 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
857 // instruction.
858 const int DivFMasWaitStates = 4;
859 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
860 return TII->isVALU(MI);
861 };
862 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
863 DivFMasWaitStates);
864
865 return DivFMasWaitStates - WaitStatesNeeded;
866}
867
868int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
871
872 const int GetRegWaitStates = 2;
873 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
874 return GetRegHWReg == getHWReg(TII, MI);
875 };
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
877
878 return GetRegWaitStates - WaitStatesNeeded;
879}
880
881int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned HWReg = getHWReg(TII, *SetRegInstr);
884
885 const int SetRegWaitStates = ST.getSetRegWaitStates();
886 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
887 return HWReg == getHWReg(TII, MI);
888 };
889 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
890 return SetRegWaitStates - WaitStatesNeeded;
891}
892
893int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
894 if (!MI.mayStore())
895 return -1;
896
897 const SIInstrInfo *TII = ST.getInstrInfo();
898 unsigned Opcode = MI.getOpcode();
899 const MCInstrDesc &Desc = MI.getDesc();
900
901 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
902 int VDataRCID = -1;
903 if (VDataIdx != -1)
904 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
905
906 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
907 // There is no hazard if the instruction does not use vector regs
908 // (like wbinvl1)
909 if (VDataIdx == -1)
910 return -1;
911 // For MUBUF/MTBUF instructions this hazard only exists if the
912 // instruction is not using a register in the soffset field.
913 const MachineOperand *SOffset =
914 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
915 // If we have no soffset operand, then assume this field has been
916 // hardcoded to zero.
917 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
918 (!SOffset || !SOffset->isReg()))
919 return VDataIdx;
920 }
921
922 // MIMG instructions create a hazard if they don't use a 256-bit T# and
923 // the store size is greater than 8 bytes and they have more than two bits
924 // of their dmask set.
925 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
926 if (TII->isMIMG(MI)) {
927 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
928 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
929 Desc.operands()[SRsrcIdx])) == 256);
930 (void)SRsrcIdx;
931 }
932
933 if (TII->isFLAT(MI)) {
934 // There is no hazard if the instruction does not use vector regs
935 if (VDataIdx == -1)
936 return -1;
937
938 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
939 return VDataIdx;
940 }
941
942 return -1;
943}
944
945int GCNHazardRecognizer::checkVALUHazardsHelper(
946 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
947 // Helper to check for the hazard where VMEM instructions that store more than
948 // 8 bytes can have there store data over written by the next instruction.
949 const SIRegisterInfo *TRI = ST.getRegisterInfo();
950
951 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
952 int WaitStatesNeeded = 0;
953
954 if (!TRI->isVectorRegister(MRI, Def.getReg()))
955 return WaitStatesNeeded;
956 Register Reg = Def.getReg();
957 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
958 int DataIdx = createsVALUHazard(MI);
959 return DataIdx >= 0 &&
960 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
961 };
962
963 int WaitStatesNeededForDef =
964 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
965 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
966
967 return WaitStatesNeeded;
968}
969
970/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
971/// pack the computed value into correct bit position of the dest register. This
972/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
973/// dst_sel that is not aligned to the register. This function analayzes the \p
974/// MI and \returns an operand with dst forwarding issue, or nullptr if
975/// none exists.
976static const MachineOperand *
979 return nullptr;
980
981 const SIInstrInfo *TII = ST.getInstrInfo();
982
983 unsigned Opcode = MI.getOpcode();
984
985 // There are three different types of instructions
986 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
987 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
988 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
989 // op_sel[3:2]
990 // != 0
991 if (SIInstrInfo::isSDWA(MI)) {
992 // Type 1: SDWA with dst_sel != DWORD
993 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
994 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
995 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
996 }
997
998 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
999 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1000 // Type 2: VOP3 which write the hi bits
1001 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1003 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1004
1005 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1006 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1007 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1009 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1010 }
1011
1012 // Special case: nop is required for all the opsel values for fp4 sr variant
1013 // cvt scale instructions
1014 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1015 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1016
1017 return nullptr;
1018}
1019
1020/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1021/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1022/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1024 const MachineOperand *Dst,
1025 const SIRegisterInfo *TRI) {
1026 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1027 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1028 // and we must account for that hazard.
1029 // We also must account for WAW hazards. In particular, WAW with dest
1030 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1031 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1032 // check for ECC. Without accounting for this hazard, the ECC will be
1033 // wrong.
1034 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1035 // complete zeroesHigh16BitsOfDest)
1036 for (auto &Operand : VALU->operands()) {
1037 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1038 return true;
1039 }
1040 }
1041 return false;
1042}
1043
1044int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1045 int WaitStatesNeeded = 0;
1046
1047 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1048 const int TransDefWaitstates = 1;
1049
1050 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1052 return false;
1053 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1054 const SIInstrInfo *TII = ST.getInstrInfo();
1055 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1056
1057 for (const MachineOperand &Use : VALU->explicit_uses()) {
1058 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1059 return true;
1060 }
1061
1062 return false;
1063 };
1064
1065 int WaitStatesNeededForDef =
1066 TransDefWaitstates -
1067 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1068 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1069 }
1070
1071 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1072 const int Shift16DefWaitstates = 1;
1073
1074 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1075 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1076 const MachineOperand *ForwardedDst =
1077 getDstSelForwardingOperand(ProducerMI, ST);
1078 if (ForwardedDst) {
1079 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1080 }
1081
1082 if (ProducerMI.isInlineAsm()) {
1083 // Assume inline asm has dst forwarding hazard
1084 for (auto &Def : ProducerMI.all_defs()) {
1085 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1086 return true;
1087 }
1088 }
1089
1090 return false;
1091 };
1092
1093 int WaitStatesNeededForDef =
1094 Shift16DefWaitstates -
1095 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1097 }
1098
1099 if (ST.hasVDecCoExecHazard()) {
1100 const int VALUWriteSGPRVALUReadWaitstates = 2;
1101 const int VALUWriteEXECRWLane = 4;
1102 const int VALUWriteVGPRReadlaneRead = 1;
1103
1104 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1105 const MachineRegisterInfo &MRI = MF.getRegInfo();
1107 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1108 if (!SIInstrInfo::isVALU(MI))
1109 return false;
1110 return MI.modifiesRegister(UseReg, TRI);
1111 };
1112
1113 for (const MachineOperand &Use : VALU->explicit_uses()) {
1114 if (!Use.isReg())
1115 continue;
1116
1117 UseReg = Use.getReg();
1118 if (TRI->isSGPRReg(MRI, UseReg)) {
1119 int WaitStatesNeededForDef =
1120 VALUWriteSGPRVALUReadWaitstates -
1121 getWaitStatesSince(IsVALUDefSGPRFn,
1122 VALUWriteSGPRVALUReadWaitstates);
1123 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1124 }
1125 }
1126
1127 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1128 UseReg = AMDGPU::VCC;
1129 int WaitStatesNeededForDef =
1130 VALUWriteSGPRVALUReadWaitstates -
1131 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1132 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1133 }
1134
1135 switch (VALU->getOpcode()) {
1136 case AMDGPU::V_READLANE_B32:
1137 case AMDGPU::V_READFIRSTLANE_B32: {
1138 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1139 UseReg = Src->getReg();
1140 int WaitStatesNeededForDef =
1141 VALUWriteVGPRReadlaneRead -
1142 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1143 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1144 }
1145 [[fallthrough]];
1146 case AMDGPU::V_WRITELANE_B32: {
1147 UseReg = AMDGPU::EXEC;
1148 int WaitStatesNeededForDef =
1149 VALUWriteEXECRWLane -
1150 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1151 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1152 break;
1153 }
1154 default:
1155 break;
1156 }
1157 }
1158
1159 // This checks for the hazard where VMEM instructions that store more than
1160 // 8 bytes can have there store data over written by the next instruction.
1161 if (!ST.has12DWordStoreHazard())
1162 return WaitStatesNeeded;
1163
1164 const MachineRegisterInfo &MRI = MF.getRegInfo();
1165
1166 for (const MachineOperand &Def : VALU->defs()) {
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1168 }
1169
1170 return WaitStatesNeeded;
1171}
1172
1173int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1174 // This checks for hazards associated with inline asm statements.
1175 // Since inline asms can contain just about anything, we use this
1176 // to call/leverage other check*Hazard routines. Note that
1177 // this function doesn't attempt to address all possible inline asm
1178 // hazards (good luck), but is a collection of what has been
1179 // problematic thus far.
1180
1181 // see checkVALUHazards()
1182 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1183 !ST.hasCvtScaleForwardingHazard())
1184 return 0;
1185
1186 const MachineRegisterInfo &MRI = MF.getRegInfo();
1187 int WaitStatesNeeded = 0;
1188
1189 for (const MachineOperand &Op :
1191 if (Op.isReg() && Op.isDef()) {
1192 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1193 continue;
1194
1195 if (ST.has12DWordStoreHazard()) {
1196 WaitStatesNeeded =
1197 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1198 }
1199 }
1200 }
1201
1202 if (ST.hasDstSelForwardingHazard()) {
1203 const int Shift16DefWaitstates = 1;
1204
1205 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1206 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1207 // Assume inline asm reads the dst
1208 if (Dst)
1209 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1210 IA->readsRegister(Dst->getReg(), &TRI);
1211
1212 if (ProducerMI.isInlineAsm()) {
1213 // If MI is inline asm, assume it has dst forwarding hazard
1214 for (auto &Def : ProducerMI.all_defs()) {
1215 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1216 IA->readsRegister(Def.getReg(), &TRI)) {
1217 return true;
1218 }
1219 }
1220 }
1221
1222 return false;
1223 };
1224
1225 int WaitStatesNeededForDef =
1226 Shift16DefWaitstates -
1227 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1228 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1229 }
1230
1231 return WaitStatesNeeded;
1232}
1233
1234int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1235 const SIInstrInfo *TII = ST.getInstrInfo();
1236 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237 const MachineRegisterInfo &MRI = MF.getRegInfo();
1238
1239 const MachineOperand *LaneSelectOp =
1240 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1241
1242 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1243 return 0;
1244
1245 Register LaneSelectReg = LaneSelectOp->getReg();
1246 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1247
1248 const int RWLaneWaitStates = 4;
1249 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1250 RWLaneWaitStates);
1251 return RWLaneWaitStates - WaitStatesSince;
1252}
1253
1254int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1255 if (!ST.hasRFEHazards())
1256 return 0;
1257
1258 const SIInstrInfo *TII = ST.getInstrInfo();
1259
1260 const int RFEWaitStates = 1;
1261
1262 auto IsHazardFn = [TII](const MachineInstr &MI) {
1263 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1264 };
1265 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1266 return RFEWaitStates - WaitStatesNeeded;
1267}
1268
1269int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1270 const SIInstrInfo *TII = ST.getInstrInfo();
1271 const int ReadM0WaitStates = 1;
1272 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1273 return ReadM0WaitStates -
1274 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1275}
1276
1277void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1279 int WaitStatesNeeded, bool IsHoisting) {
1280 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1281 for (int I = 0; I < WaitStatesNeeded; ++I)
1282 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1283}
1284
1285void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1286 fixVMEMtoScalarWriteHazards(MI);
1287 fixVcmpxPermlaneHazards(MI);
1288 fixSMEMtoVectorWriteHazards(MI);
1289 fixVcmpxExecWARHazard(MI);
1290 fixLdsBranchVmemWARHazard(MI);
1291 if (ST.hasLdsDirect()) {
1292 fixLdsDirectVALUHazard(MI);
1293 fixLdsDirectVMEMHazard(MI);
1294 }
1295 fixVALUPartialForwardingHazard(MI);
1296 fixVALUTransUseHazard(MI);
1297 fixVALUTransCoexecutionHazards(MI);
1298 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1299 fixWMMACoexecutionHazards(MI);
1300 fixShift64HighRegBug(MI);
1301 fixVALUMaskWriteHazard(MI);
1302 fixRequiredExportPriority(MI);
1303 if (ST.requiresWaitIdleBeforeGetReg())
1304 fixGetRegWaitIdle(MI);
1305 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1306 fixDsAtomicAsyncBarrierArriveB64(MI);
1307 if (ST.hasScratchBaseForwardingHazard())
1308 fixScratchBaseForwardingHazard(MI);
1309 if (ST.setRegModeNeedsVNOPs())
1310 fixSetRegMode(MI);
1311}
1312
1314 const MachineInstr &MI) {
1315 return (TII.isVOPC(MI) ||
1316 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1317 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1318}
1319
1320bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1321 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1322 return false;
1323
1324 const SIInstrInfo *TII = ST.getInstrInfo();
1325 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1326 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1327 return isVCmpXWritesExec(*TII, *TRI, MI);
1328 };
1329
1330 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1331 unsigned Opc = MI.getOpcode();
1332 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1333 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1334 };
1335
1336 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1337 std::numeric_limits<int>::max())
1338 return false;
1339
1340 // V_NOP will be discarded by SQ.
1341 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1342 // which is always a VGPR and available.
1343 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1344 Register Reg = Src0->getReg();
1345 bool IsUndef = Src0->isUndef();
1346 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1347 TII->get(AMDGPU::V_MOV_B32_e32))
1350
1351 return true;
1352}
1353
1354bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1355 if (!ST.hasVMEMtoScalarWriteHazard())
1356 return false;
1357 assert(!ST.hasExtendedWaitCounts());
1358
1360 return false;
1361
1362 if (MI->getNumDefs() == 0)
1363 return false;
1364
1365 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1366
1367 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1369 return false;
1370
1371 for (const MachineOperand &Def : MI->defs()) {
1372 const MachineOperand *Op =
1373 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1374 if (!Op)
1375 continue;
1376 return true;
1377 }
1378 return false;
1379 };
1380
1381 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1382 return SIInstrInfo::isVALU(MI) ||
1383 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1384 !MI.getOperand(0).getImm()) ||
1385 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1386 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1387 };
1388
1389 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1390 std::numeric_limits<int>::max())
1391 return false;
1392
1393 const SIInstrInfo *TII = ST.getInstrInfo();
1394 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1395 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1397 return true;
1398}
1399
1400bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1401 if (!ST.hasSMEMtoVectorWriteHazard())
1402 return false;
1403 assert(!ST.hasExtendedWaitCounts());
1404
1405 if (!SIInstrInfo::isVALU(*MI))
1406 return false;
1407
1408 AMDGPU::OpName SDSTName;
1409 switch (MI->getOpcode()) {
1410 case AMDGPU::V_READLANE_B32:
1411 case AMDGPU::V_READFIRSTLANE_B32:
1412 SDSTName = AMDGPU::OpName::vdst;
1413 break;
1414 default:
1415 SDSTName = AMDGPU::OpName::sdst;
1416 break;
1417 }
1418
1419 const SIInstrInfo *TII = ST.getInstrInfo();
1420 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1421 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1422 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1423 if (!SDST) {
1424 for (const auto &MO : MI->implicit_operands()) {
1425 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1426 SDST = &MO;
1427 break;
1428 }
1429 }
1430 }
1431
1432 if (!SDST)
1433 return false;
1434
1435 const Register SDSTReg = SDST->getReg();
1436 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1437 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1438 };
1439
1440 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1441 if (TII->isSALU(MI)) {
1442 switch (MI.getOpcode()) {
1443 case AMDGPU::S_SETVSKIP:
1444 case AMDGPU::S_VERSION:
1445 case AMDGPU::S_WAITCNT_VSCNT:
1446 case AMDGPU::S_WAITCNT_VMCNT:
1447 case AMDGPU::S_WAITCNT_EXPCNT:
1448 // These instructions cannot not mitigate the hazard.
1449 return false;
1450 case AMDGPU::S_WAITCNT_LGKMCNT:
1451 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1452 return (MI.getOperand(1).getImm() == 0) &&
1453 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1454 case AMDGPU::S_WAITCNT: {
1455 const int64_t Imm = MI.getOperand(0).getImm();
1456 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1457 // DsCnt corresponds to LGKMCnt here.
1458 return Decoded.get(AMDGPU::DS_CNT) == 0;
1459 }
1460 default:
1461 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1462 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1463 "unexpected wait count instruction");
1464 // SOPP instructions cannot mitigate the hazard.
1465 if (TII->isSOPP(MI))
1466 return false;
1467 // At this point the SALU can be assumed to mitigate the hazard
1468 // because either:
1469 // (a) it is independent of the at risk SMEM (breaking chain),
1470 // or
1471 // (b) it is dependent on the SMEM, in which case an appropriate
1472 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1473 // SMEM instruction.
1474 return true;
1475 }
1476 }
1477 return false;
1478 };
1479
1480 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1481 std::numeric_limits<int>::max())
1482 return false;
1483
1484 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1485 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1486 .addImm(0);
1487 return true;
1488}
1489
1490bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1491 if (!ST.hasVcmpxExecWARHazard())
1492 return false;
1493 assert(!ST.hasExtendedWaitCounts());
1494
1495 if (!SIInstrInfo::isVALU(*MI))
1496 return false;
1497
1498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1499 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1500 return false;
1501
1502 auto IsHazardFn = [TRI](const MachineInstr &I) {
1504 return false;
1505 return I.readsRegister(AMDGPU::EXEC, TRI);
1506 };
1507
1508 const SIInstrInfo *TII = ST.getInstrInfo();
1509 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1510 if (SIInstrInfo::isVALU(MI)) {
1511 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1512 return true;
1513 for (auto MO : MI.implicit_operands())
1514 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1515 return true;
1516 }
1517 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1518 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1519 return true;
1520 return false;
1521 };
1522
1523 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1524 std::numeric_limits<int>::max())
1525 return false;
1526
1527 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1528 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1530 return true;
1531}
1532
1534 const GCNSubtarget &ST) {
1535 if (!ST.hasLdsBranchVmemWARHazard())
1536 return false;
1537
1538 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1539 // instructions need to appear in the same function.
1540 bool HasLds = false;
1541 bool HasVmem = false;
1542 for (auto &MBB : MF) {
1543 for (auto &MI : MBB) {
1545 HasVmem |= SIInstrInfo::isVMEM(MI);
1546 if (HasLds && HasVmem)
1547 return true;
1548 }
1549 }
1550 return false;
1551}
1552
1554 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1555 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1556 !I.getOperand(1).getImm();
1557}
1558
1559bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1560 if (!RunLdsBranchVmemWARHazardFixup)
1561 return false;
1562
1563 assert(ST.hasLdsBranchVmemWARHazard());
1564 assert(!ST.hasExtendedWaitCounts());
1565
1566 auto IsHazardInst = [](const MachineInstr &MI) {
1568 return 1;
1570 return 2;
1571 return 0;
1572 };
1573
1574 auto InstType = IsHazardInst(*MI);
1575 if (!InstType)
1576 return false;
1577
1578 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1579 return IsHazardInst(I) || isStoreCountWaitZero(I);
1580 };
1581
1582 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1583 if (!I.isBranch())
1584 return false;
1585
1586 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1587 auto InstType2 = IsHazardInst(I);
1588 return InstType2 && InstType != InstType2;
1589 };
1590
1591 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1592 auto InstType2 = IsHazardInst(I);
1593 if (InstType == InstType2)
1594 return true;
1595
1596 return isStoreCountWaitZero(I);
1597 };
1598
1599 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1600 std::numeric_limits<int>::max();
1601 };
1602
1603 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1604 std::numeric_limits<int>::max())
1605 return false;
1606
1607 const SIInstrInfo *TII = ST.getInstrInfo();
1608 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1609 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1610 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1611 .addImm(0);
1612
1613 return true;
1614}
1615
1616bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1618 return false;
1619
1620 const int NoHazardWaitStates = 15;
1621 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1622 const Register VDSTReg = VDST->getReg();
1623
1624 bool VisitedTrans = false;
1625 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1626 if (!SIInstrInfo::isVALU(I))
1627 return false;
1628 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1629 // Cover both WAR and WAW
1630 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1631 };
1632 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1633 if (WaitStates >= NoHazardWaitStates)
1634 return true;
1635 // Instructions which cause va_vdst==0 expire hazard
1638 };
1639 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1640 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1641 };
1642
1643 DenseSet<const MachineBasicBlock *> Visited;
1644 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1645 std::next(MI->getReverseIterator()), 0,
1646 IsExpiredFn, Visited, GetWaitStatesFn);
1647
1648 // Transcendentals can execute in parallel to other VALUs.
1649 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1650 if (VisitedTrans)
1651 Count = 0;
1652
1653 MachineOperand *WaitVdstOp =
1654 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1655 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1656
1657 return true;
1658}
1659
1660bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1662 return false;
1663
1664 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1665 const Register VDSTReg = VDST->getReg();
1666
1667 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1669 return false;
1670 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1671 };
1672 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1673 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1674 // according to the type of VMEM instruction.
1675 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1677 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1678 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1679 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1680 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1681 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1682 };
1683
1684 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1685 std::numeric_limits<int>::max())
1686 return false;
1687
1688 if (LdsdirCanWait) {
1689 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1690 } else {
1691 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1692 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1694 }
1695
1696 return true;
1697}
1698
1699bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1700 if (!ST.hasVALUPartialForwardingHazard())
1701 return false;
1702 assert(!ST.hasExtendedWaitCounts());
1703
1704 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1705 return false;
1706
1707 SmallSetVector<Register, 4> SrcVGPRs;
1708
1709 for (const MachineOperand &Use : MI->explicit_uses()) {
1710 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1711 SrcVGPRs.insert(Use.getReg());
1712 }
1713
1714 // Only applies with >= 2 unique VGPR sources
1715 if (SrcVGPRs.size() <= 1)
1716 return false;
1717
1718 // Look for the following pattern:
1719 // Va <- VALU [PreExecPos]
1720 // intv1
1721 // Exec <- SALU [ExecPos]
1722 // intv2
1723 // Vb <- VALU [PostExecPos]
1724 // intv3
1725 // MI Va, Vb (WaitState = 0)
1726 //
1727 // Where:
1728 // intv1 + intv2 <= 2 VALUs
1729 // intv3 <= 4 VALUs
1730 //
1731 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1732
1733 const int Intv1plus2MaxVALUs = 2;
1734 const int Intv3MaxVALUs = 4;
1735 const int IntvMaxVALUs = 6;
1736 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1737
1738 struct StateType {
1739 SmallDenseMap<Register, int, 4> DefPos;
1740 int ExecPos = std::numeric_limits<int>::max();
1741 int VALUs = 0;
1742
1743 static unsigned getHashValue(const StateType &State) {
1744 return hash_combine(State.ExecPos, State.VALUs,
1745 hash_combine_range(State.DefPos));
1746 }
1747 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1748 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1749 LHS.VALUs == RHS.VALUs;
1750 }
1751 };
1752
1753 StateType State;
1754
1755 // This overloads expiry testing with all the hazard detection
1756 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1757 // Too many VALU states have passed
1758 if (State.VALUs > NoHazardVALUWaitStates)
1759 return HazardExpired;
1760
1761 // Instructions which cause va_vdst==0 expire hazard
1764 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1765 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1766 return HazardExpired;
1767
1768 // Track registers writes
1769 bool Changed = false;
1770 if (SIInstrInfo::isVALU(I)) {
1771 for (Register Src : SrcVGPRs) {
1772 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1773 State.DefPos[Src] = State.VALUs;
1774 Changed = true;
1775 }
1776 }
1777 } else if (SIInstrInfo::isSALU(I)) {
1778 if (State.ExecPos == std::numeric_limits<int>::max()) {
1779 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1780 State.ExecPos = State.VALUs;
1781 Changed = true;
1782 }
1783 }
1784 }
1785
1786 // Early expiration: too many VALUs in intv3
1787 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1788 return HazardExpired;
1789
1790 // Only evaluate state if something changed
1791 if (!Changed)
1792 return NoHazardFound;
1793
1794 // Determine positions of VALUs pre/post exec change
1795 if (State.ExecPos == std::numeric_limits<int>::max())
1796 return NoHazardFound;
1797
1798 int PreExecPos = std::numeric_limits<int>::max();
1799 int PostExecPos = std::numeric_limits<int>::max();
1800
1801 for (auto Entry : State.DefPos) {
1802 int DefVALUs = Entry.second;
1803 if (DefVALUs != std::numeric_limits<int>::max()) {
1804 if (DefVALUs >= State.ExecPos)
1805 PreExecPos = std::min(PreExecPos, DefVALUs);
1806 else
1807 PostExecPos = std::min(PostExecPos, DefVALUs);
1808 }
1809 }
1810
1811 // Need a VALUs post exec change
1812 if (PostExecPos == std::numeric_limits<int>::max())
1813 return NoHazardFound;
1814
1815 // Too many VALUs in intv3?
1816 int Intv3VALUs = PostExecPos;
1817 if (Intv3VALUs > Intv3MaxVALUs)
1818 return HazardExpired;
1819
1820 // Too many VALUs in intv2?
1821 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1822 if (Intv2VALUs > Intv1plus2MaxVALUs)
1823 return HazardExpired;
1824
1825 // Need a VALUs pre exec change
1826 if (PreExecPos == std::numeric_limits<int>::max())
1827 return NoHazardFound;
1828
1829 // Too many VALUs in intv1?
1830 int Intv1VALUs = PreExecPos - State.ExecPos;
1831 if (Intv1VALUs > Intv1plus2MaxVALUs)
1832 return HazardExpired;
1833
1834 // Too many VALUs in intv1 + intv2
1835 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1836 return HazardExpired;
1837
1838 return HazardFound;
1839 };
1840 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1842 State.VALUs += 1;
1843 };
1844
1845 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1846 std::next(MI->getReverseIterator())))
1847 return false;
1848
1849 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1850 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1852
1853 return true;
1854}
1855
1856bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1857 if (!ST.hasVALUTransUseHazard())
1858 return false;
1859 assert(!ST.hasExtendedWaitCounts());
1860
1861 if (!SIInstrInfo::isVALU(*MI))
1862 return false;
1863
1864 SmallSet<Register, 4> SrcVGPRs;
1865
1866 for (const MachineOperand &Use : MI->explicit_uses()) {
1867 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1868 SrcVGPRs.insert(Use.getReg());
1869 }
1870
1871 // Look for the following pattern:
1872 // Va <- TRANS VALU
1873 // intv
1874 // MI Va (WaitState = 0)
1875 //
1876 // Where:
1877 // intv <= 5 VALUs / 1 TRANS
1878 //
1879 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1880
1881 const int IntvMaxVALUs = 5;
1882 const int IntvMaxTRANS = 1;
1883
1884 struct StateType {
1885 int VALUs = 0;
1886 int TRANS = 0;
1887
1888 static unsigned getHashValue(const StateType &State) {
1889 return hash_combine(State.VALUs, State.TRANS);
1890 }
1891 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1892 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1893 }
1894 };
1895
1896 StateType State;
1897
1898 // This overloads expiry testing with all the hazard detection
1899 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1900 // Too many VALU states have passed
1901 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1902 return HazardExpired;
1903
1904 // Instructions which cause va_vdst==0 expire hazard
1907 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1908 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1909 return HazardExpired;
1910
1911 // Track registers writes
1912 if (SIInstrInfo::isTRANS(I)) {
1913 for (Register Src : SrcVGPRs) {
1914 if (I.modifiesRegister(Src, &TRI)) {
1915 return HazardFound;
1916 }
1917 }
1918 }
1919
1920 return NoHazardFound;
1921 };
1922 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1924 State.VALUs += 1;
1926 State.TRANS += 1;
1927 };
1928
1929 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1930 std::next(MI->getReverseIterator())))
1931 return false;
1932
1933 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1934 // avoided.
1935 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1936 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1938
1939 return true;
1940}
1941
1942bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1943 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1945 return false;
1946
1947 const SIInstrInfo *TII = ST.getInstrInfo();
1948 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1949
1950 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1951 if (!SIInstrInfo::isTRANS(I))
1952 return false;
1953
1954 // RAW: Trans(I) writes, VALU(MI) reads.
1955 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1956 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1957 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1958 return true;
1959 }
1960
1961 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1962 if (!ValuDst || !ValuDst->isReg())
1963 return false;
1964
1965 // WAR: Trans(I) reads, VALU(MI) writes.
1966 Register ValuDef = ValuDst->getReg();
1967 for (const MachineOperand &TransUse : I.explicit_uses()) {
1968 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1969 return true;
1970 }
1971
1972 return false;
1973 };
1974
1975 auto IsExpiredFn = [](const MachineInstr &I, int) {
1976 return SIInstrInfo::isVALU(I);
1977 };
1978
1979 const int HasVALU = std::numeric_limits<int>::max();
1980 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1981 return false;
1982
1983 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1984 return true;
1985}
1986
1987bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1989 return false;
1990
1991 const SIInstrInfo *TII = ST.getInstrInfo();
1992 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1993
1994 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1996 return false;
1997
1998 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1999 // with the dest(matrix D) of the previous wmma.
2000 const Register CurSrc0Reg =
2001 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2002 const Register CurSrc1Reg =
2003 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2004
2005 const Register PrevDstReg =
2006 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2007
2008 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2009 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2010 return true;
2011 }
2012
2013 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2014 // but Index can't overlap with PrevDstReg.
2015 if (AMDGPU::isGFX12Plus(ST)) {
2016 if (SIInstrInfo::isSWMMAC(*MI)) {
2017 const Register CurIndex =
2018 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2019 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2020 return true;
2021 }
2022 return false;
2023 }
2024
2025 return false;
2026 };
2027
2028 auto IsExpiredFn = [](const MachineInstr &I, int) {
2029 return SIInstrInfo::isVALU(I);
2030 };
2031
2032 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2033 std::numeric_limits<int>::max())
2034 return false;
2035
2036 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2037
2038 return true;
2039}
2040
2043 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2044}
2045
2047 const SIInstrInfo *TII, unsigned Latency,
2048 unsigned Category) {
2049 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2050 "Handle me if the xdl wmma instruction latency changes");
2051
2052 switch (Category) {
2053 case 0: // Dense WMMA Instructions:
2054 // WMMA_*F16, WMMA_*BF16
2055 // WMMA_*FP8FP8
2056 // WMMA_*FP8BF8
2057 // WMMA_*BF8FP8
2058 // WMMA_*BF8BF8
2059 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2060 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2061
2062 case 1: // Dense WMMA Instructions:
2063 // WMMA_IU8
2064 // WMMA_IU4
2065 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2066 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2067
2068 case 2: // Dense SWMMAC Instructions
2069 // SWMMAC_*F16, SWMMAC_*BF16,
2070 // SWMMAC_*FP8FP8
2071 // SWMMAC_*BF8FP8
2072 // SWMMAC_*FP8BF8
2073 // SWMMAC_*BF8BF8
2074 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2075
2076 case 3: // Sparse WMMA Instructions:
2077 // SWMMAC_IU8
2078 // SWMMAC_IU4
2079 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2080 default:
2081 break;
2082 } // end switch.
2083
2084 return false;
2085}
2086
2087int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2088 if (!ST.hasGFX1250Insts())
2089 return 0;
2090
2091 const SIInstrInfo *TII = ST.getInstrInfo();
2092 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2093 return 0;
2094
2095 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2096 // be in between the first WMMA and the second instruction to cover the hazard
2097 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2098 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2099 // numbers, which depends on the category of the first WMMA.
2100 const int WMMAWaitStates[] = {5, 9, 3, 5};
2101 const int VALUWaitStates[] = {4, 8, 2, 4};
2102 unsigned Category = 0;
2103
2104 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2105 if (!TII->isXDLWMMA(I))
2106 return false;
2107
2108 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2109 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2110 return false;
2111
2112 return hasWMMAToWMMARegOverlap(I, *MI);
2113 };
2114
2115 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2116 if (!TII->isXDLWMMA(I))
2117 return false;
2118
2119 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2120 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2121 return false;
2122
2123 return hasWMMAToVALURegOverlap(I, *MI);
2124 };
2125
2126 int Limit = 0;
2127
2128 auto GetWaitStatesFn = [](const MachineInstr &I) {
2129 return SIInstrInfo::isVALU(I) ? 1 : 0;
2130 };
2131
2132 int WaitStatesNeeded = -1;
2133 if (TII->isXDLWMMA(*MI)) {
2134 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2135 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2136 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2137 // exists, and INT_MAX if there is no hazard. As a result, a negative
2138 // WaitStatesNeeded here means no hazard, and we will continue to search
2139 // for other categories.
2140 WaitStatesNeeded =
2141 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2142 }
2143 } else { // Must be a co-executable VALU.
2144 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2145 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2146 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2147 // exists, and INT_MAX if there is no hazard. As a result, a negative
2148 // WaitStatesNeeded here means no hazard, and we will continue to search
2149 // for other categories.
2150 WaitStatesNeeded =
2151 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2152 }
2153 }
2154
2155 return WaitStatesNeeded;
2156}
2157
2158bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2159 const MachineInstr &WMMA, const MachineInstr &MI) const {
2160 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2161 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2162 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2163
2164 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2165 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2166 return true;
2167
2169 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2170 if (TRI.regsOverlap(D0, Idx1))
2171 return true;
2172 }
2173 return false;
2174}
2175
2176bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2177 const MachineInstr &WMMA, const MachineInstr &MI) const {
2178 // WMMA writes, VALU reads.
2179 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2180 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2181 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2182 return true;
2183 }
2184
2185 // WMMA reads or writes, VALU writes.
2186 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2187 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2188 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2189
2190 if (SIInstrInfo::isSWMMAC(WMMA)) {
2191 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2192 WMMARegs.push_back(Idx0);
2193 }
2194
2195 for (const MachineOperand &ValuDef : MI.defs()) {
2196 Register VDstReg = ValuDef.getReg();
2197 for (Register WMMAReg : WMMARegs) {
2198 if (TRI.regsOverlap(VDstReg, WMMAReg))
2199 return true;
2200 }
2201 }
2202 return false;
2203}
2204
2205bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2206 const MachineInstr &MI) const {
2207 // I is the potential WMMA hazard source, MI is the instruction being checked
2208 // for hazard.
2209 if (!TII.isXDLWMMA(I))
2210 return false;
2211
2212 // Dispatch based on MI type
2213 if (TII.isXDLWMMA(MI))
2214 return hasWMMAToWMMARegOverlap(I, MI);
2216 return hasWMMAToVALURegOverlap(I, MI);
2217
2218 return false;
2219}
2220
2221bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2222 bool IncludeSubloops) {
2223 // Scan loop for any WMMA that hazards MI.
2224 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2225 for (MachineBasicBlock *MBB : L->getBlocks()) {
2226 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2227 continue;
2228 for (MachineInstr &I : *MBB) {
2229 if (&I == MI)
2230 continue;
2231 if (isCoexecutionHazardFor(I, *MI))
2232 return true;
2233 }
2234 }
2235 return false;
2236}
2237
2238bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2239 int WaitStatesNeeded) {
2240 if (!MLI)
2241 return false;
2242
2243 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2244 if (!L) {
2245 ++NumWMMAHoistingBailed;
2246 return false;
2247 }
2248
2249 // If innermost loop has WMMA hazard, we can't hoist at all
2250 if (hasWMMAHazardInLoop(L, MI)) {
2251 ++NumWMMAHoistingBailed;
2252 return false;
2253 }
2254
2255 // Find outermost loop with no internal hazard
2256 MachineLoop *TargetLoop = L;
2257 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2258 if (hasWMMAHazardInLoop(Parent, MI, false))
2259 break; // Parent has hazard in its own blocks, stop here
2260 TargetLoop = Parent; // Safe to hoist further out
2261 }
2262
2263 // Need valid preheader to insert V_NOPs
2264 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2265 if (!Preheader) {
2266 ++NumWMMAHoistingBailed;
2267 return false;
2268 }
2269
2270 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2271 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2272 << "\n");
2273
2274 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2275 /*IsHoisting=*/true);
2276 NumWMMANopsHoisted += WaitStatesNeeded;
2277 return true;
2278}
2279
2280bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2281 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2282 if (WaitStatesNeeded <= 0)
2283 return false;
2284
2285 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2286 return true;
2287
2288 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2289 return true;
2290}
2291
2292bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2293 if (!ST.hasShift64HighRegBug())
2294 return false;
2295 assert(!ST.hasExtendedWaitCounts());
2296
2297 switch (MI->getOpcode()) {
2298 default:
2299 return false;
2300 case AMDGPU::V_LSHLREV_B64_e64:
2301 case AMDGPU::V_LSHRREV_B64_e64:
2302 case AMDGPU::V_ASHRREV_I64_e64:
2303 break;
2304 }
2305
2306 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2307 if (!Amt->isReg())
2308 return false;
2309
2310 Register AmtReg = Amt->getReg();
2311 const MachineRegisterInfo &MRI = MF.getRegInfo();
2312 // Check if this is a last VGPR in the allocation block.
2313 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2314 return false;
2315
2316 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2317 return false;
2318
2319 assert(ST.needsAlignedVGPRs());
2320 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2321
2322 const DebugLoc &DL = MI->getDebugLoc();
2323 MachineBasicBlock *MBB = MI->getParent();
2324 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2325
2326 // In:
2327 //
2328 // Dst = shiftrev64 Amt, Src1
2329 //
2330 // if Dst!=Src1 then avoid the bug with:
2331 //
2332 // Dst.sub0 = Amt
2333 // Dst = shift64 Dst.sub0, Src1
2334
2335 Register DstReg = MI->getOperand(0).getReg();
2336 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2337 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2338 runOnInstruction(
2339 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2340 Amt->setReg(DstLo);
2341 Amt->setIsKill(true);
2342 return true;
2343 }
2344
2345 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2346 Register NewReg;
2347 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2348 : AMDGPU::VGPR_32RegClass) {
2349 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2350 NewReg = Reg;
2351 break;
2352 }
2353 }
2354
2355 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2356 : NewReg;
2357 Register NewAmtLo;
2358
2359 if (Overlapped)
2360 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2361
2362 // Insert a full wait count because found register might be pending a wait.
2363 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2364 .addImm(0);
2365
2366 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2367 if (Overlapped)
2368 runOnInstruction(
2369 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2370 .addDef(AmtReg - 1)
2371 .addReg(AmtReg - 1, RegState::Undef)
2372 .addReg(NewAmtLo, RegState::Undef));
2373 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2374 .addDef(AmtReg)
2375 .addReg(AmtReg, RegState::Undef)
2376 .addReg(NewAmt, RegState::Undef));
2377
2378 // Instructions emitted after the current instruction will be processed by the
2379 // parent loop of the hazard recognizer in a natural way.
2380 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2381 AmtReg)
2382 .addDef(NewAmt)
2383 .addReg(NewAmt)
2384 .addReg(AmtReg);
2385 if (Overlapped)
2386 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2387 AmtReg - 1)
2388 .addDef(NewAmtLo)
2389 .addReg(NewAmtLo)
2390 .addReg(AmtReg - 1);
2391
2392 // Re-running hazard recognizer on the modified instruction is not necessary,
2393 // inserted V_SWAP_B32 has already both read and write new registers so
2394 // hazards related to these register has already been handled.
2395 Amt->setReg(NewAmt);
2396 Amt->setIsKill(false);
2397 // We do not update liveness, so verifier may see it as undef.
2398 Amt->setIsUndef();
2399 if (Overlapped) {
2400 MI->getOperand(0).setReg(NewReg);
2401 Src1->setReg(NewReg);
2402 Src1->setIsKill(false);
2403 Src1->setIsUndef();
2404 }
2405
2406 return true;
2407}
2408
2409int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2410 int NSAtoVMEMWaitStates = 1;
2411
2412 if (!ST.hasNSAtoVMEMBug())
2413 return 0;
2414
2416 return 0;
2417
2418 const SIInstrInfo *TII = ST.getInstrInfo();
2419 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2420 if (!Offset || (Offset->getImm() & 6) == 0)
2421 return 0;
2422
2423 auto IsHazardFn = [TII](const MachineInstr &I) {
2424 if (!SIInstrInfo::isMIMG(I))
2425 return false;
2426 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2427 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2428 TII->getInstSizeInBytes(I) >= 16;
2429 };
2430
2431 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2432}
2433
2434int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2435 MachineInstr *MI) const {
2436 int FPAtomicToDenormModeWaitStates = 3;
2437
2438 if (!ST.hasFPAtomicToDenormModeHazard())
2439 return 0;
2440 assert(!ST.hasExtendedWaitCounts());
2441
2442 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2443 return 0;
2444
2445 auto IsHazardFn = [](const MachineInstr &I) {
2446 if (!SIInstrInfo::isVMEM(I))
2447 return false;
2448 return SIInstrInfo::isFPAtomic(I);
2449 };
2450
2451 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2452 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2453 return true;
2454
2455 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2456 };
2457
2458 return FPAtomicToDenormModeWaitStates -
2459 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2460}
2461
2462int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2464
2465 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2466}
2467
2468int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2469 // Early exit if no padding is requested.
2470 if (MFMAPaddingRatio == 0)
2471 return 0;
2472
2473 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2474 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2475 return 0;
2476
2477 int NeighborMFMALatency = 0;
2478 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2479 this](const MachineInstr &MI) {
2480 if (!SIInstrInfo::isMFMA(MI))
2481 return false;
2482
2483 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2484 return true;
2485 };
2486
2487 const int MaxMFMAPipelineWaitStates = 16;
2488 int WaitStatesSinceNeighborMFMA =
2489 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2490
2491 int NeighborMFMAPaddingNeeded =
2492 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2493 WaitStatesSinceNeighborMFMA;
2494
2495 return std::max(0, NeighborMFMAPaddingNeeded);
2496}
2497
2498int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2499 int WaitStatesNeeded = 0;
2500 unsigned Opc = MI->getOpcode();
2501
2502 auto IsVALUFn = [](const MachineInstr &MI) {
2503 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2504 };
2505
2506 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2507 const int LegacyVALUWritesVGPRWaitStates = 2;
2508 const int VALUWritesExecWaitStates = 4;
2509 const int MaxWaitStates = 4;
2510
2511 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2512 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2513 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2514
2515 if (WaitStatesNeeded < MaxWaitStates) {
2516 for (const MachineOperand &Use : MI->explicit_uses()) {
2517 const int MaxWaitStates = 2;
2518
2519 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2520 continue;
2521
2522 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2523 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2524 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2525
2526 if (WaitStatesNeeded == MaxWaitStates)
2527 break;
2528 }
2529 }
2530 }
2531
2532 for (const MachineOperand &Op : MI->explicit_operands()) {
2533 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2534 continue;
2535
2536 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2537 continue;
2538
2539 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2540 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2541 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2542 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2543 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2544 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2545 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2546 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2547 const int MaxWaitStates = 18;
2548 Register Reg = Op.getReg();
2549 unsigned HazardDefLatency = 0;
2550
2551 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2552 this](const MachineInstr &MI) {
2553 if (!SIInstrInfo::isMFMA(MI))
2554 return false;
2555 Register DstReg = MI.getOperand(0).getReg();
2556 if (DstReg == Reg)
2557 return false;
2558 HazardDefLatency =
2559 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2560 return TRI.regsOverlap(DstReg, Reg);
2561 };
2562
2563 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2564 MaxWaitStates);
2565 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2566 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2567 int OpNo = Op.getOperandNo();
2568 if (OpNo == SrcCIdx) {
2569 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2570 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2571 switch (HazardDefLatency) {
2572 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2573 break;
2574 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2575 break;
2576 case 16: [[fallthrough]];
2577 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2578 break;
2579 }
2580 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2581 switch (HazardDefLatency) {
2582 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2583 break;
2584 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2585 break;
2586 case 16: [[fallthrough]];
2587 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2588 break;
2589 }
2590 }
2591
2592 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2593 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2594
2595 if (WaitStatesNeeded == MaxWaitStates)
2596 return WaitStatesNeeded; // Early exit.
2597
2598 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2599 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2600 return false;
2601 Register DstReg = MI.getOperand(0).getReg();
2602 return TRI.regsOverlap(Reg, DstReg);
2603 };
2604
2605 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2606 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2607 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2608 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2609 if (OpNo == SrcCIdx)
2610 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2611 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2612 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2613
2614 WaitStatesNeededForUse = NeedWaitStates -
2615 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2616 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2617
2618 if (WaitStatesNeeded == MaxWaitStates)
2619 return WaitStatesNeeded; // Early exit.
2620 }
2621
2622 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2623 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2624 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2625 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2626 const int MaxWaitStates = 13;
2627 Register DstReg = MI->getOperand(0).getReg();
2628 unsigned HazardDefLatency = 0;
2629
2630 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2631 this](const MachineInstr &MI) {
2632 if (!SIInstrInfo::isMFMA(MI))
2633 return false;
2634 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2635 HazardDefLatency =
2636 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2637 return TRI.regsOverlap(Reg, DstReg);
2638 };
2639
2640 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2641 int NeedWaitStates;
2642 switch (HazardDefLatency) {
2643 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2644 break;
2645 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2646 break;
2647 case 16: [[fallthrough]];
2648 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2649 break;
2650 }
2651
2652 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2653 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2654 }
2655
2656 // Pad neighboring MFMA with noops for better inter-wave performance.
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2658
2659 return WaitStatesNeeded;
2660}
2661
2662static int
2664 bool IsGFX950) {
2665 // xdl def cycles | gfx940 | gfx950
2666 // 2 pass | 3 4
2667 // 4 pass | 5 6
2668 // 8 pass | 9 10
2669 // 16 pass | 17 18
2670 return NumPasses + 1 + IsGFX950;
2671}
2672
2673static int
2675 bool IsGFX950) {
2676 // xdl def cycles | gfx940 | gfx950
2677 // 2 pass | 3 3
2678 // 4 pass | 5 6
2679 // 8 pass | 9 10
2680 // 16 pass | 17 18
2681 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2682}
2683
2684static int
2686 // 2 pass -> 2
2687 // 4 pass -> 4
2688 // 8 pass -> 8
2689 // 16 pass -> 16
2690 return NumPasses;
2691}
2692
2693static int
2695 // 2 pass -> 4
2696 // 4 pass -> 6
2697 // 8 pass -> 10
2698 // 16 pass -> 18
2699 return NumPasses + 2;
2700}
2701
2703 bool IsGFX950) {
2704 // xdl def cycles | gfx942 | gfx950
2705 // 2 pass | 5 5
2706 // 4 pass | 7 8
2707 // 8 pass | 11 12
2708 // 16 pass | 19 20
2709 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2710}
2711
2712int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2713 int WaitStatesNeeded = 0;
2714 unsigned Opc = MI->getOpcode();
2715
2716 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2718 };
2719
2720 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2723 };
2724
2725 if (!SIInstrInfo::isMFMA(*MI))
2726 return WaitStatesNeeded;
2727
2728 const int VALUWritesExecWaitStates = 4;
2729 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2730 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2731 VALUWritesExecWaitStates);
2732 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2733
2734 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2735
2736 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2737 for (const MachineOperand &Use : MI->explicit_uses()) {
2738 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2739 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2740 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2741 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2742 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2743 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2744 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2745 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2746 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2747 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2748 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2749 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2750 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2751 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2752 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2753 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2754 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2755 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2756 const int MaxWaitStates = 19;
2757
2758 if (!Use.isReg())
2759 continue;
2760 Register Reg = Use.getReg();
2761 bool FullReg;
2762 const MachineInstr *MI1;
2763
2764 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2765 this](const MachineInstr &MI) {
2766 if (!SIInstrInfo::isMFMA(MI))
2767 return false;
2768 Register DstReg = MI.getOperand(0).getReg();
2769 FullReg = (DstReg == Reg);
2770 MI1 = &MI;
2771 return TRI.regsOverlap(DstReg, Reg);
2772 };
2773
2774 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2775 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2777
2778 int NumWaitStates =
2779 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2780 if (NumWaitStates == std::numeric_limits<int>::max())
2781 continue;
2782
2783 int OpNo = Use.getOperandNo();
2784 unsigned Opc1 = MI1->getOpcode();
2785 int NeedWaitStates = 0;
2786 if (OpNo == SrcCIdx) {
2787 if (!SIInstrInfo::isDGEMM(Opc) &&
2788 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2789 NeedWaitStates = 0;
2790 } else if (FullReg) {
2791 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2792 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2793 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2794 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2795 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2796 else if (ST.hasGFX940Insts() &&
2797 TSchedModel.computeInstrLatency(MI1) == 2)
2798 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2799 } else {
2800 switch (Opc1) {
2801 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2802 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2803 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2804 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2805 if (!TII.isXDL(*MI))
2806 NeedWaitStates =
2807 ST.hasGFX950Insts()
2808 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2809 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2810 break;
2811 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2812 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2813 if (!TII.isXDL(*MI))
2814 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2815 break;
2816 default:
2817 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2818 if (ST.hasGFX940Insts()) {
2819 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2820 break;
2821
2822 NeedWaitStates =
2823 TII.isXDL(*MI1)
2824 ? (TII.isXDL(*MI)
2826 NumPasses, ST.hasGFX950Insts())
2828 NumPasses, ST.hasGFX950Insts()))
2830 NumPasses);
2831 break;
2832 }
2833
2834 switch (NumPasses) {
2835 case 2:
2836 NeedWaitStates =
2838 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2839 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2840 break;
2841 case 8:
2842 NeedWaitStates =
2844 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2845 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2846 break;
2847 case 16:
2848 NeedWaitStates =
2850 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2851 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2852 break;
2853 default:
2854 llvm_unreachable("unexpected number of passes");
2855 }
2856 }
2857 }
2858 } else {
2859 switch (Opc1) {
2860 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2861 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2862 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2863 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2864 NeedWaitStates =
2865 ST.hasGFX950Insts()
2866 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2867 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2868 break;
2869 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2870 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2871 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2872 break;
2873 default:
2874 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2875
2876 if (ST.hasGFX940Insts()) {
2877 NeedWaitStates =
2878 TII.isXDL(*MI1)
2880 NumPasses, ST.hasGFX950Insts())
2882 NumPasses);
2883 break;
2884 }
2885
2886 switch (NumPasses) {
2887 case 2:
2888 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2889 break;
2890 case 4:
2891 llvm_unreachable("unexpected number of passes for mfma");
2892 case 8:
2893 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2894 break;
2895 case 16:
2896 default:
2897 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2898 }
2899 }
2900 }
2901 if (WaitStatesNeeded >= NeedWaitStates)
2902 continue;
2903
2904 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2905 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2906
2907 if (WaitStatesNeeded == MaxWaitStates)
2908 break;
2909 }
2910
2911 // Pad neighboring MFMA with noops for better inter-wave performance.
2912 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2913
2914 return WaitStatesNeeded;
2915}
2916
2917int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2918 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2919 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2920 return 0;
2921
2922 int WaitStatesNeeded = 0;
2923
2924 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2925 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2926 };
2927
2928 for (const MachineOperand &Op : MI->explicit_uses()) {
2929 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2930 continue;
2931
2932 Register Reg = Op.getReg();
2933
2934 const int AccVgprReadLdStWaitStates = 2;
2935 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2936 const int MaxWaitStates = 2;
2937
2938 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2939 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2940 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2941
2942 if (WaitStatesNeeded == MaxWaitStates)
2943 return WaitStatesNeeded; // Early exit.
2944
2945 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2946 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2947 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2948 return false;
2949 auto IsVALUFn = [](const MachineInstr &MI) {
2951 };
2952 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2953 std::numeric_limits<int>::max();
2954 };
2955
2956 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2957 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2958 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2959 }
2960
2961 return WaitStatesNeeded;
2962}
2963
2964int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2965 assert(!ST.hasVcmpxPermlaneHazard() &&
2966 "this is a different vcmpx+permlane hazard");
2967 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2968 const SIInstrInfo *TII = ST.getInstrInfo();
2969
2970 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2971 return isVCmpXWritesExec(*TII, *TRI, MI);
2972 };
2973
2974 auto IsVALUFn = [](const MachineInstr &MI) {
2975 return SIInstrInfo::isVALU(MI);
2976 };
2977
2978 const int VCmpXWritesExecWaitStates = 4;
2979 const int VALUWritesVDstWaitStates = 2;
2980 int WaitStatesNeeded = 0;
2981
2982 for (const MachineOperand &Op : MI->explicit_uses()) {
2983 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2984 continue;
2985 Register Reg = Op.getReg();
2986
2987 int WaitStatesSinceDef =
2988 VALUWritesVDstWaitStates -
2989 getWaitStatesSinceDef(Reg, IsVALUFn,
2990 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2992 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2993 break;
2994 }
2995
2996 int VCmpXHazardWaits =
2997 VCmpXWritesExecWaitStates -
2998 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2999
3000 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3001 return WaitStatesNeeded;
3002}
3003
3005 // 2 pass -> 4
3006 // 4 pass -> 6
3007 // 8 pass -> 10
3008 // 16 pass -> 18
3009 return NumPasses + 2;
3010}
3011
3013 bool IsGFX950) {
3014 // xdl def cycles | gfx942 | gfx950
3015 // 2 pass | 5 5
3016 // 4 pass | 7 8
3017 // 8 pass | 11 12
3018 // 16 pass | 19 20
3019 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3020}
3021
3023 bool IsGFX950) {
3024 // xdl def cycles | gfx942 | gfx950
3025 // 2 pass | 5 5
3026 // 4 pass | 7 8
3027 // 8 pass | 11 12
3028 // 16 pass | 19 20
3029 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3030}
3031
3033 // 2 pass -> 4
3034 // 4 pass -> 6
3035 // 8 pass -> 10
3036 // 16 pass -> 18
3037 return NumPasses + 2;
3038}
3039
3040int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3041 if (!ST.hasGFX90AInsts())
3042 return 0;
3043
3044 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3045 return SIInstrInfo::isDGEMM(MI.getOpcode());
3046 };
3047
3048 // This is checked in checkMAIHazards90A()
3049 if (SIInstrInfo::isMFMA(*MI))
3050 return 0;
3051
3052 const MachineRegisterInfo &MRI = MF.getRegInfo();
3053
3054 int WaitStatesNeeded = 0;
3055
3056 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3057 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3058 bool IsVALU = SIInstrInfo::isVALU(*MI);
3059
3060 const MachineInstr *MFMA = nullptr;
3061 unsigned Reg;
3062 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3063 if (!SIInstrInfo::isMFMA(MI) ||
3064 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3065 return false;
3066 MFMA = &MI;
3067 return true;
3068 };
3069
3070 const MachineInstr *DOT = nullptr;
3071 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3072 if (!SIInstrInfo::isDOT(MI) ||
3073 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3074 return false;
3075 DOT = &MI;
3076 return true;
3077 };
3078
3079 bool DGEMMAfterVALUWrite = false;
3080 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3081 // Found DGEMM on reverse traversal to def.
3082 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3083 DGEMMAfterVALUWrite = true;
3084
3085 // Only hazard if register is defined by a VALU and a DGEMM is found after
3086 // after the def.
3087 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3088 return false;
3089
3090 return true;
3091 };
3092
3093 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3094 AMDGPU::OpName::src2);
3095
3096 if (IsMemOrExport || IsVALU) {
3097 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3098 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3099 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3100 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3101 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3102 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3103 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3104 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3105 const int DotWriteSameDotReadSrcAB = 3;
3106 const int DotWriteDifferentVALURead = 3;
3107 const int DMFMABetweenVALUWriteVMEMRead = 2;
3108 const int MaxWaitStates = 19;
3109
3110 for (const MachineOperand &Use : MI->explicit_uses()) {
3111 if (!Use.isReg())
3112 continue;
3113 Reg = Use.getReg();
3114
3115 DOT = nullptr;
3116 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3117 MaxWaitStates);
3118 if (DOT) {
3119 int NeedWaitStates = 0;
3120 if (DOT->getOpcode() == MI->getOpcode()) {
3121 if (&Use - &MI->getOperand(0) != SrcCIdx)
3122 NeedWaitStates = DotWriteSameDotReadSrcAB;
3123 } else {
3124 NeedWaitStates = DotWriteDifferentVALURead;
3125 }
3126
3127 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3129 }
3130
3131 // Workaround for HW data hazard bug observed only in GFX90A. When there
3132 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3133 // causes the SQ to incorrectly not insert two wait states between the two
3134 // instructions needed to avoid data hazard.
3135 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3136 DGEMMAfterVALUWrite = false;
3137 if (TRI.isVectorRegister(MRI, Reg)) {
3138 int WaitStatesNeededForUse =
3139 DMFMABetweenVALUWriteVMEMRead -
3140 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3141 DMFMABetweenVALUWriteVMEMRead);
3142
3143 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3144 }
3145 }
3146
3147 MFMA = nullptr;
3148 WaitStatesSinceDef =
3149 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3150 if (!MFMA)
3151 continue;
3152
3153 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3154 int NumPasses = HazardDefLatency;
3155 int NeedWaitStates = MaxWaitStates;
3156
3157 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3158 switch (HazardDefLatency) {
3159 case 4:
3160 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3161 : DMFMA4x4WriteVgprVALUReadWaitStates;
3162 break;
3163 case 8:
3164 case 16:
3165 NeedWaitStates =
3166 IsMemOrExport
3167 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3168 : (ST.hasGFX950Insts()
3169 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3170 : DMFMA16x16WriteVgprVALUReadWaitStates);
3171 break;
3172 default:
3173 llvm_unreachable("unexpected dgemm");
3174 }
3175 } else if (ST.hasGFX940Insts()) {
3176 NeedWaitStates =
3177 TII.isXDL(*MFMA)
3179 NumPasses, ST.hasGFX950Insts())
3181 NumPasses);
3182 } else {
3183 switch (HazardDefLatency) {
3184 case 2:
3185 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3186 break;
3187 case 8:
3188 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3189 break;
3190 case 16:
3191 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3192 break;
3193 default:
3194 llvm_unreachable("unexpected number of passes for mfma");
3195 }
3196 }
3197
3198 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3199 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3200
3201 if (WaitStatesNeeded == MaxWaitStates)
3202 break;
3203 }
3204 }
3205
3206 unsigned Opc = MI->getOpcode();
3207 const int DMFMAToFMA64WaitStates = 2;
3208 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3209 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3210 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3211 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3212 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3213 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3215 }
3216
3217 if (!IsVALU && !IsMemOrExport)
3218 return WaitStatesNeeded;
3219
3220 for (const MachineOperand &Def : MI->defs()) {
3221 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3222 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3223 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3224 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3225 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3226 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3227 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3228 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3229 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3230 const int DotWriteDifferentVALUWrite = 3;
3231 const int MaxWaitStates = 19;
3232 const int MaxWarWaitStates = 15;
3233
3234 Reg = Def.getReg();
3235
3236 DOT = nullptr;
3237 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3238 MaxWaitStates);
3239 if (DOT && DOT->getOpcode() != MI->getOpcode())
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3241 WaitStatesSinceDef);
3242
3243 MFMA = nullptr;
3244 WaitStatesSinceDef =
3245 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3246 if (MFMA) {
3247 int NeedWaitStates = MaxWaitStates;
3248 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3249
3250 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3251 switch (NumPasses) {
3252 case 4:
3253 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3254 break;
3255 case 8:
3256 case 16:
3257 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3258 break;
3259 default:
3260 llvm_unreachable("unexpected number of cycles for dgemm");
3261 }
3262 } else if (ST.hasGFX940Insts()) {
3263 NeedWaitStates =
3264 TII.isXDL(*MFMA)
3266 NumPasses, ST.hasGFX950Insts())
3268 } else {
3269 switch (NumPasses) {
3270 case 2:
3271 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3272 break;
3273 case 8:
3274 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3275 break;
3276 case 16:
3277 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3278 break;
3279 default:
3280 llvm_unreachable("Unexpected number of passes for mfma");
3281 }
3282 }
3283
3284 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3285 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3286
3287 if (WaitStatesNeeded == MaxWaitStates)
3288 break;
3289 }
3290
3291 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3292 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3293 !MI.readsRegister(Reg, &TRI))
3294 return false;
3295
3296 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3297 return false;
3298
3299 const MachineOperand *SrcC =
3300 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3301 assert(SrcC);
3302 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3303 return false;
3304
3305 MFMA = &MI;
3306 return true;
3307 };
3308
3309 MFMA = nullptr;
3310 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3311 MaxWarWaitStates);
3312 if (!MFMA)
3313 continue;
3314
3315 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3316 int NeedWaitStates = MaxWaitStates;
3317 switch (HazardDefLatency) {
3318 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3319 break;
3320 case 4: assert(ST.hasGFX940Insts());
3321 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3322 break;
3323 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3324 break;
3325 case 16: [[fallthrough]];
3326 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3327 break;
3328 }
3329
3330 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3331 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3332 }
3333
3334 return WaitStatesNeeded;
3335}
3336
3338 if (!SU->isInstr())
3339 return false;
3340
3341 const MachineInstr *MAI = nullptr;
3342
3343 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3344 MAI = nullptr;
3346 MAI = &MI;
3347 return MAI != nullptr;
3348 };
3349
3350 MachineInstr *MI = SU->getInstr();
3351 if (IsMFMAFn(*MI)) {
3352 int W = getWaitStatesSince(IsMFMAFn, 16);
3353 if (MAI)
3354 return W < (int)TSchedModel.computeInstrLatency(MAI);
3355 }
3356
3357 return false;
3358}
3359
3360// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3361// insertion of a new instruction.
3362static void updateGetPCBundle(MachineInstr *NewMI) {
3363 if (!NewMI->isBundled())
3364 return;
3365
3366 // Find start of bundle.
3367 auto I = NewMI->getIterator();
3368 while (I->isBundledWithPred())
3369 I--;
3370 if (I->isBundle())
3371 I++;
3372
3373 // Bail if this is not an S_GETPC bundle.
3374 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3375 return;
3376
3377 // Update offsets of any references in the bundle.
3378 const unsigned NewBytes = 4;
3379 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3380 "Unexpected instruction insertion in bundle");
3381 auto NextMI = std::next(NewMI->getIterator());
3382 auto End = NewMI->getParent()->end();
3383 while (NextMI != End && NextMI->isBundledWithPred()) {
3384 for (auto &Operand : NextMI->operands()) {
3385 if (Operand.isGlobal())
3386 Operand.setOffset(Operand.getOffset() + NewBytes);
3387 }
3388 NextMI++;
3389 }
3390}
3391
3392bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3393 if (!ST.hasVALUMaskWriteHazard())
3394 return false;
3395 assert(!ST.hasExtendedWaitCounts());
3396
3397 if (!ST.isWave64())
3398 return false;
3399
3400 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3401 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3402 if (!IsSALU && !IsVALU)
3403 return false;
3404
3405 // The hazard sequence is three instructions:
3406 // 1. VALU reads SGPR as mask
3407 // 2. VALU/SALU writes SGPR
3408 // 3. VALU/SALU reads SGPR
3409 // The hazard can expire if the distance between 2 and 3 is sufficient,
3410 // or (2) is VALU and (3) is SALU.
3411 // In practice this happens <10% of the time, hence always assume the hazard
3412 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3413
3414 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3415 const MachineRegisterInfo &MRI = MF.getRegInfo();
3416
3417 auto IgnoreableSGPR = [](const Register Reg) {
3418 switch (Reg) {
3419 case AMDGPU::EXEC:
3420 case AMDGPU::EXEC_LO:
3421 case AMDGPU::EXEC_HI:
3422 case AMDGPU::M0:
3423 case AMDGPU::SGPR_NULL:
3424 case AMDGPU::SGPR_NULL64:
3425 case AMDGPU::SCC:
3426 return true;
3427 default:
3428 return false;
3429 }
3430 };
3431 auto IsVCC = [](const Register Reg) {
3432 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3433 };
3434
3435 struct StateType {
3436 SmallSet<Register, 2> HazardSGPRs;
3437
3438 static unsigned getHashValue(const StateType &State) {
3439 return hash_combine_range(State.HazardSGPRs);
3440 }
3441 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3442 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3443 }
3444 };
3445
3446 SmallVector<const MachineInstr *> WaitInstrs;
3447 bool HasSGPRRead = false;
3448 StateType InitialState;
3449
3450 // Look for SGPR write.
3451 MachineOperand *HazardDef = nullptr;
3452 for (MachineOperand &Op : MI->operands()) {
3453 if (!Op.isReg())
3454 continue;
3455 if (Op.isDef() && HazardDef)
3456 continue;
3457
3458 Register Reg = Op.getReg();
3459 if (IgnoreableSGPR(Reg))
3460 continue;
3461 if (!IsVCC(Reg)) {
3462 if (Op.isImplicit())
3463 continue;
3464 if (!TRI->isSGPRReg(MRI, Reg))
3465 continue;
3466 }
3467 // Also check for SGPR reads.
3468 if (Op.isUse()) {
3469 HasSGPRRead = true;
3470 continue;
3471 }
3472
3473 assert(!HazardDef);
3474 HazardDef = &Op;
3475 }
3476
3477 if (!HazardDef)
3478 return false;
3479
3480 // Setup to track writes to individual SGPRs
3481 const Register HazardReg = HazardDef->getReg();
3482 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3483 InitialState.HazardSGPRs.insert(HazardReg);
3484 } else {
3485 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3486 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3487 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3488 }
3489
3490 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3491 if (State.HazardSGPRs.empty())
3492 return HazardExpired;
3493
3494 switch (I.getOpcode()) {
3495 case AMDGPU::V_ADDC_U32_e32:
3496 case AMDGPU::V_ADDC_U32_dpp:
3497 case AMDGPU::V_CNDMASK_B16_t16_e32:
3498 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3499 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3500 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3501 case AMDGPU::V_CNDMASK_B32_e32:
3502 case AMDGPU::V_CNDMASK_B32_dpp:
3503 case AMDGPU::V_DIV_FMAS_F32_e64:
3504 case AMDGPU::V_DIV_FMAS_F64_e64:
3505 case AMDGPU::V_SUBB_U32_e32:
3506 case AMDGPU::V_SUBB_U32_dpp:
3507 case AMDGPU::V_SUBBREV_U32_e32:
3508 case AMDGPU::V_SUBBREV_U32_dpp: {
3509 // These implicitly read VCC as mask source.
3510 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3511 }
3512 case AMDGPU::V_ADDC_U32_e64:
3513 case AMDGPU::V_ADDC_U32_e64_dpp:
3514 case AMDGPU::V_CNDMASK_B16_t16_e64:
3515 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3516 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3517 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B32_e64:
3519 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3520 case AMDGPU::V_SUBB_U32_e64:
3521 case AMDGPU::V_SUBB_U32_e64_dpp:
3522 case AMDGPU::V_SUBBREV_U32_e64:
3523 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3524 // Only check mask register overlaps.
3525 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3526 assert(SSRCOp);
3527 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3528 return Result ? HazardFound : NoHazardFound;
3529 }
3530 default:
3531 return NoHazardFound;
3532 }
3533 };
3534
3535 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3537 0),
3538 0);
3539 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3540 switch (I.getOpcode()) {
3541 case AMDGPU::S_WAITCNT_DEPCTR:
3542 // Record mergable waits within region of instructions free of SGPR reads.
3543 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3544 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3545 WaitInstrs.push_back(&I);
3546 break;
3547 default:
3548 // Update tracking of SGPR reads and writes.
3549 for (auto &Op : I.operands()) {
3550 if (!Op.isReg())
3551 continue;
3552
3553 Register Reg = Op.getReg();
3554 if (IgnoreableSGPR(Reg))
3555 continue;
3556 if (!IsVCC(Reg)) {
3557 if (Op.isImplicit())
3558 continue;
3559 if (!TRI->isSGPRReg(MRI, Reg))
3560 continue;
3561 }
3562 if (Op.isUse()) {
3563 HasSGPRRead = true;
3564 continue;
3565 }
3566
3567 // Stop tracking any SGPRs with writes on the basis that they will
3568 // already have an appropriate wait inserted afterwards.
3570 for (Register SGPR : State.HazardSGPRs) {
3571 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3572 Found.push_back(SGPR);
3573 }
3574 for (Register SGPR : Found)
3575 State.HazardSGPRs.erase(SGPR);
3576 }
3577 break;
3578 }
3579 };
3580
3581 // Check for hazard
3582 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3583 MI->getParent(),
3584 std::next(MI->getReverseIterator())))
3585 return false;
3586
3587 // Compute counter mask
3588 unsigned DepCtr =
3589 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3590 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3591 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3592
3593 // Try to merge previous waits into this one for regions with no SGPR reads.
3594 if (!WaitInstrs.empty()) {
3595 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3596 // obtain a mutable pointer to each instruction to be merged.
3597 // This is expected to be a very short walk within the same block.
3598 SmallVector<MachineInstr *> ToErase;
3599 unsigned Found = 0;
3600 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3601 End = MI->getParent()->rend();
3602 Found < WaitInstrs.size() && It != End; ++It) {
3603 MachineInstr *WaitMI = &*It;
3604 // Find next wait instruction.
3605 if (std::as_const(WaitMI) != WaitInstrs[Found])
3606 continue;
3607 Found++;
3608 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3609 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3610 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3611 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3612 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3613 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3614 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3615 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3616 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3617 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3618 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3619 ToErase.push_back(WaitMI);
3620 }
3621 assert(Found == WaitInstrs.size());
3622 for (MachineInstr *WaitMI : ToErase)
3623 WaitMI->eraseFromParent();
3624 }
3625
3626 // Add s_waitcnt_depctr after SGPR write.
3627 auto NextMI = std::next(MI->getIterator());
3628 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3629 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3630 .addImm(DepCtr);
3631
3632 // SALU write may be s_getpc in a bundle.
3633 updateGetPCBundle(NewMI);
3634
3635 return true;
3636}
3637
3638static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3639 const SIInstrInfo &TII) {
3640 MachineBasicBlock &EntryMBB = MF->front();
3641 if (EntryMBB.begin() != EntryMBB.end()) {
3642 auto &EntryMI = *EntryMBB.begin();
3643 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3644 EntryMI.getOperand(0).getImm() >= Priority)
3645 return false;
3646 }
3647
3648 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3649 .addImm(Priority);
3650 return true;
3651}
3652
3653bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3654 if (!ST.hasRequiredExportPriority())
3655 return false;
3656
3657 // Assume the following shader types will never have exports,
3658 // and avoid adding or adjusting S_SETPRIO.
3659 MachineBasicBlock *MBB = MI->getParent();
3660 MachineFunction *MF = MBB->getParent();
3661 auto CC = MF->getFunction().getCallingConv();
3662 switch (CC) {
3667 return false;
3668 default:
3669 break;
3670 }
3671
3672 const int MaxPriority = 3;
3673 const int NormalPriority = 2;
3674 const int PostExportPriority = 0;
3675
3676 auto It = MI->getIterator();
3677 switch (MI->getOpcode()) {
3678 case AMDGPU::S_ENDPGM:
3679 case AMDGPU::S_ENDPGM_SAVED:
3680 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3681 case AMDGPU::SI_RETURN_TO_EPILOG:
3682 // Ensure shader with calls raises priority at entry.
3683 // This ensures correct priority if exports exist in callee.
3684 if (MF->getFrameInfo().hasCalls())
3685 return ensureEntrySetPrio(MF, NormalPriority, TII);
3686 return false;
3687 case AMDGPU::S_SETPRIO: {
3688 // Raise minimum priority unless in workaround.
3689 auto &PrioOp = MI->getOperand(0);
3690 int Prio = PrioOp.getImm();
3691 bool InWA = (Prio == PostExportPriority) &&
3692 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3693 if (InWA || Prio >= NormalPriority)
3694 return false;
3695 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3696 return true;
3697 }
3698 default:
3699 if (!TII.isEXP(*MI))
3700 return false;
3701 break;
3702 }
3703
3704 // Check entry priority at each export (as there will only be a few).
3705 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3706 bool Changed = false;
3708 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3709
3710 auto NextMI = std::next(It);
3711 bool EndOfShader = false;
3712 if (NextMI != MBB->end()) {
3713 // Only need WA at end of sequence of exports.
3714 if (TII.isEXP(*NextMI))
3715 return Changed;
3716 // Assume appropriate S_SETPRIO after export means WA already applied.
3717 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3718 NextMI->getOperand(0).getImm() == PostExportPriority)
3719 return Changed;
3720 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3721 }
3722
3723 const DebugLoc &DL = MI->getDebugLoc();
3724
3725 // Lower priority.
3726 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3727 .addImm(PostExportPriority);
3728
3729 if (!EndOfShader) {
3730 // Wait for exports to complete.
3731 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3732 .addReg(AMDGPU::SGPR_NULL)
3733 .addImm(0);
3734 }
3735
3736 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3737 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3738
3739 if (!EndOfShader) {
3740 // Return to normal (higher) priority.
3741 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3742 .addImm(NormalPriority);
3743 }
3744
3745 return true;
3746}
3747
3748bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3749 if (!isSGetReg(MI->getOpcode()))
3750 return false;
3751
3752 const SIInstrInfo *TII = ST.getInstrInfo();
3753 switch (getHWReg(TII, *MI)) {
3754 default:
3755 return false;
3760 break;
3761 }
3762
3763 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3764 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3765 .addImm(0);
3766 return true;
3767}
3768
3769bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3770 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3771 return false;
3772
3773 const SIInstrInfo *TII = ST.getInstrInfo();
3774 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3775 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3777 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3778 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3780
3781 return true;
3782}
3783
3784bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3785 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3786 // for hazard to trigger.
3787 if (!IsHazardRecognizerMode)
3788 return false;
3789
3790 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3791 const SIInstrInfo *TII = ST.getInstrInfo();
3792 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3793 const int FlatScrBaseWaitStates = 10;
3794
3795 bool ReadsFlatScrLo =
3796 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3797 bool ReadsFlatScrHi =
3798 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3799 if (isSGetReg(MI->getOpcode())) {
3800 switch (getHWReg(TII, *MI)) {
3801 default:
3802 break;
3804 ReadsFlatScrLo = true;
3805 break;
3807 ReadsFlatScrHi = true;
3808 break;
3809 }
3810 }
3811
3812 const MachineRegisterInfo &MRI = MF.getRegInfo();
3813
3814 auto IsRegDefHazard = [&](Register Reg) -> bool {
3815 DenseSet<const MachineBasicBlock *> Visited;
3816 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3817 return MI.modifiesRegister(Reg, TRI);
3818 };
3819
3820 // This literally abuses the idea of waitstates. Instead of waitstates it
3821 // returns 1 for SGPR written and 0 otherwise.
3822 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3823 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3824 return 0;
3825 for (const MachineOperand &MO : MI.all_defs()) {
3826 if (TRI->isSGPRReg(MRI, MO.getReg()))
3827 return 1;
3828 }
3829 return 0;
3830 };
3831
3832 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3833 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3834 unsigned Wait = MI.getOperand(0).getImm();
3837 return true;
3838 }
3839 return SgprWrites >= FlatScrBaseWaitStates;
3840 };
3841
3842 return ::getWaitStatesSince(
3843 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3844 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3845 };
3846
3847 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3848 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3849 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3850 !IsRegDefHazard(AMDGPU::SGPR103)))
3851 return false;
3852
3853 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3854 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3857 return true;
3858}
3859
3860bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3861 if (!isSSetReg(MI->getOpcode()) ||
3862 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3863 return false;
3864
3865 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3866 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3867 return true;
3868}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition CommandLine.h:52
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...