LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
18#include "llvm/ADT/Statistic.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(0, Value))
42 return O.error("'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
60 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
64 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
72 const GCNSubtarget &ST);
73
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
85 EmittedInstrs.clear();
86}
87
91
93 CurrCycleInstr = MI;
94}
95
96static bool isDivFMas(unsigned Opcode) {
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
98}
99
100static bool isSGetReg(unsigned Opcode) {
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
102}
103
104static bool isSSetReg(unsigned Opcode) {
105 switch (Opcode) {
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
110 return true;
111 }
112 return false;
113}
114
115static bool isRWLane(unsigned Opcode) {
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
117}
118
119static bool isRFE(unsigned Opcode) {
120 return Opcode == AMDGPU::S_RFE_B64;
121}
122
123static bool isSMovRel(unsigned Opcode) {
124 switch (Opcode) {
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
129 return true;
130 default:
131 return false;
132 }
133}
134
136 const MachineInstr &MI) {
137 if (TII.isAlwaysGDS(MI.getOpcode()))
138 return true;
139
140 switch (MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
144 return true;
145 // These DS opcodes don't support GDS.
146 case AMDGPU::DS_NOP:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
149 return false;
150 default:
151 if (TII.isDS(MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (MI.getOperand(GDS).getImm())
155 return true;
156 }
157 return false;
158 }
159}
160
161static bool isPermlane(const MachineInstr &MI) {
162 unsigned Opcode = MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
177}
178
179static bool isLdsDma(const MachineInstr &MI) {
180 return SIInstrInfo::isVALU(MI) &&
182}
183
184static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
185 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
186 AMDGPU::OpName::simm16);
187 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
188}
189
192 MachineInstr *MI = SU->getInstr();
193 // If we are not in "HazardRecognizerMode" and therefore not being run from
194 // the scheduler, track possible stalls from hazards but don't insert noops.
195 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
196
197 if (MI->isBundle())
198 return NoHazard;
199
200 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
204 return HazardType;
205
206 if (checkFPAtomicToDenormModeHazard(MI) > 0)
207 return HazardType;
208
209 // Hazards which cannot be mitigated with S_NOPs.
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(MI) > 0)
212 return Hazard;
213 }
214
215 if (ST.hasNoDataDepHazard())
216 return NoHazard;
217
218 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
219 return HazardType;
220
221 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
222 return HazardType;
223
224 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
225 return HazardType;
226
227 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
228 return HazardType;
229
230 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
231 return HazardType;
232
235 checkMAIVALUHazards(MI) > 0)
236 return HazardType;
237
238 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
239 return HazardType;
240
241 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
242 return HazardType;
243
244 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
245 return HazardType;
246
247 if (((ST.hasReadM0MovRelInterpHazard() &&
248 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
252 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
255 checkReadM0Hazards(MI) > 0)
256 return HazardType;
257
258 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
259 return HazardType;
260
262 checkMAILdStHazards(MI) > 0)
263 return HazardType;
264
265 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
266 return HazardType;
267
268 return NoHazard;
269}
270
272 unsigned Quantity) {
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
275 Quantity -= Arg;
276 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
277 .addImm(Arg - 1);
278 }
279}
280
281unsigned
282GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
287}
288
289void GCNHazardRecognizer::processBundle() {
290 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
291 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
292 // Check bundled MachineInstr's for hazards.
293 for (; MI != E && MI->isInsideBundle(); ++MI) {
294 CurrCycleInstr = &*MI;
295 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
296
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
299
300 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
301 }
302
303 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
304 // include the bundled MI directly after, only add a maximum of
305 // (MaxLookAhead - 1) noops to EmittedInstrs.
306 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
307 EmittedInstrs.push_front(nullptr);
308
309 EmittedInstrs.push_front(CurrCycleInstr);
310 EmittedInstrs.resize(MaxLookAhead);
311 }
312 CurrCycleInstr = nullptr;
313}
314
315void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
316 assert(IsHazardRecognizerMode);
317
318 unsigned NumPreNoops = PreEmitNoops(MI);
319 EmitNoops(NumPreNoops);
320 if (MI->isInsideBundle())
321 insertNoopsInBundle(MI, TII, NumPreNoops);
322 else
323 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
324 NumPreNoops);
326 AdvanceCycle();
327}
328
330 IsHazardRecognizerMode = true;
331 CurrCycleInstr = MI;
332 unsigned W = PreEmitNoopsCommon(MI);
333 fixHazards(MI);
334 CurrCycleInstr = nullptr;
335 return std::max(W, NopPadding.getValue());
336}
337
341
343 if (MI->isBundle())
344 return 0;
345
346 int WaitStates = 0;
347
349 return std::max(WaitStates, checkSMRDHazards(MI));
350
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
353
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
355
356 if (ST.hasNoDataDepHazard())
357 return WaitStates;
358
360 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
361
363 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
364
366 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
367
368 if (isDivFMas(MI->getOpcode()))
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
370
371 if (isRWLane(MI->getOpcode()))
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
373
376 checkMAIVALUHazards(MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
378
379 if (MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(MI));
381
382 if (isSGetReg(MI->getOpcode()))
383 return std::max(WaitStates, checkGetRegHazards(MI));
384
385 if (isSSetReg(MI->getOpcode()))
386 return std::max(WaitStates, checkSetRegHazards(MI));
387
388 if (isRFE(MI->getOpcode()))
389 return std::max(WaitStates, checkRFEHazards(MI));
390
391 if ((ST.hasReadM0MovRelInterpHazard() &&
392 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
395 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
396 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(MI));
400
402 return std::max(WaitStates, checkMAIHazards(MI));
403
405 return std::max(WaitStates, checkMAILdStHazards(MI));
406
407 if (ST.hasGFX950Insts() && isPermlane(*MI))
408 return std::max(WaitStates, checkPermlaneHazards(MI));
409
410 return WaitStates;
411}
412
414 EmittedInstrs.push_front(nullptr);
415}
416
418 // When the scheduler detects a stall, it will call AdvanceCycle() without
419 // emitting any instructions.
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(nullptr);
422 return;
423 }
424
425 if (CurrCycleInstr->isBundle()) {
426 processBundle();
427 return;
428 }
429
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr = nullptr;
433 return;
434 }
435
436 // Keep track of emitted instructions
437 EmittedInstrs.push_front(CurrCycleInstr);
438
439 // Add a nullptr for each additional wait state after the first. Make sure
440 // not to add more than getMaxLookAhead() items to the list, since we
441 // truncate the list to that size right after this loop.
442 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
443 i < e; ++i) {
444 EmittedInstrs.push_front(nullptr);
445 }
446
447 // getMaxLookahead() is the largest number of wait states we will ever need
448 // to insert, so there is no point in keeping track of more than that many
449 // wait states.
450 EmittedInstrs.resize(getMaxLookAhead());
451
452 CurrCycleInstr = nullptr;
453}
454
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
458}
459
460//===----------------------------------------------------------------------===//
461// Helper Functions
462//===----------------------------------------------------------------------===//
463
465
466// Search for a hazard in a block and its predecessors.
467template <typename StateT>
468static bool
469hasHazard(StateT InitialState,
470 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
471 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
472 const MachineBasicBlock *InitialMBB,
474 struct StateMapKey {
476 unsigned Idx;
477 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
478 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
479 }
480 };
481 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
482 static inline StateMapKey getEmptyKey() {
483 return {static_cast<SmallVectorImpl<StateT> *>(
486 }
487 static unsigned getHashValue(const StateMapKey &Key) {
488 return StateT::getHashValue((*Key.States)[Key.Idx]);
489 }
490 static unsigned getHashValue(const StateT &State) {
491 return StateT::getHashValue(State);
492 }
493 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
494 const auto EKey = getEmptyKey();
495 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey))
496 return StateMapKey::isEqual(LHS, RHS);
497 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
498 }
499 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
500 if (StateMapKey::isEqual(RHS, getEmptyKey()))
501 return false;
502 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
503 }
504 };
505
508
510 const MachineBasicBlock *MBB = InitialMBB;
511 StateT State = InitialState;
512
514 unsigned WorkIdx = 0;
515 for (;;) {
516 bool Expired = false;
517 for (auto E = MBB->instr_rend(); I != E; ++I) {
518 // No need to look at parent BUNDLE instructions.
519 if (I->isBundle())
520 continue;
521
522 auto Result = IsHazard(State, *I);
523 if (Result == HazardFound)
524 return true;
525 if (Result == HazardExpired) {
526 Expired = true;
527 break;
528 }
529
530 if (I->isInlineAsm() || I->isMetaInstruction())
531 continue;
532
533 UpdateState(State, *I);
534 }
535
536 if (!Expired) {
537 unsigned StateIdx = States.size();
538 StateMapKey Key = {&States, StateIdx};
539 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
540 if (Insertion.second) {
541 States.emplace_back(State);
542 } else {
543 StateIdx = Insertion.first->second;
544 }
545 for (MachineBasicBlock *Pred : MBB->predecessors())
546 Worklist.insert(std::pair(Pred, StateIdx));
547 }
548
549 if (WorkIdx == Worklist.size())
550 break;
551
552 unsigned StateIdx;
553 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
554 State = States[StateIdx];
555 I = MBB->instr_rbegin();
556 }
557
558 return false;
559}
560
561// Returns a minimum wait states since \p I walking all predecessors.
562// Only scans until \p IsExpired does not return true.
563// Can only be run in a hazard recognizer mode.
564static int
566 const MachineBasicBlock *MBB,
568 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
572 for (auto E = MBB->instr_rend(); I != E; ++I) {
573 // Don't add WaitStates for parent BUNDLE instructions.
574 if (I->isBundle())
575 continue;
576
577 if (IsHazard(*I))
578 return WaitStates;
579
580 if (I->isInlineAsm())
581 continue;
582
583 WaitStates += GetNumWaitStates(*I);
584
585 if (IsExpired(*I, WaitStates))
586 return std::numeric_limits<int>::max();
587 }
588
589 int MinWaitStates = std::numeric_limits<int>::max();
590 for (MachineBasicBlock *Pred : MBB->predecessors()) {
591 if (!Visited.insert(Pred).second)
592 continue;
593
594 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
595 IsExpired, Visited, GetNumWaitStates);
596
597 MinWaitStates = std::min(MinWaitStates, W);
598 }
599
600 return MinWaitStates;
601}
602
603static int
605 const MachineInstr *MI,
610 return getWaitStatesSince(IsHazard, MI->getParent(),
611 std::next(MI->getReverseIterator()), 0, IsExpired,
612 Visited, GetNumWaitStates);
613}
614
615int GCNHazardRecognizer::getWaitStatesSince(
616 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
617 if (IsHazardRecognizerMode) {
618 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
619 return WaitStates >= Limit;
620 };
621 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
622 GetNumWaitStates);
623 }
624
625 int WaitStates = 0;
626 for (MachineInstr *MI : EmittedInstrs) {
627 if (MI) {
628 if (IsHazard(*MI))
629 return WaitStates;
630
631 if (MI->isInlineAsm())
632 continue;
633 }
634 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
635
636 if (WaitStates >= Limit)
637 break;
638 }
639 return std::numeric_limits<int>::max();
640}
641
642int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
643 int Limit) const {
644 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
645}
646
647int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
648 IsHazardFn IsHazardDef,
649 int Limit) const {
650 const SIRegisterInfo *TRI = ST.getRegisterInfo();
651
652 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
653 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
654 };
655
656 return getWaitStatesSince(IsHazardFn, Limit);
657}
658
659int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
660 int Limit) const {
661 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
662 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
663 };
664
665 return getWaitStatesSince(IsHazardFn, Limit);
666}
667
668//===----------------------------------------------------------------------===//
669// No-op Hazard Detection
670//===----------------------------------------------------------------------===//
671
672static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
673 MCRegister Reg) {
674 for (MCRegUnit Unit : TRI.regunits(Reg))
675 BV.set(static_cast<unsigned>(Unit));
676}
677
678static void addRegsToSet(const SIRegisterInfo &TRI,
680 BitVector &DefSet, BitVector &UseSet) {
681 for (const MachineOperand &Op : Ops) {
682 if (Op.isReg())
683 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
684 }
685}
686
687void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
688 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
689}
690
692 return !SIInstrInfo::isSMRD(*MI);
693}
694
696 return !SIInstrInfo::isVMEM(*MI);
697}
698
699int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
700 // SMEM soft clause are only present on VI+, and only matter if xnack is
701 // enabled.
702 if (!ST.isXNACKEnabled())
703 return 0;
704
705 bool IsSMRD = TII.isSMRD(*MEM);
706
707 resetClause();
708
709 // A soft-clause is any group of consecutive SMEM instructions. The
710 // instructions in this group may return out of order and/or may be
711 // replayed (i.e. the same instruction issued more than once).
712 //
713 // In order to handle these situations correctly we need to make sure that
714 // when a clause has more than one instruction, no instruction in the clause
715 // writes to a register that is read by another instruction in the clause
716 // (including itself). If we encounter this situation, we need to break the
717 // clause by inserting a non SMEM instruction.
718
719 for (MachineInstr *MI : EmittedInstrs) {
720 // When we hit a non-SMEM instruction then we have passed the start of the
721 // clause and we can stop.
722 if (!MI)
723 break;
724
726 break;
727
728 addClauseInst(*MI);
729 }
730
731 if (ClauseDefs.none())
732 return 0;
733
734 // We need to make sure not to put loads and stores in the same clause if they
735 // use the same address. For now, just start a new clause whenever we see a
736 // store.
737 if (MEM->mayStore())
738 return 1;
739
740 addClauseInst(*MEM);
741
742 // If the set of defs and uses intersect then we cannot add this instruction
743 // to the clause, so we have a hazard.
744 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
745}
746
747int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
748 int WaitStatesNeeded = 0;
749
750 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
751
752 // This SMRD hazard only affects SI.
753 if (!ST.hasSMRDReadVALUDefHazard())
754 return WaitStatesNeeded;
755
756 // A read of an SGPR by SMRD instruction requires 4 wait states when the
757 // SGPR was written by a VALU instruction.
758 int SmrdSgprWaitStates = 4;
759 auto IsHazardDefFn = [this](const MachineInstr &MI) {
760 return TII.isVALU(MI);
761 };
762 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
763 return TII.isSALU(MI);
764 };
765
766 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
767
768 for (const MachineOperand &Use : SMRD->uses()) {
769 if (!Use.isReg())
770 continue;
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
773 SmrdSgprWaitStates);
774 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
775
776 // This fixes what appears to be undocumented hardware behavior in SI where
777 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
778 // needs some number of nops in between. We don't know how many we need, but
779 // let's use 4. This wasn't discovered before probably because the only
780 // case when this happens is when we expand a 64-bit pointer into a full
781 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
782 // probably never encountered in the closed-source land.
783 if (IsBufferSMRD) {
784 int WaitStatesNeededForUse =
785 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
786 IsBufferHazardDefFn,
787 SmrdSgprWaitStates);
788 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
789 }
790 }
791
792 return WaitStatesNeeded;
793}
794
795int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
796 if (!ST.hasVMEMReadSGPRVALUDefHazard())
797 return 0;
798
799 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
800
801 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
802 // SGPR was written by a VALU Instruction.
803 const int VmemSgprWaitStates = 5;
804 auto IsHazardDefFn = [this](const MachineInstr &MI) {
805 return TII.isVALU(MI);
806 };
807 for (const MachineOperand &Use : VMEM->uses()) {
808 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
809 continue;
810
811 int WaitStatesNeededForUse =
812 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
813 VmemSgprWaitStates);
814 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
815 }
816 return WaitStatesNeeded;
817}
818
819int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
820 const SIRegisterInfo *TRI = ST.getRegisterInfo();
821 const SIInstrInfo *TII = ST.getInstrInfo();
822
823 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
824 int DppVgprWaitStates = 2;
825 int DppExecWaitStates = 5;
826 int WaitStatesNeeded = 0;
827 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
828 return TII->isVALU(MI);
829 };
830
831 for (const MachineOperand &Use : DPP->uses()) {
832 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
833 continue;
834 int WaitStatesNeededForUse =
835 DppVgprWaitStates - getWaitStatesSinceDef(
836 Use.getReg(),
837 [](const MachineInstr &) { return true; },
838 DppVgprWaitStates);
839 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
840 }
841
842 WaitStatesNeeded = std::max(
843 WaitStatesNeeded,
844 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
845 DppExecWaitStates));
846
847 return WaitStatesNeeded;
848}
849
850int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
851 const SIInstrInfo *TII = ST.getInstrInfo();
852
853 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
854 // instruction.
855 const int DivFMasWaitStates = 4;
856 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
857 return TII->isVALU(MI);
858 };
859 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
860 DivFMasWaitStates);
861
862 return DivFMasWaitStates - WaitStatesNeeded;
863}
864
865int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
868
869 const int GetRegWaitStates = 2;
870 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
871 return GetRegHWReg == getHWReg(TII, MI);
872 };
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
874
875 return GetRegWaitStates - WaitStatesNeeded;
876}
877
878int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 unsigned HWReg = getHWReg(TII, *SetRegInstr);
881
882 const int SetRegWaitStates = ST.getSetRegWaitStates();
883 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
884 return HWReg == getHWReg(TII, MI);
885 };
886 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
887 return SetRegWaitStates - WaitStatesNeeded;
888}
889
890int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
891 if (!MI.mayStore())
892 return -1;
893
894 const SIInstrInfo *TII = ST.getInstrInfo();
895 unsigned Opcode = MI.getOpcode();
896 const MCInstrDesc &Desc = MI.getDesc();
897
898 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
899 int VDataRCID = -1;
900 if (VDataIdx != -1)
901 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
902
903 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
904 // There is no hazard if the instruction does not use vector regs
905 // (like wbinvl1)
906 if (VDataIdx == -1)
907 return -1;
908 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
909 // On gfx940-family the BUFFER_STORE source-vgpr WAR hazard exists for
910 // every SOFFSET shape; the wait-state count differs by SOFFSET, and is
911 // computed in checkVALUHazardsHelper. Pre-gfx940 the hazard only exists
912 // if soffset is not an SGPR.
913 if (ST.hasGFX940Insts())
914 return VDataIdx;
915 const MachineOperand *SOffset =
916 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
917 if (!SOffset || !SOffset->isReg())
918 return VDataIdx;
919 }
920 }
921
922 // MIMG instructions create a hazard if they don't use a 256-bit T# and
923 // the store size is greater than 8 bytes and they have more than two bits
924 // of their dmask set.
925 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
926 if (TII->isMIMG(MI)) {
927 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
928 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
929 Desc.operands()[SRsrcIdx])) == 256);
930 (void)SRsrcIdx;
931 }
932
933 if (TII->isFLAT(MI)) {
934 // There is no hazard if the instruction does not use vector regs
935 if (VDataIdx == -1)
936 return -1;
937
938 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
939 return VDataIdx;
940 }
941
942 return -1;
943}
944
945int GCNHazardRecognizer::checkVALUHazardsHelper(
946 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
947 // Helper to check for the hazard where VMEM instructions that store more
948 // than 8 bytes can have their store data overwritten by the next
949 // instruction. On gfx940-family the window depends on the producer's
950 // SOFFSET shape:
951 // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
952 // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
953 // store: 2 wait states.
954 // Pre-gfx940 keeps a single 1-wait-state window. The 1-cycle sgpr-SOFFSET
955 // window was measured on gfx950 (MI350X); the same gate is applied to the
956 // rest of the gfx940 family to match the existing rule's granularity.
957 const SIRegisterInfo *TRI = ST.getRegisterInfo();
958 const SIInstrInfo *TII = ST.getInstrInfo();
959
960 int WaitStatesNeeded = 0;
961 if (!TRI->isVectorRegister(MRI, Def.getReg()))
962 return WaitStatesNeeded;
963 const Register Reg = Def.getReg();
964
965 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
966
967 // Per-producer required wait-state window. On pre-gfx940 every producer
968 // uses 1; on gfx940-family MUBUF/MTBUF stores with an SGPR SOFFSET use 1
969 // and everything else (literal/absent SOFFSET, FLAT) uses 2.
970 auto WindowFor = [this, TII](const MachineInstr &MI) -> int {
971 if (!ST.hasGFX940Insts())
972 return 1;
973 if (TII->isBUF(MI)) {
974 const MachineOperand *SOffset =
975 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
976 if (SOffset && SOffset->isReg())
977 return 1;
978 }
979 return 2;
980 };
981
982 // For each hazard producer reached, accumulate the wait states still
983 // needed using that producer's own window. The predicate always returns
984 // false so the walk runs to MaxWaitStates.
985 int Distance = 0;
986 auto Counter = [&](const MachineInstr &MI) {
987 int DataIdx = createsVALUHazard(MI);
988 if (DataIdx >= 0 &&
989 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg)) {
990 int Need = WindowFor(MI) - Distance;
991 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
992 }
993 // Mirror getWaitStatesSince's accounting, which does not count inline asm
994 // towards the wait-state distance.
995 if (!MI.isInlineAsm())
997 return false;
998 };
999 getWaitStatesSince(Counter, MaxWaitStates);
1000
1001 return WaitStatesNeeded;
1002}
1003
1004/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
1005/// pack the computed value into correct bit position of the dest register. This
1006/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
1007/// dst_sel that is not aligned to the register. This function analayzes the \p
1008/// MI and \returns an operand with dst forwarding issue, or nullptr if
1009/// none exists.
1010static const MachineOperand *
1012 if (!SIInstrInfo::isVALU(MI))
1013 return nullptr;
1014
1015 const SIInstrInfo *TII = ST.getInstrInfo();
1016
1017 unsigned Opcode = MI.getOpcode();
1018
1019 // There are three different types of instructions
1020 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
1021 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
1022 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
1023 // op_sel[3:2]
1024 // != 0
1025 if (SIInstrInfo::isSDWA(MI)) {
1026 // Type 1: SDWA with dst_sel != DWORD
1027 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
1028 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1029 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1030 }
1031
1032 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1033 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1034 // Type 2: VOP3 which write the hi bits
1035 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1037 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1038
1039 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1040 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1041 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1043 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1044 }
1045
1046 // Special case: nop is required for all the opsel values for fp4 sr variant
1047 // cvt scale instructions
1048 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1049 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1050
1051 return nullptr;
1052}
1053
1054/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1055/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1056/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1058 const MachineOperand *Dst,
1059 const SIRegisterInfo *TRI) {
1060 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1061 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1062 // and we must account for that hazard.
1063 // We also must account for WAW hazards. In particular, WAW with dest
1064 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1065 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1066 // check for ECC. Without accounting for this hazard, the ECC will be
1067 // wrong.
1068 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1069 // complete zeroesHigh16BitsOfDest)
1070 for (auto &Operand : VALU->operands()) {
1071 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1072 return true;
1073 }
1074 }
1075 return false;
1076}
1077
1078int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1079 int WaitStatesNeeded = 0;
1080
1081 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1082 const int TransDefWaitstates = 1;
1083
1084 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1086 return false;
1087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1088 const SIInstrInfo *TII = ST.getInstrInfo();
1089 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1090
1091 for (const MachineOperand &Use : VALU->explicit_uses()) {
1092 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1093 return true;
1094 }
1095
1096 return false;
1097 };
1098
1099 int WaitStatesNeededForDef =
1100 TransDefWaitstates -
1101 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1102 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1103 }
1104
1105 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1106 const int Shift16DefWaitstates = 1;
1107
1108 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1109 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1110 const MachineOperand *ForwardedDst =
1111 getDstSelForwardingOperand(ProducerMI, ST);
1112 if (ForwardedDst) {
1113 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1114 }
1115
1116 if (ProducerMI.isInlineAsm()) {
1117 // Assume inline asm has dst forwarding hazard
1118 for (auto &Def : ProducerMI.all_defs()) {
1119 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1120 return true;
1121 }
1122 }
1123
1124 return false;
1125 };
1126
1127 int WaitStatesNeededForDef =
1128 Shift16DefWaitstates -
1129 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1130 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1131 }
1132
1133 if (ST.hasVDecCoExecHazard()) {
1134 const int VALUWriteSGPRVALUReadWaitstates = 2;
1135 const int VALUWriteEXECRWLane = 4;
1136 const int VALUWriteVGPRReadlaneRead = 1;
1137
1138 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1139 const MachineRegisterInfo &MRI = MF.getRegInfo();
1141 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1142 if (!SIInstrInfo::isVALU(MI))
1143 return false;
1144 return MI.modifiesRegister(UseReg, TRI);
1145 };
1146
1147 for (const MachineOperand &Use : VALU->explicit_uses()) {
1148 if (!Use.isReg())
1149 continue;
1150
1151 UseReg = Use.getReg();
1152 if (TRI->isSGPRReg(MRI, UseReg)) {
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn,
1156 VALUWriteSGPRVALUReadWaitstates);
1157 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1158 }
1159 }
1160
1161 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1162 UseReg = AMDGPU::VCC;
1163 int WaitStatesNeededForDef =
1164 VALUWriteSGPRVALUReadWaitstates -
1165 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1166 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1167 }
1168
1169 switch (VALU->getOpcode()) {
1170 case AMDGPU::V_READLANE_B32:
1171 case AMDGPU::V_READFIRSTLANE_B32: {
1172 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1173 UseReg = Src->getReg();
1174 int WaitStatesNeededForDef =
1175 VALUWriteVGPRReadlaneRead -
1176 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1177 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1178 }
1179 [[fallthrough]];
1180 case AMDGPU::V_WRITELANE_B32: {
1181 UseReg = AMDGPU::EXEC;
1182 int WaitStatesNeededForDef =
1183 VALUWriteEXECRWLane -
1184 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1185 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1186 break;
1187 }
1188 default:
1189 break;
1190 }
1191 }
1192
1193 // This checks for the hazard where VMEM instructions that store more than
1194 // 8 bytes can have there store data over written by the next instruction.
1195 if (!ST.has12DWordStoreHazard())
1196 return WaitStatesNeeded;
1197
1198 const MachineRegisterInfo &MRI = MF.getRegInfo();
1199
1200 for (const MachineOperand &Def : VALU->defs()) {
1201 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1202 }
1203
1204 return WaitStatesNeeded;
1205}
1206
1207int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1208 // This checks for hazards associated with inline asm statements.
1209 // Since inline asms can contain just about anything, we use this
1210 // to call/leverage other check*Hazard routines. Note that
1211 // this function doesn't attempt to address all possible inline asm
1212 // hazards (good luck), but is a collection of what has been
1213 // problematic thus far.
1214
1215 // see checkVALUHazards()
1216 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1217 !ST.hasCvtScaleForwardingHazard())
1218 return 0;
1219
1220 const MachineRegisterInfo &MRI = MF.getRegInfo();
1221 int WaitStatesNeeded = 0;
1222
1223 for (const MachineOperand &Op :
1225 if (Op.isReg() && Op.isDef()) {
1226 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1227 continue;
1228
1229 if (ST.has12DWordStoreHazard()) {
1230 WaitStatesNeeded =
1231 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1232 }
1233 }
1234 }
1235
1236 if (ST.hasDstSelForwardingHazard()) {
1237 const int Shift16DefWaitstates = 1;
1238
1239 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1240 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1241 // Assume inline asm reads the dst
1242 if (Dst)
1243 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1244 IA->readsRegister(Dst->getReg(), &TRI);
1245
1246 if (ProducerMI.isInlineAsm()) {
1247 // If MI is inline asm, assume it has dst forwarding hazard
1248 for (auto &Def : ProducerMI.all_defs()) {
1249 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1250 IA->readsRegister(Def.getReg(), &TRI)) {
1251 return true;
1252 }
1253 }
1254 }
1255
1256 return false;
1257 };
1258
1259 int WaitStatesNeededForDef =
1260 Shift16DefWaitstates -
1261 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1262 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1263 }
1264
1265 return WaitStatesNeeded;
1266}
1267
1268int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1269 const SIInstrInfo *TII = ST.getInstrInfo();
1270 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1271 const MachineRegisterInfo &MRI = MF.getRegInfo();
1272
1273 const MachineOperand *LaneSelectOp =
1274 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1275
1276 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1277 return 0;
1278
1279 Register LaneSelectReg = LaneSelectOp->getReg();
1280 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1281
1282 const int RWLaneWaitStates = 4;
1283 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1284 RWLaneWaitStates);
1285 return RWLaneWaitStates - WaitStatesSince;
1286}
1287
1288int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1289 if (!ST.hasRFEHazards())
1290 return 0;
1291
1292 const SIInstrInfo *TII = ST.getInstrInfo();
1293
1294 const int RFEWaitStates = 1;
1295
1296 auto IsHazardFn = [TII](const MachineInstr &MI) {
1297 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1298 };
1299 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1300 return RFEWaitStates - WaitStatesNeeded;
1301}
1302
1303int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1304 const SIInstrInfo *TII = ST.getInstrInfo();
1305 const int ReadM0WaitStates = 1;
1306 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1307 return ReadM0WaitStates -
1308 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1309}
1310
1311void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1313 int WaitStatesNeeded, bool IsHoisting) {
1314 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1315 for (int I = 0; I < WaitStatesNeeded; ++I)
1316 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1317}
1318
1319void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1320 fixVMEMtoScalarWriteHazards(MI);
1321 fixVcmpxPermlaneHazards(MI);
1322 fixSMEMtoVectorWriteHazards(MI);
1323 fixVcmpxExecWARHazard(MI);
1324 fixLdsBranchVmemWARHazard(MI);
1325 if (ST.hasLdsDirect()) {
1326 fixLdsDirectVALUHazard(MI);
1327 fixLdsDirectVMEMHazard(MI);
1328 }
1329 fixVALUPartialForwardingHazard(MI);
1330 fixVALUTransUseHazard(MI);
1331 fixVALUTransCoexecutionHazards(MI);
1332 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1333 fixWMMACoexecutionHazards(MI);
1334 fixShift64HighRegBug(MI);
1335 fixVALUMaskWriteHazard(MI);
1336 fixRequiredExportPriority(MI);
1337 if (ST.requiresWaitIdleBeforeGetReg())
1338 fixGetRegWaitIdle(MI);
1339 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1340 fixDsAtomicAsyncBarrierArriveB64(MI);
1341 if (ST.hasScratchBaseForwardingHazard())
1342 fixScratchBaseForwardingHazard(MI);
1343 if (ST.setRegModeNeedsVNOPs())
1344 fixSetRegMode(MI);
1345}
1346
1348 const MachineInstr &MI) {
1349 return (TII.isVOPC(MI) ||
1350 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1351 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1352}
1353
1354bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1355 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1356 return false;
1357
1358 const SIInstrInfo *TII = ST.getInstrInfo();
1359 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1360 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1361 return isVCmpXWritesExec(*TII, *TRI, MI);
1362 };
1363
1364 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1365 unsigned Opc = MI.getOpcode();
1366 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1367 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1368 };
1369
1370 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1371 std::numeric_limits<int>::max())
1372 return false;
1373
1374 // V_NOP will be discarded by SQ.
1375 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1376 // which is always a VGPR and available.
1377 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1378 Register Reg = Src0->getReg();
1379 bool IsUndef = Src0->isUndef();
1380 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1381 TII->get(AMDGPU::V_MOV_B32_e32))
1384
1385 return true;
1386}
1387
1388bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1389 if (!ST.hasVMEMtoScalarWriteHazard())
1390 return false;
1391 assert(!ST.hasExtendedWaitCounts());
1392
1394 return false;
1395
1396 if (MI->getNumDefs() == 0)
1397 return false;
1398
1399 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1400
1401 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1403 return false;
1404
1405 for (const MachineOperand &Def : MI->defs()) {
1406 const MachineOperand *Op =
1407 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1408 if (!Op)
1409 continue;
1410 return true;
1411 }
1412 return false;
1413 };
1414
1415 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1416 return SIInstrInfo::isVALU(MI) ||
1417 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1418 !MI.getOperand(0).getImm()) ||
1419 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1420 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1421 };
1422
1423 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1424 std::numeric_limits<int>::max())
1425 return false;
1426
1427 const SIInstrInfo *TII = ST.getInstrInfo();
1428 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1429 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1431 return true;
1432}
1433
1434bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1435 if (!ST.hasSMEMtoVectorWriteHazard())
1436 return false;
1437 assert(!ST.hasExtendedWaitCounts());
1438
1439 if (!SIInstrInfo::isVALU(*MI))
1440 return false;
1441
1442 AMDGPU::OpName SDSTName;
1443 switch (MI->getOpcode()) {
1444 case AMDGPU::V_READLANE_B32:
1445 case AMDGPU::V_READFIRSTLANE_B32:
1446 SDSTName = AMDGPU::OpName::vdst;
1447 break;
1448 default:
1449 SDSTName = AMDGPU::OpName::sdst;
1450 break;
1451 }
1452
1453 const SIInstrInfo *TII = ST.getInstrInfo();
1454 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1455 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1456 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1457 if (!SDST) {
1458 for (const auto &MO : MI->implicit_operands()) {
1459 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1460 SDST = &MO;
1461 break;
1462 }
1463 }
1464 }
1465
1466 if (!SDST)
1467 return false;
1468
1469 const Register SDSTReg = SDST->getReg();
1470 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1471 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1472 };
1473
1474 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1475 if (TII->isSALU(MI)) {
1476 switch (MI.getOpcode()) {
1477 case AMDGPU::S_SETVSKIP:
1478 case AMDGPU::S_VERSION:
1479 case AMDGPU::S_WAITCNT_VSCNT:
1480 case AMDGPU::S_WAITCNT_VMCNT:
1481 case AMDGPU::S_WAITCNT_EXPCNT:
1482 // These instructions cannot not mitigate the hazard.
1483 return false;
1484 case AMDGPU::S_WAITCNT_LGKMCNT:
1485 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1486 return (MI.getOperand(1).getImm() == 0) &&
1487 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1488 case AMDGPU::S_WAITCNT: {
1489 const int64_t Imm = MI.getOperand(0).getImm();
1490 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1491 // DsCnt corresponds to LGKMCnt here.
1492 return Decoded.get(AMDGPU::DS_CNT) == 0;
1493 }
1494 default:
1495 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1496 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1497 "unexpected wait count instruction");
1498 // SOPP instructions cannot mitigate the hazard.
1499 if (TII->isSOPP(MI))
1500 return false;
1501 // At this point the SALU can be assumed to mitigate the hazard
1502 // because either:
1503 // (a) it is independent of the at risk SMEM (breaking chain),
1504 // or
1505 // (b) it is dependent on the SMEM, in which case an appropriate
1506 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1507 // SMEM instruction.
1508 return true;
1509 }
1510 }
1511 return false;
1512 };
1513
1514 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1515 std::numeric_limits<int>::max())
1516 return false;
1517
1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1519 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1520 .addImm(0);
1521 return true;
1522}
1523
1524bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1525 if (!ST.hasVcmpxExecWARHazard())
1526 return false;
1527 assert(!ST.hasExtendedWaitCounts());
1528
1529 if (!SIInstrInfo::isVALU(*MI))
1530 return false;
1531
1532 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1533 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1534 return false;
1535
1536 auto IsHazardFn = [TRI](const MachineInstr &I) {
1538 return false;
1539 return I.readsRegister(AMDGPU::EXEC, TRI);
1540 };
1541
1542 const SIInstrInfo *TII = ST.getInstrInfo();
1543 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1544 if (SIInstrInfo::isVALU(MI)) {
1545 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1546 return true;
1547 for (auto MO : MI.implicit_operands())
1548 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1549 return true;
1550 }
1551 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1552 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1553 return true;
1554 return false;
1555 };
1556
1557 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1558 std::numeric_limits<int>::max())
1559 return false;
1560
1561 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1562 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1564 return true;
1565}
1566
1568 const GCNSubtarget &ST) {
1569 if (!ST.hasLdsBranchVmemWARHazard())
1570 return false;
1571
1572 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1573 // instructions need to appear in the same function.
1574 bool HasLds = false;
1575 bool HasVmem = false;
1576 for (auto &MBB : MF) {
1577 for (auto &MI : MBB) {
1579 HasVmem |= SIInstrInfo::isVMEM(MI);
1580 if (HasLds && HasVmem)
1581 return true;
1582 }
1583 }
1584 return false;
1585}
1586
1588 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1589 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1590 !I.getOperand(1).getImm();
1591}
1592
1593bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1594 if (!RunLdsBranchVmemWARHazardFixup)
1595 return false;
1596
1597 assert(ST.hasLdsBranchVmemWARHazard());
1598 assert(!ST.hasExtendedWaitCounts());
1599
1600 auto IsHazardInst = [](const MachineInstr &MI) {
1602 return 1;
1604 return 2;
1605 return 0;
1606 };
1607
1608 auto InstType = IsHazardInst(*MI);
1609 if (!InstType)
1610 return false;
1611
1612 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1613 return IsHazardInst(I) || isStoreCountWaitZero(I);
1614 };
1615
1616 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1617 if (!I.isBranch())
1618 return false;
1619
1620 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1621 auto InstType2 = IsHazardInst(I);
1622 return InstType2 && InstType != InstType2;
1623 };
1624
1625 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1626 auto InstType2 = IsHazardInst(I);
1627 if (InstType == InstType2)
1628 return true;
1629
1630 return isStoreCountWaitZero(I);
1631 };
1632
1633 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1634 std::numeric_limits<int>::max();
1635 };
1636
1637 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1638 std::numeric_limits<int>::max())
1639 return false;
1640
1641 const SIInstrInfo *TII = ST.getInstrInfo();
1642 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1643 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1644 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1645 .addImm(0);
1646
1647 return true;
1648}
1649
1650bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1652 return false;
1653
1654 const int NoHazardWaitStates = 15;
1655 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1656 const Register VDSTReg = VDST->getReg();
1657
1658 bool VisitedTrans = false;
1659 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1660 if (!SIInstrInfo::isVALU(I))
1661 return false;
1662 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1663 // Cover both WAR and WAW
1664 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1665 };
1666 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1667 if (WaitStates >= NoHazardWaitStates)
1668 return true;
1669 // Instructions which cause va_vdst==0 expire hazard
1672 };
1673 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1674 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1675 };
1676
1677 DenseSet<const MachineBasicBlock *> Visited;
1678 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1679 std::next(MI->getReverseIterator()), 0,
1680 IsExpiredFn, Visited, GetWaitStatesFn);
1681
1682 // Transcendentals can execute in parallel to other VALUs.
1683 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1684 if (VisitedTrans)
1685 Count = 0;
1686
1687 MachineOperand *WaitVdstOp =
1688 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1689 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1690
1691 return true;
1692}
1693
1694bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1696 return false;
1697
1698 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1699 const Register VDSTReg = VDST->getReg();
1700
1701 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1703 return false;
1704 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1705 };
1706 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1707 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1708 // according to the type of VMEM instruction.
1709 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1711 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1712 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1713 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1714 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1715 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1716 };
1717
1718 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1719 std::numeric_limits<int>::max())
1720 return false;
1721
1722 if (LdsdirCanWait) {
1723 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1724 } else {
1725 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1726 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1728 }
1729
1730 return true;
1731}
1732
1733bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1734 if (!ST.hasVALUPartialForwardingHazard())
1735 return false;
1736 assert(!ST.hasExtendedWaitCounts());
1737
1738 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1739 return false;
1740
1741 SmallSetVector<Register, 4> SrcVGPRs;
1742
1743 for (const MachineOperand &Use : MI->explicit_uses()) {
1744 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1745 SrcVGPRs.insert(Use.getReg());
1746 }
1747
1748 // Only applies with >= 2 unique VGPR sources
1749 if (SrcVGPRs.size() <= 1)
1750 return false;
1751
1752 // Look for the following pattern:
1753 // Va <- VALU [PreExecPos]
1754 // intv1
1755 // Exec <- SALU [ExecPos]
1756 // intv2
1757 // Vb <- VALU [PostExecPos]
1758 // intv3
1759 // MI Va, Vb (WaitState = 0)
1760 //
1761 // Where:
1762 // intv1 + intv2 <= 2 VALUs
1763 // intv3 <= 4 VALUs
1764 //
1765 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1766
1767 const int Intv1plus2MaxVALUs = 2;
1768 const int Intv3MaxVALUs = 4;
1769 const int IntvMaxVALUs = 6;
1770 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1771
1772 struct StateType {
1773 SmallDenseMap<Register, int, 4> DefPos;
1774 int ExecPos = std::numeric_limits<int>::max();
1775 int VALUs = 0;
1776
1777 static unsigned getHashValue(const StateType &State) {
1778 return hash_combine(State.ExecPos, State.VALUs,
1779 hash_combine_range(State.DefPos));
1780 }
1781 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1782 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1783 LHS.VALUs == RHS.VALUs;
1784 }
1785 };
1786
1787 StateType State;
1788
1789 // This overloads expiry testing with all the hazard detection
1790 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1791 // Too many VALU states have passed
1792 if (State.VALUs > NoHazardVALUWaitStates)
1793 return HazardExpired;
1794
1795 // Instructions which cause va_vdst==0 expire hazard
1798 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1799 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1800 return HazardExpired;
1801
1802 // Track registers writes
1803 bool Changed = false;
1804 if (SIInstrInfo::isVALU(I)) {
1805 for (Register Src : SrcVGPRs) {
1806 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1807 State.DefPos[Src] = State.VALUs;
1808 Changed = true;
1809 }
1810 }
1811 } else if (SIInstrInfo::isSALU(I)) {
1812 if (State.ExecPos == std::numeric_limits<int>::max()) {
1813 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1814 State.ExecPos = State.VALUs;
1815 Changed = true;
1816 }
1817 }
1818 }
1819
1820 // Early expiration: too many VALUs in intv3
1821 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1822 return HazardExpired;
1823
1824 // Only evaluate state if something changed
1825 if (!Changed)
1826 return NoHazardFound;
1827
1828 // Determine positions of VALUs pre/post exec change
1829 if (State.ExecPos == std::numeric_limits<int>::max())
1830 return NoHazardFound;
1831
1832 int PreExecPos = std::numeric_limits<int>::max();
1833 int PostExecPos = std::numeric_limits<int>::max();
1834
1835 for (auto Entry : State.DefPos) {
1836 int DefVALUs = Entry.second;
1837 if (DefVALUs != std::numeric_limits<int>::max()) {
1838 if (DefVALUs >= State.ExecPos)
1839 PreExecPos = std::min(PreExecPos, DefVALUs);
1840 else
1841 PostExecPos = std::min(PostExecPos, DefVALUs);
1842 }
1843 }
1844
1845 // Need a VALUs post exec change
1846 if (PostExecPos == std::numeric_limits<int>::max())
1847 return NoHazardFound;
1848
1849 // Too many VALUs in intv3?
1850 int Intv3VALUs = PostExecPos;
1851 if (Intv3VALUs > Intv3MaxVALUs)
1852 return HazardExpired;
1853
1854 // Too many VALUs in intv2?
1855 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1856 if (Intv2VALUs > Intv1plus2MaxVALUs)
1857 return HazardExpired;
1858
1859 // Need a VALUs pre exec change
1860 if (PreExecPos == std::numeric_limits<int>::max())
1861 return NoHazardFound;
1862
1863 // Too many VALUs in intv1?
1864 int Intv1VALUs = PreExecPos - State.ExecPos;
1865 if (Intv1VALUs > Intv1plus2MaxVALUs)
1866 return HazardExpired;
1867
1868 // Too many VALUs in intv1 + intv2
1869 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1870 return HazardExpired;
1871
1872 return HazardFound;
1873 };
1874 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1876 State.VALUs += 1;
1877 };
1878
1879 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1880 std::next(MI->getReverseIterator())))
1881 return false;
1882
1883 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1884 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1886
1887 return true;
1888}
1889
1890bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1891 if (!ST.hasVALUTransUseHazard())
1892 return false;
1893 assert(!ST.hasExtendedWaitCounts());
1894
1895 if (!SIInstrInfo::isVALU(*MI))
1896 return false;
1897
1898 SmallSet<Register, 4> SrcVGPRs;
1899
1900 for (const MachineOperand &Use : MI->explicit_uses()) {
1901 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1902 SrcVGPRs.insert(Use.getReg());
1903 }
1904
1905 // Look for the following pattern:
1906 // Va <- TRANS VALU
1907 // intv
1908 // MI Va (WaitState = 0)
1909 //
1910 // Where:
1911 // intv <= 5 VALUs / 1 TRANS
1912 //
1913 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1914
1915 const int IntvMaxVALUs = 5;
1916 const int IntvMaxTRANS = 1;
1917
1918 struct StateType {
1919 int VALUs = 0;
1920 int TRANS = 0;
1921
1922 static unsigned getHashValue(const StateType &State) {
1923 return hash_combine(State.VALUs, State.TRANS);
1924 }
1925 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1926 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1927 }
1928 };
1929
1930 StateType State;
1931
1932 // This overloads expiry testing with all the hazard detection
1933 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1934 // Too many VALU states have passed
1935 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1936 return HazardExpired;
1937
1938 // Instructions which cause va_vdst==0 expire hazard
1941 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1942 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1943 return HazardExpired;
1944
1945 // Track registers writes
1946 if (SIInstrInfo::isTRANS(I)) {
1947 for (Register Src : SrcVGPRs) {
1948 if (I.modifiesRegister(Src, &TRI)) {
1949 return HazardFound;
1950 }
1951 }
1952 }
1953
1954 return NoHazardFound;
1955 };
1956 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1958 State.VALUs += 1;
1960 State.TRANS += 1;
1961 };
1962
1963 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1964 std::next(MI->getReverseIterator())))
1965 return false;
1966
1967 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1968 // avoided.
1969 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1970 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1972
1973 return true;
1974}
1975
1976bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1977 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1979 return false;
1980
1981 const SIInstrInfo *TII = ST.getInstrInfo();
1982 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1983
1984 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1985 if (!SIInstrInfo::isTRANS(I))
1986 return false;
1987
1988 // RAW: Trans(I) writes, VALU(MI) reads.
1989 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1990 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1991 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1992 return true;
1993 }
1994
1995 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1996 if (!ValuDst || !ValuDst->isReg())
1997 return false;
1998
1999 // WAR: Trans(I) reads, VALU(MI) writes.
2000 Register ValuDef = ValuDst->getReg();
2001 for (const MachineOperand &TransUse : I.explicit_uses()) {
2002 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2003 return true;
2004 }
2005
2006 return false;
2007 };
2008
2009 auto IsExpiredFn = [](const MachineInstr &I, int) {
2010 return SIInstrInfo::isVALU(I);
2011 };
2012
2013 const int HasVALU = std::numeric_limits<int>::max();
2014 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
2015 return false;
2016
2017 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2018 return true;
2019}
2020
2021bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
2023 return false;
2024
2025 const SIInstrInfo *TII = ST.getInstrInfo();
2026 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2027
2028 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2030 return false;
2031
2032 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2033 // with the dest(matrix D) of the previous wmma.
2034 const Register CurSrc0Reg =
2035 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2036 const Register CurSrc1Reg =
2037 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2038
2039 const Register PrevDstReg =
2040 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2041
2042 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2043 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2044 return true;
2045 }
2046
2047 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2048 // but Index can't overlap with PrevDstReg.
2049 if (AMDGPU::isGFX12Plus(ST)) {
2050 if (SIInstrInfo::isSWMMAC(*MI)) {
2051 const Register CurIndex =
2052 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2053 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2054 return true;
2055 }
2056 return false;
2057 }
2058
2059 return false;
2060 };
2061
2062 auto IsExpiredFn = [](const MachineInstr &I, int) {
2063 return SIInstrInfo::isVALU(I);
2064 };
2065
2066 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2067 std::numeric_limits<int>::max())
2068 return false;
2069
2070 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2071
2072 return true;
2073}
2074
2079
2080// Classify XDL WMMA instructions into co-execution hazard categories
2081// (Refer to SPG 4.6.12.1), mainly based on instruction latency.
2082//
2083// Category 0: WMMA with Latency 8
2084// WMMA_*F16, WMMA_*BF16
2085// WMMA_*FP8FP8
2086// WMMA_*FP8BF8
2087// WMMA_*BF8FP8
2088// WMMA_*BF8BF8
2089// WMMA_*F8F6F4 if SRCA & SRCB != F8
2090//
2091// Category 1: WMMA Latency 16
2092// WMMA_IU8
2093// WMMA_*F8F6F4 if SRCA OR SRCB == F8
2094//
2095// Category 2: SWMMAC with Latency 8
2096// SWMMAC_*F16, SWMMAC_*BF16,
2097// SWMMAC_*FP8FP8
2098// SWMMAC_*BF8FP8
2099// SWMMAC_*FP8BF8
2100// SWMMAC_*BF8BF8
2101//
2102// Category 3: SWMMAC with Latency 16
2103// SWMMAC_IU8
2104static unsigned
2106 const TargetSchedModel &SchedModel) {
2107 assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
2108 bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
2109 unsigned Category = 0;
2110
2111 unsigned Latency = SchedModel.computeInstrLatency(&MI);
2112 switch (Latency) {
2113 case 8:
2114 Category = IsSWMMAC ? 2 : 0;
2115 break;
2116 case 16:
2117 Category = IsSWMMAC ? 3 : 1;
2118 break;
2119 default:
2120 llvm_unreachable("unexpected xdl wmma latency");
2121 } // end switch.
2122
2123 return Category;
2124}
2125
2126int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2127 if (!ST.hasGFX1250Insts())
2128 return 0;
2129
2130 const SIInstrInfo *TII = ST.getInstrInfo();
2131 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2132 return 0;
2133
2134 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2135 // be in between the first WMMA and the second instruction to cover the hazard
2136 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2137 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2138 // numbers, which depends on the category of the first WMMA.
2139 const int WMMAWaitStates[] = {5, 9, 3, 5};
2140 const int VALUWaitStates[] = {4, 8, 2, 4};
2141 unsigned Category = 0;
2142
2143 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2144 if (!TII->isXDLWMMA(I))
2145 return false;
2146
2147 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
2148 return hasWMMAToWMMARegOverlap(I, *MI);
2149 };
2150
2151 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2152 if (!TII->isXDLWMMA(I))
2153 return false;
2154
2155 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel);
2156 return hasWMMAToVALURegOverlap(I, *MI);
2157 };
2158
2159 auto GetWaitStatesFn = [](const MachineInstr &I) {
2160 return SIInstrInfo::isVALU(I) ? 1 : 0;
2161 };
2162
2163 int WaitStatesNeeded = -1;
2164 int ExistingVALUs = 0; // Existing number of VALU ops in between.
2165
2166 // getWaitStatesSince checks for a hazard between instruction 'I' and 'MI':
2167 // - If a hazard exists: returns the number of VALUs in between and sets
2168 // 'Category' via IsWMMAHazardFn/IsVALUHazardFn for instruction 'I'.
2169 // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
2170 // so no V_NOP insertion is needed.
2171 if (TII->isXDLWMMA(*MI)) {
2172 const int WMMAWaitsLimit = 9; // Maximum of WMMAWaitStates
2173 ExistingVALUs =
2174 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2175 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2176 } else { // Must be a co-executable VALU.
2177 const int VALUWaitsLimit = 8; // Maximum of VALUWaitStates
2178 ExistingVALUs =
2179 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2180 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2181 }
2182
2183 return WaitStatesNeeded;
2184}
2185
2186bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2187 const MachineInstr &WMMA, const MachineInstr &MI) const {
2188 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2189 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2190 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2191
2192 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2193 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2194 return true;
2195
2197 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2198 if (TRI.regsOverlap(D0, Idx1))
2199 return true;
2200 }
2201 return false;
2202}
2203
2204bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2205 const MachineInstr &WMMA, const MachineInstr &MI) const {
2206 // WMMA writes, VALU reads.
2207 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2208 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2209 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2210 return true;
2211 }
2212
2213 // WMMA reads or writes, VALU writes.
2214 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2215 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2216 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2217
2218 if (SIInstrInfo::isSWMMAC(WMMA)) {
2219 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2220 WMMARegs.push_back(Idx0);
2221 }
2222
2223 for (const MachineOperand &ValuDef : MI.defs()) {
2224 Register VDstReg = ValuDef.getReg();
2225 for (Register WMMAReg : WMMARegs) {
2226 if (TRI.regsOverlap(VDstReg, WMMAReg))
2227 return true;
2228 }
2229 }
2230 return false;
2231}
2232
2233bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2234 const MachineInstr &MI) const {
2235 // I is the potential WMMA hazard source, MI is the instruction being checked
2236 // for hazard.
2237 if (!TII.isXDLWMMA(I))
2238 return false;
2239
2240 // Dispatch based on MI type
2241 if (TII.isXDLWMMA(MI))
2242 return hasWMMAToWMMARegOverlap(I, MI);
2244 return hasWMMAToVALURegOverlap(I, MI);
2245
2246 return false;
2247}
2248
2249bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2250 bool IncludeSubloops) {
2251 // Scan loop for any WMMA that hazards MI.
2252 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2253 for (MachineBasicBlock *MBB : L->getBlocks()) {
2254 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2255 continue;
2256 for (MachineInstr &I : *MBB) {
2257 if (&I == MI)
2258 continue;
2259 if (isCoexecutionHazardFor(I, *MI))
2260 return true;
2261 }
2262 }
2263 return false;
2264}
2265
2266bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2267 int WaitStatesNeeded) {
2268 if (!MLI)
2269 return false;
2270
2271 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2272 if (!L) {
2273 ++NumWMMAHoistingBailed;
2274 return false;
2275 }
2276
2277 // If innermost loop has WMMA hazard, we can't hoist at all
2278 if (hasWMMAHazardInLoop(L, MI)) {
2279 ++NumWMMAHoistingBailed;
2280 return false;
2281 }
2282
2283 // Find outermost loop with no internal hazard
2284 MachineLoop *TargetLoop = L;
2285 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2286 if (hasWMMAHazardInLoop(Parent, MI, false))
2287 break; // Parent has hazard in its own blocks, stop here
2288 TargetLoop = Parent; // Safe to hoist further out
2289 }
2290
2291 // Need valid preheader to insert V_NOPs
2292 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2293 if (!Preheader) {
2294 ++NumWMMAHoistingBailed;
2295 return false;
2296 }
2297
2298 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2299 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2300 << "\n");
2301
2302 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2303 /*IsHoisting=*/true);
2304 NumWMMANopsHoisted += WaitStatesNeeded;
2305 return true;
2306}
2307
2308bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2309 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2310 if (WaitStatesNeeded <= 0)
2311 return false;
2312
2313 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2314 return true;
2315
2316 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2317 return true;
2318}
2319
2320bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2321 if (!ST.hasShift64HighRegBug())
2322 return false;
2323 assert(!ST.hasExtendedWaitCounts());
2324
2325 switch (MI->getOpcode()) {
2326 default:
2327 return false;
2328 case AMDGPU::V_LSHLREV_B64_e64:
2329 case AMDGPU::V_LSHRREV_B64_e64:
2330 case AMDGPU::V_ASHRREV_I64_e64:
2331 break;
2332 }
2333
2334 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2335 if (!Amt->isReg())
2336 return false;
2337
2338 Register AmtReg = Amt->getReg();
2339 const MachineRegisterInfo &MRI = MF.getRegInfo();
2340 // Check if this is a last VGPR in the allocation block.
2341 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2342 return false;
2343
2344 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2345 return false;
2346
2347 assert(ST.needsAlignedVGPRs());
2348 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2349
2350 const DebugLoc &DL = MI->getDebugLoc();
2351 MachineBasicBlock *MBB = MI->getParent();
2352 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2353
2354 // In:
2355 //
2356 // Dst = shiftrev64 Amt, Src1
2357 //
2358 // if Dst!=Src1 then avoid the bug with:
2359 //
2360 // Dst.sub0 = Amt
2361 // Dst = shift64 Dst.sub0, Src1
2362
2363 Register DstReg = MI->getOperand(0).getReg();
2364 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2365 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2366 runOnInstruction(
2367 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2368 Amt->setReg(DstLo);
2369 Amt->setIsKill(true);
2370 return true;
2371 }
2372
2373 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2374 Register NewReg;
2375 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2376 : AMDGPU::VGPR_32RegClass) {
2377 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2378 NewReg = Reg;
2379 break;
2380 }
2381 }
2382
2383 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2384 : NewReg;
2385 Register NewAmtLo;
2386
2387 if (Overlapped)
2388 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2389
2390 // Insert a full wait count because found register might be pending a wait.
2391 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2392 .addImm(0);
2393
2394 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2395 if (Overlapped)
2396 runOnInstruction(
2397 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2398 .addDef(AmtReg - 1)
2399 .addReg(AmtReg - 1, RegState::Undef)
2400 .addReg(NewAmtLo, RegState::Undef));
2401 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2402 .addDef(AmtReg)
2403 .addReg(AmtReg, RegState::Undef)
2404 .addReg(NewAmt, RegState::Undef));
2405
2406 // Instructions emitted after the current instruction will be processed by the
2407 // parent loop of the hazard recognizer in a natural way.
2408 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2409 AmtReg)
2410 .addDef(NewAmt)
2411 .addReg(NewAmt)
2412 .addReg(AmtReg);
2413 if (Overlapped)
2414 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2415 AmtReg - 1)
2416 .addDef(NewAmtLo)
2417 .addReg(NewAmtLo)
2418 .addReg(AmtReg - 1);
2419
2420 // Re-running hazard recognizer on the modified instruction is not necessary,
2421 // inserted V_SWAP_B32 has already both read and write new registers so
2422 // hazards related to these register has already been handled.
2423 Amt->setReg(NewAmt);
2424 Amt->setIsKill(false);
2425 // We do not update liveness, so verifier may see it as undef.
2426 Amt->setIsUndef();
2427 if (Overlapped) {
2428 MI->getOperand(0).setReg(NewReg);
2429 Src1->setReg(NewReg);
2430 Src1->setIsKill(false);
2431 Src1->setIsUndef();
2432 }
2433
2434 return true;
2435}
2436
2437int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2438 int NSAtoVMEMWaitStates = 1;
2439
2440 if (!ST.hasNSAtoVMEMBug())
2441 return 0;
2442
2444 return 0;
2445
2446 const SIInstrInfo *TII = ST.getInstrInfo();
2447 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2448 if (!Offset || (Offset->getImm() & 6) == 0)
2449 return 0;
2450
2451 auto IsHazardFn = [TII](const MachineInstr &I) {
2452 if (!SIInstrInfo::isMIMG(I))
2453 return false;
2454 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2455 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2456 TII->getInstSizeInBytes(I) >= 16;
2457 };
2458
2459 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2460}
2461
2462int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2463 MachineInstr *MI) const {
2464 int FPAtomicToDenormModeWaitStates = 3;
2465
2466 if (!ST.hasFPAtomicToDenormModeHazard())
2467 return 0;
2468 assert(!ST.hasExtendedWaitCounts());
2469
2470 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2471 return 0;
2472
2473 auto IsHazardFn = [](const MachineInstr &I) {
2474 if (!SIInstrInfo::isVMEM(I))
2475 return false;
2476 return SIInstrInfo::isFPAtomic(I);
2477 };
2478
2479 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2480 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2481 return true;
2482
2483 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2484 };
2485
2486 return FPAtomicToDenormModeWaitStates -
2487 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2488}
2489
2490int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2492
2493 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2494}
2495
2496int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2497 // Early exit if no padding is requested.
2498 if (MFMAPaddingRatio == 0)
2499 return 0;
2500
2501 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2502 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2503 return 0;
2504
2505 int NeighborMFMALatency = 0;
2506 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2507 this](const MachineInstr &MI) {
2508 if (!SIInstrInfo::isMFMA(MI))
2509 return false;
2510
2511 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2512 return true;
2513 };
2514
2515 const int MaxMFMAPipelineWaitStates = 16;
2516 int WaitStatesSinceNeighborMFMA =
2517 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2518
2519 int NeighborMFMAPaddingNeeded =
2520 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2521 WaitStatesSinceNeighborMFMA;
2522
2523 return std::max(0, NeighborMFMAPaddingNeeded);
2524}
2525
2526int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2527 int WaitStatesNeeded = 0;
2528 unsigned Opc = MI->getOpcode();
2529
2530 auto IsVALUFn = [](const MachineInstr &MI) {
2531 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2532 };
2533
2534 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2535 const int LegacyVALUWritesVGPRWaitStates = 2;
2536 const int VALUWritesExecWaitStates = 4;
2537 const int MaxWaitStates = 4;
2538
2539 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2540 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2541 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2542
2543 if (WaitStatesNeeded < MaxWaitStates) {
2544 for (const MachineOperand &Use : MI->explicit_uses()) {
2545 const int MaxWaitStates = 2;
2546
2547 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2548 continue;
2549
2550 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2551 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553
2554 if (WaitStatesNeeded == MaxWaitStates)
2555 break;
2556 }
2557 }
2558 }
2559
2560 for (const MachineOperand &Op : MI->explicit_operands()) {
2561 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2562 continue;
2563
2564 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2565 continue;
2566
2567 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2568 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2569 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2570 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2571 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2572 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2573 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2574 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2575 const int MaxWaitStates = 18;
2576 Register Reg = Op.getReg();
2577 unsigned HazardDefLatency = 0;
2578
2579 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2580 this](const MachineInstr &MI) {
2581 if (!SIInstrInfo::isMFMA(MI))
2582 return false;
2583 Register DstReg = MI.getOperand(0).getReg();
2584 if (DstReg == Reg)
2585 return false;
2586 HazardDefLatency =
2587 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2588 return TRI.regsOverlap(DstReg, Reg);
2589 };
2590
2591 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2592 MaxWaitStates);
2593 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2594 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2595 int OpNo = Op.getOperandNo();
2596 if (OpNo == SrcCIdx) {
2597 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2598 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2599 switch (HazardDefLatency) {
2600 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2601 break;
2602 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2603 break;
2604 case 16: [[fallthrough]];
2605 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2606 break;
2607 }
2608 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2609 switch (HazardDefLatency) {
2610 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2611 break;
2612 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2613 break;
2614 case 16: [[fallthrough]];
2615 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2616 break;
2617 }
2618 }
2619
2620 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2621 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2622
2623 if (WaitStatesNeeded == MaxWaitStates)
2624 return WaitStatesNeeded; // Early exit.
2625
2626 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2627 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2628 return false;
2629 Register DstReg = MI.getOperand(0).getReg();
2630 return TRI.regsOverlap(Reg, DstReg);
2631 };
2632
2633 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2634 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2635 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2636 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2637 if (OpNo == SrcCIdx)
2638 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2639 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2640 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2641
2642 WaitStatesNeededForUse = NeedWaitStates -
2643 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2644 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2645
2646 if (WaitStatesNeeded == MaxWaitStates)
2647 return WaitStatesNeeded; // Early exit.
2648 }
2649
2650 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2651 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2652 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2653 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2654 const int MaxWaitStates = 13;
2655 Register DstReg = MI->getOperand(0).getReg();
2656 unsigned HazardDefLatency = 0;
2657
2658 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2659 this](const MachineInstr &MI) {
2660 if (!SIInstrInfo::isMFMA(MI))
2661 return false;
2662 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2663 HazardDefLatency =
2664 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2665 return TRI.regsOverlap(Reg, DstReg);
2666 };
2667
2668 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2669 int NeedWaitStates;
2670 switch (HazardDefLatency) {
2671 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2672 break;
2673 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2674 break;
2675 case 16: [[fallthrough]];
2676 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2677 break;
2678 }
2679
2680 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2681 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2682 }
2683
2684 // Pad neighboring MFMA with noops for better inter-wave performance.
2685 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2686
2687 return WaitStatesNeeded;
2688}
2689
2690static int
2692 bool IsGFX950) {
2693 // xdl def cycles | gfx940 | gfx950
2694 // 2 pass | 3 4
2695 // 4 pass | 5 6
2696 // 8 pass | 9 10
2697 // 16 pass | 17 18
2698 return NumPasses + 1 + IsGFX950;
2699}
2700
2701static int
2703 bool IsGFX950) {
2704 // xdl def cycles | gfx940 | gfx950
2705 // 2 pass | 3 3
2706 // 4 pass | 5 6
2707 // 8 pass | 9 10
2708 // 16 pass | 17 18
2709 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2710}
2711
2712static int
2714 // 2 pass -> 2
2715 // 4 pass -> 4
2716 // 8 pass -> 8
2717 // 16 pass -> 16
2718 return NumPasses;
2719}
2720
2721static int
2723 // 2 pass -> 4
2724 // 4 pass -> 6
2725 // 8 pass -> 10
2726 // 16 pass -> 18
2727 return NumPasses + 2;
2728}
2729
2731 bool IsGFX950) {
2732 // xdl def cycles | gfx942 | gfx950
2733 // 2 pass | 5 5
2734 // 4 pass | 7 8
2735 // 8 pass | 11 12
2736 // 16 pass | 19 20
2737 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2738}
2739
2740int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2741 int WaitStatesNeeded = 0;
2742 unsigned Opc = MI->getOpcode();
2743
2744 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2746 };
2747
2748 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2751 };
2752
2753 if (!SIInstrInfo::isMFMA(*MI))
2754 return WaitStatesNeeded;
2755
2756 const int VALUWritesExecWaitStates = 4;
2757 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2758 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2759 VALUWritesExecWaitStates);
2760 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2761
2762 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2763
2764 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2765 for (const MachineOperand &Use : MI->explicit_uses()) {
2766 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2767 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2768 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2769 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2770 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2771 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2772 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2773 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2774 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2775 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2776 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2777 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2778 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2779 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2780 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2781 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2782 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2783 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2784 const int MaxWaitStates = 19;
2785
2786 if (!Use.isReg())
2787 continue;
2788 Register Reg = Use.getReg();
2789 bool FullReg;
2790 const MachineInstr *MI1;
2791
2792 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2793 this](const MachineInstr &MI) {
2794 if (!SIInstrInfo::isMFMA(MI))
2795 return false;
2796 Register DstReg = MI.getOperand(0).getReg();
2797 FullReg = (DstReg == Reg);
2798 MI1 = &MI;
2799 return TRI.regsOverlap(DstReg, Reg);
2800 };
2801
2802 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2803 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2805
2806 int NumWaitStates =
2807 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2808 if (NumWaitStates == std::numeric_limits<int>::max())
2809 continue;
2810
2811 int OpNo = Use.getOperandNo();
2812 unsigned Opc1 = MI1->getOpcode();
2813 int NeedWaitStates = 0;
2814 if (OpNo == SrcCIdx) {
2815 if (!SIInstrInfo::isDGEMM(Opc) &&
2816 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2817 NeedWaitStates = 0;
2818 } else if (FullReg) {
2819 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2820 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2821 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2822 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2823 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2824 else if (ST.hasGFX940Insts() &&
2825 TSchedModel.computeInstrLatency(MI1) == 2)
2826 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2827 } else {
2828 switch (Opc1) {
2829 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2830 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2831 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2832 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2833 if (!TII.isXDL(*MI))
2834 NeedWaitStates =
2835 ST.hasGFX950Insts()
2836 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2837 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2838 break;
2839 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2840 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2841 if (!TII.isXDL(*MI))
2842 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2843 break;
2844 default:
2845 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2846 if (ST.hasGFX940Insts()) {
2847 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2848 break;
2849
2850 NeedWaitStates =
2851 TII.isXDL(*MI1)
2852 ? (TII.isXDL(*MI)
2854 NumPasses, ST.hasGFX950Insts())
2856 NumPasses, ST.hasGFX950Insts()))
2858 NumPasses);
2859 break;
2860 }
2861
2862 switch (NumPasses) {
2863 case 2:
2864 NeedWaitStates =
2866 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2867 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2868 break;
2869 case 8:
2870 NeedWaitStates =
2872 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2873 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2874 break;
2875 case 16:
2876 NeedWaitStates =
2878 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2879 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2880 break;
2881 default:
2882 llvm_unreachable("unexpected number of passes");
2883 }
2884 }
2885 }
2886 } else {
2887 switch (Opc1) {
2888 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2889 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2890 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2891 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2892 NeedWaitStates =
2893 ST.hasGFX950Insts()
2894 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2895 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2896 break;
2897 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2898 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2899 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2900 break;
2901 default:
2902 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2903
2904 if (ST.hasGFX940Insts()) {
2905 NeedWaitStates =
2906 TII.isXDL(*MI1)
2908 NumPasses, ST.hasGFX950Insts())
2910 NumPasses);
2911 break;
2912 }
2913
2914 switch (NumPasses) {
2915 case 2:
2916 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2917 break;
2918 case 4:
2919 llvm_unreachable("unexpected number of passes for mfma");
2920 case 8:
2921 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2922 break;
2923 case 16:
2924 default:
2925 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2926 }
2927 }
2928 }
2929 if (WaitStatesNeeded >= NeedWaitStates)
2930 continue;
2931
2932 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2933 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2934
2935 if (WaitStatesNeeded == MaxWaitStates)
2936 break;
2937 }
2938
2939 // Pad neighboring MFMA with noops for better inter-wave performance.
2940 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2941
2942 return WaitStatesNeeded;
2943}
2944
2945int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2946 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2947 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2948 return 0;
2949
2950 int WaitStatesNeeded = 0;
2951
2952 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2953 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2954 };
2955
2956 for (const MachineOperand &Op : MI->explicit_uses()) {
2957 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2958 continue;
2959
2960 Register Reg = Op.getReg();
2961
2962 const int AccVgprReadLdStWaitStates = 2;
2963 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2964 const int MaxWaitStates = 2;
2965
2966 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2967 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2968 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2969
2970 if (WaitStatesNeeded == MaxWaitStates)
2971 return WaitStatesNeeded; // Early exit.
2972
2973 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2974 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2975 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2976 return false;
2977 auto IsVALUFn = [](const MachineInstr &MI) {
2979 };
2980 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2981 std::numeric_limits<int>::max();
2982 };
2983
2984 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2985 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2986 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2987 }
2988
2989 return WaitStatesNeeded;
2990}
2991
2992int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2993 assert(!ST.hasVcmpxPermlaneHazard() &&
2994 "this is a different vcmpx+permlane hazard");
2995 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2996 const SIInstrInfo *TII = ST.getInstrInfo();
2997
2998 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2999 return isVCmpXWritesExec(*TII, *TRI, MI);
3000 };
3001
3002 auto IsVALUFn = [](const MachineInstr &MI) {
3003 return SIInstrInfo::isVALU(MI);
3004 };
3005
3006 const int VCmpXWritesExecWaitStates = 4;
3007 const int VALUWritesVDstWaitStates = 2;
3008 int WaitStatesNeeded = 0;
3009
3010 for (const MachineOperand &Op : MI->explicit_uses()) {
3011 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
3012 continue;
3013 Register Reg = Op.getReg();
3014
3015 int WaitStatesSinceDef =
3016 VALUWritesVDstWaitStates -
3017 getWaitStatesSinceDef(Reg, IsVALUFn,
3018 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
3019 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3020 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3021 break;
3022 }
3023
3024 int VCmpXHazardWaits =
3025 VCmpXWritesExecWaitStates -
3026 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3027
3028 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3029 return WaitStatesNeeded;
3030}
3031
3033 // 2 pass -> 4
3034 // 4 pass -> 6
3035 // 8 pass -> 10
3036 // 16 pass -> 18
3037 return NumPasses + 2;
3038}
3039
3041 bool IsGFX950) {
3042 // xdl def cycles | gfx942 | gfx950
3043 // 2 pass | 5 5
3044 // 4 pass | 7 8
3045 // 8 pass | 11 12
3046 // 16 pass | 19 20
3047 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3048}
3049
3051 bool IsGFX950) {
3052 // xdl def cycles | gfx942 | gfx950
3053 // 2 pass | 5 5
3054 // 4 pass | 7 8
3055 // 8 pass | 11 12
3056 // 16 pass | 19 20
3057 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3058}
3059
3061 // 2 pass -> 4
3062 // 4 pass -> 6
3063 // 8 pass -> 10
3064 // 16 pass -> 18
3065 return NumPasses + 2;
3066}
3067
3068int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3069 if (!ST.hasGFX90AInsts())
3070 return 0;
3071
3072 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3073 return SIInstrInfo::isDGEMM(MI.getOpcode());
3074 };
3075
3076 // This is checked in checkMAIHazards90A()
3077 if (SIInstrInfo::isMFMA(*MI))
3078 return 0;
3079
3080 const MachineRegisterInfo &MRI = MF.getRegInfo();
3081
3082 int WaitStatesNeeded = 0;
3083
3084 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3085 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3086 bool IsVALU = SIInstrInfo::isVALU(*MI);
3087
3088 const MachineInstr *MFMA = nullptr;
3089 unsigned Reg;
3090 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3091 if (!SIInstrInfo::isMFMA(MI) ||
3092 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3093 return false;
3094 MFMA = &MI;
3095 return true;
3096 };
3097
3098 const MachineInstr *DOT = nullptr;
3099 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3100 if (!SIInstrInfo::isDOT(MI) ||
3101 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3102 return false;
3103 DOT = &MI;
3104 return true;
3105 };
3106
3107 bool DGEMMAfterVALUWrite = false;
3108 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3109 // Found DGEMM on reverse traversal to def.
3110 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3111 DGEMMAfterVALUWrite = true;
3112
3113 // Only hazard if register is defined by a VALU and a DGEMM is found after
3114 // after the def.
3115 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3116 return false;
3117
3118 return true;
3119 };
3120
3121 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3122 AMDGPU::OpName::src2);
3123
3124 if (IsMemOrExport || IsVALU) {
3125 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3126 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3127 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3128 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3129 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3130 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3131 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3132 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3133 const int DotWriteSameDotReadSrcAB = 3;
3134 const int DotWriteDifferentVALURead = 3;
3135 const int DMFMABetweenVALUWriteVMEMRead = 2;
3136 const int MaxWaitStates = 19;
3137
3138 for (const MachineOperand &Use : MI->explicit_uses()) {
3139 if (!Use.isReg())
3140 continue;
3141 Reg = Use.getReg();
3142
3143 DOT = nullptr;
3144 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3145 MaxWaitStates);
3146 if (DOT) {
3147 int NeedWaitStates = 0;
3148 if (DOT->getOpcode() == MI->getOpcode()) {
3149 if (&Use - &MI->getOperand(0) != SrcCIdx)
3150 NeedWaitStates = DotWriteSameDotReadSrcAB;
3151 } else {
3152 NeedWaitStates = DotWriteDifferentVALURead;
3153 }
3154
3155 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3157 }
3158
3159 // Workaround for HW data hazard bug observed only in GFX90A. When there
3160 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3161 // causes the SQ to incorrectly not insert two wait states between the two
3162 // instructions needed to avoid data hazard.
3163 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3164 DGEMMAfterVALUWrite = false;
3165 if (TRI.isVectorRegister(MRI, Reg)) {
3166 int WaitStatesNeededForUse =
3167 DMFMABetweenVALUWriteVMEMRead -
3168 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3169 DMFMABetweenVALUWriteVMEMRead);
3170
3171 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3172 }
3173 }
3174
3175 MFMA = nullptr;
3176 WaitStatesSinceDef =
3177 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3178 if (!MFMA)
3179 continue;
3180
3181 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3182 int NumPasses = HazardDefLatency;
3183 int NeedWaitStates = MaxWaitStates;
3184
3185 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3186 switch (HazardDefLatency) {
3187 case 4:
3188 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3189 : DMFMA4x4WriteVgprVALUReadWaitStates;
3190 break;
3191 case 8:
3192 case 16:
3193 NeedWaitStates =
3194 IsMemOrExport
3195 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3196 : (ST.hasGFX950Insts()
3197 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3198 : DMFMA16x16WriteVgprVALUReadWaitStates);
3199 break;
3200 default:
3201 llvm_unreachable("unexpected dgemm");
3202 }
3203 } else if (ST.hasGFX940Insts()) {
3204 NeedWaitStates =
3205 TII.isXDL(*MFMA)
3207 NumPasses, ST.hasGFX950Insts())
3209 NumPasses);
3210 } else {
3211 switch (HazardDefLatency) {
3212 case 2:
3213 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3214 break;
3215 case 8:
3216 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3217 break;
3218 case 16:
3219 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3220 break;
3221 default:
3222 llvm_unreachable("unexpected number of passes for mfma");
3223 }
3224 }
3225
3226 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3227 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3228
3229 if (WaitStatesNeeded == MaxWaitStates)
3230 break;
3231 }
3232 }
3233
3234 unsigned Opc = MI->getOpcode();
3235 const int DMFMAToFMA64WaitStates = 2;
3236 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3237 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3238 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3239 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3240 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3241 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3242 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3243 }
3244
3245 if (!IsVALU && !IsMemOrExport)
3246 return WaitStatesNeeded;
3247
3248 for (const MachineOperand &Def : MI->defs()) {
3249 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3250 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3251 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3252 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3253 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3254 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3255 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3256 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3257 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3258 const int DotWriteDifferentVALUWrite = 3;
3259 const int MaxWaitStates = 19;
3260 const int MaxWarWaitStates = 15;
3261
3262 Reg = Def.getReg();
3263
3264 DOT = nullptr;
3265 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3266 MaxWaitStates);
3267 if (DOT && DOT->getOpcode() != MI->getOpcode())
3268 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3269 WaitStatesSinceDef);
3270
3271 MFMA = nullptr;
3272 WaitStatesSinceDef =
3273 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3274 if (MFMA) {
3275 int NeedWaitStates = MaxWaitStates;
3276 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3277
3278 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3279 switch (NumPasses) {
3280 case 4:
3281 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3282 break;
3283 case 8:
3284 case 16:
3285 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3286 break;
3287 default:
3288 llvm_unreachable("unexpected number of cycles for dgemm");
3289 }
3290 } else if (ST.hasGFX940Insts()) {
3291 NeedWaitStates =
3292 TII.isXDL(*MFMA)
3294 NumPasses, ST.hasGFX950Insts())
3296 } else {
3297 switch (NumPasses) {
3298 case 2:
3299 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3300 break;
3301 case 8:
3302 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3303 break;
3304 case 16:
3305 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3306 break;
3307 default:
3308 llvm_unreachable("Unexpected number of passes for mfma");
3309 }
3310 }
3311
3312 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3313 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3314
3315 if (WaitStatesNeeded == MaxWaitStates)
3316 break;
3317 }
3318
3319 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3320 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3321 !MI.readsRegister(Reg, &TRI))
3322 return false;
3323
3324 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3325 return false;
3326
3327 const MachineOperand *SrcC =
3328 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3329 assert(SrcC);
3330 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3331 return false;
3332
3333 MFMA = &MI;
3334 return true;
3335 };
3336
3337 MFMA = nullptr;
3338 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3339 MaxWarWaitStates);
3340 if (!MFMA)
3341 continue;
3342
3343 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3344 int NeedWaitStates = MaxWaitStates;
3345 switch (HazardDefLatency) {
3346 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3347 break;
3348 case 4: assert(ST.hasGFX940Insts());
3349 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3350 break;
3351 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3352 break;
3353 case 16: [[fallthrough]];
3354 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3355 break;
3356 }
3357
3358 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3359 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3360 }
3361
3362 return WaitStatesNeeded;
3363}
3364
3366 if (!SU->isInstr())
3367 return false;
3368
3369 const MachineInstr *MAI = nullptr;
3370
3371 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3372 MAI = nullptr;
3374 MAI = &MI;
3375 return MAI != nullptr;
3376 };
3377
3378 MachineInstr *MI = SU->getInstr();
3379 if (IsMFMAFn(*MI)) {
3380 int W = getWaitStatesSince(IsMFMAFn, 16);
3381 if (MAI)
3382 return W < (int)TSchedModel.computeInstrLatency(MAI);
3383 }
3384
3385 return false;
3386}
3387
3388// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3389// insertion of a new instruction.
3390static void updateGetPCBundle(MachineInstr *NewMI) {
3391 if (!NewMI->isBundled())
3392 return;
3393
3394 // Find start of bundle.
3395 auto I = NewMI->getIterator();
3396 while (I->isBundledWithPred())
3397 I--;
3398 if (I->isBundle())
3399 I++;
3400
3401 // Bail if this is not an S_GETPC bundle.
3402 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3403 return;
3404
3405 // Update offsets of any references in the bundle.
3406 const unsigned NewBytes = 4;
3407 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3408 "Unexpected instruction insertion in bundle");
3409 auto NextMI = std::next(NewMI->getIterator());
3410 auto End = NewMI->getParent()->end();
3411 while (NextMI != End && NextMI->isBundledWithPred()) {
3412 for (auto &Operand : NextMI->operands()) {
3413 if (Operand.isGlobal())
3414 Operand.setOffset(Operand.getOffset() + NewBytes);
3415 }
3416 NextMI++;
3417 }
3418}
3419
3420bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3421 if (!ST.hasVALUMaskWriteHazard())
3422 return false;
3423 assert(!ST.hasExtendedWaitCounts());
3424
3425 if (!ST.isWave64())
3426 return false;
3427
3428 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3429 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3430 if (!IsSALU && !IsVALU)
3431 return false;
3432
3433 // The hazard sequence is three instructions:
3434 // 1. VALU reads SGPR as mask
3435 // 2. VALU/SALU writes SGPR
3436 // 3. VALU/SALU reads SGPR
3437 // The hazard can expire if the distance between 2 and 3 is sufficient,
3438 // or (2) is VALU and (3) is SALU.
3439 // In practice this happens <10% of the time, hence always assume the hazard
3440 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3441
3442 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3443 const MachineRegisterInfo &MRI = MF.getRegInfo();
3444
3445 auto IgnoreableSGPR = [](const Register Reg) {
3446 switch (Reg) {
3447 case AMDGPU::EXEC:
3448 case AMDGPU::EXEC_LO:
3449 case AMDGPU::EXEC_HI:
3450 case AMDGPU::M0:
3451 case AMDGPU::SGPR_NULL:
3452 case AMDGPU::SGPR_NULL64:
3453 case AMDGPU::SCC:
3454 return true;
3455 default:
3456 return false;
3457 }
3458 };
3459 auto IsVCC = [](const Register Reg) {
3460 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3461 };
3462
3463 struct StateType {
3464 SmallSet<Register, 2> HazardSGPRs;
3465
3466 static unsigned getHashValue(const StateType &State) {
3467 return hash_combine_range(State.HazardSGPRs);
3468 }
3469 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3470 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3471 }
3472 };
3473
3474 SmallVector<const MachineInstr *> WaitInstrs;
3475 bool HasSGPRRead = false;
3476 StateType InitialState;
3477
3478 // Look for SGPR write.
3479 MachineOperand *HazardDef = nullptr;
3480 for (MachineOperand &Op : MI->operands()) {
3481 if (!Op.isReg())
3482 continue;
3483 if (Op.isDef() && HazardDef)
3484 continue;
3485
3486 Register Reg = Op.getReg();
3487 if (IgnoreableSGPR(Reg))
3488 continue;
3489 if (!IsVCC(Reg)) {
3490 if (Op.isImplicit())
3491 continue;
3492 if (!TRI->isSGPRReg(MRI, Reg))
3493 continue;
3494 }
3495 // Also check for SGPR reads.
3496 if (Op.isUse()) {
3497 HasSGPRRead = true;
3498 continue;
3499 }
3500
3501 assert(!HazardDef);
3502 HazardDef = &Op;
3503 }
3504
3505 if (!HazardDef)
3506 return false;
3507
3508 // Setup to track writes to individual SGPRs
3509 const Register HazardReg = HazardDef->getReg();
3510 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3511 InitialState.HazardSGPRs.insert(HazardReg);
3512 } else {
3513 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3514 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3515 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3516 }
3517
3518 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3519 if (State.HazardSGPRs.empty())
3520 return HazardExpired;
3521
3522 switch (I.getOpcode()) {
3523 case AMDGPU::V_ADDC_U32_e32:
3524 case AMDGPU::V_ADDC_U32_dpp:
3525 case AMDGPU::V_CNDMASK_B16_t16_e32:
3526 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3527 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3528 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3529 case AMDGPU::V_CNDMASK_B32_e32:
3530 case AMDGPU::V_CNDMASK_B32_dpp:
3531 case AMDGPU::V_DIV_FMAS_F32_e64:
3532 case AMDGPU::V_DIV_FMAS_F64_e64:
3533 case AMDGPU::V_SUBB_U32_e32:
3534 case AMDGPU::V_SUBB_U32_dpp:
3535 case AMDGPU::V_SUBBREV_U32_e32:
3536 case AMDGPU::V_SUBBREV_U32_dpp: {
3537 // These implicitly read VCC as mask source.
3538 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3539 }
3540 case AMDGPU::V_ADDC_U32_e64:
3541 case AMDGPU::V_ADDC_U32_e64_dpp:
3542 case AMDGPU::V_CNDMASK_B16_t16_e64:
3543 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3544 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3545 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3546 case AMDGPU::V_CNDMASK_B32_e64:
3547 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3548 case AMDGPU::V_SUBB_U32_e64:
3549 case AMDGPU::V_SUBB_U32_e64_dpp:
3550 case AMDGPU::V_SUBBREV_U32_e64:
3551 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3552 // Only check mask register overlaps.
3553 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3554 assert(SSRCOp);
3555 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3556 return Result ? HazardFound : NoHazardFound;
3557 }
3558 default:
3559 return NoHazardFound;
3560 }
3561 };
3562
3563 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3565 0),
3566 0);
3567 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3568 switch (I.getOpcode()) {
3569 case AMDGPU::S_WAITCNT_DEPCTR:
3570 // Record mergable waits within region of instructions free of SGPR reads.
3571 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3572 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3573 WaitInstrs.push_back(&I);
3574 break;
3575 default:
3576 // Update tracking of SGPR reads and writes.
3577 for (auto &Op : I.operands()) {
3578 if (!Op.isReg())
3579 continue;
3580
3581 Register Reg = Op.getReg();
3582 if (IgnoreableSGPR(Reg))
3583 continue;
3584 if (!IsVCC(Reg)) {
3585 if (Op.isImplicit())
3586 continue;
3587 if (!TRI->isSGPRReg(MRI, Reg))
3588 continue;
3589 }
3590 if (Op.isUse()) {
3591 HasSGPRRead = true;
3592 continue;
3593 }
3594
3595 // Stop tracking any SGPRs with writes on the basis that they will
3596 // already have an appropriate wait inserted afterwards.
3598 for (Register SGPR : State.HazardSGPRs) {
3599 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3600 Found.push_back(SGPR);
3601 }
3602 for (Register SGPR : Found)
3603 State.HazardSGPRs.erase(SGPR);
3604 }
3605 break;
3606 }
3607 };
3608
3609 // Check for hazard
3610 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3611 MI->getParent(),
3612 std::next(MI->getReverseIterator())))
3613 return false;
3614
3615 // Compute counter mask
3616 unsigned DepCtr =
3617 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3618 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3619 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3620
3621 // Try to merge previous waits into this one for regions with no SGPR reads.
3622 if (!WaitInstrs.empty()) {
3623 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3624 // obtain a mutable pointer to each instruction to be merged.
3625 // This is expected to be a very short walk within the same block.
3626 SmallVector<MachineInstr *> ToErase;
3627 unsigned Found = 0;
3628 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3629 End = MI->getParent()->rend();
3630 Found < WaitInstrs.size() && It != End; ++It) {
3631 MachineInstr *WaitMI = &*It;
3632 // Find next wait instruction.
3633 if (std::as_const(WaitMI) != WaitInstrs[Found])
3634 continue;
3635 Found++;
3636 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3637 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3638 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3639 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3640 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3641 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3642 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3643 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3644 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3645 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3646 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3647 ToErase.push_back(WaitMI);
3648 }
3649 assert(Found == WaitInstrs.size());
3650 for (MachineInstr *WaitMI : ToErase)
3651 WaitMI->eraseFromParent();
3652 }
3653
3654 // Add s_waitcnt_depctr after SGPR write.
3655 auto NextMI = std::next(MI->getIterator());
3656 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3657 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3658 .addImm(DepCtr);
3659
3660 // SALU write may be s_getpc in a bundle.
3661 updateGetPCBundle(NewMI);
3662
3663 return true;
3664}
3665
3666static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3667 const SIInstrInfo &TII) {
3668 MachineBasicBlock &EntryMBB = MF->front();
3669 if (EntryMBB.begin() != EntryMBB.end()) {
3670 auto &EntryMI = *EntryMBB.begin();
3671 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3672 EntryMI.getOperand(0).getImm() >= Priority)
3673 return false;
3674 }
3675
3676 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3677 .addImm(Priority);
3678 return true;
3679}
3680
3681bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3682 if (!ST.hasRequiredExportPriority())
3683 return false;
3684
3685 // Assume the following shader types will never have exports,
3686 // and avoid adding or adjusting S_SETPRIO.
3687 MachineBasicBlock *MBB = MI->getParent();
3688 MachineFunction *MF = MBB->getParent();
3689 auto CC = MF->getFunction().getCallingConv();
3690 switch (CC) {
3695 return false;
3696 default:
3697 break;
3698 }
3699
3700 const int MaxPriority = 3;
3701 const int NormalPriority = 2;
3702 const int PostExportPriority = 0;
3703
3704 auto It = MI->getIterator();
3705 switch (MI->getOpcode()) {
3706 case AMDGPU::S_ENDPGM:
3707 case AMDGPU::S_ENDPGM_SAVED:
3708 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3709 case AMDGPU::SI_RETURN_TO_EPILOG:
3710 // Ensure shader with calls raises priority at entry.
3711 // This ensures correct priority if exports exist in callee.
3712 if (MF->getFrameInfo().hasCalls())
3713 return ensureEntrySetPrio(MF, NormalPriority, TII);
3714 return false;
3715 case AMDGPU::S_SETPRIO: {
3716 // Raise minimum priority unless in workaround.
3717 auto &PrioOp = MI->getOperand(0);
3718 int Prio = PrioOp.getImm();
3719 bool InWA = (Prio == PostExportPriority) &&
3720 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3721 if (InWA || Prio >= NormalPriority)
3722 return false;
3723 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3724 return true;
3725 }
3726 default:
3727 if (!TII.isEXP(*MI))
3728 return false;
3729 break;
3730 }
3731
3732 // Check entry priority at each export (as there will only be a few).
3733 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3734 bool Changed = false;
3736 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3737
3738 auto NextMI = std::next(It);
3739 bool EndOfShader = false;
3740 if (NextMI != MBB->end()) {
3741 // Only need WA at end of sequence of exports.
3742 if (TII.isEXP(*NextMI))
3743 return Changed;
3744 // Assume appropriate S_SETPRIO after export means WA already applied.
3745 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3746 NextMI->getOperand(0).getImm() == PostExportPriority)
3747 return Changed;
3748 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3749 }
3750
3751 const DebugLoc &DL = MI->getDebugLoc();
3752
3753 // Lower priority.
3754 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3755 .addImm(PostExportPriority);
3756
3757 if (!EndOfShader) {
3758 // Wait for exports to complete.
3759 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3760 .addReg(AMDGPU::SGPR_NULL)
3761 .addImm(0);
3762 }
3763
3764 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3765 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3766
3767 if (!EndOfShader) {
3768 // Return to normal (higher) priority.
3769 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3770 .addImm(NormalPriority);
3771 }
3772
3773 return true;
3774}
3775
3776bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3777 if (!isSGetReg(MI->getOpcode()))
3778 return false;
3779
3780 const SIInstrInfo *TII = ST.getInstrInfo();
3781 switch (getHWReg(TII, *MI)) {
3782 default:
3783 return false;
3788 break;
3789 }
3790
3791 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3792 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3793 .addImm(0);
3794 return true;
3795}
3796
3797bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3798 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3799 return false;
3800
3801 const SIInstrInfo *TII = ST.getInstrInfo();
3802 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3803 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3805 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3806 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3808
3809 return true;
3810}
3811
3812bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3813 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3814 // for hazard to trigger.
3815 if (!IsHazardRecognizerMode)
3816 return false;
3817
3818 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3819 const SIInstrInfo *TII = ST.getInstrInfo();
3820 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3821 const int FlatScrBaseWaitStates = 10;
3822
3823 bool ReadsFlatScrLo =
3824 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3825 bool ReadsFlatScrHi =
3826 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3827 if (isSGetReg(MI->getOpcode())) {
3828 switch (getHWReg(TII, *MI)) {
3829 default:
3830 break;
3832 ReadsFlatScrLo = true;
3833 break;
3835 ReadsFlatScrHi = true;
3836 break;
3837 }
3838 }
3839
3840 const MachineRegisterInfo &MRI = MF.getRegInfo();
3841
3842 auto IsRegDefHazard = [&](Register Reg) -> bool {
3843 DenseSet<const MachineBasicBlock *> Visited;
3844 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3845 return MI.modifiesRegister(Reg, TRI);
3846 };
3847
3848 // This literally abuses the idea of waitstates. Instead of waitstates it
3849 // returns 1 for SGPR written and 0 otherwise.
3850 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3851 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3852 return 0;
3853 for (const MachineOperand &MO : MI.all_defs()) {
3854 if (TRI->isSGPRReg(MRI, MO.getReg()))
3855 return 1;
3856 }
3857 return 0;
3858 };
3859
3860 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3861 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3862 unsigned Wait = MI.getOperand(0).getImm();
3865 return true;
3866 }
3867 return SgprWrites >= FlatScrBaseWaitStates;
3868 };
3869
3870 return ::getWaitStatesSince(
3871 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3872 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3873 };
3874
3875 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3876 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3877 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3878 !IsRegDefHazard(AMDGPU::SGPR103)))
3879 return false;
3880
3881 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3882 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3885 return true;
3886}
3887
3888bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3889 if (!isSSetReg(MI->getOpcode()) ||
3890 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3891 return false;
3892
3893 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3894 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3895 return true;
3896}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:319
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition MCSchedule.h:35
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...