LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUWaitcntUtils.h"
28#include "GCNSubtarget.h"
32#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/Sequence.h"
40#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
78 return Limits.LoadcntMax;
79 case AMDGPU::DS_CNT:
80 return Limits.DscntMax;
81 case AMDGPU::EXP_CNT:
82 return Limits.ExpcntMax;
84 return Limits.StorecntMax;
86 return Limits.SamplecntMax;
87 case AMDGPU::BVH_CNT:
88 return Limits.BvhcntMax;
89 case AMDGPU::KM_CNT:
90 return Limits.KmcntMax;
91 case AMDGPU::X_CNT:
92 return Limits.XcntMax;
93 case AMDGPU::VA_VDST:
94 return Limits.VaVdstMax;
95 case AMDGPU::VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
164 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \
165 DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
166
167// clang-format off
168#define AMDGPU_EVENT_ENUM(Name) Name,
169enum WaitEventType {
171 NUM_WAIT_EVENTS
172};
173#undef AMDGPU_EVENT_ENUM
174} // namespace
175
176namespace llvm {
177template <> struct enum_iteration_traits<WaitEventType> {
178 static constexpr bool is_iterable = true;
179};
180} // namespace llvm
181
182namespace {
183
184/// Return an iterator over all events between VMEM_ACCESS (the first event)
185/// and \c MaxEvent (exclusive, default value yields an enumeration over
186/// all counters).
187auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
188 return enum_seq(VMEM_ACCESS, MaxEvent);
189}
190
191#define AMDGPU_EVENT_NAME(Name) #Name,
192static constexpr StringLiteral WaitEventTypeName[] = {
194};
195#undef AMDGPU_EVENT_NAME
196static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
197 return WaitEventTypeName[Event];
198}
199// clang-format on
200
201// Enumerate different types of result-returning VMEM operations. Although
202// s_waitcnt orders them all with a single vmcnt counter, in the absence of
203// s_waitcnt only instructions of the same VmemType are guaranteed to write
204// their results in order -- so there is no need to insert an s_waitcnt between
205// two instructions of the same type that write the same vgpr.
206enum VmemType {
207 // BUF instructions and MIMG instructions without a sampler.
208 VMEM_NOSAMPLER,
209 // MIMG instructions with a sampler.
210 VMEM_SAMPLER,
211 // BVH instructions
212 VMEM_BVH,
213 NUM_VMEM_TYPES
214};
215
216// Maps values of InstCounterType to the instruction that waits on that
217// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
218// returns true, and does not cover VA_VDST or VM_VSRC.
219static const unsigned
220 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
221 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
222 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
223 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
224 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
225 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
226
227// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
228// code but still need to be processed by this pass for async vmcnt tracking.
229static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
230 switch (MI.getOpcode()) {
231 case AMDGPU::ASYNCMARK:
232 case AMDGPU::WAIT_ASYNCMARK:
233 return false;
234 default:
235 return MI.isMetaInstruction();
236 }
237}
238
239static bool updateVMCntOnly(const MachineInstr &Inst) {
240 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
242}
243
244#ifndef NDEBUG
245static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
246 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
247}
248#endif // NDEBUG
249
250VmemType getVmemType(const MachineInstr &Inst) {
251 assert(updateVMCntOnly(Inst));
252 if (!SIInstrInfo::isImage(Inst))
253 return VMEM_NOSAMPLER;
254 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
255 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
256 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
257
258 if (BaseInfo->BVH)
259 return VMEM_BVH;
260
261 // We have to make an additional check for isVSAMPLE here since some
262 // instructions don't have a sampler, but are still classified as sampler
263 // instructions for the purposes of e.g. waitcnt.
264 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
265 return VMEM_SAMPLER;
266
267 return VMEM_NOSAMPLER;
268}
269
270void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
271 Wait.set(T, std::min(Wait.get(T), Count));
272}
273
275 Wait.set(T, ~0u);
276}
277
278/// A small set of events.
279class WaitEventSet {
280 unsigned Mask = 0;
281
282public:
283 WaitEventSet() = default;
284 explicit constexpr WaitEventSet(WaitEventType Event) {
285 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
286 "Not enough bits in Mask for all the events");
287 Mask |= 1 << Event;
288 }
289 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
290 for (auto &E : Events) {
291 Mask |= 1 << E;
292 }
293 }
294 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
295 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
296 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
297 bool contains(const WaitEventType &Event) const {
298 return Mask & (1 << Event);
299 }
300 /// \Returns true if this set contains all elements of \p Other.
301 bool contains(const WaitEventSet &Other) const {
302 return (~Mask & Other.Mask) == 0;
303 }
304 /// \Returns the intersection of this and \p Other.
305 WaitEventSet operator&(const WaitEventSet &Other) const {
306 auto Copy = *this;
307 Copy.Mask &= Other.Mask;
308 return Copy;
309 }
310 /// \Returns the union of this and \p Other.
311 WaitEventSet operator|(const WaitEventSet &Other) const {
312 auto Copy = *this;
313 Copy.Mask |= Other.Mask;
314 return Copy;
315 }
316 /// This set becomes the union of this and \p Other.
317 WaitEventSet &operator|=(const WaitEventSet &Other) {
318 Mask |= Other.Mask;
319 return *this;
320 }
321 /// This set becomes the intersection of this and \p Other.
322 WaitEventSet &operator&=(const WaitEventSet &Other) {
323 Mask &= Other.Mask;
324 return *this;
325 }
326 bool operator==(const WaitEventSet &Other) const {
327 return Mask == Other.Mask;
328 }
329 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
330 bool empty() const { return Mask == 0; }
331 /// \Returns true if the set contains more than one element.
332 bool twoOrMore() const { return Mask & (Mask - 1); }
333 operator bool() const { return !empty(); }
334 void print(raw_ostream &OS) const {
335 ListSeparator LS(", ");
336 for (WaitEventType Event : wait_events()) {
337 if (contains(Event))
338 OS << LS << getWaitEventTypeName(Event);
339 }
340 }
341 LLVM_DUMP_METHOD void dump() const;
342};
343
344void WaitEventSet::dump() const {
345 print(dbgs());
346 dbgs() << "\n";
347}
348
349class WaitcntBrackets;
350
351// This abstracts the logic for generating and updating S_WAIT* instructions
352// away from the analysis that determines where they are needed. This was
353// done because the set of counters and instructions for waiting on them
354// underwent a major shift with gfx12, sufficiently so that having this
355// abstraction allows the main analysis logic to be simpler than it would
356// otherwise have had to become.
357class WaitcntGenerator {
358protected:
359 const GCNSubtarget &ST;
360 const SIInstrInfo &TII;
361 AMDGPU::IsaVersion IV;
362 AMDGPU::InstCounterType MaxCounter;
363 bool OptNone;
364 bool ExpandWaitcntProfiling = false;
365 const AMDGPU::HardwareLimits &Limits;
366
367public:
368 WaitcntGenerator() = delete;
369 WaitcntGenerator(const WaitcntGenerator &) = delete;
370 WaitcntGenerator(const MachineFunction &MF,
371 AMDGPU::InstCounterType MaxCounter,
372 const AMDGPU::HardwareLimits &Limits)
373 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
374 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
375 OptNone(MF.getFunction().hasOptNone() ||
376 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
377 ExpandWaitcntProfiling(
378 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
379 Limits(Limits) {}
380
381 // Return true if the current function should be compiled with no
382 // optimization.
383 bool isOptNone() const { return OptNone; }
384
385 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
386
387 // Edits an existing sequence of wait count instructions according
388 // to an incoming Waitcnt value, which is itself updated to reflect
389 // any new wait count instructions which may need to be generated by
390 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
391 // were made.
392 //
393 // This editing will usually be merely updated operands, but it may also
394 // delete instructions if the incoming Wait value indicates they are not
395 // needed. It may also remove existing instructions for which a wait
396 // is needed if it can be determined that it is better to generate new
397 // instructions later, as can happen on gfx12.
398 virtual bool
399 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
400 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
402
403 // Transform a soft waitcnt into a normal one.
404 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
405
406 // Generates new wait count instructions according to the value of
407 // Wait, returning true if any new instructions were created.
408 // ScoreBrackets is used for profiling expansion.
409 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
411 AMDGPU::Waitcnt Wait,
412 const WaitcntBrackets &ScoreBrackets) = 0;
413
414 // Returns the WaitEventSet that corresponds to counter \p T.
415 virtual const WaitEventSet &
416 getWaitEvents(AMDGPU::InstCounterType T) const = 0;
417
418 /// \returns the counter that corresponds to event \p E.
419 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
420 for (auto T : AMDGPU::inst_counter_types()) {
421 if (getWaitEvents(T).contains(E))
422 return T;
423 }
424 llvm_unreachable("event type has no associated counter");
425 }
426
427 // Returns a new waitcnt with all counters except VScnt set to 0. If
428 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
429 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
430 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
431 // processed.
432 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
433
434 virtual ~WaitcntGenerator() = default;
435};
436
437class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
438 static constexpr const WaitEventSet
439 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
440 WaitEventSet(
441 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
442 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
443 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
444 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
445 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
446 WaitEventSet(),
447 WaitEventSet(),
448 WaitEventSet(),
449 WaitEventSet(),
450 WaitEventSet(),
451 WaitEventSet(),
452 WaitEventSet(),
453 WaitEventSet()};
454
455public:
456 using WaitcntGenerator::WaitcntGenerator;
457 bool
458 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
459 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
460 MachineBasicBlock::instr_iterator It) const override;
461
462 bool createNewWaitcnt(MachineBasicBlock &Block,
464 AMDGPU::Waitcnt Wait,
465 const WaitcntBrackets &ScoreBrackets) override;
466
467 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
468 return WaitEventMaskForInstPreGFX12[T];
469 }
470
471 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
472};
473
474class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
475protected:
476 bool IsExpertMode;
477 static constexpr const WaitEventSet
478 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
479 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
480 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
481 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
482 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
483 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
484 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
485 WaitEventSet({VMEM_BVH_READ_ACCESS}),
486 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
487 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
488 WaitEventSet({ASYNC_ACCESS}),
489 WaitEventSet({TENSOR_ACCESS}),
490 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
491 VGPR_XDL_WRITE}),
492 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
493
494public:
495 WaitcntGeneratorGFX12Plus() = delete;
496 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
497 AMDGPU::InstCounterType MaxCounter,
498 const AMDGPU::HardwareLimits &Limits,
499 bool IsExpertMode)
500 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
501
502 bool
503 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
504 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
505 MachineBasicBlock::instr_iterator It) const override;
506
507 bool createNewWaitcnt(MachineBasicBlock &Block,
509 AMDGPU::Waitcnt Wait,
510 const WaitcntBrackets &ScoreBrackets) override;
511
512 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
513 return WaitEventMaskForInstGFX12Plus[T];
514 }
515
516 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
517};
518
519// Flags indicating which counters should be flushed in a loop preheader.
520struct PreheaderFlushFlags {
521 bool FlushVmCnt = false;
522 bool FlushDsCnt = false;
523};
524
525class SIInsertWaitcnts {
526 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
527 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
528 MachineLoopInfo &MLI;
529 MachinePostDominatorTree &PDT;
530 AliasAnalysis *AA = nullptr;
531 MachineFunction &MF;
532
533 struct BlockInfo {
534 std::unique_ptr<WaitcntBrackets> Incoming;
535 bool Dirty = true;
536 BlockInfo() = default;
537 BlockInfo(BlockInfo &&) = default;
538 BlockInfo &operator=(BlockInfo &&) = default;
539 ~BlockInfo();
540 };
541
542 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
543
544 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
545
546 std::unique_ptr<WaitcntGenerator> WCG;
547
548 // Remember call and return instructions in the function.
549 DenseSet<MachineInstr *> CallInsts;
550 DenseSet<MachineInstr *> ReturnInsts;
551
552 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
553 // be outstanding stores but definitely no outstanding scratch stores, to help
554 // with insertion of DEALLOC_VGPRS messages.
555 DenseMap<MachineInstr *, bool> EndPgmInsts;
556
557 AMDGPU::HardwareLimits Limits;
558
559public:
560 const GCNSubtarget &ST;
561 const SIInstrInfo &TII;
562 const SIRegisterInfo &TRI;
563 const MachineRegisterInfo &MRI;
564 AMDGPU::InstCounterType SmemAccessCounter;
565 AMDGPU::InstCounterType MaxCounter;
566 bool IsExpertMode = false;
567
568 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
569 AliasAnalysis *AA, MachineFunction &MF)
570 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
571 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
572 MRI(MF.getRegInfo()) {
573 (void)ForceExpCounter;
574 (void)ForceLgkmCounter;
575 (void)ForceVMCounter;
576 }
577
578 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
579
580 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
581 const WaitcntBrackets &Brackets);
582 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
583 const WaitcntBrackets &ScoreBrackets);
584 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
585 bool isDSRead(const MachineInstr &MI) const;
586 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
587 bool run();
588
589 void setForceEmitWaitcnt() {
590// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
591// For debug builds, get the debug counter info and adjust if need be
592#ifndef NDEBUG
593 if (DebugCounter::isCounterSet(ForceExpCounter) &&
594 DebugCounter::shouldExecute(ForceExpCounter)) {
595 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
596 } else {
597 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
598 }
599
600 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
601 DebugCounter::shouldExecute(ForceLgkmCounter)) {
602 ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
603 ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
604 } else {
605 ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
606 ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
607 }
608
609 if (DebugCounter::isCounterSet(ForceVMCounter) &&
610 DebugCounter::shouldExecute(ForceVMCounter)) {
611 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
612 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
613 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
614 } else {
615 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
616 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
617 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
618 }
619
620 ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
621 ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
622#endif // NDEBUG
623 }
624
625 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
626 // instruction.
627 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
628 switch (Inst.getOpcode()) {
629 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
630 case AMDGPU::GLOBAL_INV:
631 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
632 // VGPRs
633 case AMDGPU::GLOBAL_WB:
634 case AMDGPU::GLOBAL_WBINV:
635 return VMEM_WRITE_ACCESS; // tracked using storecnt
636 default:
637 break;
638 }
639
640 // Maps VMEM access types to their corresponding WaitEventType.
641 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
642 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
643
645 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
646 // these should use VM_CNT.
647 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
648 return VMEM_ACCESS;
649 if (Inst.mayStore() &&
650 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
651 if (TII.mayAccessScratch(Inst))
652 return SCRATCH_WRITE_ACCESS;
653 return VMEM_WRITE_ACCESS;
654 }
655 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
656 return VMEM_ACCESS;
657 return VmemReadMapping[getVmemType(Inst)];
658 }
659
660 std::optional<WaitEventType>
661 getExpertSchedulingEventType(const MachineInstr &Inst) const;
662
663 bool isAsync(const MachineInstr &MI) const {
665 return false;
667 return true;
668 const MachineOperand *Async =
669 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
670 return Async && (Async->getImm());
671 }
672
673 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
674 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
675 }
676
677 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
678 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
679 }
680
681 bool shouldUpdateAsyncMark(const MachineInstr &MI,
684 return T == AMDGPU::TENSOR_CNT;
685 if (!isAsyncLdsDmaWrite(MI))
686 return false;
688 return T == AMDGPU::ASYNC_CNT;
689 return T == AMDGPU::LOAD_CNT;
690 }
691
692 bool isVmemAccess(const MachineInstr &MI) const;
693 bool generateWaitcntInstBefore(MachineInstr &MI,
694 WaitcntBrackets &ScoreBrackets,
695 MachineInstr *OldWaitcntInstr,
696 PreheaderFlushFlags FlushFlags);
697 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
699 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
700 MachineInstr *OldWaitcntInstr);
701 /// \returns all events that correspond to \p Inst.
702 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
703 void updateEventWaitcntAfter(MachineInstr &Inst,
704 WaitcntBrackets *ScoreBrackets);
705 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
706 MachineBasicBlock *Block) const;
707 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
708 WaitcntBrackets &ScoreBrackets);
709 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
710 WaitcntBrackets &ScoreBrackets);
711 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
712 /// Legalizer. Returns true if block was modified.
713 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
714 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
715 bool ExpertMode) const;
716 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
717 return WCG->getWaitEvents(T);
718 }
719 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
720 return WCG->getCounterFromEvent(E);
721 }
722};
723
724// This objects maintains the current score brackets of each wait counter, and
725// a per-register scoreboard for each wait counter.
726//
727// We also maintain the latest score for every event type that can change the
728// waitcnt in order to know if there are multiple types of events within
729// the brackets. When multiple types of event happen in the bracket,
730// wait count may get decreased out of order, therefore we need to put in
731// "s_waitcnt 0" before use.
732class WaitcntBrackets {
733public:
734 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
735 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
736 }
737
738#ifndef NDEBUG
739 ~WaitcntBrackets() {
740 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
741 for (auto &[ID, Val] : VMem) {
742 if (Val.empty())
743 ++NumUnusedVmem;
744 }
745 for (auto &[ID, Val] : SGPRs) {
746 if (Val.empty())
747 ++NumUnusedSGPRs;
748 }
749
750 if (NumUnusedVmem || NumUnusedSGPRs) {
751 errs() << "WaitcntBracket had unused entries at destruction time: "
752 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
753 << " SGPR unused entries\n";
754 std::abort();
755 }
756 }
757#endif
758
759 bool isSmemCounter(AMDGPU::InstCounterType T) const {
760 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
761 }
762
763 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
764 return ScoreUBs[T] - ScoreLBs[T];
765 }
766
767 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
768 return getVMemScore(ID, T) > getScoreLB(T);
769 }
770
771 /// \Return true if we have no score entries for counter \p T.
772 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
773
774private:
775 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
777 return ScoreLBs[T];
778 }
779
780 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
782 return ScoreUBs[T];
783 }
784
785 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
786 return getScoreUB(T) - getScoreLB(T);
787 }
788
789 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
790 auto It = SGPRs.find(RU);
791 return It != SGPRs.end() ? It->second.get(T) : 0;
792 }
793
794 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
795 auto It = VMem.find(TID);
796 return It != VMem.end() ? It->second.Scores[T] : 0;
797 }
798
799public:
800 bool merge(const WaitcntBrackets &Other);
801
802 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
803 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
804 simplifyWaitcnt(Wait, Wait);
805 }
806 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
807 AMDGPU::Waitcnt &UpdateWait) const;
808 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
809 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
810 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
811 AMDGPU::Waitcnt &UpdateWait) const;
812 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
813 AMDGPU::Waitcnt &UpdateWait) const;
814
815 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
816 AMDGPU::Waitcnt &Wait,
817 const MachineInstr &MI) const;
818 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
820 MCPhysReg Reg) const;
821 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
822 AMDGPU::Waitcnt &Wait) const;
823 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
824 void tryClearSCCWriteEvent(MachineInstr *Inst);
825
826 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
827 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
828 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
829 void updateByEvent(WaitEventType E, MachineInstr &MI);
830 void recordAsyncMark(MachineInstr &MI);
831
832 bool hasPendingEvent() const { return !PendingEvents.empty(); }
833 bool hasPendingEvent(WaitEventType E) const {
834 return PendingEvents.contains(E);
835 }
836 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
837 bool HasPending = PendingEvents & Context->getWaitEvents(T);
838 assert(HasPending == !empty(T) &&
839 "Expected pending events iff scoreboard is not empty");
840 return HasPending;
841 }
842
843 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
844 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
845 // Return true if more than one bit is set in Events.
846 return Events.twoOrMore();
847 }
848
849 bool hasPendingFlat() const {
850 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
851 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
852 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
853 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
854 }
855
856 void setPendingFlat() {
857 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
858 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
859 }
860
861 bool hasPendingGDS() const {
862 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
863 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
864 }
865
866 unsigned getPendingGDSWait() const {
867 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
868 getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
869 }
870
871 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
872
873 // Return true if there might be pending writes to the vgpr-interval by VMEM
874 // instructions with types different from V.
875 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
876 for (MCRegUnit RU : regunits(Reg)) {
877 auto It = VMem.find(toVMEMID(RU));
878 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
879 return true;
880 }
881 return false;
882 }
883
884 void clearVgprVmemTypes(MCPhysReg Reg) {
885 for (MCRegUnit RU : regunits(Reg)) {
886 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
887 It->second.VMEMTypes = 0;
888 if (It->second.empty())
889 VMem.erase(It);
890 }
891 }
892 }
893
894 void setStateOnFunctionEntryOrReturn() {
895 setScoreUB(AMDGPU::STORE_CNT,
896 getScoreUB(AMDGPU::STORE_CNT) +
897 getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
898 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
899 }
900
901 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
902 return LDSDMAStores;
903 }
904
905 bool hasPointSampleAccel(const MachineInstr &MI) const;
906 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
907 MCPhysReg RU) const;
908
909 void print(raw_ostream &) const;
910 void dump() const { print(dbgs()); }
911
912 // Free up memory by removing empty entries from the DenseMap that track event
913 // scores.
914 void purgeEmptyTrackingData();
915
916private:
917 struct MergeInfo {
918 unsigned OldLB;
919 unsigned OtherLB;
920 unsigned MyShift;
921 unsigned OtherShift;
922 };
923
924 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
925
926 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
927 AMDGPU::Waitcnt &Wait) const;
928
929 static bool mergeScore(const MergeInfo &M, unsigned &Score,
930 unsigned OtherScore);
931 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
932 ArrayRef<CounterValueArray> OtherMarks);
933
935 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
936 if (!Context->TRI.isInAllocatableClass(Reg))
937 return {{}, {}};
938 return Context->TRI.regunits(Reg);
939 }
940
941 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
943 ScoreLBs[T] = Val;
944 }
945
946 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
948 ScoreUBs[T] = Val;
949
950 if (T != AMDGPU::EXP_CNT)
951 return;
952
953 if (getScoreRange(AMDGPU::EXP_CNT) >
954 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
955 ScoreLBs[AMDGPU::EXP_CNT] =
956 ScoreUBs[AMDGPU::EXP_CNT] -
957 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
958 }
959
960 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
961 const SIRegisterInfo &TRI = Context->TRI;
962 if (Reg == AMDGPU::SCC) {
963 SCCScore = Val;
964 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
965 for (MCRegUnit RU : regunits(Reg))
966 VMem[toVMEMID(RU)].Scores[T] = Val;
967 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
968 for (MCRegUnit RU : regunits(Reg))
969 SGPRs[RU].get(T) = Val;
970 } else {
971 llvm_unreachable("Register cannot be tracked/unknown register!");
972 }
973 }
974
975 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
976 VMem[TID].Scores[T] = Val;
977 }
978
979 void setScoreByOperand(const MachineOperand &Op,
980 AMDGPU::InstCounterType CntTy, unsigned Val);
981
982 const SIInsertWaitcnts *Context;
983
984 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
985 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
986 WaitEventSet PendingEvents;
987 // Remember the last flat memory operation.
988 unsigned LastFlatDsCnt = 0;
989 unsigned LastFlatLoadCnt = 0;
990 // Remember the last GDS operation.
991 unsigned LastGDS = 0;
992
993 // The score tracking logic is fragmented as follows:
994 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
995 // - SGPRs: SGPR RegUnits
996 // - SCC: Non-allocatable and not general purpose: not a SGPR.
997 //
998 // For the VMem case, if the key is within the range of LDS DMA IDs,
999 // then the corresponding index into the `LDSDMAStores` vector below is:
1000 // Key - LDSDMA_BEGIN - 1
1001 // This is because LDSDMA_BEGIN is a generic entry and does not have an
1002 // associated MachineInstr.
1003 //
1004 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
1005
1006 struct VMEMInfo {
1007 // Scores for all instruction counters. Zero-initialized.
1008 CounterValueArray Scores{};
1009 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
1010 unsigned VMEMTypes = 0;
1011
1012 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1013 };
1014
1015 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1016 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1017 class SGPRInfo {
1018 /// Either DS_CNT or KM_CNT score.
1019 unsigned ScoreDsKmCnt = 0;
1020 unsigned ScoreXCnt = 0;
1021
1022 public:
1023 unsigned get(AMDGPU::InstCounterType T) const {
1024 assert(
1025 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1026 "Invalid counter");
1027 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1028 }
1029 unsigned &get(AMDGPU::InstCounterType T) {
1030 assert(
1031 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1032 "Invalid counter");
1033 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1034 }
1035
1036 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
1037 };
1038
1039 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1040 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1041
1042 // Reg score for SCC.
1043 unsigned SCCScore = 0;
1044 // The unique instruction that has an SCC write pending, if there is one.
1045 const MachineInstr *PendingSCCWrite = nullptr;
1046
1047 // Store representative LDS DMA operations. The only useful info here is
1048 // alias info. One store is kept per unique AAInfo.
1049 SmallVector<const MachineInstr *> LDSDMAStores;
1050
1051 // State of all counters at each async mark encountered so far.
1053
1054 // But in the rare pathological case, a nest of loops that pushes marks
1055 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1056 // it to a reasonable limit. We can tune this later or potentially introduce a
1057 // user option to control the value.
1058 static constexpr unsigned MaxAsyncMarks = 16;
1059
1060 // Track the upper bound score for async operations that are not part of a
1061 // mark yet. Initialized to all zeros.
1062 CounterValueArray AsyncScore{};
1063};
1064
1065SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
1066
1067class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1068public:
1069 static char ID;
1070 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1071
1072 bool runOnMachineFunction(MachineFunction &MF) override;
1073
1074 StringRef getPassName() const override {
1075 return "SI insert wait instructions";
1076 }
1077
1078 void getAnalysisUsage(AnalysisUsage &AU) const override {
1079 AU.setPreservesCFG();
1080 AU.addRequired<MachineLoopInfoWrapperPass>();
1081 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1082 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1083 AU.addPreserved<AAResultsWrapperPass>();
1085 }
1086};
1087
1088} // end anonymous namespace
1089
1090void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1092 unsigned Score) {
1093 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1094}
1095
1096// Return true if the subtarget is one that enables Point Sample Acceleration
1097// and the MachineInstr passed in is one to which it might be applied (the
1098// hardware makes this decision based on several factors, but we can't determine
1099// this at compile time, so we have to assume it might be applied if the
1100// instruction supports it).
1101bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1102 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1103 return false;
1104
1105 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1106 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1108 return BaseInfo->PointSampleAccel;
1109}
1110
1111// Return true if the subtarget enables Point Sample Acceleration, the supplied
1112// MachineInstr is one to which it might be applied and the supplied interval is
1113// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1114// (this is the type that a point sample accelerated instruction effectively
1115// becomes)
1116bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1117 MCPhysReg Reg) const {
1118 if (!hasPointSampleAccel(MI))
1119 return false;
1120
1121 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1122}
1123
1124void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1125 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
1126 assert(T < Context->MaxCounter);
1127
1128 unsigned UB = getScoreUB(T);
1129 unsigned Increment = 1;
1131 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
1132 // two VOP3P instructions and increments VA_VDST twice.
1133 Increment = 2;
1134 }
1135 unsigned CurrScore = UB + Increment;
1136 if (CurrScore == 0)
1137 report_fatal_error("InsertWaitcnt score wraparound");
1138 // PendingEvents and ScoreUB need to be update regardless if this event
1139 // changes the score of a register or not.
1140 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1141 PendingEvents.insert(E);
1142 setScoreUB(T, CurrScore);
1143
1144 const SIRegisterInfo &TRI = Context->TRI;
1145 const MachineRegisterInfo &MRI = Context->MRI;
1146 const SIInstrInfo &TII = Context->TII;
1147
1148 if (T == AMDGPU::EXP_CNT) {
1149 // Put score on the source vgprs. If this is a store, just use those
1150 // specific register(s).
1151 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1152 // All GDS operations must protect their address register (same as
1153 // export.)
1154 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1155 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
1156
1157 if (Inst.mayStore()) {
1158 if (const auto *Data0 =
1159 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1160 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
1161 if (const auto *Data1 =
1162 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1163 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
1164 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1165 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1166 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1167 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1168 for (const MachineOperand &Op : Inst.all_uses()) {
1169 if (TRI.isVectorRegister(MRI, Op.getReg()))
1170 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1171 }
1172 }
1173 } else if (TII.isFLAT(Inst)) {
1174 if (Inst.mayStore()) {
1175 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1176 AMDGPU::EXP_CNT, CurrScore);
1177 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1178 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1179 AMDGPU::EXP_CNT, CurrScore);
1180 }
1181 } else if (TII.isMIMG(Inst)) {
1182 if (Inst.mayStore()) {
1183 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1184 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1185 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1186 AMDGPU::EXP_CNT, CurrScore);
1187 }
1188 } else if (TII.isMTBUF(Inst)) {
1189 if (Inst.mayStore())
1190 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1191 } else if (TII.isMUBUF(Inst)) {
1192 if (Inst.mayStore()) {
1193 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1194 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1195 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1196 AMDGPU::EXP_CNT, CurrScore);
1197 }
1198 } else if (TII.isLDSDIR(Inst)) {
1199 // LDSDIR instructions attach the score to the destination.
1200 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1201 AMDGPU::EXP_CNT, CurrScore);
1202 } else {
1203 if (TII.isEXP(Inst)) {
1204 // For export the destination registers are really temps that
1205 // can be used as the actual source after export patching, so
1206 // we need to treat them like sources and set the EXP_CNT
1207 // score.
1208 for (MachineOperand &DefMO : Inst.all_defs()) {
1209 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1210 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
1211 }
1212 }
1213 }
1214 for (const MachineOperand &Op : Inst.all_uses()) {
1215 if (TRI.isVectorRegister(MRI, Op.getReg()))
1216 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1217 }
1218 }
1219 } else if (T == AMDGPU::X_CNT) {
1220 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1221 if (PendingEvents.contains(OtherEvent)) {
1222 // Hardware inserts an implicit xcnt between interleaved
1223 // SMEM and VMEM operations. So there will never be
1224 // outstanding address translations for both SMEM and
1225 // VMEM at the same time.
1226 setScoreLB(T, getScoreUB(T) - 1);
1227 PendingEvents.remove(OtherEvent);
1228 }
1229 for (const MachineOperand &Op : Inst.all_uses())
1230 setScoreByOperand(Op, T, CurrScore);
1231 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1232 // Match the score to the VGPR destination or source registers as
1233 // appropriate
1234 for (const MachineOperand &Op : Inst.operands()) {
1235 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1236 (T == AMDGPU::VM_VSRC && Op.isDef()))
1237 continue;
1238 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1239 setScoreByOperand(Op, T, CurrScore);
1240 }
1241 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1242 // Match the score to the destination registers.
1243 //
1244 // Check only explicit operands. Stores, especially spill stores, include
1245 // implicit uses and defs of their super registers which would create an
1246 // artificial dependency, while these are there only for register liveness
1247 // accounting purposes.
1248 //
1249 // Special cases where implicit register defs exists, such as M0 or VCC,
1250 // but none with memory instructions.
1251 for (const MachineOperand &Op : Inst.defs()) {
1252 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1253 T == AMDGPU::BVH_CNT) {
1254 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1255 continue;
1256 if (updateVMCntOnly(Inst)) {
1257 // updateVMCntOnly should only leave us with VGPRs
1258 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1259 // defs. That's required for a sane index into `VgprMemTypes` below
1260 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1261 VmemType V = getVmemType(Inst);
1262 unsigned char TypesMask = 1 << V;
1263 // If instruction can have Point Sample Accel applied, we have to flag
1264 // this with another potential dependency
1265 if (hasPointSampleAccel(Inst))
1266 TypesMask |= 1 << VMEM_NOSAMPLER;
1267 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1268 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1269 }
1270 }
1271 setScoreByOperand(Op, T, CurrScore);
1272 }
1273 if (Inst.mayStore() &&
1274 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1275 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1276 // written can be accessed. A load from LDS to VMEM does not need a wait.
1277 //
1278 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1279 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1280 // store. The "Slot" is the index into LDSDMAStores + 1.
1281 unsigned Slot = 0;
1282 for (const auto *MemOp : Inst.memoperands()) {
1283 if (!MemOp->isStore() ||
1284 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1285 continue;
1286 // Comparing just AA info does not guarantee memoperands are equal
1287 // in general, but this is so for LDS DMA in practice.
1288 auto AAI = MemOp->getAAInfo();
1289 // Alias scope information gives a way to definitely identify an
1290 // original memory object and practically produced in the module LDS
1291 // lowering pass. If there is no scope available we will not be able
1292 // to disambiguate LDS aliasing as after the module lowering all LDS
1293 // is squashed into a single big object.
1294 if (!AAI || !AAI.Scope)
1295 break;
1296 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1297 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1298 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1299 Slot = I + 1;
1300 break;
1301 }
1302 }
1303 }
1304 if (Slot)
1305 break;
1306 // The slot may not be valid because it can be >= NUM_LDSDMA which
1307 // means the scoreboard cannot track it. We still want to preserve the
1308 // MI in order to check alias information, though.
1309 LDSDMAStores.push_back(&Inst);
1310 Slot = LDSDMAStores.size();
1311 break;
1312 }
1313 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1314 if (Slot && Slot < NUM_LDSDMA)
1315 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1316 }
1317
1318 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1319 AsyncScore[T] = CurrScore;
1320 }
1321
1323 setRegScore(AMDGPU::SCC, T, CurrScore);
1324 PendingSCCWrite = &Inst;
1325 }
1326 }
1327}
1328
1329void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1330 // In the absence of loops, AsyncMarks can grow linearly with the program
1331 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1332 // limit every time we push a new mark, but that seems like unnecessary work
1333 // in practical cases. We do separately truncate the array when processing a
1334 // loop, which should be sufficient.
1335 AsyncMarks.push_back(AsyncScore);
1336 AsyncScore = {};
1337 LLVM_DEBUG({
1338 dbgs() << "recordAsyncMark:\n" << Inst;
1339 for (const auto &Mark : AsyncMarks) {
1340 llvm::interleaveComma(Mark, dbgs());
1341 dbgs() << '\n';
1342 }
1343 });
1344}
1345
1346void WaitcntBrackets::print(raw_ostream &OS) const {
1347 const GCNSubtarget &ST = Context->ST;
1348
1349 for (auto T : inst_counter_types(Context->MaxCounter)) {
1350 unsigned SR = getScoreRange(T);
1351 switch (T) {
1352 case AMDGPU::LOAD_CNT:
1353 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1354 << SR << "):";
1355 break;
1356 case AMDGPU::DS_CNT:
1357 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1358 << SR << "):";
1359 break;
1360 case AMDGPU::EXP_CNT:
1361 OS << " EXP_CNT(" << SR << "):";
1362 break;
1363 case AMDGPU::STORE_CNT:
1364 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1365 << SR << "):";
1366 break;
1367 case AMDGPU::SAMPLE_CNT:
1368 OS << " SAMPLE_CNT(" << SR << "):";
1369 break;
1370 case AMDGPU::BVH_CNT:
1371 OS << " BVH_CNT(" << SR << "):";
1372 break;
1373 case AMDGPU::KM_CNT:
1374 OS << " KM_CNT(" << SR << "):";
1375 break;
1376 case AMDGPU::X_CNT:
1377 OS << " X_CNT(" << SR << "):";
1378 break;
1379 case AMDGPU::ASYNC_CNT:
1380 OS << " ASYNC_CNT(" << SR << "):";
1381 break;
1382 case AMDGPU::VA_VDST:
1383 OS << " VA_VDST(" << SR << "): ";
1384 break;
1385 case AMDGPU::VM_VSRC:
1386 OS << " VM_VSRC(" << SR << "): ";
1387 break;
1388 default:
1389 OS << " UNKNOWN(" << SR << "):";
1390 break;
1391 }
1392
1393 if (SR != 0) {
1394 // Print vgpr scores.
1395 unsigned LB = getScoreLB(T);
1396
1397 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1398 sort(SortedVMEMIDs);
1399
1400 for (auto ID : SortedVMEMIDs) {
1401 unsigned RegScore = VMem.at(ID).Scores[T];
1402 if (RegScore <= LB)
1403 continue;
1404 unsigned RelScore = RegScore - LB - 1;
1405 if (ID < REGUNITS_END) {
1406 OS << ' ' << RelScore << ":vRU" << ID;
1407 } else {
1408 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1409 "Unhandled/unexpected ID value!");
1410 OS << ' ' << RelScore << ":LDSDMA" << ID;
1411 }
1412 }
1413
1414 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1415 if (isSmemCounter(T)) {
1416 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1417 sort(SortedSMEMIDs);
1418 for (auto ID : SortedSMEMIDs) {
1419 unsigned RegScore = SGPRs.at(ID).get(T);
1420 if (RegScore <= LB)
1421 continue;
1422 unsigned RelScore = RegScore - LB - 1;
1423 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1424 }
1425 }
1426
1427 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1428 OS << ' ' << SCCScore << ":scc";
1429 }
1430 OS << '\n';
1431 }
1432
1433 OS << "Pending Events: ";
1434 if (hasPendingEvent()) {
1435 ListSeparator LS;
1436 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1437 if (hasPendingEvent((WaitEventType)I)) {
1438 OS << LS << WaitEventTypeName[I];
1439 }
1440 }
1441 } else {
1442 OS << "none";
1443 }
1444 OS << '\n';
1445
1446 OS << "Async score: ";
1447 if (AsyncScore.empty())
1448 OS << "none";
1449 else
1450 llvm::interleaveComma(AsyncScore, OS);
1451 OS << '\n';
1452
1453 OS << "Async marks: " << AsyncMarks.size() << '\n';
1454
1455 for (const auto &Mark : AsyncMarks) {
1456 for (auto T : AMDGPU::inst_counter_types()) {
1457 unsigned MarkedScore = Mark[T];
1458 switch (T) {
1459 case AMDGPU::LOAD_CNT:
1460 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1461 << "_CNT: " << MarkedScore;
1462 break;
1463 case AMDGPU::DS_CNT:
1464 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1465 << "_CNT: " << MarkedScore;
1466 break;
1467 case AMDGPU::EXP_CNT:
1468 OS << " EXP_CNT: " << MarkedScore;
1469 break;
1470 case AMDGPU::STORE_CNT:
1471 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1472 << "_CNT: " << MarkedScore;
1473 break;
1474 case AMDGPU::SAMPLE_CNT:
1475 OS << " SAMPLE_CNT: " << MarkedScore;
1476 break;
1477 case AMDGPU::BVH_CNT:
1478 OS << " BVH_CNT: " << MarkedScore;
1479 break;
1480 case AMDGPU::KM_CNT:
1481 OS << " KM_CNT: " << MarkedScore;
1482 break;
1483 case AMDGPU::X_CNT:
1484 OS << " X_CNT: " << MarkedScore;
1485 break;
1486 case AMDGPU::ASYNC_CNT:
1487 OS << " ASYNC_CNT: " << MarkedScore;
1488 break;
1489 default:
1490 OS << " UNKNOWN: " << MarkedScore;
1491 break;
1492 }
1493 }
1494 OS << '\n';
1495 }
1496 OS << '\n';
1497}
1498
1499/// Simplify \p UpdateWait by removing waits that are redundant based on the
1500/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1501void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1502 AMDGPU::Waitcnt &UpdateWait) const {
1503 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1504 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1505 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1506 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1507 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1508 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1509 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1510 simplifyXcnt(CheckWait, UpdateWait);
1511 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1512 simplifyVmVsrc(CheckWait, UpdateWait);
1513 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1514}
1515
1516void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1517 unsigned &Count) const {
1518 // The number of outstanding events for this type, T, can be calculated
1519 // as (UB - LB). If the current Count is greater than or equal to the number
1520 // of outstanding events, then the wait for this counter is redundant.
1521 if (Count >= getScoreRange(T))
1522 Count = ~0u;
1523}
1524
1525void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1526 AMDGPU::InstCounterType T) const {
1527 unsigned Cnt = Wait.get(T);
1528 simplifyWaitcnt(T, Cnt);
1529 Wait.set(T, Cnt);
1530}
1531
1532void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1533 AMDGPU::Waitcnt &UpdateWait) const {
1534 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1535 // optimizations. On entry to a block with multiple predescessors, there may
1536 // be pending SMEM and VMEM events active at the same time.
1537 // In such cases, only clear one active event at a time.
1538 // TODO: Revisit xcnt optimizations for gfx1250.
1539 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1540 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1541 // zero.
1542 if (CheckWait.get(AMDGPU::KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1543 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1544 // If we have pending store we cannot optimize XCnt because we do not wait for
1545 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1546 // decremented to the same number as LOADCnt.
1547 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1548 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1549 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1550 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1551 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1552}
1553
1554void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1555 AMDGPU::Waitcnt &UpdateWait) const {
1556 // Waiting for some counters implies waiting for VM_VSRC, since an
1557 // instruction that decrements a counter on completion would have
1558 // decremented VM_VSRC once its VGPR operands had been read.
1559 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1560 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1561 CheckWait.get(AMDGPU::STORE_CNT),
1562 CheckWait.get(AMDGPU::SAMPLE_CNT),
1563 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1564 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1565 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1566}
1567
1568void WaitcntBrackets::purgeEmptyTrackingData() {
1569 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1570 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1571}
1572
1573void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1574 unsigned ScoreToWait,
1575 AMDGPU::Waitcnt &Wait) const {
1576 const unsigned LB = getScoreLB(T);
1577 const unsigned UB = getScoreUB(T);
1578
1579 // If the score falls within the bracket, we need a waitcnt.
1580 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1581 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1582 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1583 // If there is a pending FLAT operation, and this is a VMem or LGKM
1584 // waitcnt and the target can report early completion, then we need
1585 // to force a waitcnt 0.
1586 addWait(Wait, T, 0);
1587 } else if (counterOutOfOrder(T)) {
1588 // Counter can get decremented out-of-order when there
1589 // are multiple types event in the bracket. Also emit an s_wait counter
1590 // with a conservative value of 0 for the counter.
1591 addWait(Wait, T, 0);
1592 } else {
1593 // If a counter has been maxed out avoid overflow by waiting for
1594 // MAX(CounterType) - 1 instead.
1595 unsigned NeededWait = std::min(
1596 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1597 addWait(Wait, T, NeededWait);
1598 }
1599 }
1600}
1601
1602AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1603 LLVM_DEBUG({
1604 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1605 << ":\n";
1606 for (const auto &Mark : AsyncMarks) {
1607 llvm::interleaveComma(Mark, dbgs());
1608 dbgs() << '\n';
1609 }
1610 });
1611
1612 if (AsyncMarks.size() == MaxAsyncMarks) {
1613 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1614 // MaxAsyncMarks is linear when traversing straightline code. But we do
1615 // need to check if truncation may have occured at a merge, and adjust N
1616 // to ensure that a wait is generated.
1617 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1618 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1619 }
1620
1621 AMDGPU::Waitcnt Wait;
1622 if (AsyncMarks.size() <= N) {
1623 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1624 return Wait;
1625 }
1626
1627 size_t MarkIndex = AsyncMarks.size() - N - 1;
1628 const auto &RequiredMark = AsyncMarks[MarkIndex];
1630 determineWaitForScore(T, RequiredMark[T], Wait);
1631
1632 // Immediately remove the waited mark and all older ones
1633 // This happens BEFORE the wait is actually inserted, which is fine
1634 // because we've already extracted the wait requirements
1635 LLVM_DEBUG({
1636 dbgs() << "Removing " << (MarkIndex + 1)
1637 << " async marks after determining wait\n";
1638 });
1639 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1640
1641 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1642 return Wait;
1643}
1644
1645// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1646// other half 16bit.
1647//
1648// Replace VGPR16 to VGPR32 for wait check if:
1649// 1. MI is a VALU, and there is a wait event on the other half
1650// 2. MI is a LdSt, and there is a wait event on the other half from different
1651// order group
1652MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1654 MCPhysReg Reg) const {
1655 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1656 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1657
1658 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1659 return Reg;
1660
1661 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1662 // check dependency on the other half
1663 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1664 Register OtherHalf = Context->TRI.getSubReg(
1665 Reg32,
1666 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1667
1668 AMDGPU::Waitcnt Wait;
1669 for (MCRegUnit RU : regunits(OtherHalf))
1670 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1671
1672 // No wait on otherhalf
1673 if (!Wait.hasWait())
1674 return Reg;
1675
1676 if (Context->TII.isVALU(MI))
1677 return Reg32;
1678
1679 // If hi/lo16 mixed events
1680 WaitEventSet MIEvents = Context->getEventsFor(MI);
1681 WaitEventSet OtherHalfEvents = Context->getWaitEvents(T);
1682 WaitEventSet Events = MIEvents & OtherHalfEvents;
1683 if (Events.twoOrMore())
1684 return Reg32;
1685 return Reg;
1686}
1687
1688void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1689 MCPhysReg Reg,
1690 AMDGPU::Waitcnt &Wait,
1691 const MachineInstr &MI) const {
1692 if (Reg == AMDGPU::SCC) {
1693 determineWaitForScore(T, SCCScore, Wait);
1694 } else {
1695 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1696 if (IsVGPR)
1697 Reg = determineVGPR16Dependency(MI, T, Reg);
1698 for (MCRegUnit RU : regunits(Reg))
1699 determineWaitForScore(
1700 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1701 Wait);
1702 }
1703}
1704
1705void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1706 VMEMID TID,
1707 AMDGPU::Waitcnt &Wait) const {
1708 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1709 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1710}
1711
1712void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1713 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1714 // SCC has landed
1715 if (PendingSCCWrite &&
1716 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1717 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1718 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1719 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1720 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1721 SCC_WRITE_PendingEvent) {
1722 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1723 }
1724
1725 PendingEvents.remove(SCC_WRITE_PendingEvent);
1726 PendingSCCWrite = nullptr;
1727 }
1728}
1729
1730void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1732 applyWaitcnt(Wait, T);
1733}
1734
1735void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1736 const unsigned UB = getScoreUB(T);
1737 if (Count >= UB)
1738 return;
1739 if (Count != 0) {
1740 if (counterOutOfOrder(T))
1741 return;
1742 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1743 } else {
1744 setScoreLB(T, UB);
1745 PendingEvents.remove(Context->getWaitEvents(T));
1746 }
1747
1748 if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1749 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1750 applyWaitcnt(AMDGPU::X_CNT, 0);
1751 else
1752 PendingEvents.remove(SMEM_GROUP);
1753 }
1754 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1755 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1756 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1757 applyWaitcnt(AMDGPU::X_CNT, Count);
1758 else if (Count == 0)
1759 PendingEvents.remove(VMEM_GROUP);
1760 }
1761}
1762
1763void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1765 unsigned Cnt = Wait.get(T);
1766 applyWaitcnt(T, Cnt);
1767}
1768
1769// Where there are multiple types of event in the bracket of a counter,
1770// the decrement may go out of order.
1771bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1772 // Scalar memory read always can go out of order.
1773 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1774 (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
1775 return true;
1776
1777 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1778 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1779 // out-of-order completion.
1780 if (T == AMDGPU::LOAD_CNT) {
1781 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1782 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1783 // events
1784 Events.remove(GLOBAL_INV_ACCESS);
1785 // Return true only if there are still multiple event types after removing
1786 // GLOBAL_INV
1787 return Events.twoOrMore();
1788 }
1789
1790 return hasMixedPendingEvents(T);
1791}
1792
1793INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1794 false, false)
1797INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1799
1800char SIInsertWaitcntsLegacy::ID = 0;
1801
1802char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1803
1805 return new SIInsertWaitcntsLegacy();
1806}
1807
1808static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1809 unsigned NewEnc) {
1810 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1811 assert(OpIdx >= 0);
1812
1813 MachineOperand &MO = MI.getOperand(OpIdx);
1814
1815 if (NewEnc == MO.getImm())
1816 return false;
1817
1818 MO.setImm(NewEnc);
1819 return true;
1820}
1821
1822/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1823/// and if so, which counter it is waiting on.
1824static std::optional<AMDGPU::InstCounterType>
1825counterTypeForInstr(unsigned Opcode) {
1826 switch (Opcode) {
1827 case AMDGPU::S_WAIT_LOADCNT:
1828 return AMDGPU::LOAD_CNT;
1829 case AMDGPU::S_WAIT_EXPCNT:
1830 return AMDGPU::EXP_CNT;
1831 case AMDGPU::S_WAIT_STORECNT:
1832 return AMDGPU::STORE_CNT;
1833 case AMDGPU::S_WAIT_SAMPLECNT:
1834 return AMDGPU::SAMPLE_CNT;
1835 case AMDGPU::S_WAIT_BVHCNT:
1836 return AMDGPU::BVH_CNT;
1837 case AMDGPU::S_WAIT_DSCNT:
1838 return AMDGPU::DS_CNT;
1839 case AMDGPU::S_WAIT_KMCNT:
1840 return AMDGPU::KM_CNT;
1841 case AMDGPU::S_WAIT_XCNT:
1842 return AMDGPU::X_CNT;
1843 case AMDGPU::S_WAIT_ASYNCCNT:
1844 return AMDGPU::ASYNC_CNT;
1845 case AMDGPU::S_WAIT_TENSORCNT:
1846 return AMDGPU::TENSOR_CNT;
1847 default:
1848 return {};
1849 }
1850}
1851
1852bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1853 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1854 if (Opcode == Waitcnt->getOpcode())
1855 return false;
1856
1857 Waitcnt->setDesc(TII.get(Opcode));
1858 return true;
1859}
1860
1861/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1862/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1863/// from \p Wait that were added by previous passes. Currently this pass
1864/// conservatively assumes that these preexisting waits are required for
1865/// correctness.
1866bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1867 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1868 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1869 assert(isNormalMode(MaxCounter));
1870
1871 bool Modified = false;
1872 MachineInstr *WaitcntInstr = nullptr;
1873 MachineInstr *WaitcntVsCntInstr = nullptr;
1874
1875 LLVM_DEBUG({
1876 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1877 if (It.isEnd())
1878 dbgs() << "end of block\n";
1879 else
1880 dbgs() << *It;
1881 });
1882
1883 for (auto &II :
1884 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1885 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1886 if (isNonWaitcntMetaInst(II)) {
1887 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1888 continue;
1889 }
1890
1891 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1892 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1893
1894 // Update required wait count. If this is a soft waitcnt (= it was added
1895 // by an earlier pass), it may be entirely removed.
1896 if (Opcode == AMDGPU::S_WAITCNT) {
1897 unsigned IEnc = II.getOperand(0).getImm();
1898 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1899 if (TrySimplify)
1900 ScoreBrackets.simplifyWaitcnt(OldWait);
1901 Wait = Wait.combined(OldWait);
1902
1903 // Merge consecutive waitcnt of the same type by erasing multiples.
1904 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1905 II.eraseFromParent();
1906 Modified = true;
1907 } else
1908 WaitcntInstr = &II;
1909 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1910 assert(ST.hasVMemToLDSLoad());
1911 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1912 << "Before: " << Wait << '\n';);
1913 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1914 Wait);
1915 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1916
1917 // It is possible (but unlikely) that this is the only wait instruction,
1918 // in which case, we exit this loop without a WaitcntInstr to consume
1919 // `Wait`. But that works because `Wait` was passed in by reference, and
1920 // the callee eventually calls createNewWaitcnt on it. We test this
1921 // possibility in an articial MIR test since such a situation cannot be
1922 // recreated by running the memory legalizer.
1923 II.eraseFromParent();
1924 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1925 unsigned N = II.getOperand(0).getImm();
1926 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1927 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1928 Wait = Wait.combined(OldWait);
1929 } else {
1930 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1931 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1932
1933 unsigned OldVSCnt =
1934 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1935 if (TrySimplify)
1936 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1938 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1939
1940 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1941 II.eraseFromParent();
1942 Modified = true;
1943 } else
1944 WaitcntVsCntInstr = &II;
1945 }
1946 }
1947
1948 if (WaitcntInstr) {
1949 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1951 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1952
1953 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1954 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1955 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1956 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1957 Wait.set(AMDGPU::EXP_CNT, ~0u);
1958 Wait.set(AMDGPU::DS_CNT, ~0u);
1959
1960 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1961 << "New Instr at block end: "
1962 << *WaitcntInstr << '\n'
1963 : dbgs() << "applied pre-existing waitcnt\n"
1964 << "Old Instr: " << *It
1965 << "New Instr: " << *WaitcntInstr << '\n');
1966 }
1967
1968 if (WaitcntVsCntInstr) {
1969 Modified |=
1970 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1971 Wait.get(AMDGPU::STORE_CNT));
1972 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1973
1974 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1975 Wait.set(AMDGPU::STORE_CNT, ~0u);
1976
1977 LLVM_DEBUG(It.isEnd()
1978 ? dbgs() << "applied pre-existing waitcnt\n"
1979 << "New Instr at block end: " << *WaitcntVsCntInstr
1980 << '\n'
1981 : dbgs() << "applied pre-existing waitcnt\n"
1982 << "Old Instr: " << *It
1983 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1984 }
1985
1986 return Modified;
1987}
1988
1989/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1990/// required counters in \p Wait
1991bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1992 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1993 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1994 assert(isNormalMode(MaxCounter));
1995
1996 bool Modified = false;
1997 const DebugLoc &DL = Block.findDebugLoc(It);
1998
1999 // Helper to emit expanded waitcnt sequence for profiling.
2000 // Emits waitcnts from (Outstanding-1) down to Target.
2001 // The EmitWaitcnt callback emits a single waitcnt.
2002 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2003 auto EmitWaitcnt) {
2004 do {
2005 EmitWaitcnt(--Outstanding);
2006 } while (Outstanding > Target);
2007 Modified = true;
2008 };
2009
2010 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
2011 // single instruction while VScnt has its own instruction.
2012 if (Wait.hasWaitExceptStoreCnt()) {
2013 // If profiling expansion is enabled, emit an expanded sequence
2014 if (ExpandWaitcntProfiling) {
2015 // Check if any of the counters to be waited on are out-of-order.
2016 // If so, fall back to normal (non-expanded) behavior since expansion
2017 // would provide misleading profiling information.
2018 bool AnyOutOfOrder = false;
2019 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2020 unsigned WaitCnt = Wait.get(CT);
2021 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
2022 AnyOutOfOrder = true;
2023 break;
2024 }
2025 }
2026
2027 if (AnyOutOfOrder) {
2028 // Fall back to non-expanded wait
2029 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2030 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2031 Modified = true;
2032 } else {
2033 // All counters are in-order, safe to expand
2034 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2035 unsigned WaitCnt = Wait.get(CT);
2036 if (WaitCnt == ~0u)
2037 continue;
2038
2039 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2040 getWaitCountMax(getLimits(), CT) - 1);
2041 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
2042 AMDGPU::Waitcnt W;
2043 W.set(CT, Count);
2044 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
2046 });
2047 }
2048 }
2049 } else {
2050 // Normal behavior: emit single combined waitcnt
2051 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2052 [[maybe_unused]] auto SWaitInst =
2053 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2054 Modified = true;
2055
2056 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2057 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2058 dbgs() << "New Instr: " << *SWaitInst << '\n');
2059 }
2060 }
2061
2062 if (Wait.hasWaitStoreCnt()) {
2063 assert(ST.hasVscnt());
2064
2065 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
2066 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
2067 // Only expand if counter is not out-of-order
2068 unsigned Outstanding =
2069 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
2070 getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
2071 EmitExpandedWaitcnt(
2072 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
2073 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2074 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2075 .addImm(Count);
2076 });
2077 } else {
2078 [[maybe_unused]] auto SWaitInst =
2079 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2080 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2082 Modified = true;
2083
2084 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2085 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2086 dbgs() << "New Instr: " << *SWaitInst << '\n');
2087 }
2088 }
2089
2090 return Modified;
2091}
2092
2093AMDGPU::Waitcnt
2094WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2095 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
2096}
2097
2098AMDGPU::Waitcnt
2099WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2100 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
2101 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2102 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
2103 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
2104}
2105
2106/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2107/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2108/// were added by previous passes. Currently this pass conservatively
2109/// assumes that these preexisting waits are required for correctness.
2110bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2111 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2112 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2113 assert(!isNormalMode(MaxCounter));
2114
2115 bool Modified = false;
2116 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2117 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2118 MachineInstr *WaitcntDepctrInstr = nullptr;
2119 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
2120
2121 LLVM_DEBUG({
2122 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2123 if (It.isEnd())
2124 dbgs() << "end of block\n";
2125 else
2126 dbgs() << *It;
2127 });
2128
2129 // Accumulate waits that should not be simplified.
2130 AMDGPU::Waitcnt RequiredWait;
2131
2132 for (auto &II :
2133 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2134 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2135 if (isNonWaitcntMetaInst(II)) {
2136 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2137 continue;
2138 }
2139
2140 // Update required wait count. If this is a soft waitcnt (= it was added
2141 // by an earlier pass), it may be entirely removed.
2142
2143 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2144 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2145
2146 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2147 // attempt to do more than that either.
2148 if (Opcode == AMDGPU::S_WAITCNT)
2149 continue;
2150
2151 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2152 unsigned OldEnc =
2153 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2154 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2155 if (TrySimplify)
2156 Wait = Wait.combined(OldWait);
2157 else
2158 RequiredWait = RequiredWait.combined(OldWait);
2159 // Keep the first wait_loadcnt, erase the rest.
2160 if (CombinedLoadDsCntInstr == nullptr) {
2161 CombinedLoadDsCntInstr = &II;
2162 } else {
2163 II.eraseFromParent();
2164 Modified = true;
2165 }
2166 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2167 unsigned OldEnc =
2168 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2169 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2170 if (TrySimplify)
2171 Wait = Wait.combined(OldWait);
2172 else
2173 RequiredWait = RequiredWait.combined(OldWait);
2174 // Keep the first wait_storecnt, erase the rest.
2175 if (CombinedStoreDsCntInstr == nullptr) {
2176 CombinedStoreDsCntInstr = &II;
2177 } else {
2178 II.eraseFromParent();
2179 Modified = true;
2180 }
2181 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2182 unsigned OldEnc =
2183 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2184 AMDGPU::Waitcnt OldWait;
2187 if (TrySimplify)
2188 ScoreBrackets.simplifyWaitcnt(OldWait);
2189 Wait = Wait.combined(OldWait);
2190 if (WaitcntDepctrInstr == nullptr) {
2191 WaitcntDepctrInstr = &II;
2192 } else {
2193 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2194 // duplicate if it is waiting on things other than VA_VDST or
2195 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2196 // VM_VSRC subfields of the operand are set to the "no wait"
2197 // values.
2198
2199 unsigned Enc =
2200 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2201 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2202 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2203
2204 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2205 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2206 Modified |= promoteSoftWaitCnt(&II);
2207 } else {
2208 II.eraseFromParent();
2209 Modified = true;
2210 }
2211 }
2212 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2213 // Architectures higher than GFX10 do not have direct loads to
2214 // LDS, so no work required here yet.
2215 II.eraseFromParent();
2216 Modified = true;
2217 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2218 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
2219 // shows up in the assembly as a comment with the original parameter N.
2220 unsigned N = II.getOperand(0).getImm();
2221 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
2222 Wait = Wait.combined(OldWait);
2223 } else {
2224 std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
2225 assert(CT.has_value());
2226 unsigned OldCnt =
2227 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2228 if (TrySimplify)
2229 addWait(Wait, CT.value(), OldCnt);
2230 else
2231 addWait(RequiredWait, CT.value(), OldCnt);
2232 // Keep the first wait of its kind, erase the rest.
2233 if (WaitInstrs[CT.value()] == nullptr) {
2234 WaitInstrs[CT.value()] = &II;
2235 } else {
2236 II.eraseFromParent();
2237 Modified = true;
2238 }
2239 }
2240 }
2241
2242 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2243 Wait = Wait.combined(RequiredWait);
2244
2245 if (CombinedLoadDsCntInstr) {
2246 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2247 // to be waited for. Otherwise, let the instruction be deleted so
2248 // the appropriate single counter wait instruction can be inserted
2249 // instead, when new S_WAIT_*CNT instructions are inserted by
2250 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2251 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2252 // the loop below that deals with single counter instructions.
2253 //
2254 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2255 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2256 // will have needed to wait for their register sources to be available
2257 // first.
2258 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2259 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2260 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2261 AMDGPU::OpName::simm16, NewEnc);
2262 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2263 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
2264 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
2265 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2266 Wait.set(AMDGPU::DS_CNT, ~0u);
2267
2268 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2269 << "New Instr at block end: "
2270 << *CombinedLoadDsCntInstr << '\n'
2271 : dbgs() << "applied pre-existing waitcnt\n"
2272 << "Old Instr: " << *It << "New Instr: "
2273 << *CombinedLoadDsCntInstr << '\n');
2274 } else {
2275 CombinedLoadDsCntInstr->eraseFromParent();
2276 Modified = true;
2277 }
2278 }
2279
2280 if (CombinedStoreDsCntInstr) {
2281 // Similarly for S_WAIT_STORECNT_DSCNT.
2282 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2283 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2284 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2285 AMDGPU::OpName::simm16, NewEnc);
2286 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2287 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2288 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2289 Wait.set(AMDGPU::STORE_CNT, ~0u);
2290 Wait.set(AMDGPU::DS_CNT, ~0u);
2291
2292 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2293 << "New Instr at block end: "
2294 << *CombinedStoreDsCntInstr << '\n'
2295 : dbgs() << "applied pre-existing waitcnt\n"
2296 << "Old Instr: " << *It << "New Instr: "
2297 << *CombinedStoreDsCntInstr << '\n');
2298 } else {
2299 CombinedStoreDsCntInstr->eraseFromParent();
2300 Modified = true;
2301 }
2302 }
2303
2304 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2305 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2306 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2307 // instructions so that createNewWaitcnt() will create new combined
2308 // instructions to replace them.
2309
2310 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2311 // This is a vector of addresses in WaitInstrs pointing to instructions
2312 // that should be removed if they are present.
2314
2315 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2316 // both) need to be waited for, ensure that there are no existing
2317 // individual wait count instructions for these.
2318
2319 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2320 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2321 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2322 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2323 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2324 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2325 }
2326
2327 for (MachineInstr **WI : WaitsToErase) {
2328 if (!*WI)
2329 continue;
2330
2331 (*WI)->eraseFromParent();
2332 *WI = nullptr;
2333 Modified = true;
2334 }
2335 }
2336
2338 if (!WaitInstrs[CT])
2339 continue;
2340
2341 unsigned NewCnt = Wait.get(CT);
2342 if (NewCnt != ~0u) {
2343 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2344 AMDGPU::OpName::simm16, NewCnt);
2345 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2346
2347 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2348 setNoWait(Wait, CT);
2349
2350 LLVM_DEBUG(It.isEnd()
2351 ? dbgs() << "applied pre-existing waitcnt\n"
2352 << "New Instr at block end: " << *WaitInstrs[CT]
2353 << '\n'
2354 : dbgs() << "applied pre-existing waitcnt\n"
2355 << "Old Instr: " << *It
2356 << "New Instr: " << *WaitInstrs[CT] << '\n');
2357 } else {
2358 WaitInstrs[CT]->eraseFromParent();
2359 Modified = true;
2360 }
2361 }
2362
2363 if (WaitcntDepctrInstr) {
2364 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2365 // subfields with the new required values.
2366 unsigned Enc =
2367 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2368 ->getImm();
2371
2372 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2373 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2374 Wait.set(AMDGPU::VA_VDST, ~0u);
2375 Wait.set(AMDGPU::VM_VSRC, ~0u);
2376
2377 // If that new encoded Depctr immediate would actually still wait
2378 // for anything, update the instruction's operand. Otherwise it can
2379 // just be deleted.
2380 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2381 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2382 AMDGPU::OpName::simm16, Enc);
2383 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2384 << "New Instr at block end: "
2385 << *WaitcntDepctrInstr << '\n'
2386 : dbgs() << "applyPreexistingWaitcnt\n"
2387 << "Old Instr: " << *It << "New Instr: "
2388 << *WaitcntDepctrInstr << '\n');
2389 } else {
2390 WaitcntDepctrInstr->eraseFromParent();
2391 Modified = true;
2392 }
2393 }
2394
2395 return Modified;
2396}
2397
2398/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2399bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2400 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2401 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2402 assert(!isNormalMode(MaxCounter));
2403
2404 bool Modified = false;
2405 const DebugLoc &DL = Block.findDebugLoc(It);
2406
2407 // Helper to emit expanded waitcnt sequence for profiling.
2408 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2409 auto EmitWaitcnt) {
2410 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2411 EmitWaitcnt(I);
2412 EmitWaitcnt(Target);
2413 Modified = true;
2414 };
2415
2416 // For GFX12+, we use separate wait instructions, which makes expansion
2417 // simpler
2418 if (ExpandWaitcntProfiling) {
2420 unsigned Count = Wait.get(CT);
2421 if (Count == ~0u)
2422 continue;
2423
2424 // Skip expansion for out-of-order counters - emit normal wait instead
2425 if (ScoreBrackets.counterOutOfOrder(CT)) {
2426 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2427 .addImm(Count);
2428 Modified = true;
2429 continue;
2430 }
2431
2432 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2433 getWaitCountMax(getLimits(), CT) - 1);
2434 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2435 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2436 .addImm(Val);
2437 });
2438 }
2439 return Modified;
2440 }
2441
2442 // Normal behavior (no expansion)
2443 // Check for opportunities to use combined wait instructions.
2444 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2445 MachineInstr *SWaitInst = nullptr;
2446
2447 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2448 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2449
2450 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2451 .addImm(Enc);
2452
2453 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2454 Wait.set(AMDGPU::DS_CNT, ~0u);
2455 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2456 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2457
2458 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2459 .addImm(Enc);
2460
2461 Wait.set(AMDGPU::STORE_CNT, ~0u);
2462 Wait.set(AMDGPU::DS_CNT, ~0u);
2463 }
2464
2465 if (SWaitInst) {
2466 Modified = true;
2467
2468 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2469 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2470 dbgs() << "New Instr: " << *SWaitInst << '\n');
2471 }
2472 }
2473
2474 // Generate an instruction for any remaining counter that needs
2475 // waiting for.
2476
2478 unsigned Count = Wait.get(CT);
2479 if (Count == ~0u)
2480 continue;
2481
2482 [[maybe_unused]] auto SWaitInst =
2483 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2484 .addImm(Count);
2485
2486 Modified = true;
2487
2488 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2489 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2490 dbgs() << "New Instr: " << *SWaitInst << '\n');
2491 }
2492
2493 if (Wait.hasWaitDepctr()) {
2494 assert(IsExpertMode);
2495 unsigned Enc =
2498
2499 [[maybe_unused]] auto SWaitInst =
2500 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2501
2502 Modified = true;
2503
2504 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2505 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2506 dbgs() << "New Instr: " << *SWaitInst << '\n');
2507 }
2508
2509 return Modified;
2510}
2511
2512/// Generate s_waitcnt instruction to be placed before cur_Inst.
2513/// Instructions of a given type are returned in order,
2514/// but instructions of different types can complete out of order.
2515/// We rely on this in-order completion
2516/// and simply assign a score to the memory access instructions.
2517/// We keep track of the active "score bracket" to determine
2518/// if an access of a memory read requires an s_waitcnt
2519/// and if so what the value of each counter is.
2520/// The "score bracket" is bound by the lower bound and upper bound
2521/// scores (*_score_LB and *_score_ub respectively).
2522/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2523/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2524/// (GFX12+ only, where DS_CNT is a separate counter).
2525bool SIInsertWaitcnts::generateWaitcntInstBefore(
2526 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2527 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2528 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2529 setForceEmitWaitcnt();
2530
2531 assert(!isNonWaitcntMetaInst(MI));
2532
2533 AMDGPU::Waitcnt Wait;
2534 const unsigned Opc = MI.getOpcode();
2535
2536 switch (Opc) {
2537 case AMDGPU::BUFFER_WBINVL1:
2538 case AMDGPU::BUFFER_WBINVL1_SC:
2539 case AMDGPU::BUFFER_WBINVL1_VOL:
2540 case AMDGPU::BUFFER_GL0_INV:
2541 case AMDGPU::BUFFER_GL1_INV: {
2542 // FIXME: This should have already been handled by the memory legalizer.
2543 // Removing this currently doesn't affect any lit tests, but we need to
2544 // verify that nothing was relying on this. The number of buffer invalidates
2545 // being handled here should not be expanded.
2546 Wait.set(AMDGPU::LOAD_CNT, 0);
2547 break;
2548 }
2549 case AMDGPU::SI_RETURN_TO_EPILOG:
2550 case AMDGPU::SI_RETURN:
2551 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2552 case AMDGPU::S_SETPC_B64_return: {
2553 // All waits must be resolved at call return.
2554 // NOTE: this could be improved with knowledge of all call sites or
2555 // with knowledge of the called routines.
2556 ReturnInsts.insert(&MI);
2557 AMDGPU::Waitcnt AllZeroWait =
2558 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2559 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2560 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2561 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2562 // no need to wait for it at function boundaries.
2563 if (ST.hasExtendedWaitCounts() &&
2564 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2565 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2566 Wait = AllZeroWait;
2567 break;
2568 }
2569 case AMDGPU::S_ENDPGM:
2570 case AMDGPU::S_ENDPGM_SAVED: {
2571 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2572 // Technically the hardware will do this on its own if we don't, but that
2573 // might cost extra cycles compared to doing it explicitly.
2574 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2575 // have to wait for outstanding VMEM stores. In this case it can be useful
2576 // to send a message to explicitly release all VGPRs before the stores have
2577 // completed, but it is only safe to do this if there are no outstanding
2578 // scratch stores.
2579 EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2580 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2581 break;
2582 }
2583 case AMDGPU::S_SENDMSG:
2584 case AMDGPU::S_SENDMSGHALT: {
2585 if (ST.hasLegacyGeometry() &&
2586 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2588 // Resolve vm waits before gs-done.
2589 Wait.set(AMDGPU::LOAD_CNT, 0);
2590 break;
2591 }
2592 [[fallthrough]];
2593 }
2594 default: {
2595
2596 // Export & GDS instructions do not read the EXEC mask until after the
2597 // export is granted (which can occur well after the instruction is issued).
2598 // The shader program must flush all EXP operations on the export-count
2599 // before overwriting the EXEC mask.
2600 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2601 // Export and GDS are tracked individually, either may trigger a waitcnt
2602 // for EXEC.
2603 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2604 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2605 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2606 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2607 Wait.set(AMDGPU::EXP_CNT, 0);
2608 }
2609 }
2610
2611 // Wait for any pending GDS instruction to complete before any
2612 // "Always GDS" instruction.
2613 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2614 addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2615
2616 if (MI.isCall()) {
2617 // The function is going to insert a wait on everything in its prolog.
2618 // This still needs to be careful if the call target is a load (e.g. a GOT
2619 // load). We also need to check WAW dependency with saved PC.
2620 CallInsts.insert(&MI);
2621 Wait = AMDGPU::Waitcnt();
2622
2623 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2624 if (CallAddrOp.isReg()) {
2625 ScoreBrackets.determineWaitForPhysReg(
2626 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2627
2628 if (const auto *RtnAddrOp =
2629 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2630 ScoreBrackets.determineWaitForPhysReg(
2631 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2632 }
2633 }
2634 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2635 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2636 } else {
2637 // FIXME: Should not be relying on memoperands.
2638 // Look at the source operands of every instruction to see if
2639 // any of them results from a previous memory operation that affects
2640 // its current usage. If so, an s_waitcnt instruction needs to be
2641 // emitted.
2642 // If the source operand was defined by a load, add the s_waitcnt
2643 // instruction.
2644 //
2645 // Two cases are handled for destination operands:
2646 // 1) If the destination operand was defined by a load, add the s_waitcnt
2647 // instruction to guarantee the right WAW order.
2648 // 2) If a destination operand that was used by a recent export/store ins,
2649 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2650
2651 for (const MachineMemOperand *Memop : MI.memoperands()) {
2652 const Value *Ptr = Memop->getValue();
2653 if (Memop->isStore()) {
2654 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2655 addWait(Wait, SmemAccessCounter, 0);
2656 if (PDT.dominates(MI.getParent(), It->second))
2657 SLoadAddresses.erase(It);
2658 }
2659 }
2660 unsigned AS = Memop->getAddrSpace();
2662 continue;
2663 // No need to wait before load from VMEM to LDS.
2664 if (TII.mayWriteLDSThroughDMA(MI))
2665 continue;
2666
2667 // LOAD_CNT is only relevant to vgpr or LDS.
2668 unsigned TID = LDSDMA_BEGIN;
2669 if (Ptr && Memop->getAAInfo()) {
2670 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2671 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2672 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2673 if ((I + 1) >= NUM_LDSDMA) {
2674 // We didn't have enough slot to track this LDS DMA store, it
2675 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2676 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2677 Wait);
2678 break;
2679 }
2680
2681 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2682 TID + I + 1, Wait);
2683 }
2684 }
2685 } else {
2686 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2687 }
2688 if (Memop->isStore()) {
2689 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2690 }
2691 }
2692
2693 // Loop over use and def operands.
2694 for (const MachineOperand &Op : MI.operands()) {
2695 if (!Op.isReg())
2696 continue;
2697
2698 // If the instruction does not read tied source, skip the operand.
2699 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2700 continue;
2701
2702 MCPhysReg Reg = Op.getReg().asMCReg();
2703
2704 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2705 if (IsVGPR) {
2706 // Implicit VGPR defs and uses are never a part of the memory
2707 // instructions description and usually present to account for
2708 // super-register liveness.
2709 // TODO: Most of the other instructions also have implicit uses
2710 // for the liveness accounting only.
2711 if (Op.isImplicit() && MI.mayLoadOrStore())
2712 continue;
2713
2714 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2715 if (Op.isDef())
2716 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2717 MI);
2718 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2719 // previous write and this write are the same type of VMEM
2720 // instruction, in which case they are (in some architectures)
2721 // guaranteed to write their results in order anyway.
2722 // Additionally check instructions where Point Sample Acceleration
2723 // might be applied.
2724 if (Op.isUse() || !updateVMCntOnly(MI) ||
2725 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2726 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2727 !ST.hasVmemWriteVgprInOrder()) {
2728 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2729 MI);
2730 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2731 MI);
2732 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2733 MI);
2734 ScoreBrackets.clearVgprVmemTypes(Reg);
2735 }
2736
2737 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2738 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2739 MI);
2740 }
2741 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2742 } else if (Op.getReg() == AMDGPU::SCC) {
2743 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2744 } else {
2745 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2746 MI);
2747 }
2748
2749 if (ST.hasWaitXcnt() && Op.isDef())
2750 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2751 }
2752 }
2753 }
2754 }
2755
2756 // Ensure safety against exceptions from outstanding memory operations while
2757 // waiting for a barrier:
2758 //
2759 // * Some subtargets safely handle backing off the barrier in hardware
2760 // when an exception occurs.
2761 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2762 // there can be no outstanding memory operations during the wait.
2763 // * Subtargets with split barriers don't need to back off the barrier; it
2764 // is up to the trap handler to preserve the user barrier state correctly.
2765 //
2766 // In all other cases, ensure safety by ensuring that there are no outstanding
2767 // memory operations.
2768 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2769 !ST.hasBackOffBarrier()) {
2770 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2771 }
2772
2773 // TODO: Remove this work-around, enable the assert for Bug 457939
2774 // after fixing the scheduler. Also, the Shader Compiler code is
2775 // independent of target.
2776 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2777 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2778 Wait.set(AMDGPU::DS_CNT, 0);
2779 }
2780
2781 // Verify that the wait is actually needed.
2782 ScoreBrackets.simplifyWaitcnt(Wait);
2783
2784 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2785 // waits on VA_VDST if the instruction it would precede is not a VALU
2786 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2787 // expert scheduling mode.
2788 if (TII.isVALU(MI))
2789 Wait.set(AMDGPU::VA_VDST, ~0u);
2790
2791 // Since the translation for VMEM addresses occur in-order, we can apply the
2792 // XCnt if the current instruction is of VMEM type and has a memory
2793 // dependency with another VMEM instruction in flight.
2794 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2795 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2796 Wait.set(AMDGPU::X_CNT, ~0u);
2797 }
2798
2799 // When forcing emit, we need to skip terminators because that would break the
2800 // terminators of the MBB if we emit a waitcnt between terminators.
2801 if (ForceEmitZeroFlag && !MI.isTerminator())
2802 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2803
2804 // If we force waitcnt then update Wait accordingly.
2806 if (!ForceEmitWaitcnt[T])
2807 continue;
2808 Wait.set(T, 0);
2809 }
2810
2811 if (FlushFlags.FlushVmCnt) {
2814 Wait.set(T, 0);
2815 }
2816
2817 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2818 Wait.set(AMDGPU::DS_CNT, 0);
2819
2820 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2821 Wait.set(AMDGPU::LOAD_CNT, 0);
2822
2823 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2824 OldWaitcntInstr);
2825}
2826
2827bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2829 MachineBasicBlock &Block,
2830 WaitcntBrackets &ScoreBrackets,
2831 MachineInstr *OldWaitcntInstr) {
2832 bool Modified = false;
2833
2834 if (OldWaitcntInstr)
2835 // Try to merge the required wait with preexisting waitcnt instructions.
2836 // Also erase redundant waitcnt.
2837 Modified =
2838 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2839
2840 // ExpCnt can be merged into VINTERP.
2841 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2843 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2844 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2845 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2846 Modified = true;
2847 }
2848 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2849 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2850 Wait.set(AMDGPU::EXP_CNT, ~0u);
2851
2852 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2853 << "Update Instr: " << *It);
2854 }
2855
2856 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2857 Modified = true;
2858
2859 // Any counts that could have been applied to any existing waitcnt
2860 // instructions will have been done so, now deal with any remaining.
2861 ScoreBrackets.applyWaitcnt(Wait);
2862
2863 return Modified;
2864}
2865
2866std::optional<WaitEventType>
2867SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2868 if (TII.isVALU(Inst)) {
2869 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2870 // out-of-order with respect to each other, so each of these classes
2871 // has its own event.
2872
2873 if (TII.isXDL(Inst))
2874 return VGPR_XDL_WRITE;
2875
2876 if (TII.isTRANS(Inst))
2877 return VGPR_TRANS_WRITE;
2878
2880 return VGPR_DPMACC_WRITE;
2881
2882 return VGPR_CSMACC_WRITE;
2883 }
2884
2885 // FLAT and LDS instructions may read their VGPR sources out-of-order
2886 // with respect to each other and all other VMEM instructions, so
2887 // each of these also has a separate event.
2888
2889 if (TII.isFLAT(Inst))
2890 return VGPR_FLAT_READ;
2891
2892 if (TII.isDS(Inst))
2893 return VGPR_LDS_READ;
2894
2895 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2896 return VGPR_VMEM_READ;
2897
2898 // Otherwise, no hazard.
2899
2900 return {};
2901}
2902
2903bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2904 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2905 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2906}
2907
2908// Return true if the next instruction is S_ENDPGM, following fallthrough
2909// blocks if necessary.
2910bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2911 MachineBasicBlock *Block) const {
2912 auto BlockEnd = Block->getParent()->end();
2913 auto BlockIter = Block->getIterator();
2914
2915 while (true) {
2916 if (It.isEnd()) {
2917 if (++BlockIter != BlockEnd) {
2918 It = BlockIter->instr_begin();
2919 continue;
2920 }
2921
2922 return false;
2923 }
2924
2925 if (!It->isMetaInstruction())
2926 break;
2927
2928 It++;
2929 }
2930
2931 assert(!It.isEnd());
2932
2933 return It->getOpcode() == AMDGPU::S_ENDPGM;
2934}
2935
2936// Add a wait after an instruction if architecture requirements mandate one.
2937bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2938 MachineBasicBlock &Block,
2939 WaitcntBrackets &ScoreBrackets) {
2940 AMDGPU::Waitcnt Wait;
2941 bool NeedsEndPGMCheck = false;
2942
2943 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2944 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2946
2947 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2948 Wait.set(AMDGPU::DS_CNT, 0);
2949 NeedsEndPGMCheck = true;
2950 }
2951
2952 ScoreBrackets.simplifyWaitcnt(Wait);
2953
2954 auto SuccessorIt = std::next(Inst.getIterator());
2955 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2956 /*OldWaitcntInstr=*/nullptr);
2957
2958 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2959 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2960 .addImm(0);
2961 }
2962
2963 return Result;
2964}
2965
2966WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2967 WaitEventSet Events;
2968 if (IsExpertMode) {
2969 if (const auto ET = getExpertSchedulingEventType(Inst))
2970 Events.insert(*ET);
2971 }
2972
2973 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2974 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2975 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2976 Events.insert(GDS_ACCESS);
2977 Events.insert(GDS_GPR_LOCK);
2978 } else {
2979 Events.insert(LDS_ACCESS);
2980 }
2981 } else if (TII.isFLAT(Inst)) {
2983 Events.insert(getVmemWaitEventType(Inst));
2984 } else {
2985 assert(Inst.mayLoadOrStore());
2986 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2987 if (ST.hasWaitXcnt())
2988 Events.insert(VMEM_GROUP);
2989 Events.insert(getVmemWaitEventType(Inst));
2990 }
2991 if (TII.mayAccessLDSThroughFlat(Inst))
2992 Events.insert(LDS_ACCESS);
2993 }
2994 } else if (SIInstrInfo::isVMEM(Inst) &&
2996 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2997 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2998 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2999 // completed.
3000 if (ST.hasWaitXcnt())
3001 Events.insert(VMEM_GROUP);
3002 Events.insert(getVmemWaitEventType(Inst));
3003 if (ST.vmemWriteNeedsExpWaitcnt() &&
3004 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
3005 Events.insert(VMW_GPR_LOCK);
3006 }
3007 } else if (TII.isSMRD(Inst)) {
3008 if (ST.hasWaitXcnt())
3009 Events.insert(SMEM_GROUP);
3010 Events.insert(SMEM_ACCESS);
3011 } else if (SIInstrInfo::isLDSDIR(Inst)) {
3012 Events.insert(EXP_LDS_ACCESS);
3013 } else if (SIInstrInfo::isEXP(Inst)) {
3014 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
3016 Events.insert(EXP_PARAM_ACCESS);
3017 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
3018 Events.insert(EXP_POS_ACCESS);
3019 else
3020 Events.insert(EXP_GPR_LOCK);
3021 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
3022 Events.insert(SCC_WRITE);
3023 } else {
3024 switch (Inst.getOpcode()) {
3025 case AMDGPU::S_SENDMSG:
3026 case AMDGPU::S_SENDMSG_RTN_B32:
3027 case AMDGPU::S_SENDMSG_RTN_B64:
3028 case AMDGPU::S_SENDMSGHALT:
3029 Events.insert(SQ_MESSAGE);
3030 break;
3031 case AMDGPU::S_MEMTIME:
3032 case AMDGPU::S_MEMREALTIME:
3033 case AMDGPU::S_GET_BARRIER_STATE_M0:
3034 case AMDGPU::S_GET_BARRIER_STATE_IMM:
3035 Events.insert(SMEM_ACCESS);
3036 break;
3037 }
3038 }
3039 return Events;
3040}
3041
3042void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
3043 WaitcntBrackets *ScoreBrackets) {
3044
3045 WaitEventSet InstEvents = getEventsFor(Inst);
3046 for (WaitEventType E : wait_events()) {
3047 if (InstEvents.contains(E))
3048 ScoreBrackets->updateByEvent(E, Inst);
3049 }
3050
3051 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
3052 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
3053 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
3054 ScoreBrackets->setPendingGDS();
3055 }
3056 } else if (TII.isFLAT(Inst)) {
3057 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
3058 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
3059 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
3060 // pointers. They do have two operands that each access global and LDS,
3061 // thus making it appear at this point that they are using a flat pointer.
3062 // Filter them out, and for the rest, generate a dependency on flat
3063 // pointers so that both VM and LGKM counters are flushed.
3064 ScoreBrackets->setPendingFlat();
3065 }
3066 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
3067 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3068 }
3069 } else if (SIInstrInfo::usesTENSOR_CNT(Inst)) {
3070 ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst);
3071 } else if (Inst.isCall()) {
3072 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
3073 // included in such blanket waits.
3074 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
3075 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3076 } else if (TII.isVINTERP(Inst)) {
3077 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3078 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
3079 }
3080
3081 // Set XCNT to zero in the bracket for instructions that implicitly drain
3082 // XCNT.
3083 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
3084 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
3085}
3086
3087bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
3088 unsigned OtherScore) {
3089 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
3090 unsigned OtherShifted =
3091 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
3092 Score = std::max(MyShifted, OtherShifted);
3093 return OtherShifted > MyShifted;
3094}
3095
3096bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
3097 ArrayRef<CounterValueArray> OtherMarks) {
3098 bool StrictDom = false;
3099
3100 LLVM_DEBUG(dbgs() << "Merging async marks ...");
3101 // Early exit: nothing to merge when both sides are empty.
3102 if (AsyncMarks.empty() && OtherMarks.empty()) {
3103 LLVM_DEBUG(dbgs() << " nothing to merge\n");
3104 return false;
3105 }
3106 LLVM_DEBUG(dbgs() << '\n');
3107
3108 // Determine maximum length needed after merging
3109 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
3110 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3111
3112 // Keep only the most recent marks within our limit.
3113 if (AsyncMarks.size() > MaxSize)
3114 AsyncMarks.erase(AsyncMarks.begin(),
3115 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3116
3117 // Pad with zero-filled marks if our list is shorter. Zero represents "no
3118 // pending async operations at this checkpoint" and acts as the identity
3119 // element for max() during merging. We pad at the beginning since the marks
3120 // need to be aligned in most-recent order.
3121 constexpr CounterValueArray ZeroMark{};
3122 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3123
3124 LLVM_DEBUG({
3125 dbgs() << "Before merge:\n";
3126 for (const auto &Mark : AsyncMarks) {
3127 llvm::interleaveComma(Mark, dbgs());
3128 dbgs() << '\n';
3129 }
3130 dbgs() << "Other marks:\n";
3131 for (const auto &Mark : OtherMarks) {
3132 llvm::interleaveComma(Mark, dbgs());
3133 dbgs() << '\n';
3134 }
3135 });
3136
3137 // Merge element-wise using the existing mergeScore function and the
3138 // appropriate MergeInfo for each counter type. Iterate only while we have
3139 // elements in both vectors.
3140 unsigned OtherSize = OtherMarks.size();
3141 unsigned OurSize = AsyncMarks.size();
3142 unsigned MergeCount = std::min(OtherSize, OurSize);
3143 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
3144 // Our existing marks are the conservative result; return early to avoid
3145 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
3146 if (MergeCount == 0)
3147 return StrictDom;
3148 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3149 for (auto T : inst_counter_types(Context->MaxCounter)) {
3150 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3151 OtherMarks[OtherSize - Idx][T]);
3152 }
3153 }
3154
3155 LLVM_DEBUG({
3156 dbgs() << "After merge:\n";
3157 for (const auto &Mark : AsyncMarks) {
3158 llvm::interleaveComma(Mark, dbgs());
3159 dbgs() << '\n';
3160 }
3161 });
3162
3163 return StrictDom;
3164}
3165
3166/// Merge the pending events and associater score brackets of \p Other into
3167/// this brackets status.
3168///
3169/// Returns whether the merge resulted in a change that requires tighter waits
3170/// (i.e. the merged brackets strictly dominate the original brackets).
3171bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3172 bool StrictDom = false;
3173
3174 // Check if "other" has keys we don't have, and create default entries for
3175 // those. If they remain empty after merging, we will clean it up after.
3176 for (auto K : Other.VMem.keys())
3177 VMem.try_emplace(K);
3178 for (auto K : Other.SGPRs.keys())
3179 SGPRs.try_emplace(K);
3180
3181 // Array to store MergeInfo for each counter type
3182 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
3183
3184 for (auto T : inst_counter_types(Context->MaxCounter)) {
3185 // Merge event flags for this counter
3186 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3187 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3188 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3189 if (!OldEvents.contains(OtherEvents))
3190 StrictDom = true;
3191 PendingEvents |= OtherEvents;
3192
3193 // Merge scores for this counter
3194 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3195 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3196 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3197 if (NewUB < ScoreLBs[T])
3198 report_fatal_error("waitcnt score overflow");
3199
3200 MergeInfo &M = MergeInfos[T];
3201 M.OldLB = ScoreLBs[T];
3202 M.OtherLB = Other.ScoreLBs[T];
3203 M.MyShift = NewUB - ScoreUBs[T];
3204 M.OtherShift = NewUB - Other.ScoreUBs[T];
3205
3206 ScoreUBs[T] = NewUB;
3207
3208 if (T == AMDGPU::LOAD_CNT)
3209 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3210
3211 if (T == AMDGPU::DS_CNT) {
3212 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3213 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3214 }
3215
3216 if (T == AMDGPU::KM_CNT) {
3217 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3218 if (Other.hasPendingEvent(SCC_WRITE)) {
3219 if (!OldEvents.contains(SCC_WRITE)) {
3220 PendingSCCWrite = Other.PendingSCCWrite;
3221 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3222 PendingSCCWrite = nullptr;
3223 }
3224 }
3225 }
3226
3227 for (auto &[RegID, Info] : VMem)
3228 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3229
3230 if (isSmemCounter(T)) {
3231 for (auto &[RegID, Info] : SGPRs) {
3232 auto It = Other.SGPRs.find(RegID);
3233 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3234 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3235 }
3236 }
3237 }
3238
3239 for (auto &[TID, Info] : VMem) {
3240 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3241 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3242 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3243 Info.VMEMTypes = NewVmemTypes;
3244 }
3245 }
3246
3247 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3248 for (auto T : inst_counter_types(Context->MaxCounter))
3249 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3250
3251 purgeEmptyTrackingData();
3252 return StrictDom;
3253}
3254
3255static bool isWaitInstr(MachineInstr &Inst) {
3256 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3257 return Opcode == AMDGPU::S_WAITCNT ||
3258 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3259 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3260 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3261 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3262 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3263 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3264 counterTypeForInstr(Opcode).has_value();
3265}
3266
3267void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3269 bool ExpertMode) const {
3270 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3272 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3273 .addImm(ExpertMode ? 2 : 0)
3274 .addImm(EncodedReg);
3275}
3276
3277namespace {
3278// TODO: Remove this work-around after fixing the scheduler.
3279// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3280// and ST.partialVCCWritesUpdateVCCZ().
3281// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3282// corrupt vccz bit, so when we detect that an instruction may read from
3283// a corrupt vccz bit, we need to:
3284// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3285// operations to complete.
3286// 2. Recompute the correct value of vccz by writing the current value
3287// of vcc back to vcc.
3288// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3289// correct value of vccz by reading vcc and writing it back to vcc.
3290// No waitcnt is needed in this case.
3291class VCCZWorkaround {
3292 const WaitcntBrackets &ScoreBrackets;
3293 const GCNSubtarget &ST;
3294 const SIInstrInfo &TII;
3295 const SIRegisterInfo &TRI;
3296 bool VCCZCorruptionBug = false;
3297 bool VCCZNotUpdatedByPartialWrites = false;
3298 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3299 /// to vcc and then issued an smem load, so initialize to true.
3300 bool MustRecomputeVCCZ = true;
3301
3302public:
3303 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3304 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3305 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3306 VCCZCorruptionBug = ST.hasReadVCCZBug();
3307 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3308 }
3309 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3310 /// then emit a vccz recompute instruction before \p MI. This needs to be
3311 /// called on every instruction in the basic block because it also tracks the
3312 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3313 /// modified the IR.
3314 bool tryRecomputeVCCZ(MachineInstr &MI) {
3315 // No need to run this if neither bug is present.
3316 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3317 return false;
3318
3319 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3320 // both to emit a waitcnt and to recompute vccz.
3321 // But we don't actually emit a waitcnt here. This is done in
3322 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3323 // state, and can either skip emitting a waitcnt if there is already one in
3324 // the IR, or emit an "optimized" combined waitcnt.
3325 // If this is an smem read, it could complete and clobber vccz at any time.
3326 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3327
3328 // If the target partial vcc writes don't update vccz, and MI is such an
3329 // instruction then we must recompute vccz.
3330 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3331 // `definesRegister()` more than needed, because it's not very cheap.
3332 std::optional<bool> PartiallyWritesToVCCOpt;
3333 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3334 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3335 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3336 };
3337 if (VCCZNotUpdatedByPartialWrites) {
3338 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3339 // If this is a partial VCC write but won't update vccz, then we must
3340 // recompute vccz.
3341 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3342 }
3343
3344 // If MI is a vcc write with no pending smem, or there is a pending smem
3345 // but the target does not suffer from the vccz corruption bug, then we
3346 // don't need to recompute vccz as this write will recompute it anyway.
3347 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3348 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3349 if (!PartiallyWritesToVCCOpt)
3350 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3351 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3352 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3353 // If we write to the full vcc or we write partially and the target
3354 // updates vccz on partial writes, then vccz will be updated correctly.
3355 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3356 *PartiallyWritesToVCCOpt);
3357 if (UpdatesVCCZ)
3358 MustRecomputeVCCZ = false;
3359 }
3360
3361 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3362 // restore instruction if either is needed.
3363 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3364 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3365 // bit is updated, so we can restore the bit by reading the value of vcc
3366 // and then writing it back to the register.
3367 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3368 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3369 TRI.getVCC())
3370 .addReg(TRI.getVCC());
3371 MustRecomputeVCCZ = false;
3372 return true;
3373 }
3374 return false;
3375 }
3376};
3377
3378} // namespace
3379
3380// Generate s_waitcnt instructions where needed.
3381bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3382 MachineBasicBlock &Block,
3383 WaitcntBrackets &ScoreBrackets) {
3384 bool Modified = false;
3385
3386 LLVM_DEBUG({
3387 dbgs() << "*** Begin Block: ";
3388 Block.printName(dbgs());
3389 ScoreBrackets.dump();
3390 });
3391 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3392
3393 // Walk over the instructions.
3394 MachineInstr *OldWaitcntInstr = nullptr;
3395
3396 // NOTE: We may append instrs after Inst while iterating.
3397 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3398 E = Block.instr_end();
3399 Iter != E; ++Iter) {
3400 MachineInstr &Inst = *Iter;
3401 if (isNonWaitcntMetaInst(Inst))
3402 continue;
3403 // Track pre-existing waitcnts that were added in earlier iterations or by
3404 // the memory legalizer.
3405 if (isWaitInstr(Inst) ||
3406 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3407 if (!OldWaitcntInstr)
3408 OldWaitcntInstr = &Inst;
3409 continue;
3410 }
3411
3412 PreheaderFlushFlags FlushFlags;
3413 if (Block.getFirstTerminator() == Inst)
3414 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3415
3416 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3417 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3418 FlushFlags);
3419 OldWaitcntInstr = nullptr;
3420
3421 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3422 // Asyncmarks record the current wait state and so should not allow
3423 // waitcnts that occur after them to be merged into waitcnts that occur
3424 // before.
3425 ScoreBrackets.recordAsyncMark(Inst);
3426 continue;
3427 }
3428
3429 if (TII.isSMRD(Inst)) {
3430 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3431 // No need to handle invariant loads when avoiding WAR conflicts, as
3432 // there cannot be a vector store to the same memory location.
3433 if (!Memop->isInvariant()) {
3434 const Value *Ptr = Memop->getValue();
3435 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3436 }
3437 }
3438 }
3439
3440 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3441
3442 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3443 // visited by the loop.
3444 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3445
3446 LLVM_DEBUG({
3447 Inst.print(dbgs());
3448 ScoreBrackets.dump();
3449 });
3450
3451 // If the target suffers from the vccz bugs, this may emit the necessary
3452 // vccz recompute instruction before \p Inst if needed.
3453 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3454 }
3455
3456 // Flush counters at the end of the block if needed (for preheaders with no
3457 // terminator).
3458 AMDGPU::Waitcnt Wait;
3459 if (Block.getFirstTerminator() == Block.end()) {
3460 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3461 if (FlushFlags.FlushVmCnt) {
3462 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3463 Wait.set(AMDGPU::LOAD_CNT, 0);
3464 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3465 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3466 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3467 Wait.set(AMDGPU::BVH_CNT, 0);
3468 }
3469 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3470 Wait.set(AMDGPU::DS_CNT, 0);
3471 }
3472
3473 // Combine or remove any redundant waitcnts at the end of the block.
3474 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3475 OldWaitcntInstr);
3476
3477 LLVM_DEBUG({
3478 dbgs() << "*** End Block: ";
3479 Block.printName(dbgs());
3480 ScoreBrackets.dump();
3481 });
3482
3483 return Modified;
3484}
3485
3486bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3487 if (Block.size() <= 1)
3488 return false;
3489 // The Memory Legalizer conservatively inserts a soft xcnt before each
3490 // atomic RMW operation. However, for sequences of back-to-back atomic
3491 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3492 // the redundant soft xcnts.
3493 bool Modified = false;
3494 // Remember the last atomic with a soft xcnt right before it.
3495 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3496
3497 for (MachineInstr &MI : drop_begin(Block)) {
3498 // Ignore last atomic if non-LDS VMEM and SMEM.
3499 bool IsLDS =
3500 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3501 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3502 LastAtomicWithSoftXcnt = nullptr;
3503
3504 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3505 MI.mayLoad() && MI.mayStore();
3506 MachineInstr &PrevMI = *MI.getPrevNode();
3507 // This is an atomic with a soft xcnt.
3508 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3509 // If we have already found an atomic with a soft xcnt, remove this soft
3510 // xcnt as it's redundant.
3511 if (LastAtomicWithSoftXcnt) {
3512 PrevMI.eraseFromParent();
3513 Modified = true;
3514 }
3515 LastAtomicWithSoftXcnt = &MI;
3516 }
3517 }
3518 return Modified;
3519}
3520
3521// Return flags indicating which counters should be flushed in the preheader.
3522PreheaderFlushFlags
3523SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3524 const WaitcntBrackets &ScoreBrackets) {
3525 auto [Iterator, IsInserted] =
3526 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3527 if (!IsInserted)
3528 return Iterator->second;
3529
3530 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3531 if (!Succ)
3532 return PreheaderFlushFlags();
3533
3534 MachineLoop *Loop = MLI.getLoopFor(Succ);
3535 if (!Loop)
3536 return PreheaderFlushFlags();
3537
3538 if (Loop->getLoopPreheader() == &MBB) {
3539 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3540 return Iterator->second;
3541 }
3542
3543 return PreheaderFlushFlags();
3544}
3545
3546bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3548 return TII.mayAccessVMEMThroughFlat(MI);
3549 return SIInstrInfo::isVMEM(MI);
3550}
3551
3552bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3553 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3554}
3555
3556// Check if instruction is a store to LDS that is counted via DSCNT
3557// (where that counter exists).
3558bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3559 return MI.mayStore() && SIInstrInfo::isDS(MI);
3560}
3561
3562// Return flags indicating which counters should be flushed in the preheader of
3563// the given loop. We currently decide to flush in the following situations:
3564// For VMEM (FlushVmCnt):
3565// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3566// vgpr containing a value that is loaded outside of the loop. (Only on
3567// targets with no vscnt counter).
3568// 2. The loop contains vmem load(s), but the loaded values are not used in the
3569// loop, and at least one use of a vgpr containing a value that is loaded
3570// outside of the loop.
3571// For DS (FlushDsCnt, GFX12+ only):
3572// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3573// a value that is DS read outside of the loop.
3574// 4. The loop contains DS read(s), loaded values are not used in the same
3575// iteration but in the next iteration (prefetch pattern), and at least one
3576// use of a vgpr containing a value that is DS read outside of the loop.
3577// Flushing in preheader reduces wait overhead if the wait requirement in
3578// iteration 1 would otherwise be more strict (but unfortunately preheader
3579// flush decision is taken before knowing that).
3580// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3581// tracking. Some DS reads may be used in the same iteration (creating
3582// "flush points"), but others remain unflushed at the backedge. When a DS
3583// read is consumed in the same iteration, it and all prior reads are
3584// "flushed" (FIFO order). No DS writes are allowed in the loop.
3585// TODO: Find a way to extend to multi-block loops.
3586PreheaderFlushFlags
3587SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3588 const WaitcntBrackets &Brackets) {
3589 PreheaderFlushFlags Flags;
3590 bool HasVMemLoad = false;
3591 bool HasVMemStore = false;
3592 bool UsesVgprVMEMLoadedOutside = false;
3593 bool UsesVgprDSReadOutside = false;
3594 bool VMemInvalidated = false;
3595 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3596 // Tracking status for "no DS read in loop" or "pure DS prefetch
3597 // (use only in next iteration)".
3598 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3599 DenseSet<MCRegUnit> VgprUse;
3600 DenseSet<MCRegUnit> VgprDefVMEM;
3601 DenseSet<MCRegUnit> VgprDefDS;
3602
3603 // Track DS reads for prefetch pattern with flush points (single-block only).
3604 // Keeps track of the last DS read (position counted from the top of the loop)
3605 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3606 // the dest register has a use or is overwritten (by any later opertions).
3607 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3608 unsigned DSReadPosition = 0;
3609 bool IsSingleBlock = ML->getNumBlocks() == 1;
3610 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3611 unsigned LastDSFlushPosition = 0;
3612
3613 for (MachineBasicBlock *MBB : ML->blocks()) {
3614 for (MachineInstr &MI : *MBB) {
3615 if (isVMEMOrFlatVMEM(MI)) {
3616 HasVMemLoad |= MI.mayLoad();
3617 HasVMemStore |= MI.mayStore();
3618 }
3619 // TODO: Can we relax DSStore check? There may be cases where
3620 // these DS stores are drained prior to the end of MBB (or loop).
3621 if (mayStoreIncrementingDSCNT(MI)) {
3622 // Early exit if none of the optimizations are feasible.
3623 // Otherwise, set tracking status appropriately and continue.
3624 if (VMemInvalidated)
3625 return Flags;
3626 TrackSimpleDSOpt = false;
3627 TrackDSFlushPoint = false;
3628 }
3629 bool IsDSRead = isDSRead(MI);
3630 if (IsDSRead)
3631 ++DSReadPosition;
3632
3633 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3634 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3635 if (!TrackDSFlushPoint)
3636 return;
3637 if (auto It = LastDSReadPositionMap.find(RU);
3638 It != LastDSReadPositionMap.end()) {
3639 // RU defined by DSRead is used or overwritten. Need to complete
3640 // the read, if not already implied by a later DSRead (to any RU)
3641 // needing to complete in FIFO order.
3642 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3643 }
3644 };
3645
3646 for (const MachineOperand &Op : MI.all_uses()) {
3647 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3648 continue;
3649 // Vgpr use
3650 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3651 // If we find a register that is loaded inside the loop, 1. and 2.
3652 // are invalidated.
3653 if (VgprDefVMEM.contains(RU))
3654 VMemInvalidated = true;
3655
3656 // Check for DS reads used inside the loop
3657 if (VgprDefDS.contains(RU))
3658 TrackSimpleDSOpt = false;
3659
3660 // Early exit if all optimizations are invalidated
3661 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3662 return Flags;
3663
3664 // Check for flush points (DS read used in same iteration)
3665 updateDSReadFlushTracking(RU);
3666
3667 VgprUse.insert(RU);
3668 // Check if this register has a pending VMEM load from outside the
3669 // loop (value loaded outside and used inside).
3670 VMEMID ID = toVMEMID(RU);
3671 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3672 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3673 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3674 UsesVgprVMEMLoadedOutside = true;
3675 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3676 // Only consider it a DS read if there's no pending VMEM load for
3677 // this register, since FLAT can set both counters.
3678 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3679 UsesVgprDSReadOutside = true;
3680 }
3681 }
3682
3683 // VMem load vgpr def
3684 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3685 for (const MachineOperand &Op : MI.all_defs()) {
3686 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3687 // If we find a register that is loaded inside the loop, 1. and 2.
3688 // are invalidated.
3689 if (VgprUse.contains(RU))
3690 VMemInvalidated = true;
3691 VgprDefVMEM.insert(RU);
3692 }
3693 }
3694 // Early exit if all optimizations are invalidated
3695 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3696 return Flags;
3697 }
3698
3699 // DS read vgpr def
3700 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3701 // If USE comes before DEF, it's the prefetch pattern (use value from
3702 // previous iteration, read for next iteration). We should still flush
3703 // in preheader so iteration 1 doesn't need to wait inside the loop.
3704 // Only invalidate when DEF comes before USE (same-iteration consumption,
3705 // checked above when processing uses).
3706 if (IsDSRead || TrackDSFlushPoint) {
3707 for (const MachineOperand &Op : MI.all_defs()) {
3708 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3709 continue;
3710 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3711 // Check for overwrite of pending DS read (flush point) by any
3712 // instruction
3713 updateDSReadFlushTracking(RU);
3714 if (IsDSRead) {
3715 VgprDefDS.insert(RU);
3716 if (TrackDSFlushPoint)
3717 LastDSReadPositionMap[RU] = DSReadPosition;
3718 }
3719 }
3720 }
3721 }
3722 }
3723 }
3724
3725 // VMEM flush decision
3726 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3727 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3728 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3729 Flags.FlushVmCnt = true;
3730
3731 // DS flush decision:
3732 // Simple DS Opt: flush if loop uses DS read values from outside
3733 // and either has no DS reads in the loop, or DS reads whose results
3734 // are not used in the loop.
3735 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3736 // Prefetch with flush points: some DS reads used in same iteration,
3737 // but unflushed reads remain at backedge
3738 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3739 bool DSFlushPointPrefetch =
3740 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3741
3742 if (SimpleDSOpt || DSFlushPointPrefetch)
3743 Flags.FlushDsCnt = true;
3744
3745 return Flags;
3746}
3747
3748bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3749 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3750 auto &PDT =
3751 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3752 AliasAnalysis *AA = nullptr;
3753 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3754 AA = &AAR->getAAResults();
3755
3756 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3757}
3758
3759PreservedAnalyses
3762 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3763 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3765 .getManager()
3766 .getCachedResult<AAManager>(MF.getFunction());
3767
3768 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3769 return PreservedAnalyses::all();
3770
3773 .preserve<AAManager>();
3774}
3775
3776bool SIInsertWaitcnts::run() {
3778
3780
3781 // Initialize hardware limits first, as they're needed by the generators.
3782 Limits = AMDGPU::HardwareLimits(IV);
3783
3784 if (ST.hasExtendedWaitCounts()) {
3785 IsExpertMode = ST.hasExpertSchedulingMode() &&
3786 (ExpertSchedulingModeFlag.getNumOccurrences()
3788 : MF.getFunction()
3789 .getFnAttribute("amdgpu-expert-scheduling-mode")
3790 .getValueAsBool());
3791 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3793 // Initialize WCG per MF. It contains state that depends on MF attributes.
3794 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3795 IsExpertMode);
3796 } else {
3797 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3798 // Initialize WCG per MF. It contains state that depends on MF attributes.
3799 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3800 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3801 }
3802
3803 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3804
3805 bool Modified = false;
3806
3807 MachineBasicBlock &EntryBB = MF.front();
3808
3809 if (!MFI->isEntryFunction() &&
3810 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3811 // Wait for any outstanding memory operations that the input registers may
3812 // depend on. We can't track them and it's better to do the wait after the
3813 // costly call sequence.
3814
3815 // TODO: Could insert earlier and schedule more liberally with operations
3816 // that only use caller preserved registers.
3818 while (I != EntryBB.end() && I->isMetaInstruction())
3819 ++I;
3820
3821 if (ST.hasExtendedWaitCounts()) {
3822 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3823 .addImm(0);
3825 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3826 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3828 continue;
3829
3830 if (!ST.hasImageInsts() &&
3831 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3832 CT == AMDGPU::BVH_CNT))
3833 continue;
3834
3835 BuildMI(EntryBB, I, DebugLoc(),
3836 TII.get(instrsForExtendedCounterTypes[CT]))
3837 .addImm(0);
3838 }
3839 if (IsExpertMode) {
3840 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3842 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3843 .addImm(Enc);
3844 }
3845 } else {
3846 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3847 }
3848
3849 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3850 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3851 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3852
3853 Modified = true;
3854 }
3855
3856 // Keep iterating over the blocks in reverse post order, inserting and
3857 // updating s_waitcnt where needed, until a fix point is reached.
3858 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3859 BlockInfos.try_emplace(MBB);
3860
3861 std::unique_ptr<WaitcntBrackets> Brackets;
3862 bool Repeat;
3863 do {
3864 Repeat = false;
3865
3866 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3867 ++BII) {
3868 MachineBasicBlock *MBB = BII->first;
3869 BlockInfo &BI = BII->second;
3870 if (!BI.Dirty)
3871 continue;
3872
3873 if (BI.Incoming) {
3874 if (!Brackets)
3875 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3876 else
3877 *Brackets = *BI.Incoming;
3878 } else {
3879 if (!Brackets) {
3880 Brackets = std::make_unique<WaitcntBrackets>(this);
3881 } else {
3882 // Reinitialize in-place. N.B. do not do this by assigning from a
3883 // temporary because the WaitcntBrackets class is large and it could
3884 // cause this function to use an unreasonable amount of stack space.
3885 Brackets->~WaitcntBrackets();
3886 new (Brackets.get()) WaitcntBrackets(this);
3887 }
3888 }
3889
3890 if (ST.hasWaitXcnt())
3891 Modified |= removeRedundantSoftXcnts(*MBB);
3892 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3893 BI.Dirty = false;
3894
3895 if (Brackets->hasPendingEvent()) {
3896 BlockInfo *MoveBracketsToSucc = nullptr;
3897 for (MachineBasicBlock *Succ : MBB->successors()) {
3898 auto *SuccBII = BlockInfos.find(Succ);
3899 BlockInfo &SuccBI = SuccBII->second;
3900 if (!SuccBI.Incoming) {
3901 SuccBI.Dirty = true;
3902 if (SuccBII <= BII) {
3903 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3904 Repeat = true;
3905 }
3906 if (!MoveBracketsToSucc) {
3907 MoveBracketsToSucc = &SuccBI;
3908 } else {
3909 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3910 }
3911 } else {
3912 LLVM_DEBUG({
3913 dbgs() << "Try to merge ";
3914 MBB->printName(dbgs());
3915 dbgs() << " into ";
3916 Succ->printName(dbgs());
3917 dbgs() << '\n';
3918 });
3919 if (SuccBI.Incoming->merge(*Brackets)) {
3920 SuccBI.Dirty = true;
3921 if (SuccBII <= BII) {
3922 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3923 Repeat = true;
3924 }
3925 }
3926 }
3927 }
3928 if (MoveBracketsToSucc)
3929 MoveBracketsToSucc->Incoming = std::move(Brackets);
3930 }
3931 }
3932 } while (Repeat);
3933
3934 if (ST.hasScalarStores()) {
3935 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3936 bool HaveScalarStores = false;
3937
3938 for (MachineBasicBlock &MBB : MF) {
3939 for (MachineInstr &MI : MBB) {
3940 if (!HaveScalarStores && TII.isScalarStore(MI))
3941 HaveScalarStores = true;
3942
3943 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3944 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3945 EndPgmBlocks.push_back(&MBB);
3946 }
3947 }
3948
3949 if (HaveScalarStores) {
3950 // If scalar writes are used, the cache must be flushed or else the next
3951 // wave to reuse the same scratch memory can be clobbered.
3952 //
3953 // Insert s_dcache_wb at wave termination points if there were any scalar
3954 // stores, and only if the cache hasn't already been flushed. This could
3955 // be improved by looking across blocks for flushes in postdominating
3956 // blocks from the stores but an explicitly requested flush is probably
3957 // very rare.
3958 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3959 bool SeenDCacheWB = false;
3960
3961 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3962 I != E; ++I) {
3963 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3964 SeenDCacheWB = true;
3965 else if (TII.isScalarStore(*I))
3966 SeenDCacheWB = false;
3967
3968 // FIXME: It would be better to insert this before a waitcnt if any.
3969 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3970 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3971 !SeenDCacheWB) {
3972 Modified = true;
3973 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3974 }
3975 }
3976 }
3977 }
3978 }
3979
3980 if (IsExpertMode) {
3981 // Enable expert scheduling on function entry. To satisfy ABI requirements
3982 // and to allow calls between function with different expert scheduling
3983 // settings, disable it around calls and before returns.
3984
3986 while (I != EntryBB.end() && I->isMetaInstruction())
3987 ++I;
3988 setSchedulingMode(EntryBB, I, true);
3989
3990 for (MachineInstr *MI : CallInsts) {
3991 MachineBasicBlock &MBB = *MI->getParent();
3992 setSchedulingMode(MBB, MI, false);
3993 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3994 }
3995
3996 for (MachineInstr *MI : ReturnInsts)
3997 setSchedulingMode(*MI->getParent(), MI, false);
3998
3999 Modified = true;
4000 }
4001
4002 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
4003 // This is done in different ways depending on how the VGPRs were allocated
4004 // (i.e. whether we're in dynamic VGPR mode or not).
4005 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
4006 // waveslot limited kernel runs slower with the deallocation.
4007 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
4008 for (auto [MI, _] : EndPgmInsts) {
4009 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4010 TII.get(AMDGPU::S_ALLOC_VGPR))
4011 .addImm(0);
4012 Modified = true;
4013 }
4014 } else if (!WCG->isOptNone() &&
4015 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
4016 (MF.getFrameInfo().hasCalls() ||
4017 ST.getOccupancyWithNumVGPRs(
4018 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
4019 /*IsDynamicVGPR=*/false) <
4021 for (auto [MI, Flag] : EndPgmInsts) {
4022 if (Flag) {
4023 if (ST.requiresNopBeforeDeallocVGPRs()) {
4024 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4025 TII.get(AMDGPU::S_NOP))
4026 .addImm(0);
4027 }
4028 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4029 TII.get(AMDGPU::S_SENDMSG))
4031 Modified = true;
4032 }
4033 }
4034 }
4035
4036 return Modified;
4037}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2152
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2172
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.