LLVM 23.0.0git
GCNSubtarget.h
Go to the documentation of this file.
1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34/// Module flag names controlling out-of-bounds buffer access semantics.
35/// Each flag is an i32 with Module::Max merge behaviour and tri-state values:
36/// 0 = any (absent/default - backend currently treats as strict)
37/// 1 = relaxed
38/// 2 = strict
39namespace AMDGPUOOBMode {
40inline constexpr StringLiteral BufferFlag("amdgpu.buffer.oob.mode");
41inline constexpr StringLiteral TBufferFlag("amdgpu.tbuffer.oob.mode");
42} // namespace AMDGPUOOBMode
43
45 public AMDGPUSubtarget {
46public:
48
49 // Following 2 enums are documented at:
50 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
51 enum class TrapHandlerAbi {
52 NONE = 0x00,
53 AMDHSA = 0x01,
54 };
55
56 enum class TrapID {
59 };
60
61private:
62 /// SelectionDAGISel related APIs.
63 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
64
65 /// GlobalISel related APIs.
66 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
67 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
68 std::unique_ptr<InstructionSelector> InstSelector;
69 std::unique_ptr<LegalizerInfo> Legalizer;
70 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
71
72protected:
73 // Basic subtarget description.
75 unsigned Gen = INVALID;
77 int LDSBankCount = 0;
79
80 // Instruction cache line size in bytes; set from TableGen subtarget features.
81 unsigned InstCacheLineSize = 0;
82
83 // Dynamically set bits that enable features.
84 bool DynamicVGPR = false;
86 bool ScalarizeGlobal = false;
87 const bool BufferOOBRelaxed;
89
90 /// The maximum number of instructions that may be placed within an S_CLAUSE,
91 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
92 /// indicates a lack of S_CLAUSE support.
93 unsigned MaxHardClauseLength = 0;
94
95#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
96 bool ATTRIBUTE = DEFAULT;
97#include "AMDGPUGenSubtargetInfo.inc"
98
99private:
100 SIInstrInfo InstrInfo;
101 SITargetLowering TLInfo;
102 SIFrameLowering FrameLowering;
103
104 /// Get the register that represents the actual dependency between the
105 /// definition and the use. The definition might only affect a subregister
106 /// that is not actually used. Works for both virtual and physical registers.
107 /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
108 /// Returns the definition register if there is a real dependency and no
109 /// better match is found.
110 Register getRealSchedDependency(const MachineInstr &DefI, int DefOpIdx,
111 const MachineInstr &UseI, int UseOpIdx) const;
112
113public:
114 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
115 const GCNTargetMachine &TM, bool BufferOOBRelaxed = false,
116 bool TBufferOOBRelaxed = false);
117 ~GCNSubtarget() override;
118
120 StringRef FS);
121
122 /// Diagnose inconsistent subtarget features before attempting to codegen
123 /// function \p F.
124 void checkSubtargetFeatures(const Function &F) const;
125
126 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
127
128 const SIFrameLowering *getFrameLowering() const override {
129 return &FrameLowering;
130 }
131
132 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
133
134 const SIRegisterInfo *getRegisterInfo() const override {
135 return &InstrInfo.getRegisterInfo();
136 }
137
138 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
139
140 const CallLowering *getCallLowering() const override {
141 return CallLoweringInfo.get();
142 }
143
144 const InlineAsmLowering *getInlineAsmLowering() const override {
145 return InlineAsmLoweringInfo.get();
146 }
147
149 return InstSelector.get();
150 }
151
152 const LegalizerInfo *getLegalizerInfo() const override {
153 return Legalizer.get();
154 }
155
156 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
157 return RegBankInfo.get();
158 }
159
161 return TargetID;
162 }
163
165 return &InstrItins;
166 }
167
169
171
172 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
173
174#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
175 bool GETTER() const override { return ATTRIBUTE; }
176#include "AMDGPUGenSubtargetInfo.inc"
177
178 unsigned getMaxWaveScratchSize() const {
179 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
180 if (getGeneration() >= GFX12) {
181 // 18-bit field in units of 64-dword.
182 return (64 * 4) * ((1 << 18) - 1);
183 }
184 if (getGeneration() == GFX11) {
185 // 15-bit field in units of 64-dword.
186 return (64 * 4) * ((1 << 15) - 1);
187 }
188 // 13-bit field in units of 256-dword.
189 return (256 * 4) * ((1 << 13) - 1);
190 }
191
192 /// Return the number of high bits known to be zero for a frame index.
196
197 int getLDSBankCount() const { return LDSBankCount; }
198
199 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
200 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
201
202 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
203 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
204 : 16;
205 }
206
207 unsigned getConstantBusLimit(unsigned Opcode) const;
208
209 /// Returns if the result of this instruction with a 16-bit result returned in
210 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
211 /// the original value.
212 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
213
214 bool supportsWGP() const {
215 if (HasGFX1250Insts)
216 return false;
217 return getGeneration() >= GFX10;
218 }
219
220 bool hasHWFP64() const { return HasFP64; }
221
222 bool hasAddr64() const {
224 }
225
226 bool hasFlat() const {
228 }
229
230 // Return true if the target only has the reverse operand versions of VALU
231 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
232 bool hasOnlyRevVALUShifts() const {
234 }
235
236 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
237
238 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
239
240 bool hasMin3Max3_16() const {
242 }
243
244 bool hasSwap() const { return HasGFX9Insts; }
245
246 bool hasScalarPackInsts() const { return HasGFX9Insts; }
247
248 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
249
250 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
251
252 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
253
257
259 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
260 return getGeneration() >= GFX9;
261 }
262
263 /// True if the offset field of DS instructions works as expected. On SI, the
264 /// offset uses a 16-bit adder and does not always wrap properly.
265 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
266
268 return EnableUnsafeDSOffsetFolding;
269 }
270
271 /// Condition output from div_scale is usable.
275
276 /// Extra wait hazard is needed in some cases before
277 /// s_cbranch_vccnz/s_cbranch_vccz.
278 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
279
280 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
281 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
282
283 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
284 /// was written by a VALU instruction.
287 }
288
289 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
290 /// SGPR was written by a VALU Instruction.
293 }
294
295 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
296
297 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
298 unsigned getSetRegWaitStates() const {
299 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
300 }
301
302 /// Return the amount of LDS that can be used that will not restrict the
303 /// occupancy lower than WaveCount.
304 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
305 const Function &) const;
306
309 }
310
311 /// \returns If target supports S_DENORM_MODE.
312 bool hasDenormModeInst() const {
314 }
315
316 /// \returns If target supports ds_read/write_b128 and user enables generation
317 /// of ds_read/write_b128.
318 bool useDS128() const { return HasCIInsts && EnableDS128; }
319
320 /// \return If target supports ds_read/write_b96/128.
321 bool hasDS96AndDS128() const { return HasCIInsts; }
322
323 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
324 bool haveRoundOpsF64() const { return HasCIInsts; }
325
326 /// \returns If MUBUF instructions always perform range checking, even for
327 /// buffer resources used for private memory access.
331
332 /// \returns If target requires PRT Struct NULL support (zero result registers
333 /// for sparse texture support).
334 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
335
337 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
338 }
339
341 return HasUnalignedDSAccess && HasUnalignedAccessMode;
342 }
343
345 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
346 }
347
348 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
349
350 bool isTgSplitEnabled() const { return EnableTgSplit; }
351
354
355 bool isCuModeEnabled() const { return EnableCuMode; }
356
357 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
358
359 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
360
361 // Check if target supports ST addressing mode with FLAT scratch instructions.
362 // The ST addressing mode means no registers are used, either VGPR or SGPR,
363 // but only immediate offset is swizzled and added to the FLAT scratch base.
364 bool hasFlatScratchSTMode() const {
365 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
366 }
367
368 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
369
371 return hasArchitectedFlatScratch() ||
372 (EnableFlatScratch && hasFlatScratchInsts());
373 }
374
375 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
376
377 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
378
379 bool hasExportInsts() const {
380 return !hasGFX940Insts() && !hasGFX1250Insts();
381 }
382
383 bool hasVINTERPEncoding() const {
384 return HasGFX11Insts && !hasGFX1250Insts();
385 }
386
387 // DS_ADD_F64/DS_ADD_RTN_F64
388 bool hasLdsAtomicAddF64() const {
389 return hasGFX90AInsts() || hasGFX1250Insts();
390 }
391
393 return getGeneration() >= GFX9;
394 }
395
396 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
397
398 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
399
401 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
402 }
403
404 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
405
406 /// Return if most LDS instructions have an m0 use that require m0 to be
407 /// initialized.
408 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
409
410 // True if the hardware rewinds and replays GWS operations if a wave is
411 // preempted.
412 //
413 // If this is false, a GWS operation requires testing if a nack set the
414 // MEM_VIOL bit, and repeating if so.
415 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
416
417 /// \returns if target has ds_gws_sema_release_all instruction.
418 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
419
420 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
421
422 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
423
424 // Covers VS/PS/CS graphics shaders
425 bool isMesaGfxShader(const Function &F) const {
426 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
427 }
428
429 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
430
431 bool hasAtomicFaddInsts() const {
432 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
433 }
434
436 return getGeneration() < SEA_ISLANDS;
437 }
438
439 bool hasInstPrefetch() const {
440 return getGeneration() == GFX10 || getGeneration() == GFX11;
441 }
442
443 bool hasPrefetch() const { return HasGFX12Insts; }
444
445 bool hasInstPrefSize() const { return isGFX11Plus(); }
446
447 void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
448 uint32_t &CacheLineSize) const {
451 if (getGeneration() == GFX11) {
452 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
453 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
454 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
455 } else {
456 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
457 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
458 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
459 }
460 }
461
462 // Has s_cmpk_* instructions.
463 bool hasSCmpK() const { return getGeneration() < GFX12; }
464
465 // Scratch is allocated in 256 dword per wave blocks for the entire
466 // wavefront. When viewed from the perspective of an arbitrary workitem, this
467 // is 4-byte aligned.
468 //
469 // Only 4-byte alignment is really needed to access anything. Transformations
470 // on the pointer value itself may rely on the alignment / known low bits of
471 // the pointer. Set this to something above the minimum to avoid needing
472 // dynamic realignment in common cases.
473 Align getStackAlignment() const { return Align(16); }
474
475 bool enableMachineScheduler() const override { return true; }
476
477 bool useAA() const override;
478
479 bool enableSubRegLiveness() const override { return true; }
480
483
484 // static wrappers
485 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
486
487 // XXX - Why is this here if it isn't in the default pass set?
488 bool enableEarlyIfConversion() const override { return true; }
489
491 const SchedRegion &Region) const override;
492
494 const SchedRegion &Region) const override;
495
496 void mirFileLoaded(MachineFunction &MF) const override;
497
498 unsigned getMaxNumUserSGPRs() const {
499 return AMDGPU::getMaxNumUserSGPRs(*this);
500 }
501
502 bool useVGPRIndexMode() const;
503
504 bool hasScalarCompareEq64() const {
506 }
507
508 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
509 bool hasLDSFPAtomicAddF64() const {
510 return HasGFX90AInsts || HasGFX1250Insts;
511 }
512
513 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
514 bool hasPermLane64() const { return getGeneration() >= GFX11; }
515
516 /// \returns true if the subtarget supports the ds_swizzle rotate and FFT
517 /// swizzle modes (GFX9+).
518 bool hasDsSwizzleRotateMode() const { return getGeneration() >= GFX9; }
519
520 bool hasDPPRowShare() const {
521 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
522 }
523
524 // Has V_PK_MOV_B32 opcode
525 bool hasPkMovB32() const { return HasGFX90AInsts; }
526
528 return getGeneration() >= GFX10 || hasGFX940Insts();
529 }
530
531 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
532
533 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
534
535 unsigned getNSAMaxSize(bool HasSampler = false) const {
536 return AMDGPU::getNSAMaxSize(*this, HasSampler);
537 }
538
539 bool hasMadF16() const;
540
541 // Scalar and global loads support scale_offset bit.
542 bool hasScaleOffset() const { return HasGFX1250Insts; }
543
544 // FLAT GLOBAL VOffset is signed
545 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
546
548
550 return HasUserSGPRInit16Bug && isWave32();
551 }
552
556
557 // \returns true if the subtarget supports DWORDX3 load/store instructions.
558 bool hasDwordx3LoadStores() const { return HasCIInsts; }
559
563
568
571 }
572
575 }
576
578 return HasLDSMisalignedBug && !EnableCuMode;
579 }
580
581 // Shift amount of a 64 bit shift cannot be a highest allocated register
582 // if also at the end of the allocation block.
583 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
584
585 // Has one cycle hazard on transcendental instruction feeding a
586 // non transcendental VALU.
587 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
588
589 // Has one cycle hazard on a VALU instruction partially writing dst with
590 // a shift of result bits feeding another VALU instruction.
591 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
592
593 // Cannot use op_sel with v_dot instructions.
594 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
595
596 // Does not have HW interlocs for VALU writing and then reading SGPRs.
597 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
598
599 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
600
602 return getGeneration() == GFX10;
603 }
604
605 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
606
607 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
608
609 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
610
612 return getGeneration() == GFX11;
613 }
614
615 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
616
617 // All GFX9 targets experience a fetch delay when an instruction at the start
618 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
619 // is uniquely sensitive to this: the delay triggers further performance
620 // degradation beyond the fetch latency itself.
621 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
622
623 bool requiresCodeObjectV6() const { return RequiresCOV6; }
624
625 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
626
627 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
628
630 return HasGFX12Insts && !HasGFX1250Insts;
631 }
632
633 bool setRegModeNeedsVNOPs() const {
634 return HasGFX1250Insts && getGeneration() == GFX12;
635 }
636
637 /// Return if operations acting on VGPR tuples require even alignment.
638 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
639
640 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
641 bool hasSPackHL() const { return HasGFX11Insts; }
642
643 /// Return true if the target's EXP instruction has the COMPR flag, which
644 /// affects the meaning of the EN (enable) bits.
645 bool hasCompressedExport() const { return !HasGFX11Insts; }
646
647 /// Return true if the target's EXP instruction supports the NULL export
648 /// target.
649 bool hasNullExportTarget() const { return !HasGFX11Insts; }
650
651 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
652
653 /// Return true if the target has the S_DELAY_ALU instruction.
654 bool hasDelayAlu() const { return HasGFX11Insts; }
655
656 /// Returns true if the target supports
657 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
658 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
659 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
660
661 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
662 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
663 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
664
665 /// \returns true if the target has packed f32 instructions that only read 32
666 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
667 /// both channels.
669 return getGeneration() == GFX12 && HasGFX1250Insts;
670 }
671
672 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
673
674 /// \returns true if the target supports expert scheduling mode 2 which relies
675 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
676 /// instructions in some instances.
677 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
678
679 /// \returns The maximum number of instructions that can be enclosed in an
680 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
681 /// instruction.
682 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
683
684 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
685 /// SGPRs
686 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
687
688 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
689 /// VGPRs
690 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
691 unsigned DynamicVGPRBlockSize) const;
692
693 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
694 /// be achieved when the only function running on a CU is \p F, each workgroup
695 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
696 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
697 /// range, so this returns a range as well.
698 ///
699 /// Note that occupancy can be affected by the scratch allocation as well, but
700 /// we do not have enough information to compute it.
701 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
702 unsigned LDSSize = 0,
703 unsigned NumSGPRs = 0,
704 unsigned NumVGPRs = 0) const;
705
706 /// \returns true if the flat_scratch register should be initialized with the
707 /// pointer to the wave's scratch memory rather than a size and offset.
708 bool flatScratchIsPointer() const {
710 }
711
712 /// \returns true if the machine has merged shaders in which s0-s7 are
713 /// reserved by the hardware and user SGPRs start at s8
714 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
715
716 // \returns true if the target supports the pre-NGG legacy geometry path.
717 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
718
719 // \returns true if the target has split barriers feature
720 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
721
722 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
723 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
724
725 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
726 /// values.
727 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
728
729 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
730
731 bool hasVOPD3() const { return HasGFX1250Insts; }
732
733 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
734 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
735
736 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
737 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
738
739 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
740 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
741 // extended VA to 57 bits.
743 return HasGFX12Insts && !HasGFX1250Insts;
744 }
745
746 // \returns true if the target needs to create a prolog for backward
747 // compatibility when preloading kernel arguments.
749 return hasKernargPreload() && !HasGFX1250Insts;
750 }
751
752 bool hasCondSubInsts() const { return HasGFX12Insts; }
753
754 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
755
756 bool hasFmaLegacy32Insts() const { return hasGFX10_3Insts(); }
757
758 /// \returns SGPR allocation granularity supported by the subtarget.
759 unsigned getSGPRAllocGranule() const {
761 }
762
763 /// \returns SGPR encoding granularity supported by the subtarget.
764 unsigned getSGPREncodingGranule() const {
766 }
767
768 /// \returns Total number of SGPRs supported by the subtarget.
769 unsigned getTotalNumSGPRs() const {
771 }
772
773 /// \returns Addressable number of SGPRs supported by the subtarget.
774 unsigned getAddressableNumSGPRs() const {
776 }
777
778 /// \returns Minimum number of SGPRs that meets the given number of waves per
779 /// execution unit requirement supported by the subtarget.
780 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
781 return AMDGPU::IsaInfo::getMinNumSGPRs(*this, WavesPerEU);
782 }
783
784 /// \returns Maximum number of SGPRs that meets the given number of waves per
785 /// execution unit requirement supported by the subtarget.
786 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
787 return AMDGPU::IsaInfo::getMaxNumSGPRs(*this, WavesPerEU, Addressable);
788 }
789
790 /// \returns Reserved number of SGPRs. This is common
791 /// utility function called by MachineFunction and
792 /// Function variants of getReservedNumSGPRs.
793 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
794 /// \returns Reserved number of SGPRs for given machine function \p MF.
795 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
796
797 /// \returns Reserved number of SGPRs for given function \p F.
798 unsigned getReservedNumSGPRs(const Function &F) const;
799
800 /// \returns Maximum number of preloaded SGPRs for the subtarget.
801 unsigned getMaxNumPreloadedSGPRs() const;
802
803 /// \returns max num SGPRs. This is the common utility
804 /// function called by MachineFunction and Function
805 /// variants of getMaxNumSGPRs.
806 unsigned getBaseMaxNumSGPRs(const Function &F,
807 std::pair<unsigned, unsigned> WavesPerEU,
808 unsigned PreloadedSGPRs,
809 unsigned ReservedNumSGPRs) const;
810
811 /// \returns Maximum number of SGPRs that meets number of waves per execution
812 /// unit requirement for function \p MF, or number of SGPRs explicitly
813 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
814 ///
815 /// \returns Value that meets number of waves per execution unit requirement
816 /// if explicitly requested value cannot be converted to integer, violates
817 /// subtarget's specifications, or does not meet number of waves per execution
818 /// unit requirement.
819 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
820
821 /// \returns Maximum number of SGPRs that meets number of waves per execution
822 /// unit requirement for function \p F, or number of SGPRs explicitly
823 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
824 ///
825 /// \returns Value that meets number of waves per execution unit requirement
826 /// if explicitly requested value cannot be converted to integer, violates
827 /// subtarget's specifications, or does not meet number of waves per execution
828 /// unit requirement.
829 unsigned getMaxNumSGPRs(const Function &F) const;
830
831 /// \returns VGPR allocation granularity supported by the subtarget.
832 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
833 return AMDGPU::IsaInfo::getVGPRAllocGranule(*this, DynamicVGPRBlockSize);
834 }
835
836 /// \returns VGPR encoding granularity supported by the subtarget.
837 unsigned getVGPREncodingGranule() const {
839 }
840
841 /// \returns Total number of VGPRs supported by the subtarget.
842 unsigned getTotalNumVGPRs() const {
844 }
845
846 /// \returns Addressable number of architectural VGPRs supported by the
847 /// subtarget.
851
852 /// \returns Addressable number of VGPRs supported by the subtarget.
853 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
854 return AMDGPU::IsaInfo::getAddressableNumVGPRs(*this, DynamicVGPRBlockSize);
855 }
856
857 /// \returns the minimum number of VGPRs that will prevent achieving more than
858 /// the specified number of waves \p WavesPerEU.
859 unsigned getMinNumVGPRs(unsigned WavesPerEU,
860 unsigned DynamicVGPRBlockSize) const {
861 return AMDGPU::IsaInfo::getMinNumVGPRs(*this, WavesPerEU,
862 DynamicVGPRBlockSize);
863 }
864
865 /// \returns the maximum number of VGPRs that can be used and still achieved
866 /// at least the specified number of waves \p WavesPerEU.
867 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
868 unsigned DynamicVGPRBlockSize) const {
869 return AMDGPU::IsaInfo::getMaxNumVGPRs(*this, WavesPerEU,
870 DynamicVGPRBlockSize);
871 }
872
873 /// \returns max num VGPRs. This is the common utility function
874 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
875 unsigned
877 std::pair<unsigned, unsigned> NumVGPRBounds) const;
878
879 /// \returns Maximum number of VGPRs that meets number of waves per execution
880 /// unit requirement for function \p F, or number of VGPRs explicitly
881 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
882 ///
883 /// \returns Value that meets number of waves per execution unit requirement
884 /// if explicitly requested value cannot be converted to integer, violates
885 /// subtarget's specifications, or does not meet number of waves per execution
886 /// unit requirement.
887 unsigned getMaxNumVGPRs(const Function &F) const;
888
889 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
890
891 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
892 /// of waves per execution unit required for the function \p MF.
893 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
894
895 /// \returns Maximum number of VGPRs that meets number of waves per execution
896 /// unit requirement for function \p MF, or number of VGPRs explicitly
897 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
898 ///
899 /// \returns Value that meets number of waves per execution unit requirement
900 /// if explicitly requested value cannot be converted to integer, violates
901 /// subtarget's specifications, or does not meet number of waves per execution
902 /// unit requirement.
903 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
904
905 bool supportsWave32() const { return getGeneration() >= GFX10; }
906
907 bool supportsWave64() const { return !hasGFX1250Insts() || HasGFX13Insts; }
908
909 bool isWave32() const { return getWavefrontSize() == 32; }
910
911 bool isWave64() const { return getWavefrontSize() == 64; }
912
913 /// Returns if the wavesize of this subtarget is known reliable. This is false
914 /// only for the a default target-cpu that does not have an explicit
915 /// +wavefrontsize target feature.
916 bool isWaveSizeKnown() const {
917 return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
918 hasFeature(AMDGPU::FeatureWavefrontSize64);
919 }
920
922 return getRegisterInfo()->getBoolRC();
923 }
924
925 /// \returns Maximum number of work groups per compute unit supported by the
926 /// subtarget and limited by given \p FlatWorkGroupSize.
927 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
928 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(*this, FlatWorkGroupSize);
929 }
930
931 /// \returns Minimum flat work group size supported by the subtarget.
932 unsigned getMinFlatWorkGroupSize() const override {
934 }
935
936 /// \returns Maximum flat work group size supported by the subtarget.
937 unsigned getMaxFlatWorkGroupSize() const override {
939 }
940
941 /// \returns Number of waves per execution unit required to support the given
942 /// \p FlatWorkGroupSize.
943 unsigned
944 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
945 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(*this, FlatWorkGroupSize);
946 }
947
948 /// \returns Minimum number of waves per execution unit supported by the
949 /// subtarget.
950 unsigned getMinWavesPerEU() const override {
952 }
953
954 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
955 SDep &Dep,
956 const TargetSchedModel *SchedModel) const override;
957
958 // \returns true if it's beneficial on this subtarget for the scheduler to
959 // cluster stores as well as loads.
960 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
961
962 // \returns the number of address arguments from which to enable MIMG NSA
963 // on supported architectures.
964 unsigned getNSAThreshold(const MachineFunction &MF) const;
965
966 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
967 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
968 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
969
970 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
971 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
972 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
973
974 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
975 unsigned getDynamicVGPRBlockSize() const {
976 return DynamicVGPRBlockSize32 ? 32 : 16;
977 }
978
980 // AMDGPU doesn't care if early-clobber and undef operands are allocated
981 // to the same register.
982 return false;
983 }
984
985 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
986 // and surronded by S_WAIT_ALU(0xFFE3).
988 return getGeneration() == GFX12;
989 }
990
991 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
992 // read.
994 return HasGFX1250Insts && getGeneration() == GFX12;
995 }
996
997 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
998 // result.
1000 return HasGFX1250Insts && getGeneration() == GFX12;
1001 }
1002
1003 /// \returns true if the subtarget requires a wait for xcnt before VMEM
1004 /// accesses that must never be repeated in the event of a page fault/re-try.
1005 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
1007 return HasGFX1250Insts;
1008 }
1009
1010 /// \returns the number of significant bits in the immediate field of the
1011 /// S_NOP instruction.
1012 unsigned getSNopBits() const {
1014 return 7;
1016 return 4;
1017 return 3;
1018 }
1019
1023
1025 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1027 isWave32();
1028 }
1029
1030 /// Return true if real (non-fake) variants of True16 instructions using
1031 /// 16-bit registers should be code-generated. Fake True16 instructions are
1032 /// identical to non-fake ones except that they take 32-bit registers as
1033 /// operands and always use their low halves.
1034 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1035 // supported and the support for fake True16 instructions is removed.
1036 bool useRealTrue16Insts() const {
1037 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1038 }
1039
1041 return getGeneration() >= GFX10 || isTgSplitEnabled();
1042 }
1043};
1044
1046public:
1047 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1048
1049 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1050
1051 bool hasDispatchPtr() const { return DispatchPtr; }
1052
1053 bool hasQueuePtr() const { return QueuePtr; }
1054
1055 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1056
1057 bool hasDispatchID() const { return DispatchID; }
1058
1059 bool hasFlatScratchInit() const { return FlatScratchInit; }
1060
1061 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1062
1063 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1064
1065 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1066
1067 unsigned getNumFreeUserSGPRs();
1068
1069 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1070
1081
1082 // Returns the size in number of SGPRs for preload user SGPR field.
1084 switch (ID) {
1086 return 2;
1088 return 4;
1089 case DispatchPtrID:
1090 return 2;
1091 case QueuePtrID:
1092 return 2;
1094 return 2;
1095 case DispatchIdID:
1096 return 2;
1097 case FlatScratchInitID:
1098 return 2;
1100 return 1;
1101 }
1102 llvm_unreachable("Unknown UserSGPRID.");
1103 }
1104
1105 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1106
1107private:
1108 const GCNSubtarget &ST;
1109
1110 // Private memory buffer
1111 // Compute directly in sgpr[0:1]
1112 // Other shaders indirect 64-bits at sgpr[0:1]
1113 bool ImplicitBufferPtr = false;
1114
1115 bool PrivateSegmentBuffer = false;
1116
1117 bool DispatchPtr = false;
1118
1119 bool QueuePtr = false;
1120
1121 bool KernargSegmentPtr = false;
1122
1123 bool DispatchID = false;
1124
1125 bool FlatScratchInit = false;
1126
1127 bool PrivateSegmentSize = false;
1128
1129 unsigned NumKernargPreloadSGPRs = 0;
1130
1131 unsigned NumUsedUserSGPRs = 0;
1132};
1133
1134} // end namespace llvm
1135
1136#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
AMDHSA kernel descriptor definitions.
#define F(x, y, z)
Definition MD5.cpp:54
Promote Memory to Register
Definition Mem2Reg.cpp:110
SI DAG Lowering interface definition.
Interface definition for SIInstrInfo.
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned getMaxWavesPerEU() const
unsigned getWavefrontSize() const
bool hasPrefetch() const
bool hasFlat() const
bool hasD16Images() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
bool partialVCCWritesUpdateVCCZ() const
Writes to VCC_LO/VCC_HI update the VCCZ flag.
bool hasSwap() const
bool hasPkMinMax3Insts() const
bool hasD16LoadStore() const
bool hasMergedShaders() const
bool hasRrWGMode() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasOnlyRevVALUShifts() const
bool hasNonNSAEncoding() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasExpertSchedulingMode() const
void mirFileLoaded(MachineFunction &MF) const override
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool loadStoreOptEnabled() const
bool enableSubRegLiveness() const override
unsigned getSGPRAllocGranule() const
bool hasLdsAtomicAddF64() const
bool hasFlatLgkmVMemCountInOrder() const
bool flatScratchIsPointer() const
bool requiresWaitOnWorkgroupReleaseFence() const
bool hasShift64HighRegBug() const
unsigned MaxPrivateElementSize
bool unsafeDSOffsetFoldingEnabled() const
bool hasFPAtomicToDenormModeHazard() const
unsigned getAddressableNumArchVGPRs() const
bool vmemWriteNeedsExpWaitcnt() const
bool shouldClusterStores() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasUserSGPRInit16BugInWave32() const
unsigned getSGPREncodingGranule() const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasFlatScratchHiInB64InstHazard() const
bool hasDstSelForwardingHazard() const
void setScalarizeGlobalBehavior(bool b)
bool hasFlatScratchEnabled() const
bool hasRelaxedBufferOOBMode() const
unsigned getSNopBits() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasMultiDwordFlatScratchAddressing() const
bool hasFmaakFmamkF64Insts() const
bool hasDsSwizzleRotateMode() const
bool hasHWFP64() const
bool hasScaleOffset() const
bool hasDenormModeInst() const
bool hasCvtScaleForwardingHazard() const
unsigned getTotalNumVGPRs() const
unsigned getMinWavesPerEU() const override
bool hasUnalignedDSAccessEnabled() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM, bool BufferOOBRelaxed=false, bool TBufferOOBRelaxed=false)
const SIInstrInfo * getInstrInfo() const override
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasCondSubInsts() const
const InlineAsmLowering * getInlineAsmLowering() const override
unsigned getTotalNumSGPRs() const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Align getStackAlignment() const
bool privateMemoryResourceIsRangeChecked() const
bool hasScalarSubwordLoads() const
const bool BufferOOBRelaxed
bool hasMadF16() const
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
bool hasLoopHeadInstSplitSensitivity() const
bool hasDwordx3LoadStores() const
bool hasSignedScratchOffsets() const
bool hasGlobalAddTidInsts() const
bool hasFlatScrRegister() const
bool hasGetPCZeroExtension() const
bool hasPermLane64() const
bool requiresNopBeforeDeallocVGPRs() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool supportsGetDoorbellID() const
bool supportsWave32() const
bool isTgSplitEnabled() const
unsigned getMaxNumAGPRs(const Function &F) const
bool hasReadM0MovRelInterpHazard() const
bool isDynamicVGPREnabled() const
bool hasInstPrefSize() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasDOTOpSelHazard() const
bool hasLdsWaitVMSRC() const
const TargetRegisterClass * getBoolRC() const
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool hasFmaakFmamkF32Insts() const
bool hasMad64_32() const
InstructionSelector * getInstructionSelector() const override
unsigned getVGPREncodingGranule() const
bool hasHardClauses() const
bool useDS128() const
bool hasExtendedWaitCounts() const
bool d16PreservesUnusedBits() const
bool hasInstPrefetch() const
bool hasAddPC64Inst() const
unsigned maxHardClauseLength() const
bool isMesaGfxShader(const Function &F) const
bool hasExportInsts() const
bool hasVINTERPEncoding() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
bool hasLegacyGeometry() const
TrapHandlerAbi getTrapHandlerAbi() const
bool isCuModeEnabled() const
const SIFrameLowering * getFrameLowering() const override
bool hasDPPRowShare() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool useVGPRBlockOpsForCSR() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool needsKernArgPreloadProlog() const
bool hasMin3Max3_16() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
const SITargetLowering * getTargetLowering() const override
bool hasTransForwardingHazard() const
bool enableMachineScheduler() const override
bool hasLDSFPAtomicAddF64() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool getScalarizeGlobalBehavior() const
bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const
bool hasReadM0LdsDmaHazard() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasScratchBaseForwardingHazard() const
bool hasRelaxedTBufferOOBMode() const
bool hasScalarPackInsts() const
bool requiresDisjointEarlyClobberAndUndef() const override
bool hasVALUReadSGPRHazard() const
bool usePRTStrictNull() const
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const
bool supportsWaveWideBPermute() const
bool hasMed3_16() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasUnalignedScratchAccessEnabled() const
bool hasNullExportTarget() const
Return true if the target's EXP instruction supports the NULL export target.
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
const bool TBufferOOBRelaxed
bool useAA() const override
bool isWave32() const
bool isGFX11Plus() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool hasUnalignedBufferAccessEnabled() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getMinFlatWorkGroupSize() const override
bool hasAsyncMark() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool supportsMinMaxDenormModes() const
bool supportsWave64() const
bool supportsBPermute() const
bool hasFlatScratchSVSMode() const
unsigned InstCacheLineSize
bool supportsWGP() const
bool hasAtomicFaddInsts() const
bool hasSubClampInsts() const
bool requiresWaitXCntForSingleAccessInstructions() const
unsigned getNSAMaxSize(bool HasSampler=false) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool hasVOP3DPP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMaxNumUserSGPRs() const
unsigned MaxHardClauseLength
The maximum number of instructions that may be placed within an S_CLAUSE, which is one greater than t...
bool hasFlatScratchSVSSwizzleBug() const
bool hasVDecCoExecHazard() const
bool hasSignedGVSOffset() const
bool hasLDSFPAtomicAddF32() const
unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDelayAlu() const
Return true if the target has the S_DELAY_ALU instruction.
bool hasReadM0SendMsgHazard() const
bool hasScalarMulHiInsts() const
bool hasSCmpK() const
const LegalizerInfo * getLegalizerInfo() const override
bool requiresWaitIdleBeforeGetReg() const
bool hasDS96AndDS128() const
bool hasFmaLegacy32Insts() const
bool hasReadM0LdsDirectHazard() const
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI)
Generation getGeneration() const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
bool hasScalarAddSub64() const
bool hasSplitBarriers() const
bool enableEarlyIfConversion() const override
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasSGetShaderCyclesInst() const
bool hasINVWBL2WaitCntRequirement() const
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasFlatScratchSTMode() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasGWSSemaReleaseAll() const
bool hasAddr64() const
unsigned getAddressableNumSGPRs() const
bool hasReadVCCZBug() const
Extra wait hazard is needed in some cases before s_cbranch_vccnz/s_cbranch_vccz.
bool isWave64() const
unsigned getDynamicVGPRBlockSize() const
bool setRegModeNeedsVNOPs() const
bool hasFractBug() const
bool isPreciseMemoryEnabled() const
unsigned getMaxWaveScratchSize() const
bool hasLDSMisalignedBugInWGPMode() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool hasVOPD3() const
bool hasAtomicCSub() const
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
bool requiresCodeObjectV6() const
const CallLowering * getCallLowering() const override
bool hasLdsDirect() const
bool hasGWSAutoReplay() const
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
unsigned getNumKernargPreloadSGPRs() const
unsigned getNumUsedUserSGPRs() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Itinerary data supplied by a subtarget to be used by a target.
Scheduling dependency.
Definition ScheduleDAG.h:51
const TargetRegisterClass * getBoolRC() const
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Module flag names controlling out-of-bounds buffer access semantics.
constexpr StringLiteral BufferFlag("amdgpu.buffer.oob.mode")
constexpr StringLiteral TBufferFlag("amdgpu.tbuffer.oob.mode")
unsigned getSGPRAllocGranule(const MCSubtargetInfo &STI)
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo &STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo &STI)
unsigned getAddressableNumSGPRs(const MCSubtargetInfo &STI)
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo &STI)
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
unsigned getMinNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU)
unsigned getMaxNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, bool Addressable)
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
constexpr unsigned getMaxFlatWorkGroupSize()
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMinNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize)
unsigned getMaxNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getMinWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
A region of an MBB for scheduling.