LLVM 23.0.0git
GCNSubtarget.cpp
Go to the documentation of this file.
1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
20#include "AMDGPUTargetMachine.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(true));
49
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
54
56
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive("+wavefrontsize")) {
80 if (!FS.contains_insensitive("wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive("wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive("wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
109 }
110
111 // We don't support FP64 for EG/NI atm.
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
142
143 if (FlatOffsetBitWidth == 0)
145
147
150
151 TargetID.setTargetIDFromFeaturesString(FS);
152
153 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154 << TargetID.getXnackSetting() << '\n');
155 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156 << TargetID.getSramEccSetting() << '\n');
157
158 return *this;
159}
160
162 LLVMContext &Ctx = F.getContext();
163 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
165 Ctx.diagnose(DiagnosticInfoUnsupported(
166 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167 }
168}
169
171 const GCNTargetMachine &TM)
172 : // clang-format off
173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174 AMDGPUSubtarget(TT),
175 TargetID(*this),
176 InstrItins(getInstrItineraryForCPU(GPU)),
177 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
178 TLInfo(TM, *this),
179 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
180 // clang-format on
183
184 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
185
186 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
187 InlineAsmLoweringInfo =
188 std::make_unique<InlineAsmLowering>(getTargetLowering());
189 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
190 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
191 InstSelector =
192 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
193}
194
196 return TSInfo.get();
197}
198
199unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
200 if (getGeneration() < GFX10)
201 return 1;
202
203 switch (Opcode) {
204 case AMDGPU::V_LSHLREV_B64_e64:
205 case AMDGPU::V_LSHLREV_B64_gfx10:
206 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
207 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
208 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
209 case AMDGPU::V_LSHL_B64_e64:
210 case AMDGPU::V_LSHRREV_B64_e64:
211 case AMDGPU::V_LSHRREV_B64_gfx10:
212 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
213 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
214 case AMDGPU::V_LSHR_B64_e64:
215 case AMDGPU::V_ASHRREV_I64_e64:
216 case AMDGPU::V_ASHRREV_I64_gfx10:
217 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
218 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
219 case AMDGPU::V_ASHR_I64_e64:
220 return 1;
221 }
222
223 return 2;
224}
225
226/// This list was mostly derived from experimentation.
227bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
228 switch (Opcode) {
229 case AMDGPU::V_CVT_F16_F32_e32:
230 case AMDGPU::V_CVT_F16_F32_e64:
231 case AMDGPU::V_CVT_F16_U16_e32:
232 case AMDGPU::V_CVT_F16_U16_e64:
233 case AMDGPU::V_CVT_F16_I16_e32:
234 case AMDGPU::V_CVT_F16_I16_e64:
235 case AMDGPU::V_RCP_F16_e64:
236 case AMDGPU::V_RCP_F16_e32:
237 case AMDGPU::V_RSQ_F16_e64:
238 case AMDGPU::V_RSQ_F16_e32:
239 case AMDGPU::V_SQRT_F16_e64:
240 case AMDGPU::V_SQRT_F16_e32:
241 case AMDGPU::V_LOG_F16_e64:
242 case AMDGPU::V_LOG_F16_e32:
243 case AMDGPU::V_EXP_F16_e64:
244 case AMDGPU::V_EXP_F16_e32:
245 case AMDGPU::V_SIN_F16_e64:
246 case AMDGPU::V_SIN_F16_e32:
247 case AMDGPU::V_COS_F16_e64:
248 case AMDGPU::V_COS_F16_e32:
249 case AMDGPU::V_FLOOR_F16_e64:
250 case AMDGPU::V_FLOOR_F16_e32:
251 case AMDGPU::V_CEIL_F16_e64:
252 case AMDGPU::V_CEIL_F16_e32:
253 case AMDGPU::V_TRUNC_F16_e64:
254 case AMDGPU::V_TRUNC_F16_e32:
255 case AMDGPU::V_RNDNE_F16_e64:
256 case AMDGPU::V_RNDNE_F16_e32:
257 case AMDGPU::V_FRACT_F16_e64:
258 case AMDGPU::V_FRACT_F16_e32:
259 case AMDGPU::V_FREXP_MANT_F16_e64:
260 case AMDGPU::V_FREXP_MANT_F16_e32:
261 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
262 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
263 case AMDGPU::V_LDEXP_F16_e64:
264 case AMDGPU::V_LDEXP_F16_e32:
265 case AMDGPU::V_LSHLREV_B16_e64:
266 case AMDGPU::V_LSHLREV_B16_e32:
267 case AMDGPU::V_LSHRREV_B16_e64:
268 case AMDGPU::V_LSHRREV_B16_e32:
269 case AMDGPU::V_ASHRREV_I16_e64:
270 case AMDGPU::V_ASHRREV_I16_e32:
271 case AMDGPU::V_ADD_U16_e64:
272 case AMDGPU::V_ADD_U16_e32:
273 case AMDGPU::V_SUB_U16_e64:
274 case AMDGPU::V_SUB_U16_e32:
275 case AMDGPU::V_SUBREV_U16_e64:
276 case AMDGPU::V_SUBREV_U16_e32:
277 case AMDGPU::V_MUL_LO_U16_e64:
278 case AMDGPU::V_MUL_LO_U16_e32:
279 case AMDGPU::V_ADD_F16_e64:
280 case AMDGPU::V_ADD_F16_e32:
281 case AMDGPU::V_SUB_F16_e64:
282 case AMDGPU::V_SUB_F16_e32:
283 case AMDGPU::V_SUBREV_F16_e64:
284 case AMDGPU::V_SUBREV_F16_e32:
285 case AMDGPU::V_MUL_F16_e64:
286 case AMDGPU::V_MUL_F16_e32:
287 case AMDGPU::V_MAX_F16_e64:
288 case AMDGPU::V_MAX_F16_e32:
289 case AMDGPU::V_MIN_F16_e64:
290 case AMDGPU::V_MIN_F16_e32:
291 case AMDGPU::V_MAX_U16_e64:
292 case AMDGPU::V_MAX_U16_e32:
293 case AMDGPU::V_MIN_U16_e64:
294 case AMDGPU::V_MIN_U16_e32:
295 case AMDGPU::V_MAX_I16_e64:
296 case AMDGPU::V_MAX_I16_e32:
297 case AMDGPU::V_MIN_I16_e64:
298 case AMDGPU::V_MIN_I16_e32:
299 case AMDGPU::V_MAD_F16_e64:
300 case AMDGPU::V_MAD_U16_e64:
301 case AMDGPU::V_MAD_I16_e64:
302 case AMDGPU::V_FMA_F16_e64:
303 case AMDGPU::V_DIV_FIXUP_F16_e64:
304 // On gfx10, all 16-bit instructions preserve the high bits.
306 case AMDGPU::V_MADAK_F16:
307 case AMDGPU::V_MADMK_F16:
308 case AMDGPU::V_MAC_F16_e64:
309 case AMDGPU::V_MAC_F16_e32:
310 case AMDGPU::V_FMAMK_F16:
311 case AMDGPU::V_FMAAK_F16:
312 case AMDGPU::V_FMAC_F16_e64:
313 case AMDGPU::V_FMAC_F16_e32:
314 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
315 // instructions maintain the legacy behavior of 0ing. Some instructions
316 // changed to preserving the high bits.
318 case AMDGPU::V_MAD_MIXLO_F16:
319 case AMDGPU::V_MAD_MIXHI_F16:
320 default:
321 return false;
322 }
323}
324
326 const SchedRegion &Region) const {
327 // Track register pressure so the scheduler can try to decrease
328 // pressure once register usage is above the threshold defined by
329 // SIRegisterInfo::getRegPressureSetLimit()
330 Policy.ShouldTrackPressure = true;
331
332 // Enabling both top down and bottom up scheduling seems to give us less
333 // register spills than just using one of these approaches on its own.
334 Policy.OnlyTopDown = false;
335 Policy.OnlyBottomUp = false;
336
337 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
338 if (!enableSIScheduler())
339 Policy.ShouldTrackLaneMasks = true;
340}
341
343 const SchedRegion &Region) const {
344 const Function &F = Region.RegionBegin->getMF()->getFunction();
345 Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");
346 if (!PostRADirectionAttr.isValid())
347 return;
348
349 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
350 if (PostRADirectionStr == "topdown") {
351 Policy.OnlyTopDown = true;
352 Policy.OnlyBottomUp = false;
353 } else if (PostRADirectionStr == "bottomup") {
354 Policy.OnlyTopDown = false;
355 Policy.OnlyBottomUp = true;
356 } else if (PostRADirectionStr == "bidirectional") {
357 Policy.OnlyTopDown = false;
358 Policy.OnlyBottomUp = false;
359 } else {
361 F, F.getSubprogram(), "invalid value for postRA direction attribute");
362 F.getContext().diagnose(Diag);
363 }
364
365 LLVM_DEBUG({
366 const char *DirStr = "default";
367 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
368 DirStr = "topdown";
369 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
370 DirStr = "bottomup";
371 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
372 DirStr = "bidirectional";
373
374 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
375 << '\n';
376 });
377}
378
380 if (isWave32()) {
381 // Fix implicit $vcc operands after MIParser has verified that they match
382 // the instruction definitions.
383 for (auto &MBB : MF) {
384 for (auto &MI : MBB)
385 InstrInfo.fixImplicitOperands(MI);
386 }
387 }
388}
389
391 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
392}
393
395 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
396}
397
398bool GCNSubtarget::useAA() const { return UseAA; }
399
404
405unsigned
407 unsigned DynamicVGPRBlockSize) const {
409 DynamicVGPRBlockSize);
410}
411
412unsigned
413GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
415 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
416
417 if (HasFlatScratch || HasArchitectedFlatScratch) {
419 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
421 return 4; // FLAT_SCRATCH, VCC (in that order).
422 }
423
424 if (isXNACKEnabled())
425 return 4; // XNACK, VCC (in that order).
426 return 2; // VCC.
427}
428
433
435 // In principle we do not need to reserve SGPR pair used for flat_scratch if
436 // we know flat instructions do not access the stack anywhere in the
437 // program. For now assume it's needed if we have flat instructions.
438 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
439 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
440}
441
442std::pair<unsigned, unsigned>
444 unsigned NumSGPRs, unsigned NumVGPRs) const {
445 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
446 // Temporarily check both the attribute and the subtarget feature until the
447 // latter is removed.
448 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
449 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
450
451 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
452 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
453 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
454
455 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
456 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
457 return {std::min(MinOcc, MaxOcc), MaxOcc};
458}
459
461 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
462 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
463 // Compute maximum number of SGPRs function can use using default/requested
464 // minimum number of waves per execution unit.
465 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
466 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
467
468 // Check if maximum number of SGPRs was explicitly requested using
469 // "amdgpu-num-sgpr" attribute.
470 unsigned Requested =
471 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
472
473 if (Requested != MaxNumSGPRs) {
474 // Make sure requested value does not violate subtarget's specifications.
475 if (Requested && (Requested <= ReservedNumSGPRs))
476 Requested = 0;
477
478 // If more SGPRs are required to support the input user/system SGPRs,
479 // increase to accommodate them.
480 //
481 // FIXME: This really ends up using the requested number of SGPRs + number
482 // of reserved special registers in total. Theoretically you could re-use
483 // the last input registers for these special registers, but this would
484 // require a lot of complexity to deal with the weird aliasing.
485 unsigned InputNumSGPRs = PreloadedSGPRs;
486 if (Requested && Requested < InputNumSGPRs)
487 Requested = InputNumSGPRs;
488
489 // Make sure requested value is compatible with values implied by
490 // default/requested minimum/maximum number of waves per execution unit.
491 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
492 Requested = 0;
493 if (WavesPerEU.second && Requested &&
494 Requested < getMinNumSGPRs(WavesPerEU.second))
495 Requested = 0;
496
497 if (Requested)
498 MaxNumSGPRs = Requested;
499 }
500
501 if (hasSGPRInitBug())
503
504 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
505}
506
508 const Function &F = MF.getFunction();
512}
513
515 using USI = GCNUserSGPRUsageInfo;
516 // Max number of user SGPRs
517 const unsigned MaxUserSGPRs =
518 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
519 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
520 USI::getNumUserSGPRForField(USI::QueuePtrID) +
521 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
522 USI::getNumUserSGPRForField(USI::DispatchIdID) +
523 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
524 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
525
526 // Max number of system SGPRs
527 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
528 1 + // WorkGroupIDY
529 1 + // WorkGroupIDZ
530 1 + // WorkGroupInfo
531 1; // private segment wave byte offset
532
533 // Max number of synthetic SGPRs
534 const unsigned SyntheticSGPRs = 1; // LDSKernelId
535
536 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
537}
538
543
545 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
546 const auto [Min, Max] = NumVGPRBounds;
547
548 // Check if maximum number of VGPRs was explicitly requested using
549 // "amdgpu-num-vgpr" attribute.
550
551 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
552 if (Requested != Max && hasGFX90AInsts())
553 Requested *= 2;
554
555 // Make sure requested value is inside the range of possible VGPR usage.
556 return std::clamp(Requested, Min, Max);
557}
558
560 // Temporarily check both the attribute and the subtarget feature, until the
561 // latter is removed.
562 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
563 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
564 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
565
566 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
567 return getBaseMaxNumVGPRs(
568 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
569 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
570}
571
573 return getMaxNumVGPRs(MF.getFunction());
574}
575
576std::pair<unsigned, unsigned>
578 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
579
580 unsigned MaxNumVGPRs = MaxVectorRegs;
581 unsigned MaxNumAGPRs = 0;
582 unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
583
584 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
585 // a wave may have up to 512 total vector registers combining together both
586 // VGPRs and AGPRs. Hence, in an entry function without calls and without
587 // AGPRs used within it, it is possible to use the whole vector register
588 // budget for VGPRs.
589 //
590 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
591 // register file accordingly.
592 if (hasGFX90AInsts()) {
593 unsigned MinNumAGPRs = 0;
594 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
595
596 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
597
598 // TODO: The lower bound should probably force the number of required
599 // registers up, overriding amdgpu-waves-per-eu.
600 std::tie(MinNumAGPRs, MaxNumAGPRs) =
601 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
602 /*OnlyFirstRequired=*/true);
603
604 if (MinNumAGPRs == DefaultNumAGPR.first) {
605 // Default to splitting half the registers if AGPRs are required.
606 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
607 } else {
608 // Align to accum_offset's allocation granularity.
609 MinNumAGPRs = alignTo(MinNumAGPRs, 4);
610
611 MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
612 }
613
614 // Clamp values to be inbounds of our limits, and ensure min <= max.
615
616 MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
617 MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
618
619 MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
620 MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
621
622 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
623 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
624 "invalid register counts");
625 } else if (hasMAIInsts()) {
626 // On gfx908 the number of AGPRs always equals the number of VGPRs.
627 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
628 }
629
630 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
631}
632
634 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
635 const TargetSchedModel *SchedModel) const {
636 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
637 !Use->isInstr())
638 return;
639
640 MachineInstr *DefI = Def->getInstr();
641 MachineInstr *UseI = Use->getInstr();
642
643 if (DefI->isBundle()) {
645 auto Reg = Dep.getReg();
648 unsigned Lat = 0;
649 for (++I; I != E && I->isBundledWithPred(); ++I) {
650 if (I->isMetaInstruction())
651 continue;
652 if (I->modifiesRegister(Reg, TRI))
653 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
654 else if (Lat)
655 --Lat;
656 }
657 Dep.setLatency(Lat);
658 } else if (UseI->isBundle()) {
660 auto Reg = Dep.getReg();
663 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
664 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
665 if (I->isMetaInstruction())
666 continue;
667 if (I->readsRegister(Reg, TRI))
668 break;
669 --Lat;
670 }
671 Dep.setLatency(Lat);
672 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
673 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
674 // implicit operands which come from the MCInstrDesc, which can fool
675 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
676 // pseudo operands.
677 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
678 DefI, DefOpIdx, UseI, UseOpIdx));
679 }
680}
681
684 return 0; // Not MIMG encoding.
685
686 if (NSAThreshold.getNumOccurrences() > 0)
687 return std::max(NSAThreshold.getValue(), 2u);
688
690 "amdgpu-nsa-threshold", -1);
691 if (Value > 0)
692 return std::max(Value, 2);
693
694 return NSAThreshold;
695}
696
698 const GCNSubtarget &ST)
699 : ST(ST) {
700 const CallingConv::ID CC = F.getCallingConv();
701 const bool IsKernel =
703
704 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
705 KernargSegmentPtr = true;
706
707 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
708 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
709 PrivateSegmentBuffer = true;
710 else if (ST.isMesaGfxShader(F))
711 ImplicitBufferPtr = true;
712
713 if (!AMDGPU::isGraphics(CC)) {
714 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
715 DispatchPtr = true;
716
717 // FIXME: Can this always be disabled with < COv5?
718 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
719 QueuePtr = true;
720
721 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
722 DispatchID = true;
723 }
724
725 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
726 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
727 // FlatScratchInit cannot be true for graphics CC if
728 // hasFlatScratchEnabled() is false.
729 (ST.hasFlatScratchEnabled() ||
730 (!AMDGPU::isGraphics(CC) &&
731 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
732 !ST.hasArchitectedFlatScratch()) {
733 FlatScratchInit = true;
734 }
735
737 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
738
741
742 if (hasDispatchPtr())
743 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
744
745 if (hasQueuePtr())
746 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
747
749 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
750
751 if (hasDispatchID())
752 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
753
754 if (hasFlatScratchInit())
755 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
756
758 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
759}
760
762 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
763 NumKernargPreloadSGPRs += NumSGPRs;
764 NumUsedUserSGPRs += NumSGPRs;
765}
766
768 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
769}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
This file defines the SmallString class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned AddressableLocalMemorySize
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
Diagnostic information for optimization failures.
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:776
bool hasFlat() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
unsigned getAddressableNumArchVGPRs() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Align getStackAlignment() const
bool hasMadF16() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
const SITargetLowering * getTargetLowering() const override
unsigned getNSAThreshold(const MachineFunction &MF) const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool useAA() const override
bool isWave32() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasAddr64() const
unsigned getDynamicVGPRBlockSize() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Instructions::const_iterator const_instr_iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool isBundle() const
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Register getReg() const
Returns the register associated with this edge.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
Provide an instruction scheduling machine model to CodeGen passes.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
self_iterator getIterator()
Definition ilist_node.h:123
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getLocalMemorySize(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
A region of an MBB for scheduling.