LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "AMDGPUTargetMachine.h"
24#include "GCNSubtarget.h"
29#include "R600AsmPrinter.h"
35#include "llvm/ADT/StringSet.h"
43#include "llvm/MC/MCAssembler.h"
44#include "llvm/MC/MCContext.h"
46#include "llvm/MC/MCStreamer.h"
47#include "llvm/MC/MCValue.h"
54
55using namespace llvm;
56using namespace llvm::AMDGPU;
57
58// This should get the default rounding mode from the kernel. We just set the
59// default here, but this could change if the OpenCL rounding mode pragmas are
60// used.
61//
62// The denormal mode here should match what is reported by the OpenCL runtime
63// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64// can also be override to flush with the -cl-denorms-are-zero compiler flag.
65//
66// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67// precision, and leaves single precision to flush all and does not report
68// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69// CL_FP_DENORM for both.
70//
71// FIXME: It seems some instructions do not support single precision denormals
72// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73// and sin_f32, cos_f32 on most parts).
74
75// We want to use these instructions, and using fp32 denormals also causes
76// instructions to run at the double precision rate for the device so it's
77// probably best to just report no single precision denormals.
84
85static AsmPrinter *
87 std::unique_ptr<MCStreamer> &&Streamer) {
88 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89}
90
98
99namespace {
100class AMDGPUAsmPrinterHandler : public AsmPrinterHandler {
101protected:
102 AMDGPUAsmPrinter *Asm;
103
104public:
105 AMDGPUAsmPrinterHandler(AMDGPUAsmPrinter *A) : Asm(A) {}
106
107 void beginFunction(const MachineFunction *MF) override {}
108
109 void endFunction(const MachineFunction *MF) override { Asm->endFunction(MF); }
110
111 void endModule() override {}
112};
113} // End anonymous namespace
114
116 std::unique_ptr<MCStreamer> Streamer)
117 : AsmPrinter(TM, std::move(Streamer)) {
118 assert(OutStreamer && "AsmPrinter constructed without streamer");
119}
120
122 return "AMDGPU Assembly Printer";
123}
124
126 return &TM.getMCSubtargetInfo();
127}
128
130 if (!OutStreamer)
131 return nullptr;
132 return static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
133}
134
138
139void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
141
142 // TODO: Which one is called first, emitStartOfAsmFile or
143 // emitFunctionBodyStart?
144 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
145 initializeTargetID(M);
146
149 return;
150
152
155 CodeObjectVersion);
156 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
157 }
158
161}
162
164 // Init target streamer if it has not yet happened
166 initTargetStreamer(M);
167
168 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
170
171 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
172 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
173 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
174 HSAMetadataStream->end();
175 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
176 (void)Success;
177 assert(Success && "Malformed HSA Metadata");
178 }
179}
180
182 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
183 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
184 const Function &F = MF->getFunction();
185
186 // TODO: We're checking this late, would be nice to check it earlier.
187 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
189 STM.getCPU() + " is only available on code object version 6 or better");
190 }
191
192 // TODO: Which one is called first, emitStartOfAsmFile or
193 // emitFunctionBodyStart?
194 if (!getTargetStreamer()->getTargetID())
195 initializeTargetID(*F.getParent());
196
197 const auto &FunctionTargetID = STM.getTargetID();
198 // Make sure function's xnack settings are compatible with module's
199 // xnack settings.
200 if (FunctionTargetID.isXnackSupported() &&
201 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
202 FunctionTargetID.getXnackSetting() !=
203 getTargetStreamer()->getTargetID()->getXnackSetting()) {
204 OutContext.reportError(
205 {}, "xnack setting of '" + Twine(MF->getName()) +
206 "' function does not match module xnack setting");
207 return;
208 }
209 // Make sure function's sramecc settings are compatible with module's
210 // sramecc settings.
211 if (FunctionTargetID.isSramEccSupported() &&
212 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
213 FunctionTargetID.getSramEccSetting() !=
214 getTargetStreamer()->getTargetID()->getSramEccSetting()) {
215 OutContext.reportError(
216 {}, "sramecc setting of '" + Twine(MF->getName()) +
217 "' function does not match module sramecc setting");
218 return;
219 }
220
221 if (!MFI.isEntryFunction())
222 return;
223
224 if (STM.isMesaKernel(F) &&
225 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
226 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
227 AMDGPUMCKernelCodeT KernelCode;
228 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
229 KernelCode.validate(&STM, MF->getContext());
231 }
232
233 if (STM.isAmdHsaOS())
234 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
235}
236
237/// Set bits in a kernel descriptor MCExpr field:
238/// return ((Dst & ~Mask) | (Value << Shift))
239static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
240 uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
241 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
242 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
243 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
245 Ctx);
246 return Dst;
247}
248
250 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
251 if (!MFI.isEntryFunction())
252 return;
253
254 assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
255
256 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
257 MCContext &Ctx = MF->getContext();
258
260 getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo);
261
262 // Compute inst_pref_size using MCExpr label subtraction for exact code
263 // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
264 // right after the function code, so (Lfunc_end - func_sym) gives the
265 // exact function code size in bytes.
266 if (STM.hasInstPrefSize()) {
267 const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
270
271 uint32_t Mask, Shift, Width, CacheLineSize;
272 STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
273 const MCExpr *InstPrefSize =
274 AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx);
276 setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx);
277 }
278
279 auto &Streamer = getTargetStreamer()->getStreamer();
280 auto &Context = Streamer.getContext();
281 auto &ObjectFileInfo = *Context.getObjectFileInfo();
282 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
283
284 Streamer.pushSection();
285 Streamer.switchSection(&ReadOnlySection);
286
287 // CP microcode requires the kernel descriptor to be allocated on 64 byte
288 // alignment.
289 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
290 ReadOnlySection.ensureMinAlignment(Align(64));
291
292 SmallString<128> KernelName;
293 getNameWithPrefix(KernelName, &MF->getFunction());
295 STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU,
297 CurrentProgramInfo.NumSGPRsForWavesPerEU,
299 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
300 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
301 Context),
302 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
303
304 Streamer.popSection();
305}
306
308 Register RegNo = MI->getOperand(0).getReg();
309
311 raw_svector_ostream OS(Str);
312 OS << "implicit-def: "
313 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
314
315 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
316 OS << " : SGPR spill to VGPR lane";
317
318 OutStreamer->AddComment(OS.str());
319 OutStreamer->addBlankLine();
320}
321
323 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
325 return;
326 }
327
328 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
329 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
330 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
331 SmallString<128> SymbolName;
332 getNameWithPrefix(SymbolName, &MF->getFunction()),
335 }
336 if (DumpCodeInstEmitter) {
337 // Disassemble function name label to text.
338 DisasmLines.push_back(MF->getName().str() + ":");
339 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
340 HexLines.emplace_back("");
341 }
342
344}
345
347 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
348 // Write a line for the basic block label if it is not only fallthrough.
349 DisasmLines.push_back((Twine("BB") + Twine(getFunctionNumber()) + "_" +
350 Twine(MBB.getNumber()) + ":")
351 .str());
352 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
353 HexLines.emplace_back("");
354 }
356}
357
360 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
361 OutContext.reportError({},
362 Twine(GV->getName()) +
363 ": unsupported initializer for address space");
364 return;
365 }
366
367 const Triple::OSType OS = TM.getTargetTriple().getOS();
368 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
370 return;
371 // With object linking, LDS definitions should have been externalized
372 // by earlier passes (e.g. LDS lowering, named barrier lowering).
373 // Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
374 // so the linker can assign their offsets.
375 assert(GV->isDeclaration() &&
376 "LDS definitions should have been externalized when object "
377 "linking is enabled");
378 }
379
380 MCSymbol *GVSym = getSymbol(GV);
381
382 GVSym->redefineIfPossible();
383 if (GVSym->isDefined() || GVSym->isVariable())
384 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
385 "' is already defined");
386
387 const DataLayout &DL = GV->getDataLayout();
389 Align Alignment = GV->getAlign().value_or(Align(4));
390
391 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
392 emitLinkage(GV, GVSym);
393 auto *TS = getTargetStreamer();
394 TS->emitAMDGPULDS(GVSym, Size, Alignment);
395 return;
396 }
397
399}
400
402 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
403
404 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
405 switch (CodeObjectVersion) {
407 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
408 break;
410 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
411 break;
413 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
414 break;
415 default:
416 reportFatalUsageError("unsupported code object version");
417 }
418
419 addAsmPrinterHandler(std::make_unique<AMDGPUAsmPrinterHandler>(this));
420 }
421
423}
424
425/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
426///
427/// Remove dependency on GCNSubtarget and depend only only the necessary values
428/// for said occupancy computation. Should match computeOccupancy implementation
429/// without passing \p STM on.
430const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
431 const MCExpr *NumVGPRs,
432 unsigned DynamicVGPRBlockSize,
433 const GCNSubtarget &STM, MCContext &Ctx) {
434 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STM);
435 unsigned Granule = IsaInfo::getVGPRAllocGranule(STM, DynamicVGPRBlockSize);
436 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STM);
437 unsigned Generation = STM.getGeneration();
438
439 auto CreateExpr = [&Ctx](unsigned Value) {
440 return MCConstantExpr::create(Value, Ctx);
441 };
442
444 {CreateExpr(MaxWaves), CreateExpr(Granule),
445 CreateExpr(TargetTotalNumVGPRs),
446 CreateExpr(Generation), CreateExpr(InitOcc),
447 NumSGPRs, NumVGPRs},
448 Ctx);
449}
450
451void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
452 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
453 return;
454
456 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
457 MCSymbol *FnSym = TM.getSymbol(&F);
458
459 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
460 int64_t Val;
461 if (Value->evaluateAsAbsolute(Val)) {
462 Res = Val;
463 return true;
464 }
465 return false;
466 };
467
468 const uint64_t MaxScratchPerWorkitem =
470 MCSymbol *ScratchSizeSymbol =
471 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
472 uint64_t ScratchSize;
473 if (ScratchSizeSymbol->isVariable() &&
474 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
475 ScratchSize > MaxScratchPerWorkitem) {
476 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
477 DS_Error);
478 F.getContext().diagnose(DiagStackSize);
479 }
480
481 // Validate addressable scalar registers (i.e., prior to added implicit
482 // SGPRs).
483 MCSymbol *NumSGPRSymbol =
484 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
486 !STM.hasSGPRInitBug()) {
487 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
488 uint64_t NumSgpr;
489 if (NumSGPRSymbol->isVariable() &&
490 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
491 NumSgpr > MaxAddressableNumSGPRs) {
492 F.getContext().diagnose(DiagnosticInfoResourceLimit(
493 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
495 return;
496 }
497 }
498
499 MCSymbol *VCCUsedSymbol =
500 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
501 MCSymbol *FlatUsedSymbol =
502 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
503 uint64_t VCCUsed, FlatUsed, NumSgpr;
504
505 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
506 FlatUsedSymbol->isVariable() &&
507 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
508 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
509 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
510
511 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
512 // resolvable.
513 NumSgpr += IsaInfo::getNumExtraSGPRs(
514 STM, VCCUsed, FlatUsed,
515 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
517 STM.hasSGPRInitBug()) {
518 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
519 if (NumSgpr > MaxAddressableNumSGPRs) {
520 F.getContext().diagnose(DiagnosticInfoResourceLimit(
521 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
523 return;
524 }
525 }
526
527 MCSymbol *NumVgprSymbol =
528 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
529 MCSymbol *NumAgprSymbol =
530 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
531 uint64_t NumVgpr, NumAgpr;
532
533 MachineModuleInfo &MMI =
535 MachineFunction *MF = MMI.getMachineFunction(F);
536 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
537 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
538 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
539 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
540 unsigned MaxWaves = MFI.getMaxWavesPerEU();
541 uint64_t TotalNumVgpr =
542 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
543 uint64_t NumVGPRsForWavesPerEU =
544 std::max({TotalNumVgpr, (uint64_t)1,
545 (uint64_t)STM.getMinNumVGPRs(
546 MaxWaves, MFI.getDynamicVGPRBlockSize())});
547 uint64_t NumSGPRsForWavesPerEU = std::max(
548 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
549 const MCExpr *OccupancyExpr = createOccupancy(
550 STM.getOccupancyWithWorkGroupSizes(*MF).second,
551 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
552 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
554 uint64_t Occupancy;
555
556 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
557 F, "amdgpu-waves-per-eu", {0, 0}, true);
558
559 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
560 DiagnosticInfoOptimizationFailure Diag(
561 F, F.getSubprogram(),
562 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
563 "'" +
564 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
565 ", final occupancy is " + Twine(Occupancy));
566 F.getContext().diagnose(Diag);
567 return;
568 }
569 }
570 }
571}
572
573static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL,
574 bool IsReturnType) {
575 if (Ty->isVoidTy()) {
576 Enc += 'v';
577 return;
578 }
579 unsigned Bits = DL.getTypeSizeInBits(Ty);
580 // Zero-sized non-void types (e.g. `{}` or `[0 x i8]`) consume no ABI
581 // registers. For returns, emit the same no-result marker as void so the
582 // parameter encoding still has an explicit return-type prefix.
583 if (Bits == 0) {
584 if (IsReturnType)
585 Enc += 'v';
586 return;
587 }
588 if (Bits <= 32)
589 Enc += 'i';
590 else if (Bits <= 64)
591 Enc += 'l';
592 else
593 Enc.append(divideCeil(Bits, 32), 'i');
594}
595
596static std::string computeTypeId(const FunctionType *FTy,
597 const DataLayout &DL) {
598 std::string Enc;
599 appendTypeEncoding(Enc, FTy->getReturnType(), DL, /*IsReturnType=*/true);
600 for (Type *ParamTy : FTy->params())
601 appendTypeEncoding(Enc, ParamTy, DL, /*IsReturnType=*/false);
602 return Enc;
603}
604
605void AMDGPUAsmPrinter::collectCallEdge(const MachineInstr &MI) {
607 return;
608 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
609 const MachineOperand *Callee =
610 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
611 if (!Callee || !Callee->isGlobal())
612 return;
613 DirectCallEdges.insert(
614 {getSymbol(&MF->getFunction()), getSymbol(Callee->getGlobal())});
615}
616
617void AMDGPUAsmPrinter::emitAMDGPUInfo(Module &M) {
619 return;
620
621 const NamedMDNode *LDSMD = M.getNamedMetadata("amdgpu.lds.uses");
622 bool HasLDSUses = LDSMD && LDSMD->getNumOperands() > 0;
623
624 const NamedMDNode *BarMD = M.getNamedMetadata("amdgpu.named_barrier.uses");
625 bool HasNamedBarriers = BarMD && BarMD->getNumOperands() > 0;
626
627 // Collect address-taken functions (with type IDs) and indirect call sites.
628 DenseMap<const Function *, std::string> AddrTakenTypeIds;
629 using IndirectCallInfo = std::pair<const Function *, std::string>;
631
632 for (const Function &F : M) {
633 bool IsKernel = AMDGPU::isKernel(F.getCallingConv());
634
635 if (!IsKernel && F.hasAddressTaken(/*PutOffender=*/nullptr,
636 /*IgnoreCallbackUses=*/false,
637 /*IgnoreAssumeLikeCalls=*/true,
638 /*IgnoreLLVMUsed=*/true)) {
639 AddrTakenTypeIds[&F] =
640 computeTypeId(F.getFunctionType(), M.getDataLayout());
641 }
642
643 if (F.isDeclaration())
644 continue;
645
646 StringSet<> SeenTypeIds;
647 for (const BasicBlock &BB : F) {
648 for (const Instruction &I : BB) {
649 const auto *CB = dyn_cast<CallBase>(&I);
650 if (!CB || !CB->isIndirectCall())
651 continue;
652 std::string TId =
653 computeTypeId(CB->getFunctionType(), M.getDataLayout());
654 if (SeenTypeIds.insert(TId).second)
655 IndirectCalls.push_back({&F, std::move(TId)});
656 }
657 }
658 }
659
660 if (FunctionInfos.empty() && DirectCallEdges.empty() && !HasLDSUses &&
661 !HasNamedBarriers && AddrTakenTypeIds.empty() && IndirectCalls.empty())
662 return;
663
664 AMDGPU::InfoSectionData Data;
665 Data.Funcs = std::move(FunctionInfos);
666
667 for (auto &[F, TypeId] : AddrTakenTypeIds) {
668 MCSymbol *Sym = getSymbol(F);
669 Data.TypeIds.push_back({Sym, TypeId});
670 }
671
672 for (auto &[CallerSym, CalleeSym] : DirectCallEdges)
673 Data.Calls.push_back({CallerSym, CalleeSym});
674 DirectCallEdges.clear();
675
676 if (HasLDSUses) {
677 for (const MDNode *N : LDSMD->operands()) {
678 auto *Func = mdconst::extract<Function>(N->getOperand(0));
679 auto *LdsVar = mdconst::extract<GlobalVariable>(N->getOperand(1));
680 Data.Uses.push_back({getSymbol(Func), getSymbol(LdsVar)});
681 }
682 }
683
684 if (HasNamedBarriers) {
685 for (const MDNode *N : BarMD->operands()) {
686 auto *BarVar = mdconst::extract<GlobalVariable>(N->getOperand(0));
687 MCSymbol *BarSym = getSymbol(BarVar);
688 for (unsigned I = 1, E = N->getNumOperands(); I < E; ++I) {
689 auto *Func = mdconst::extract<Function>(N->getOperand(I));
690 Data.Uses.push_back({getSymbol(Func), BarSym});
691 }
692 }
693 }
694
695 for (auto &[Caller, Enc] : IndirectCalls) {
696 MCSymbol *CallerSym = getSymbol(Caller);
697 Data.IndirectCalls.push_back({CallerSym, Enc});
698 }
699
701}
702
704 // Pad with s_code_end to help tools and guard against instruction prefetch
705 // causing stale data in caches. Arguably this should be done by the linker,
706 // which is why this isn't done for Mesa.
707 // Don't do it if there is no code.
708 const MCSubtargetInfo &STI = *getGlobalSTI();
709 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
713 if (TextSect->hasInstructions()) {
714 OutStreamer->switchSection(TextSect);
716 }
717 }
718
719 // Emit the unified .amdgpu.info section (per-function resources, call graph,
720 // LDS/named-barrier use edges, indirect calls, and address-taken type IDs).
721 emitAMDGPUInfo(M);
722
723 // Assign expressions which can only be resolved when all other functions are
724 // known.
725 RI.finalize(OutContext);
726
727 // Switch section and emit all GPR maximums within the processed module.
728 OutStreamer->pushSection();
729 MCSectionELF *MaxGPRSection =
730 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
731 OutStreamer->switchSection(MaxGPRSection);
733 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
734 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
735 OutStreamer->popSection();
736
737 // In the object-linking pipeline per-function resource MCExprs reference
738 // external callee symbols that cannot be evaluated here, so cross-TU limit
739 // checks would silently no-op for every non-leaf function. Defer resource
740 // sanity checking to the linker, which re-validates against the aggregated
741 // call graph in the combined .amdgpu.info metadata.
743 for (Function &F : M.functions())
744 validateMCResourceInfo(F);
745 }
746
747 RI.reset();
748
750}
751
752SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
754 raw_svector_ostream OSS(Str);
755 auto &Streamer = getTargetStreamer()->getStreamer();
756 auto &Context = Streamer.getContext();
757 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
758 printAMDGPUMCExpr(New, OSS, &MAI);
759 return Str;
760}
761
762// Print comments that apply to both callable functions and entry points.
763void AMDGPUAsmPrinter::emitCommonFunctionComments(
764 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
765 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
766 const AMDGPUMachineFunctionInfo *MFI) {
767 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
768 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
769 false);
770 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
771 if (NumAGPR && TotalNumVGPR) {
772 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
773 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
774 false);
775 }
776 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
777 false);
778 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
779 false);
780}
781
782const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
783 const MachineFunction &MF) const {
784 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
785 MCContext &Ctx = MF.getContext();
786 uint16_t KernelCodeProperties = 0;
787 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
788
789 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
790 KernelCodeProperties |=
791 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
792 }
793 if (UserSGPRInfo.hasDispatchPtr()) {
794 KernelCodeProperties |=
795 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
796 }
797 if (UserSGPRInfo.hasQueuePtr()) {
798 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
799 }
800 if (UserSGPRInfo.hasKernargSegmentPtr()) {
801 KernelCodeProperties |=
802 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
803 }
804 if (UserSGPRInfo.hasDispatchID()) {
805 KernelCodeProperties |=
806 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
807 }
808 if (UserSGPRInfo.hasFlatScratchInit()) {
809 KernelCodeProperties |=
810 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
811 }
812 if (UserSGPRInfo.hasPrivateSegmentSize()) {
813 KernelCodeProperties |=
814 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
815 }
816 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
817 KernelCodeProperties |=
818 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
819 }
820
821 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
822 // un-evaluatable at this point so it cannot be conditionally checked here.
823 // Instead, we'll directly shift the possibly unknown MCExpr into its place
824 // and bitwise-or it into KernelCodeProperties.
825 const MCExpr *KernelCodePropExpr =
826 MCConstantExpr::create(KernelCodeProperties, Ctx);
827 const MCExpr *OrValue = MCConstantExpr::create(
828 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
829 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
830 OrValue, Ctx);
831 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
832
833 return KernelCodePropExpr;
834}
835
836MCKernelDescriptor
837AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
838 const SIProgramInfo &PI) const {
839 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
840 const Function &F = MF.getFunction();
841 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
842 MCContext &Ctx = MF.getContext();
843
844 MCKernelDescriptor KernelDescriptor;
845
846 KernelDescriptor.group_segment_fixed_size =
848 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
849
850 Align MaxKernArgAlign;
851 KernelDescriptor.kernarg_size = MCConstantExpr::create(
852 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
853
854 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
855 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(STM, Ctx);
856 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
857
858 int64_t PGM_Rsrc3 = 1;
859 bool EvaluatableRsrc3 =
860 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
861 (void)PGM_Rsrc3;
862 (void)EvaluatableRsrc3;
864 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
865 static_cast<uint64_t>(PGM_Rsrc3) == 0);
866 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
867
868 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
869 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
870 Ctx);
871
872 return KernelDescriptor;
873}
874
876 // Init target streamer lazily on the first function so that previous passes
877 // can set metadata.
879 initTargetStreamer(*MF.getFunction().getParent());
880
881 ResourceUsage =
883 CurrentProgramInfo.reset(MF);
884
885 const AMDGPUMachineFunctionInfo *MFI =
886 MF.getInfo<AMDGPUMachineFunctionInfo>();
887 MCContext &Ctx = MF.getContext();
888
889 // The starting address of all shader programs must be 256 bytes aligned.
890 // Regular functions just need the basic required instruction alignment.
891 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
892
894
895 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
897 // FIXME: This should be an explicit check for Mesa.
898 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
899 MCSectionELF *ConfigSection =
900 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
901 OutStreamer->switchSection(ConfigSection);
902 }
903
904 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
905
908 *ResourceUsage;
909 FunctionInfos.push_back(
910 {/*NumSGPR=*/static_cast<uint32_t>(RU.NumExplicitSGPR),
911 /*NumArchVGPR=*/static_cast<uint32_t>(RU.NumVGPR),
912 /*NumAccVGPR=*/static_cast<uint32_t>(RU.NumAGPR),
913 /*PrivateSegmentSize=*/static_cast<uint32_t>(RU.PrivateSegmentSize),
914 /*UsesVCC=*/RU.UsesVCC,
915 /*UsesFlatScratch=*/RU.UsesFlatScratch,
916 /*HasDynStack=*/RU.HasDynamicallySizedStack,
917 /*Sym=*/getSymbol(&MF.getFunction())});
918 }
919
920 if (MFI->isModuleEntryFunction()) {
921 getSIProgramInfo(CurrentProgramInfo, MF);
922 }
923
924 if (STM.isAmdPalOS()) {
925 if (MFI->isEntryFunction())
926 EmitPALMetadata(MF, CurrentProgramInfo);
927 else if (MFI->isModuleEntryFunction())
928 emitPALFunctionMetadata(MF);
929 } else if (!STM.isAmdHsaOS()) {
930 EmitProgramInfoSI(MF, CurrentProgramInfo);
931 }
932
933 DumpCodeInstEmitter = nullptr;
934 if (STM.dumpCode()) {
935 // For -dumpcode, get the assembler out of the streamer. This only works
936 // with -filetype=obj.
937 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
938 if (Assembler)
939 DumpCodeInstEmitter = Assembler->getEmitterPtr();
940 }
941
942 DisasmLines.clear();
943 HexLines.clear();
945
947
948 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
949 STM.hasMAIInsts());
950
951 {
954 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
955 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
956 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
957 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
958 OutContext),
959 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
960 OutContext),
961 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
962 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
963 OutContext),
964 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
965 OutContext),
966 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
967 OutContext),
968 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
969 OutContext));
970 }
971
972 // Emit _dvgpr$ symbol when appropriate.
973 emitDVgprSymbol(MF);
974
975 if (isVerbose()) {
976 MCSectionELF *CommentSection =
977 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
978 OutStreamer->switchSection(CommentSection);
979
980 if (!MFI->isEntryFunction()) {
982 OutStreamer->emitRawComment(" Function info:", false);
983
984 emitCommonFunctionComments(
985 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
986 ->getVariableValue(),
987 STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
988 RIK::RIK_NumAGPR, OutContext)
989 ->getVariableValue()
990 : nullptr,
991 RI.createTotalNumVGPRs(MF, Ctx),
992 RI.createTotalNumSGPRs(
993 MF,
994 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
995 Ctx),
996 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
998 ->getVariableValue(),
999 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1000 return false;
1001 }
1002
1003 OutStreamer->emitRawComment(" Kernel info:", false);
1004 emitCommonFunctionComments(
1005 CurrentProgramInfo.NumArchVGPR,
1006 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
1007 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
1008 CurrentProgramInfo.ScratchSize,
1009 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1010
1011 OutStreamer->emitRawComment(
1012 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
1013 OutStreamer->emitRawComment(
1014 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
1015 OutStreamer->emitRawComment(
1016 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
1017 " bytes/workgroup (compile time only)",
1018 false);
1019
1020 OutStreamer->emitRawComment(
1021 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
1022
1023 OutStreamer->emitRawComment(
1024 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
1025
1026 OutStreamer->emitRawComment(
1027 " NumSGPRsForWavesPerEU: " +
1028 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
1029 false);
1030 OutStreamer->emitRawComment(
1031 " NumVGPRsForWavesPerEU: " +
1032 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
1033 false);
1034
1035 if (STM.hasGFX90AInsts()) {
1036 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
1037 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
1038 AdjustedAccum = MCBinaryExpr::createMul(
1039 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
1040 OutStreamer->emitRawComment(
1041 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
1042 }
1043
1044 if (STM.hasGFX1250Insts())
1045 OutStreamer->emitRawComment(
1046 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
1047 false);
1048
1049 OutStreamer->emitRawComment(
1050 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
1051
1052 OutStreamer->emitRawComment(
1053 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
1054
1055 OutStreamer->emitRawComment(
1056 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
1057 getMCExprStr(CurrentProgramInfo.ScratchEnable),
1058 false);
1059 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
1060 Twine(CurrentProgramInfo.UserSGPR),
1061 false);
1062 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
1063 Twine(CurrentProgramInfo.TrapHandlerEnable),
1064 false);
1065 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
1066 Twine(CurrentProgramInfo.TGIdXEnable),
1067 false);
1068 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
1069 Twine(CurrentProgramInfo.TGIdYEnable),
1070 false);
1071 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
1072 Twine(CurrentProgramInfo.TGIdZEnable),
1073 false);
1074 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
1075 Twine(CurrentProgramInfo.TIdIGCompCount),
1076 false);
1077
1078 [[maybe_unused]] int64_t PGMRSrc3;
1080 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
1081 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
1082 static_cast<uint64_t>(PGMRSrc3) == 0));
1083 if (STM.hasGFX90AInsts()) {
1084 OutStreamer->emitRawComment(
1085 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
1086 getMCExprStr(MCKernelDescriptor::bits_get(
1087 CurrentProgramInfo.ComputePGMRSrc3,
1088 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
1089 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
1090 false);
1091 OutStreamer->emitRawComment(
1092 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
1093 getMCExprStr(MCKernelDescriptor::bits_get(
1094 CurrentProgramInfo.ComputePGMRSrc3,
1095 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
1096 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
1097 false);
1098 }
1099 }
1100
1101 if (DumpCodeInstEmitter) {
1102
1103 OutStreamer->switchSection(
1104 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
1105
1106 for (size_t i = 0; i < DisasmLines.size(); ++i) {
1107 std::string Comment = "\n";
1108 if (!HexLines[i].empty()) {
1109 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
1110 Comment += " ; " + HexLines[i] + "\n";
1111 }
1112
1113 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
1114 OutStreamer->emitBytes(StringRef(Comment));
1115 }
1116 }
1117
1118 return false;
1119}
1120
1121// When appropriate, add a _dvgpr$ symbol, with the value of the function
1122// symbol, plus an offset encoding one less than the number of VGPR blocks used
1123// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
1124// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
1125// used by a front-end to have functions that are chained rather than called,
1126// and a dispatcher that dynamically resizes the VGPR count before dispatching
1127// to a function.
1128void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
1130 if (MFI.isDynamicVGPREnabled() &&
1132 MCContext &Ctx = MF.getContext();
1133 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
1134
1135 const MCExpr *EncodedBlocks;
1136 MCValue NumVGPRs;
1137 if (CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
1138 NumVGPRs, nullptr) &&
1139 NumVGPRs.isAbsolute()) {
1140
1141 // Calculate number of VGPR blocks.
1142 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
1143 unsigned NumBlocks =
1144 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
1145
1146 if (NumBlocks > AMDGPU::IsaInfo::MaxDynamicVGPRBlocks) {
1148 {}, "DVGPR block count " + Twine(NumBlocks) +
1149 " exceeds maximum of " +
1151 " for __dvgpr$ symbol for '" +
1152 Twine(CurrentFnSym->getName()) + "'");
1153 return;
1154 }
1155 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
1156 EncodedBlocks = MCConstantExpr::create(EncodedNumBlocks, Ctx);
1157 } else {
1158 // Value not yet available so build a symbolic MCExpr:
1159 // ((alignTo(max(NumVGPRs, 1), BlockSize) / BlockSize - 1) << 3
1160 const MCExpr *One = MCConstantExpr::create(1, Ctx);
1161 const MCExpr *BlockSizeConst = MCConstantExpr::create(BlockSize, Ctx);
1162 const MCExpr *MaxVGPRs = AMDGPUMCExpr::createMax(
1163 {CurrentProgramInfo.NumVGPRsForWavesPerEU, One}, Ctx);
1164 const MCExpr *NumBlocks = MCBinaryExpr::createDiv(
1165 AMDGPUMCExpr::createAlignTo(MaxVGPRs, BlockSizeConst, Ctx),
1166 BlockSizeConst, Ctx);
1167 EncodedBlocks =
1169 MCConstantExpr::create(3, Ctx), Ctx);
1170 }
1171
1172 // Add to function symbol to create _dvgpr$ symbol.
1173 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
1174 MCSymbolRefExpr::create(CurrentFnSym, Ctx), EncodedBlocks, Ctx);
1175 MCSymbol *DVgprFuncSym =
1176 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
1177 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
1178 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
1179 emitLinkage(&MF.getFunction(), DVgprFuncSym);
1180 }
1181}
1182
1183// TODO: Fold this into emitFunctionBodyStart.
1184void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
1185 // In the beginning all features are either 'Any' or 'NotSupported',
1186 // depending on global target features. This will cover empty modules.
1188 getGlobalSTI()->getFeatureString());
1189
1190 // If module is empty, we are done.
1191 if (M.empty())
1192 return;
1193
1194 // If module is not empty, need to find first 'Off' or 'On' feature
1195 // setting per feature from functions in module.
1196 for (auto &F : M) {
1197 auto &TSTargetID = getTargetStreamer()->getTargetID();
1198 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
1199 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
1200 break;
1201
1202 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
1203 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
1204 if (TSTargetID->isXnackSupported())
1205 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
1206 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
1207 if (TSTargetID->isSramEccSupported())
1208 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
1209 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
1210 }
1211}
1212
1213// AccumOffset computed for the MCExpr equivalent of:
1214// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
1215static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
1216 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
1217 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
1218
1219 // Can't be lower than 1 for subsequent alignTo.
1220 const MCExpr *MaximumTaken =
1221 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
1222
1223 // Practically, it's computing divideCeil(MaximumTaken, 4).
1224 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
1225 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
1226 Ctx);
1227
1228 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1229}
1230
1231void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1232 const MachineFunction &MF) {
1233 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1234 MCContext &Ctx = MF.getContext();
1235
1236 auto CreateExpr = [&Ctx](int64_t Value) {
1237 return MCConstantExpr::create(Value, Ctx);
1238 };
1239
1240 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1241 int64_t Val;
1242 if (Value->evaluateAsAbsolute(Val)) {
1243 Res = Val;
1244 return true;
1245 }
1246 return false;
1247 };
1248
1249 auto GetSymRefExpr =
1250 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1251 MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
1252 return MCSymbolRefExpr::create(Sym, Ctx);
1253 };
1254
1256 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1257 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1259 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1260
1261 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1262 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1263 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1264 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1265 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1266 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1267 ProgInfo.DynamicCallStack =
1268 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1269 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1270
1271 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1272 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1273 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1274 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1275
1276 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1277
1278 // The calculations related to SGPR/VGPR blocks are
1279 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1280 // unified.
1281 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1282 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1283 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1284
1285 // Check the addressable register limit before we add ExtraSGPRs.
1287 !STM.hasSGPRInitBug()) {
1288 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1289 uint64_t NumSgpr;
1290 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1291 NumSgpr > MaxAddressableNumSGPRs) {
1292 // This can happen due to a compiler bug or when using inline asm.
1293 LLVMContext &Ctx = MF.getFunction().getContext();
1294 Ctx.diagnose(DiagnosticInfoResourceLimit(
1295 MF.getFunction(), "addressable scalar registers", NumSgpr,
1296 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1297 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1298 }
1299 }
1300
1301 // Account for extra SGPRs and VGPRs reserved for debugger use.
1302 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1303
1304 const Function &F = MF.getFunction();
1305
1306 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1307 // dispatch registers as function args.
1308 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1309 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1310
1311 if (WaveDispatchNumSGPR) {
1313 {ProgInfo.NumSGPR,
1314 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1315 Ctx)},
1316 Ctx);
1317 }
1318
1319 if (WaveDispatchNumVGPR) {
1321 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1322
1324 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1325 }
1326
1327 // Adjust number of registers used to meet default/requested minimum/maximum
1328 // number of waves per execution unit request.
1329 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1330 ProgInfo.NumSGPRsForWavesPerEU =
1331 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1332 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1333 Ctx);
1334 ProgInfo.NumVGPRsForWavesPerEU =
1335 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1336 CreateExpr(STM.getMinNumVGPRs(
1337 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1338 Ctx);
1339
1341 STM.hasSGPRInitBug()) {
1342 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1343 uint64_t NumSgpr;
1344 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1345 NumSgpr > MaxAddressableNumSGPRs) {
1346 // This can happen due to a compiler bug or when using inline asm to use
1347 // the registers which are usually reserved for vcc etc.
1348 LLVMContext &Ctx = MF.getFunction().getContext();
1349 Ctx.diagnose(DiagnosticInfoResourceLimit(
1350 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1352 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1353 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1354 }
1355 }
1356
1357 if (STM.hasSGPRInitBug()) {
1358 ProgInfo.NumSGPR =
1360 ProgInfo.NumSGPRsForWavesPerEU =
1362 }
1363
1364 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1365 LLVMContext &Ctx = MF.getFunction().getContext();
1366 Ctx.diagnose(DiagnosticInfoResourceLimit(
1367 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1369 }
1370
1371 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1372 LLVMContext &Ctx = MF.getFunction().getContext();
1373 Ctx.diagnose(DiagnosticInfoResourceLimit(
1374 MF.getFunction(), "local memory", MFI->getLDSSize(),
1376 }
1377 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1378 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1379 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1380 unsigned Granule) {
1381 const MCExpr *OneConst = CreateExpr(1ul);
1382 const MCExpr *GranuleConst = CreateExpr(Granule);
1383 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1384 const MCExpr *AlignToGPR =
1385 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1386 const MCExpr *DivGPR =
1387 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1388 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1389 return SubGPR;
1390 };
1391 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1393 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1394 } else {
1395 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1397 }
1398 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1400
1401 const SIModeRegisterDefaults Mode = MFI->getMode();
1402
1403 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1404 // register.
1405 ProgInfo.FloatMode = getFPMode(Mode);
1406
1407 ProgInfo.IEEEMode = Mode.IEEE;
1408
1409 // Make clamp modifier on NaN input returns 0.
1410 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1411
1412 unsigned LDSAlignShift = 8;
1413 switch (getLdsDwGranularity(STM)) {
1414 case 512:
1415 case 320:
1416 LDSAlignShift = 11;
1417 break;
1418 case 128:
1419 LDSAlignShift = 9;
1420 break;
1421 case 64:
1422 LDSAlignShift = 8;
1423 break;
1424 default:
1425 llvm_unreachable("invald LDS block size");
1426 }
1427
1428 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1429 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1430
1431 ProgInfo.LDSSize = MFI->getLDSSize();
1432 ProgInfo.LDSBlocks =
1433 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1434
1435 // The MCExpr equivalent of divideCeil.
1436 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1437 const MCExpr *Ceil =
1438 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1439 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1440 };
1441
1442 // Scratch is allocated in 64-dword or 256-dword blocks.
1443 unsigned ScratchAlignShift =
1444 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1445 // We need to program the hardware with the amount of scratch memory that
1446 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1447 // scratch memory used per thread.
1448 ProgInfo.ScratchBlocks = DivideCeil(
1450 CreateExpr(STM.getWavefrontSize()), Ctx),
1451 CreateExpr(1ULL << ScratchAlignShift));
1452
1453 if (STM.supportsWGP()) {
1454 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1455 }
1456
1457 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1458 ProgInfo.MemOrdered = 1;
1459 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1460 }
1461
1462 // 0 = X, 1 = XY, 2 = XYZ
1463 unsigned TIDIGCompCnt = 0;
1464 if (MFI->hasWorkItemIDZ())
1465 TIDIGCompCnt = 2;
1466 else if (MFI->hasWorkItemIDY())
1467 TIDIGCompCnt = 1;
1468
1469 // The private segment wave byte offset is the last of the system SGPRs. We
1470 // initially assumed it was allocated, and may have used it. It shouldn't harm
1471 // anything to disable it if we know the stack isn't used here. We may still
1472 // have emitted code reading it to initialize scratch, but if that's unused
1473 // reading garbage should be OK.
1476 MCConstantExpr::create(0, Ctx), Ctx),
1477 ProgInfo.DynamicCallStack, Ctx);
1478
1479 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1480 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1481 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1482 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1483 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1484 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1485 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1486 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1487 ProgInfo.EXCPEnMSB = 0;
1488 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1489 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1490 ProgInfo.EXCPEnable = 0;
1491
1492 if (STM.hasGFX90AInsts()) {
1493 ProgInfo.ComputePGMRSrc3 =
1494 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1495 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1496 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
1497 ProgInfo.ComputePGMRSrc3 =
1498 setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1499 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1500 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
1501 }
1502
1503 if (STM.hasGFX1250Insts())
1504 ProgInfo.ComputePGMRSrc3 =
1505 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1506 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1507 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
1508
1509 ProgInfo.Occupancy = createOccupancy(
1510 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1512 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1513
1514 const auto [MinWEU, MaxWEU] =
1515 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1516 uint64_t Occupancy;
1517 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1518 DiagnosticInfoOptimizationFailure Diag(
1519 F, F.getSubprogram(),
1520 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1521 "'" +
1522 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1523 ", final occupancy is " + Twine(Occupancy));
1524 F.getContext().diagnose(Diag);
1525 }
1526}
1527
1528static unsigned getRsrcReg(CallingConv::ID CallConv) {
1529 switch (CallConv) {
1530 default:
1531 [[fallthrough]];
1546 }
1547}
1548
1549void AMDGPUAsmPrinter::EmitProgramInfoSI(
1550 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1551 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1552 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1553 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1554 MCContext &Ctx = MF.getContext();
1555
1556 // (((Value) & Mask) << Shift)
1557 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1558 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1559 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1561 shft, Ctx);
1562 };
1563
1564 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1565 int64_t Val;
1566 if (Value->evaluateAsAbsolute(Val))
1567 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1568 else
1569 OutStreamer->emitValue(Value, Size);
1570 };
1571
1572 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1574
1575 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1576 /*Size=*/4);
1577
1579 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx),
1580 /*Size=*/4);
1581
1583
1584 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1585 // appropriate generation.
1586 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1587 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1588 /*Mask=*/0x3FFFF, /*Shift=*/12),
1589 /*Size=*/4);
1590 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1591 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1592 /*Mask=*/0x7FFF, /*Shift=*/12),
1593 /*Size=*/4);
1594 } else {
1595 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1596 /*Mask=*/0x1FFF, /*Shift=*/12),
1597 /*Size=*/4);
1598 }
1599
1600 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1601 // 0" comment but I don't see a corresponding field in the register spec.
1602 } else {
1603 OutStreamer->emitInt32(RsrcReg);
1604
1605 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1606 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1607 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1608 MF.getContext());
1609 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1611
1612 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1613 // appropriate generation.
1614 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1615 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1616 /*Mask=*/0x3FFFF, /*Shift=*/12),
1617 /*Size=*/4);
1618 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1619 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1620 /*Mask=*/0x7FFF, /*Shift=*/12),
1621 /*Size=*/4);
1622 } else {
1623 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1624 /*Mask=*/0x1FFF, /*Shift=*/12),
1625 /*Size=*/4);
1626 }
1627 }
1628
1629 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1631 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1632 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1633 : CurrentProgramInfo.LDSBlocks;
1634 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1636 OutStreamer->emitInt32(MFI->getPSInputEnable());
1638 OutStreamer->emitInt32(MFI->getPSInputAddr());
1639 }
1640
1641 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1642 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1643 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1644 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1645}
1646
1647// Helper function to add common PAL Metadata 3.0+
1649 const SIProgramInfo &CurrentProgramInfo,
1650 CallingConv::ID CC, const GCNSubtarget &ST,
1651 unsigned DynamicVGPRBlockSize) {
1652 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1653 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1654
1655 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1656 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1657 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1658
1659 if (AMDGPU::isCompute(CC)) {
1660 MD->setHwStage(CC, ".trap_present",
1661 (bool)CurrentProgramInfo.TrapHandlerEnable);
1662 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1663
1664 if (DynamicVGPRBlockSize != 0)
1665 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1666 }
1667
1669 CC, ".lds_size",
1670 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1671 sizeof(uint32_t)));
1672}
1673
1674// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1675// is AMDPAL. It stores each compute/SPI register setting and other PAL
1676// metadata items into the PALMD::Metadata, combining with any provided by the
1677// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1678// is then written as a single block in the .note section.
1679void AMDGPUAsmPrinter::EmitPALMetadata(
1680 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1681 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1682 auto CC = MF.getFunction().getCallingConv();
1683 auto *MD = getTargetStreamer()->getPALMetadata();
1684 auto &Ctx = MF.getContext();
1685
1686 MD->setEntryPoint(CC, MF.getFunction().getName());
1687 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1688
1689 // For targets that support dynamic VGPRs, set the number of saved dynamic
1690 // VGPRs (if any) in the PAL metadata.
1691 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1692 if (MFI->isDynamicVGPREnabled() &&
1694 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1696
1697 // Only set AGPRs for supported devices
1698 if (STM.hasMAIInsts()) {
1699 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1700 }
1701
1702 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1703 if (MD->getPALMajorVersion() < 3) {
1704 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1705 if (AMDGPU::isCompute(CC)) {
1706 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), Ctx);
1707 } else {
1708 const MCExpr *HasScratchBlocks =
1709 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1710 MCConstantExpr::create(0, Ctx), Ctx);
1711 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1712 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1713 }
1714 } else {
1715 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1716 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1717 CurrentProgramInfo.ScratchEnable);
1718 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1720 }
1721
1722 // ScratchSize is in bytes, 16 aligned.
1723 MD->setScratchSize(
1724 CC,
1725 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1726 MCConstantExpr::create(16, Ctx), Ctx),
1727 Ctx);
1728
1729 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1730 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1731 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1732 : CurrentProgramInfo.LDSBlocks;
1733 if (MD->getPALMajorVersion() < 3) {
1734 MD->setRsrc2(
1735 CC,
1737 Ctx);
1738 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1739 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1740 } else {
1741 // Graphics registers
1742 const unsigned ExtraLdsDwGranularity =
1743 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1744 MD->setGraphicsRegisters(
1745 ".ps_extra_lds_size",
1746 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1747
1748 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1749 static StringLiteral const PsInputFields[] = {
1750 ".persp_sample_ena", ".persp_center_ena",
1751 ".persp_centroid_ena", ".persp_pull_model_ena",
1752 ".linear_sample_ena", ".linear_center_ena",
1753 ".linear_centroid_ena", ".line_stipple_tex_ena",
1754 ".pos_x_float_ena", ".pos_y_float_ena",
1755 ".pos_z_float_ena", ".pos_w_float_ena",
1756 ".front_face_ena", ".ancillary_ena",
1757 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1758 unsigned PSInputEna = MFI->getPSInputEnable();
1759 unsigned PSInputAddr = MFI->getPSInputAddr();
1760 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1761 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1762 (bool)((PSInputEna >> Idx) & 1));
1763 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1764 (bool)((PSInputAddr >> Idx) & 1));
1765 }
1766 }
1767 }
1768
1769 // For version 3 and above the wave front size is already set in the metadata
1770 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1771 MD->setWave32(MF.getFunction().getCallingConv());
1772}
1773
1774void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1775 auto *MD = getTargetStreamer()->getPALMetadata();
1776 const MachineFrameInfo &MFI = MF.getFrameInfo();
1777 StringRef FnName = MF.getFunction().getName();
1778 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1779 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1780 MCContext &Ctx = MF.getContext();
1781
1782 if (MD->getPALMajorVersion() < 3) {
1783 // Set compute registers
1784 MD->setRsrc1(
1786 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1787 MD->setRsrc2(CallingConv::AMDGPU_CS,
1788 CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx);
1789 } else {
1791 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1792 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1793 }
1794
1795 // Set optional info
1796 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1797 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1798 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1799}
1800
1801// This is supposed to be log2(Size)
1803 switch (Size) {
1804 case 4:
1805 return AMD_ELEMENT_4_BYTES;
1806 case 8:
1807 return AMD_ELEMENT_8_BYTES;
1808 case 16:
1809 return AMD_ELEMENT_16_BYTES;
1810 default:
1811 llvm_unreachable("invalid private_element_size");
1812 }
1813}
1814
1815void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1816 const SIProgramInfo &CurrentProgramInfo,
1817 const MachineFunction &MF) const {
1818 const Function &F = MF.getFunction();
1819 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1820 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1821
1822 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1823 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1824 MCContext &Ctx = MF.getContext();
1825
1826 Out.initDefault(STM, Ctx, /*InitMCExpr=*/false);
1827
1829 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1831 CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx);
1833
1834 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1835
1837 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1838
1839 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1840 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1842 }
1843
1844 if (UserSGPRInfo.hasDispatchPtr())
1846
1847 if (UserSGPRInfo.hasQueuePtr())
1849
1850 if (UserSGPRInfo.hasKernargSegmentPtr())
1852
1853 if (UserSGPRInfo.hasDispatchID())
1855
1856 if (UserSGPRInfo.hasFlatScratchInit())
1858
1859 if (UserSGPRInfo.hasPrivateSegmentSize())
1861
1862 if (STM.isXNACKEnabled())
1864
1865 Align MaxKernArgAlign;
1866 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1867 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1868 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1869 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1870 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1871
1872 // kernarg_segment_alignment is specified as log of the alignment.
1873 // The minimum alignment is 16.
1874 // FIXME: The metadata treats the minimum as 4?
1875 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1876}
1877
1879 const char *ExtraCode, raw_ostream &O) {
1880 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1881 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1882 return false;
1883
1884 if (ExtraCode && ExtraCode[0]) {
1885 if (ExtraCode[1] != 0)
1886 return true; // Unknown modifier.
1887
1888 switch (ExtraCode[0]) {
1889 case 'r':
1890 break;
1891 default:
1892 return true;
1893 }
1894 }
1895
1896 // TODO: Should be able to support other operand types like globals.
1897 const MachineOperand &MO = MI->getOperand(OpNo);
1898 if (MO.isReg()) {
1900 *MF->getSubtarget().getRegisterInfo());
1901 return false;
1902 }
1903 if (MO.isImm()) {
1904 int64_t Val = MO.getImm();
1906 O << Val;
1907 } else if (isUInt<16>(Val)) {
1908 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1909 } else if (isUInt<32>(Val)) {
1910 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1911 } else {
1912 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1913 }
1914 return false;
1915 }
1916 return true;
1917}
1918
1926
1927void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1928 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1929 bool isModuleEntryFunction, bool hasMAIInsts) {
1930 if (!ORE)
1931 return;
1932
1933 const char *Name = "kernel-resource-usage";
1934 const char *Indent = " ";
1935
1936 // If the remark is not specifically enabled, do not output to yaml
1938 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1939 return;
1940
1941 // Currently non-kernel functions have no resources to emit.
1943 return;
1944
1945 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1946 StringRef RemarkLabel, auto Argument) {
1947 // Add an indent for every line besides the line with the kernel name. This
1948 // makes it easier to tell which resource usage go with which kernel since
1949 // the kernel name will always be displayed first.
1950 std::string LabelStr = RemarkLabel.str() + ": ";
1951 if (RemarkName != "FunctionName")
1952 LabelStr = Indent + LabelStr;
1953
1954 ORE->emit([&]() {
1955 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1957 &MF.front())
1958 << LabelStr << ore::NV(RemarkName, Argument);
1959 });
1960 };
1961
1962 // FIXME: Formatting here is pretty nasty because clang does not accept
1963 // newlines from diagnostics. This forces us to emit multiple diagnostic
1964 // remarks to simulate newlines. If and when clang does accept newlines, this
1965 // formatting should be aggregated into one remark with newlines to avoid
1966 // printing multiple diagnostic location and diag opts.
1967 EmitResourceUsageRemark("FunctionName", "Function Name",
1968 MF.getFunction().getName());
1969 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1970 getMCExprStr(CurrentProgramInfo.NumSGPR));
1971 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1972 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1973 if (hasMAIInsts) {
1974 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1975 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1976 }
1977 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1978 getMCExprStr(CurrentProgramInfo.ScratchSize));
1979 int64_t DynStack;
1980 bool DynStackEvaluatable =
1981 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1982 StringRef DynamicStackStr =
1983 DynStackEvaluatable && DynStack ? "True" : "False";
1984 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1985 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1986 getMCExprStr(CurrentProgramInfo.Occupancy));
1987 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1988 CurrentProgramInfo.SGPRSpill);
1989 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1990 CurrentProgramInfo.VGPRSpill);
1991 if (isModuleEntryFunction)
1992 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1993 CurrentProgramInfo.LDSSize);
1994}
1995
1996char AMDGPUAsmPrinter::ID = 0;
1997
1998INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1999 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static const MCExpr * setBits(const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Set bits in a kernel descriptor MCExpr field: return ((Dst & ~Mask) | (Value << Shift))
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static std::string computeTypeId(const FunctionType *FTy, const DataLayout &DL)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL, bool IsReturnType)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1146
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1288
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1270
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1182
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1262
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1221
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1283
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1169
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1168
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1177
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1220
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1155
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1281
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1223
#define R_SPILLED_SGPRS
Definition SIDefines.h:1302
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1269
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1280
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1160
#define R_SPILLED_VGPRS
Definition SIDefines.h:1303
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1154
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1179
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1153
StringSet - A set-like wrapper for the StringMap.
static const int BlockSize
Definition TarWriter.cpp:33
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
void endFunction(const MachineFunction *MF)
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx)
Create an expression for instruction prefetch size computation: min(divideCeil(CodeSizeBytes,...
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
virtual void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Collects and handles AsmPrinter objects required to build debug or EH information.
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
const MCAsmInfo & MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:310
MCSymbol * getFunctionEnd() const
Definition AsmPrinter.h:320
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
void addAsmPrinterHandler(std::unique_ptr< AsmPrinterHandler > Handler)
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool empty() const
Definition DenseMap.h:173
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool hasInstPrefSize() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:413
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:661
bool hasInstructions() const
Definition MCSection.h:669
MCContext & getContext() const
Definition MCStreamer.h:323
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVM_ABI unsigned getNumOperands() const
iterator_range< op_iterator > operands()
Definition Metadata.h:1851
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void push_back(const T &Elt)
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< typename Base::iterator, bool > insert(StringRef key)
Definition StringSet.h:39
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:445
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
static constexpr unsigned MaxDynamicVGPRBlocks
Maximum number of VGPR blocks that can be allocated in dynamic VGPR mode.
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo &STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1150
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1433
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:861
#define N
AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo FunctionResourceInfo
void initDefault(const MCSubtargetInfo &STI, MCContext &Ctx, bool InitMCExpr=true)
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
const MCExpr * getComputePGMRSrc2(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.