LLVM 23.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form",
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
42
44 const SITargetLowering *TLI = STI->getTargetLowering();
45 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46}
47
49
51 const GCNSubtarget *STI)
52 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
53 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
54 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
55 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
56 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
57 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
58 IsWholeWaveFunction(F.getCallingConv() ==
60 const GCNSubtarget &ST = *STI;
61 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62 WavesPerEU = ST.getWavesPerEU(F);
63 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64 assert(MaxNumWorkGroups.size() == 3);
65
66 // Temporarily check both the attribute and the subtarget feature, until the
67 // latter is completely removed.
68 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
70 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
73 CallingConv::ID CC = F.getCallingConv();
74
75 VRegFlags.reserve(1024);
76
77 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
79
80 if (IsKernel) {
81 WorkGroupIDX = true;
82 WorkItemIDX = true;
83 } else if (CC == CallingConv::AMDGPU_PS) {
84 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85 }
86
87 if (ST.hasGFX90AInsts()) {
88 // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89 // allocation granule and clamping.
90 auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
92 /*OnlyFirstRequired=*/true);
93 MinNumAGPRs = MinNumAGPRAttr;
94 }
95
96 if (!isEntryFunction()) {
97 if (CC != CallingConv::AMDGPU_Gfx &&
100
101 FrameOffsetReg = AMDGPU::SGPR33;
102 StackPtrOffsetReg = AMDGPU::SGPR32;
103
104 if (!ST.hasFlatScratchEnabled()) {
105 // Non-entry functions have no special inputs for now, other registers
106 // required for scratch access.
107 ScratchRSrcReg = AMDGPU::isChainCC(CC)
108 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
109 : ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
110
111 ArgInfo.PrivateSegmentBuffer =
112 ArgDescriptor::createRegister(ScratchRSrcReg);
113 }
114
115 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr") &&
117 ImplicitArgPtr = true;
118 } else {
119 ImplicitArgPtr = false;
121 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
122 }
123
124 if (!AMDGPU::isGraphics(CC) ||
126 ST.hasArchitectedSGPRs())) {
127 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x") ||
128 !F.hasFnAttribute("amdgpu-no-cluster-id-x"))
129 WorkGroupIDX = true;
130
131 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y") ||
132 !F.hasFnAttribute("amdgpu-no-cluster-id-y"))
133 WorkGroupIDY = true;
134
135 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z") ||
136 !F.hasFnAttribute("amdgpu-no-cluster-id-z"))
137 WorkGroupIDZ = true;
138 }
139
140 if (!AMDGPU::isGraphics(CC)) {
141 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
142 WorkItemIDX = true;
143
144 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
145 ST.getMaxWorkitemID(F, 1) != 0)
146 WorkItemIDY = true;
147
148 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
149 ST.getMaxWorkitemID(F, 2) != 0)
150 WorkItemIDZ = true;
151
152 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
153 LDSKernelId = true;
154 }
155
156 if (isEntryFunction()) {
157 // X, XY, and XYZ are the only supported combinations, so make sure Y is
158 // enabled if Z is.
159 if (WorkItemIDZ)
160 WorkItemIDY = true;
161
162 if (!ST.hasArchitectedFlatScratch()) {
163 PrivateSegmentWaveByteOffset = true;
164
165 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
166 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
168 ArgInfo.PrivateSegmentWaveByteOffset =
169 ArgDescriptor::createRegister(AMDGPU::SGPR5);
170 }
171 }
172
173 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
174 StringRef S = A.getValueAsString();
175 if (!S.empty())
176 S.consumeInteger(0, GITPtrHigh);
177
178 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
179 S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(0, HighBitsOf32BitAddress);
182
183 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
184 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
185
186 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
187 // VGPR available at all times. For now, reserve highest available VGPR. After
188 // RA, shift it to the lowest available unused VGPR if the one exist.
189 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
190 VGPRForAGPRCopy =
191 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
192 }
193
194 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
195}
196
203
206 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
207 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
208}
209
211 const SIRegisterInfo &TRI) {
212 ArgInfo.PrivateSegmentBuffer =
213 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
214 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
215 NumUserSGPRs += 4;
216 return ArgInfo.PrivateSegmentBuffer.getRegister();
217}
218
220 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
221 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
222 NumUserSGPRs += 2;
223 return ArgInfo.DispatchPtr.getRegister();
224}
225
227 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
228 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
229 NumUserSGPRs += 2;
230 return ArgInfo.QueuePtr.getRegister();
231}
232
234 ArgInfo.KernargSegmentPtr
235 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
236 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
237 NumUserSGPRs += 2;
238 return ArgInfo.KernargSegmentPtr.getRegister();
239}
240
242 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
243 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
244 NumUserSGPRs += 2;
245 return ArgInfo.DispatchID.getRegister();
246}
247
249 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
250 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
251 NumUserSGPRs += 2;
252 return ArgInfo.FlatScratchInit.getRegister();
253}
254
256 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
257 NumUserSGPRs += 1;
258 return ArgInfo.PrivateSegmentSize.getRegister();
259}
260
262 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
263 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
264 NumUserSGPRs += 2;
265 return ArgInfo.ImplicitBufferPtr.getRegister();
266}
267
269 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
270 NumUserSGPRs += 1;
271 return ArgInfo.LDSKernelId.getRegister();
272}
273
275 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
276 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
277 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
278 assert(Inserted && "Preload kernel argument allocated twice.");
279 NumUserSGPRs += PaddingSGPRs;
280 // If the available register tuples are aligned with the kernarg to be
281 // preloaded use that register, otherwise we need to use a set of SGPRs and
282 // merge them.
283 if (!ArgInfo.FirstKernArgPreloadReg)
284 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
285 Register PreloadReg =
286 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
287 auto &Regs = It->second.Regs;
288 if (PreloadReg &&
289 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
290 Regs.push_back(PreloadReg);
291 NumUserSGPRs += AllocSizeDWord;
292 } else {
293 Regs.reserve(AllocSizeDWord);
294 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
295 Regs.push_back(getNextUserSGPR());
296 NumUserSGPRs++;
297 }
298 }
299
300 // Track the actual number of SGPRs that HW will preload to.
301 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
302 return &Regs;
303}
304
306 uint64_t Size, Align Alignment) {
307 // Skip if it is an entry function or the register is already added.
308 if (isEntryFunction() || WWMSpills.count(VGPR))
309 return;
310
311 // Skip if this is a function with the amdgpu_cs_chain or
312 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
313 // We never need to allocate a spill for these because we don't even need to
314 // restore the inactive lanes for them (they're scratchier than the usual
315 // scratch registers). We only need to do this if we have calls to
316 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
317 // chain functions do not return) and the function did not contain a call to
318 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
319 // when entering the function).
320 if (isChainFunction() &&
323 return;
324
325 WWMSpills.insert(std::make_pair(
326 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
327}
328
329// Separate out the callee-saved and scratch registers.
331 MachineFunction &MF,
332 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
333 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
334 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
335 for (auto &Reg : WWMSpills) {
336 if (isCalleeSavedReg(CSRegs, Reg.first))
337 CalleeSavedRegs.push_back(Reg);
338 else
339 ScratchRegs.push_back(Reg);
340 }
341}
342
344 MCPhysReg Reg) const {
345 for (unsigned I = 0; CSRegs[I]; ++I) {
346 if (CSRegs[I] == Reg)
347 return true;
348 }
349
350 return false;
351}
352
355 BitVector &SavedVGPRs) {
356 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
358 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
359 Register Reg = WWMVGPRs[I];
360 Register NewReg =
361 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
362 if (!NewReg || NewReg >= Reg)
363 break;
364
365 MRI.replaceRegWith(Reg, NewReg);
366
367 // Update various tables with the new VGPR.
368 WWMVGPRs[I] = NewReg;
369 WWMReservedRegs.remove(Reg);
370 WWMReservedRegs.insert(NewReg);
371 MRI.reserveReg(NewReg, TRI);
372
373 // Replace the register in SpillPhysVGPRs. This is needed to look for free
374 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
375 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
376 if (RegItr != SpillPhysVGPRs.end()) {
377 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
378 SpillPhysVGPRs[Idx] = NewReg;
379 }
380
381 // The generic `determineCalleeSaves` might have set the old register if it
382 // is in the CSR range.
383 SavedVGPRs.reset(Reg);
384
385 for (MachineBasicBlock &MBB : MF) {
386 MBB.removeLiveIn(Reg);
387 MBB.sortUniqueLiveIns();
388 }
389
390 Reg = NewReg;
391 }
392}
393
394bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
395 MachineFunction &MF, int FI, unsigned LaneIndex) {
397 Register LaneVGPR;
398 if (!LaneIndex) {
399 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
400 SpillVGPRs.push_back(LaneVGPR);
401 } else {
402 LaneVGPR = SpillVGPRs.back();
403 }
404
405 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
406 return true;
407}
408
409bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
410 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
411 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
412 const SIRegisterInfo *TRI = ST.getRegisterInfo();
413 MachineRegisterInfo &MRI = MF.getRegInfo();
414 Register LaneVGPR;
415 if (!LaneIndex) {
416 // Find the highest available register if called before RA to ensure the
417 // lowest registers are available for allocation. The LaneVGPR, in that
418 // case, will be shifted back to the lowest range after VGPR allocation.
419 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
420 !IsPrologEpilog);
421 if (LaneVGPR == AMDGPU::NoRegister) {
422 // We have no VGPRs left for spilling SGPRs. Reset because we will not
423 // partially spill the SGPR to VGPRs.
424 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
425 return false;
426 }
427
428 if (IsPrologEpilog)
429 allocateWWMSpill(MF, LaneVGPR);
430
431 reserveWWMRegister(LaneVGPR);
432 for (MachineBasicBlock &MBB : MF) {
433 MBB.addLiveIn(LaneVGPR);
435 }
436 SpillPhysVGPRs.push_back(LaneVGPR);
437 } else {
438 LaneVGPR = SpillPhysVGPRs.back();
439 }
440
441 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
442 return true;
443}
444
446 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
447 bool IsPrologEpilog) {
448 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
449 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
450 : SGPRSpillsToVirtualVGPRLanes[FI];
451
452 // This has already been allocated.
453 if (!SpillLanes.empty())
454 return true;
455
456 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
457 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
458 unsigned WaveSize = ST.getWavefrontSize();
459
460 unsigned Size = FrameInfo.getObjectSize(FI);
461 unsigned NumLanes = Size / 4;
462
463 if (NumLanes > WaveSize)
464 return false;
465
466 assert(Size >= 4 && "invalid sgpr spill size");
467 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
468 "not spilling SGPRs to VGPRs");
469
470 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
471 : NumVirtualVGPRSpillLanes;
472
473 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
474 unsigned LaneIndex = (NumSpillLanes % WaveSize);
475
476 bool Allocated = SpillToPhysVGPRLane
477 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
478 IsPrologEpilog)
479 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
480 if (!Allocated) {
481 NumSpillLanes -= I;
482 return false;
483 }
484 }
485
486 return true;
487}
488
489/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
490/// Either AGPR is spilled to VGPR to vice versa.
491/// Returns true if a \p FI can be eliminated completely.
493 int FI,
494 bool isAGPRtoVGPR) {
496 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
497 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
498
499 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
500
501 auto &Spill = VGPRToAGPRSpills[FI];
502
503 // This has already been allocated.
504 if (!Spill.Lanes.empty())
505 return Spill.FullyAllocated;
506
507 unsigned Size = FrameInfo.getObjectSize(FI);
508 unsigned NumLanes = Size / 4;
509 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
510
511 const TargetRegisterClass &RC =
512 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
513 auto Regs = RC.getRegisters();
514
515 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
516 const SIRegisterInfo *TRI = ST.getRegisterInfo();
517 Spill.FullyAllocated = true;
518
519 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
520 // once.
521 BitVector OtherUsedRegs;
522 OtherUsedRegs.resize(TRI->getNumRegs());
523
524 const uint32_t *CSRMask =
525 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
526 if (CSRMask)
527 OtherUsedRegs.setBitsInMask(CSRMask);
528
529 // TODO: Should include register tuples, but doesn't matter with current
530 // usage.
531 for (MCPhysReg Reg : SpillAGPR)
532 OtherUsedRegs.set(Reg);
533 for (MCPhysReg Reg : SpillVGPR)
534 OtherUsedRegs.set(Reg);
535
536 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
537 for (int I = NumLanes - 1; I >= 0; --I) {
538 NextSpillReg = std::find_if(
539 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
540 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
541 !OtherUsedRegs[Reg];
542 });
543
544 if (NextSpillReg == Regs.end()) { // Registers exhausted
545 Spill.FullyAllocated = false;
546 break;
547 }
548
549 OtherUsedRegs.set(*NextSpillReg);
550 SpillRegs.push_back(*NextSpillReg);
551 MRI.reserveReg(*NextSpillReg, TRI);
552 Spill.Lanes[I] = *NextSpillReg++;
553 }
554
555 return Spill.FullyAllocated;
556}
557
559 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
560 // Remove dead frame indices from function frame, however keep FP & BP since
561 // spills for them haven't been inserted yet. And also make sure to remove the
562 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
563 // otherwise, it could result in an unexpected side effect and bug, in case of
564 // any re-mapping of freed frame indices by later pass(es) like "stack slot
565 // coloring".
566 for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
567 MFI.RemoveStackObject(R.first);
568 SGPRSpillsToVirtualVGPRLanes.erase(R.first);
569 }
570
571 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
572 // VGPR lanes during SILowerSGPRSpills pass.
573 if (!ResetSGPRSpillStackIDs) {
574 for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
575 MFI.RemoveStackObject(R.first);
576 SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
577 }
578 }
579 bool HaveSGPRToMemory = false;
580
581 if (ResetSGPRSpillStackIDs) {
582 // All other SGPRs must be allocated on the default stack, so reset the
583 // stack ID.
584 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
585 ++I) {
589 HaveSGPRToMemory = true;
590 }
591 }
592 }
593 }
594
595 for (auto &R : VGPRToAGPRSpills) {
596 if (R.second.IsDead)
597 MFI.RemoveStackObject(R.first);
598 }
599
600 return HaveSGPRToMemory;
601}
602
604 const SIRegisterInfo &TRI) {
605 if (ScavengeFI)
606 return *ScavengeFI;
607
608 ScavengeFI =
609 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
610 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
611 return *ScavengeFI;
612}
613
614MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
615 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
616 return AMDGPU::SGPR0 + NumUserSGPRs;
617}
618
619MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
620 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
621}
622
623void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
624 VRegFlags.grow(Reg);
625}
626
627void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
628 Register SrcReg) {
629 VRegFlags.grow(NewReg);
630 VRegFlags[NewReg] = VRegFlags[SrcReg];
631}
632
635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
636 if (!ST.isAmdPalOS())
637 return Register();
638 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
639 if (ST.hasMergedShaders()) {
640 switch (MF.getFunction().getCallingConv()) {
643 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
644 // ES+GS merged shader on gfx9+.
645 GitPtrLo = AMDGPU::SGPR8;
646 return GitPtrLo;
647 default:
648 return GitPtrLo;
649 }
650 }
651 return GitPtrLo;
652}
653
655 const TargetRegisterInfo &TRI) {
657 {
658 raw_string_ostream OS(Dest.Value);
659 OS << printReg(Reg, &TRI);
660 }
661 return Dest;
662}
663
664static std::optional<yaml::SIArgumentInfo>
666 const TargetRegisterInfo &TRI) {
668
669 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
670 const ArgDescriptor &Arg) {
671 if (!Arg)
672 return false;
673
674 // Create a register or stack argument.
676 if (Arg.isRegister()) {
678 OS << printReg(Arg.getRegister(), &TRI);
679 } else
680 SA.StackOffset = Arg.getStackOffset();
681 // Check and update the optional mask.
682 if (Arg.isMasked())
683 SA.Mask = Arg.getMask();
684
685 A = std::move(SA);
686 return true;
687 };
688
689 bool Any = false;
690 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
691 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
692 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
693 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
694 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
695 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
696 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
697 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
698 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
699 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
700 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
701 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
702 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
703 ArgInfo.PrivateSegmentWaveByteOffset);
704 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
705 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
706 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
707 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
708 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
709
710 // Write FirstKernArgPreloadReg separately, since it's a Register,
711 // not ArgDescriptor.
712 if (ArgInfo.FirstKernArgPreloadReg) {
713 Register Reg = ArgInfo.FirstKernArgPreloadReg;
714 assert(Reg.isPhysical() &&
715 "FirstKernArgPreloadReg must be a physical register");
716
719 OS << printReg(Reg, &TRI);
720
722 Any = true;
723 }
724
725 if (Any)
726 return AI;
727
728 return std::nullopt;
729}
730
733 const llvm::MachineFunction &MF)
734 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
735 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
736 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
737 IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
738 WaveLimiter(MFI.needsWaveLimiter()),
739 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
740 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
741 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
742 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
743 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
744 Occupancy(MFI.getOccupancy()),
745 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
746 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
747 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
748 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
749 ReturnsVoid(MFI.returnsVoid()),
750 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
751 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
752 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
753 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
754 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
755 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
756 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
757 NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
758 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
759 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
760
761 for (Register Reg : MFI.getWWMReservedRegs())
762 WWMReservedRegs.push_back(regToString(Reg, TRI));
763
764 if (MFI.getLongBranchReservedReg())
766 if (MFI.getVGPRForAGPRCopy())
768
769 if (MFI.getSGPRForEXECCopy())
771
772 auto SFI = MFI.getOptionalScavengeFI();
773 if (SFI)
775}
776
780
782 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
786 LDSSize = YamlMFI.LDSSize;
787 GDSSize = YamlMFI.GDSSize;
788 DynLDSAlign = YamlMFI.DynLDSAlign;
789 PSInputAddr = YamlMFI.PSInputAddr;
790 PSInputEnable = YamlMFI.PSInputEnable;
791 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
792 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
793 Occupancy = YamlMFI.Occupancy;
795 MemoryBound = YamlMFI.MemoryBound;
796 WaveLimiter = YamlMFI.WaveLimiter;
797 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
798 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
799 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
800 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
801 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
802 ReturnsVoid = YamlMFI.ReturnsVoid;
803 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
804
805 UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
806
807 if (YamlMFI.ScavengeFI) {
808 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
809 if (!FIOrErr) {
810 // Create a diagnostic for a the frame index.
811 const MemoryBuffer &Buffer =
812 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
813
814 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
815 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
816 "", {}, {});
817 SourceRange = YamlMFI.ScavengeFI->SourceRange;
818 return true;
819 }
820 ScavengeFI = *FIOrErr;
821 } else {
822 ScavengeFI = std::nullopt;
823 }
824 return false;
825}
826
828 auto [MinNumAGPR, MaxNumAGPR] =
829 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
830 /*OnlyFirstRequired=*/true);
831 return MinNumAGPR != 0u;
832}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static cl::opt< bool, true > MFMAVGPRFormOpt("amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true), cl::Hidden)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST)
Align DynLDSAlign
Align for dynamic shared memory if any.
uint32_t LDSSize
Number of bytes in the LDS that are being used.
static ClusterDimsAttr get(const Function &F)
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
BitVector & reset()
Definition BitVector.h:411
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition BitVector.h:360
BitVector & set()
Definition BitVector.h:370
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
setBitsInMask - Add '1' bits from Mask to this vector.
Definition BitVector.h:726
void push_back(bool Val)
Definition BitVector.h:485
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void reserveReg(MCRegister PhysReg, const TargetRegisterInfo *TRI)
reserveReg – Mark a register as reserved so checks like isAllocatable will not suggest using it.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:297
Represents a location in source code.
Definition SMLoc.h:22
Represents a range in source code.
Definition SMLoc.h:47
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:148
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:141
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:519
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:62
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > FirstKernArgPreloadReg
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.