LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(
344 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
345 for (auto Reg : AMDGPU::VGPR_16RegClass) {
346 if (AMDGPU::isHi16Reg(Reg, *this))
347 RegPressureIgnoredUnits.set(
348 static_cast<unsigned>(*regunits(Reg).begin()));
349 }
350
351 // HACK: Until this is fully tablegen'd.
352 static llvm::once_flag InitializeRegSplitPartsFlag;
353
354 static auto InitializeRegSplitPartsOnce = [this]() {
355 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
356 unsigned Size = getSubRegIdxSize(Idx);
357 if (Size & 15)
358 continue;
359 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
360 unsigned Pos = getSubRegIdxOffset(Idx);
361 if (Pos % Size)
362 continue;
363 Pos /= Size;
364 if (Vec.empty()) {
365 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
366 Vec.resize(MaxNumParts);
367 }
368 Vec[Pos] = Idx;
369 }
370 };
371
372 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
373
374 static auto InitializeSubRegFromChannelTableOnce = [this]() {
375 for (auto &Row : SubRegFromChannelTable)
376 Row.fill(AMDGPU::NoSubRegister);
377 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
378 unsigned Width = getSubRegIdxSize(Idx) / 32;
379 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
381 Width = SubRegFromChannelTableWidthMap[Width];
382 if (Width == 0)
383 continue;
384 unsigned TableIdx = Width - 1;
385 assert(TableIdx < SubRegFromChannelTable.size());
386 assert(Offset < SubRegFromChannelTable[TableIdx].size());
387 SubRegFromChannelTable[TableIdx][Offset] = Idx;
388 }
389 };
390
391 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
392 llvm::call_once(InitializeSubRegFromChannelTableFlag,
393 InitializeSubRegFromChannelTableOnce);
394}
395
396void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
397 MCRegister Reg) const {
398 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
399 Reserved.set(*R);
400}
401
402// Forced to be here by one .inc
404 const MachineFunction *MF) const {
406 switch (CC) {
407 case CallingConv::C:
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SaveList;
414 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
415 : CSR_AMDGPU_SI_Gfx_SaveList;
417 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
418 default: {
419 // Dummy to not crash RegisterClassInfo.
420 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
421 return &NoCalleeSavedReg;
422 }
423 }
424}
425
426const MCPhysReg *
428 return nullptr;
429}
430
432 CallingConv::ID CC) const {
433 switch (CC) {
434 case CallingConv::C:
437 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
438 : CSR_AMDGPU_RegMask;
441 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
442 : CSR_AMDGPU_SI_Gfx_RegMask;
445 // Calls to these functions never return, so we can pretend everything is
446 // preserved.
447 return AMDGPU_AllVGPRs_RegMask;
448 default:
449 return nullptr;
450 }
451}
452
454 return CSR_AMDGPU_NoRegs_RegMask;
455}
456
458 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
459}
460
463 const MachineFunction &MF) const {
464 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
465 // equivalent AV class. If used one, the verifier will crash after
466 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
467 // until Instruction selection.
468 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
469 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
470 return &AMDGPU::AV_32RegClass;
471 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
472 return &AMDGPU::AV_64RegClass;
473 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
474 RC == &AMDGPU::AReg_64_Align2RegClass)
475 return &AMDGPU::AV_64_Align2RegClass;
476 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
477 return &AMDGPU::AV_96RegClass;
478 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
479 RC == &AMDGPU::AReg_96_Align2RegClass)
480 return &AMDGPU::AV_96_Align2RegClass;
481 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
482 return &AMDGPU::AV_128RegClass;
483 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
484 RC == &AMDGPU::AReg_128_Align2RegClass)
485 return &AMDGPU::AV_128_Align2RegClass;
486 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
487 return &AMDGPU::AV_160RegClass;
488 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
489 RC == &AMDGPU::AReg_160_Align2RegClass)
490 return &AMDGPU::AV_160_Align2RegClass;
491 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
492 return &AMDGPU::AV_192RegClass;
493 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
494 RC == &AMDGPU::AReg_192_Align2RegClass)
495 return &AMDGPU::AV_192_Align2RegClass;
496 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
497 return &AMDGPU::AV_256RegClass;
498 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
499 RC == &AMDGPU::AReg_256_Align2RegClass)
500 return &AMDGPU::AV_256_Align2RegClass;
501 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
502 return &AMDGPU::AV_512RegClass;
503 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
504 RC == &AMDGPU::AReg_512_Align2RegClass)
505 return &AMDGPU::AV_512_Align2RegClass;
506 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
507 return &AMDGPU::AV_1024RegClass;
508 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
509 RC == &AMDGPU::AReg_1024_Align2RegClass)
510 return &AMDGPU::AV_1024_Align2RegClass;
511 }
512
514}
515
517 const SIFrameLowering *TFI = ST.getFrameLowering();
519
520 // During ISel lowering we always reserve the stack pointer in entry and chain
521 // functions, but never actually want to reference it when accessing our own
522 // frame. If we need a frame pointer we use it, but otherwise we can just use
523 // an immediate "0" which we represent by returning NoRegister.
524 if (FuncInfo->isBottomOfStack()) {
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
526 }
527 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
528 : FuncInfo->getStackPtrOffsetReg();
529}
530
532 // When we need stack realignment, we can't reference off of the
533 // stack pointer, so we reserve a base pointer.
534 return shouldRealignStack(MF);
535}
536
537Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
538
540 return AMDGPU_AllVGPRs_RegMask;
541}
542
544 return AMDGPU_AllAGPRs_RegMask;
545}
546
548 return AMDGPU_AllVectorRegs_RegMask;
549}
550
552 return AMDGPU_AllAllocatableSRegs_RegMask;
553}
554
555unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
556 unsigned NumRegs) {
557 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
558 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
559 assert(NumRegIndex && "Not implemented");
560 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
561 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
562}
563
566 const unsigned Align,
567 const TargetRegisterClass *RC) const {
568 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
569 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
570 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
571}
572
574 const MachineFunction &MF) const {
575 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
576}
577
579 BitVector Reserved(getNumRegs());
580 Reserved.set(AMDGPU::MODE);
581
583
584 // Reserve special purpose registers.
585 //
586 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
587 // this seems likely to result in bugs, so I'm marking them as reserved.
588 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
589 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
590
591 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
592 reserveRegisterTuples(Reserved, AMDGPU::M0);
593
594 // Reserve src_vccz, src_execz, src_scc.
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
596 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
598
599 // Reserve the memory aperture registers
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
606
607 // Reserve async counters pseudo registers
608 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
609 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
610
611 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
613
614 // Reserve xnack_mask registers - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
616
617 // Reserve lds_direct register - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
619
620 // Reserve Trap Handler registers - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::TBA);
622 reserveRegisterTuples(Reserved, AMDGPU::TMA);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
629 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
630 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
631
632 // Reserve null register - it shall never be allocated
633 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
634
635 // Reserve SGPRs.
636 //
637 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
638 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
639 for (const TargetRegisterClass *RC : regclasses()) {
640 if (RC->isBaseClass() && isSGPRClass(RC)) {
641 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
642 for (MCPhysReg Reg : *RC) {
643 unsigned Index = getHWRegIndex(Reg);
644 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
645 Reserved.set(Reg);
646 }
647 }
648 }
649
650 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
651 if (ScratchRSrcReg != AMDGPU::NoRegister) {
652 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
653 // need to spill.
654 // TODO: May need to reserve a VGPR if doing LDS spilling.
655 reserveRegisterTuples(Reserved, ScratchRSrcReg);
656 }
657
658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
659 if (LongBranchReservedReg)
660 reserveRegisterTuples(Reserved, LongBranchReservedReg);
661
662 // We have to assume the SP is needed in case there are calls in the function,
663 // which is detected after the function is lowered. If we aren't really going
664 // to need SP, don't bother reserving it.
665 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
666 if (StackPtrReg) {
667 reserveRegisterTuples(Reserved, StackPtrReg);
668 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
669 }
670
671 MCRegister FrameReg = MFI->getFrameOffsetReg();
672 if (FrameReg) {
673 reserveRegisterTuples(Reserved, FrameReg);
674 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
675 }
676
677 if (hasBasePointer(MF)) {
678 MCRegister BasePtrReg = getBaseRegister();
679 reserveRegisterTuples(Reserved, BasePtrReg);
680 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
681 }
682
683 // FIXME: Use same reserved register introduced in D149775
684 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
685 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
686 if (ExecCopyReg)
687 reserveRegisterTuples(Reserved, ExecCopyReg);
688
689 // Reserve VGPRs/AGPRs.
690 //
691 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
692
693 for (const TargetRegisterClass *RC : regclasses()) {
694 if (RC->isBaseClass() && isVGPRClass(RC)) {
695 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
696 for (MCPhysReg Reg : *RC) {
697 unsigned Index = getHWRegIndex(Reg);
698 if (Index + NumRegs > MaxNumVGPRs)
699 Reserved.set(Reg);
700 }
701 }
702 }
703
704 // Reserve all the AGPRs if there are no instructions to use it.
705 if (!ST.hasMAIInsts())
706 MaxNumAGPRs = 0;
707 for (const TargetRegisterClass *RC : regclasses()) {
708 if (RC->isBaseClass() && isAGPRClass(RC)) {
709 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
710 for (MCPhysReg Reg : *RC) {
711 unsigned Index = getHWRegIndex(Reg);
712 if (Index + NumRegs > MaxNumAGPRs)
713 Reserved.set(Reg);
714 }
715 }
716 }
717
718 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
719 // VGPR available at all times.
720 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
721 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
722 }
723
724 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
725 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
726 // wwm-regalloc and it would be empty otherwise.
727 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
728 if (!NonWWMRegMask.empty()) {
729 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
730 RegI < RegE; ++RegI) {
731 if (NonWWMRegMask.test(RegI))
732 reserveRegisterTuples(Reserved, RegI);
733 }
734 }
735
736 for (Register Reg : MFI->getWWMReservedRegs())
737 reserveRegisterTuples(Reserved, Reg);
738
739 // FIXME: Stop using reserved registers for this.
740 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
744 reserveRegisterTuples(Reserved, Reg);
745
746 return Reserved;
747}
748
750 MCRegister PhysReg) const {
751 return !MF.getRegInfo().isReserved(PhysReg);
752}
753
756 // On entry or in chain functions, the base address is 0, so it can't possibly
757 // need any more alignment.
758
759 // FIXME: Should be able to specify the entry frame alignment per calling
760 // convention instead.
761 if (Info->isBottomOfStack())
762 return false;
763
765}
766
769 if (Info->isEntryFunction()) {
770 const MachineFrameInfo &MFI = Fn.getFrameInfo();
771 return MFI.hasStackObjects() || MFI.hasCalls();
772 }
773
774 // May need scavenger for dealing with callee saved registers.
775 return true;
776}
777
779 const MachineFunction &MF) const {
780 // Do not use frame virtual registers. They used to be used for SGPRs, but
781 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
782 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
783 // spill.
784 return false;
785}
786
788 const MachineFunction &MF) const {
789 const MachineFrameInfo &MFI = MF.getFrameInfo();
790 return MFI.hasStackObjects();
791}
792
794 const MachineFunction &) const {
795 // There are no special dedicated stack or frame pointers.
796 return true;
797}
798
801
802 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
803 AMDGPU::OpName::offset);
804 return MI->getOperand(OffIdx).getImm();
805}
806
808 int Idx) const {
809 switch (MI->getOpcode()) {
810 case AMDGPU::V_ADD_U32_e32:
811 case AMDGPU::V_ADD_U32_e64:
812 case AMDGPU::V_ADD_CO_U32_e32: {
813 int OtherIdx = Idx == 1 ? 2 : 1;
814 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
815 return OtherOp.isImm() ? OtherOp.getImm() : 0;
816 }
817 case AMDGPU::V_ADD_CO_U32_e64: {
818 int OtherIdx = Idx == 2 ? 3 : 2;
819 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
820 return OtherOp.isImm() ? OtherOp.getImm() : 0;
821 }
822 default:
823 break;
824 }
825
827 return 0;
828
829 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::vaddr) ||
831 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
832 AMDGPU::OpName::saddr))) &&
833 "Should never see frame index on non-address operand");
834
836}
837
839 const MachineInstr &MI) {
840 assert(MI.getDesc().isAdd());
841 const MachineOperand &Src0 = MI.getOperand(1);
842 const MachineOperand &Src1 = MI.getOperand(2);
843
844 if (Src0.isFI()) {
845 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
846 Src1.getReg()));
847 }
848
849 if (Src1.isFI()) {
850 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
851 Src0.getReg()));
852 }
853
854 return false;
855}
856
858 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
859 switch (MI->getOpcode()) {
860 case AMDGPU::V_ADD_U32_e32: {
861 // TODO: We could handle this but it requires work to avoid violating
862 // operand restrictions.
863 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
864 !isFIPlusImmOrVGPR(*this, *MI))
865 return false;
866 [[fallthrough]];
867 }
868 case AMDGPU::V_ADD_U32_e64:
869 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
870 // as-is.
871 //
872 // Much of the benefit with the MUBUF handling is we avoid duplicating the
873 // shift of the frame register, which isn't needed with scratch.
874 //
875 // materializeFrameBaseRegister doesn't know the register classes of the
876 // uses, and unconditionally uses an s_add_i32, which will end up using a
877 // copy for the vector uses.
878 return !ST.hasFlatScratchEnabled();
879 case AMDGPU::V_ADD_CO_U32_e32:
880 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
881 !isFIPlusImmOrVGPR(*this, *MI))
882 return false;
883 // We can't deal with the case where the carry out has a use (though this
884 // should never happen)
885 return MI->getOperand(3).isDead();
886 case AMDGPU::V_ADD_CO_U32_e64:
887 // TODO: Should we check use_empty instead?
888 return MI->getOperand(1).isDead();
889 default:
890 break;
891 }
892
894 return false;
895
896 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
897
898 const SIInstrInfo *TII = ST.getInstrInfo();
900 return !TII->isLegalMUBUFImmOffset(FullOffset);
901
902 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
904}
905
907 int FrameIdx,
908 int64_t Offset) const {
909 MachineBasicBlock::iterator Ins = MBB->begin();
910 DebugLoc DL; // Defaults to "unknown"
911
912 if (Ins != MBB->end())
913 DL = Ins->getDebugLoc();
914
915 MachineFunction *MF = MBB->getParent();
916 const SIInstrInfo *TII = ST.getInstrInfo();
917 MachineRegisterInfo &MRI = MF->getRegInfo();
918 unsigned MovOpc =
919 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
920
921 Register BaseReg = MRI.createVirtualRegister(
922 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
923 : &AMDGPU::VGPR_32RegClass);
924
925 if (Offset == 0) {
926 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
927 .addFrameIndex(FrameIdx);
928 return BaseReg;
929 }
930
931 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
932
933 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
934 ? &AMDGPU::SReg_32_XM0RegClass
935 : &AMDGPU::VGPR_32RegClass);
936
937 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
938 .addImm(Offset);
939 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
940 .addFrameIndex(FrameIdx);
941
942 if (ST.hasFlatScratchEnabled()) {
943 // FIXME: Make sure scc isn't live in.
944 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
945 .addReg(OffsetReg, RegState::Kill)
946 .addReg(FIReg)
947 .setOperandDead(3); // scc
948 return BaseReg;
949 }
950
951 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
952 .addReg(OffsetReg, RegState::Kill)
953 .addReg(FIReg)
954 .addImm(0); // clamp bit
955
956 return BaseReg;
957}
958
960 int64_t Offset) const {
961 const SIInstrInfo *TII = ST.getInstrInfo();
962
963 switch (MI.getOpcode()) {
964 case AMDGPU::V_ADD_U32_e32:
965 case AMDGPU::V_ADD_CO_U32_e32: {
966 MachineOperand *FIOp = &MI.getOperand(2);
967 MachineOperand *ImmOp = &MI.getOperand(1);
968 if (!FIOp->isFI())
969 std::swap(FIOp, ImmOp);
970
971 if (!ImmOp->isImm()) {
972 assert(Offset == 0);
973 FIOp->ChangeToRegister(BaseReg, false);
974 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
975 return;
976 }
977
978 int64_t TotalOffset = ImmOp->getImm() + Offset;
979 if (TotalOffset == 0) {
980 MI.setDesc(TII->get(AMDGPU::COPY));
981 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
982 MI.removeOperand(I);
983
984 MI.getOperand(1).ChangeToRegister(BaseReg, false);
985 return;
986 }
987
988 ImmOp->setImm(TotalOffset);
989
990 MachineBasicBlock *MBB = MI.getParent();
991 MachineFunction *MF = MBB->getParent();
992 MachineRegisterInfo &MRI = MF->getRegInfo();
993
994 // FIXME: materializeFrameBaseRegister does not know the register class of
995 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
996 // Emit a copy so we have a legal operand and hope the register coalescer
997 // can clean it up.
998 if (isSGPRReg(MRI, BaseReg)) {
999 Register BaseRegVGPR =
1000 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1001 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1002 .addReg(BaseReg);
1003 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1004 } else {
1005 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1006 }
1007 return;
1008 }
1009 case AMDGPU::V_ADD_U32_e64:
1010 case AMDGPU::V_ADD_CO_U32_e64: {
1011 int Src0Idx = MI.getNumExplicitDefs();
1012 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1013 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1014 if (!FIOp->isFI())
1015 std::swap(FIOp, ImmOp);
1016
1017 if (!ImmOp->isImm()) {
1018 FIOp->ChangeToRegister(BaseReg, false);
1019 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1020 return;
1021 }
1022
1023 int64_t TotalOffset = ImmOp->getImm() + Offset;
1024 if (TotalOffset == 0) {
1025 MI.setDesc(TII->get(AMDGPU::COPY));
1026
1027 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1028 MI.removeOperand(I);
1029
1030 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1031 } else {
1032 FIOp->ChangeToRegister(BaseReg, false);
1033 ImmOp->setImm(TotalOffset);
1034 }
1035
1036 return;
1037 }
1038 default:
1039 break;
1040 }
1041
1042 bool IsFlat = TII->isFLATScratch(MI);
1043
1044#ifndef NDEBUG
1045 // FIXME: Is it possible to be storing a frame index to itself?
1046 bool SeenFI = false;
1047 for (const MachineOperand &MO: MI.operands()) {
1048 if (MO.isFI()) {
1049 if (SeenFI)
1050 llvm_unreachable("should not see multiple frame indices");
1051
1052 SeenFI = true;
1053 }
1054 }
1055#endif
1056
1057 MachineOperand *FIOp =
1058 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1059 : AMDGPU::OpName::vaddr);
1060
1061 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1062 int64_t NewOffset = OffsetOp->getImm() + Offset;
1063
1064 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1065 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1066
1067 if (IsFlat) {
1068 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1070 "offset should be legal");
1071 FIOp->ChangeToRegister(BaseReg, false);
1072 OffsetOp->setImm(NewOffset);
1073 return;
1074 }
1075
1076#ifndef NDEBUG
1077 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1078 assert(SOffset->isImm() && SOffset->getImm() == 0);
1079#endif
1080
1081 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1082
1083 FIOp->ChangeToRegister(BaseReg, false);
1084 OffsetOp->setImm(NewOffset);
1085}
1086
1088 Register BaseReg,
1089 int64_t Offset) const {
1090
1091 switch (MI->getOpcode()) {
1092 case AMDGPU::V_ADD_U32_e32:
1093 case AMDGPU::V_ADD_CO_U32_e32:
1094 return true;
1095 case AMDGPU::V_ADD_U32_e64:
1096 case AMDGPU::V_ADD_CO_U32_e64:
1097 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1098 default:
1099 break;
1100 }
1101
1103 return false;
1104
1105 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1106
1107 const SIInstrInfo *TII = ST.getInstrInfo();
1109 return TII->isLegalMUBUFImmOffset(NewOffset);
1110
1111 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1113}
1114
1115const TargetRegisterClass *
1117 // This is inaccurate. It depends on the instruction and address space. The
1118 // only place where we should hit this is for dealing with frame indexes /
1119 // private accesses, so this is correct in that case.
1120 return &AMDGPU::VGPR_32RegClass;
1121}
1122
1123const TargetRegisterClass *
1125 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1126}
1127
1129 const SIInstrInfo *TII) {
1130
1131 unsigned Op = MI.getOpcode();
1132 switch (Op) {
1133 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1134 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1135 // FIXME: This assumes the mask is statically known and not computed at
1136 // runtime. However, some ABIs may want to compute the mask dynamically and
1137 // this will need to be updated.
1138 return llvm::popcount(
1139 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1140 case AMDGPU::SI_SPILL_S1024_SAVE:
1141 case AMDGPU::SI_SPILL_S1024_RESTORE:
1142 case AMDGPU::SI_SPILL_V1024_SAVE:
1143 case AMDGPU::SI_SPILL_V1024_RESTORE:
1144 case AMDGPU::SI_SPILL_A1024_SAVE:
1145 case AMDGPU::SI_SPILL_A1024_RESTORE:
1146 case AMDGPU::SI_SPILL_AV1024_SAVE:
1147 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1148 return 32;
1149 case AMDGPU::SI_SPILL_S512_SAVE:
1150 case AMDGPU::SI_SPILL_S512_RESTORE:
1151 case AMDGPU::SI_SPILL_V512_SAVE:
1152 case AMDGPU::SI_SPILL_V512_RESTORE:
1153 case AMDGPU::SI_SPILL_A512_SAVE:
1154 case AMDGPU::SI_SPILL_A512_RESTORE:
1155 case AMDGPU::SI_SPILL_AV512_SAVE:
1156 case AMDGPU::SI_SPILL_AV512_RESTORE:
1157 return 16;
1158 case AMDGPU::SI_SPILL_S384_SAVE:
1159 case AMDGPU::SI_SPILL_S384_RESTORE:
1160 case AMDGPU::SI_SPILL_V384_SAVE:
1161 case AMDGPU::SI_SPILL_V384_RESTORE:
1162 case AMDGPU::SI_SPILL_A384_SAVE:
1163 case AMDGPU::SI_SPILL_A384_RESTORE:
1164 case AMDGPU::SI_SPILL_AV384_SAVE:
1165 case AMDGPU::SI_SPILL_AV384_RESTORE:
1166 return 12;
1167 case AMDGPU::SI_SPILL_S352_SAVE:
1168 case AMDGPU::SI_SPILL_S352_RESTORE:
1169 case AMDGPU::SI_SPILL_V352_SAVE:
1170 case AMDGPU::SI_SPILL_V352_RESTORE:
1171 case AMDGPU::SI_SPILL_A352_SAVE:
1172 case AMDGPU::SI_SPILL_A352_RESTORE:
1173 case AMDGPU::SI_SPILL_AV352_SAVE:
1174 case AMDGPU::SI_SPILL_AV352_RESTORE:
1175 return 11;
1176 case AMDGPU::SI_SPILL_S320_SAVE:
1177 case AMDGPU::SI_SPILL_S320_RESTORE:
1178 case AMDGPU::SI_SPILL_V320_SAVE:
1179 case AMDGPU::SI_SPILL_V320_RESTORE:
1180 case AMDGPU::SI_SPILL_A320_SAVE:
1181 case AMDGPU::SI_SPILL_A320_RESTORE:
1182 case AMDGPU::SI_SPILL_AV320_SAVE:
1183 case AMDGPU::SI_SPILL_AV320_RESTORE:
1184 return 10;
1185 case AMDGPU::SI_SPILL_S288_SAVE:
1186 case AMDGPU::SI_SPILL_S288_RESTORE:
1187 case AMDGPU::SI_SPILL_V288_SAVE:
1188 case AMDGPU::SI_SPILL_V288_RESTORE:
1189 case AMDGPU::SI_SPILL_A288_SAVE:
1190 case AMDGPU::SI_SPILL_A288_RESTORE:
1191 case AMDGPU::SI_SPILL_AV288_SAVE:
1192 case AMDGPU::SI_SPILL_AV288_RESTORE:
1193 return 9;
1194 case AMDGPU::SI_SPILL_S256_SAVE:
1195 case AMDGPU::SI_SPILL_S256_RESTORE:
1196 case AMDGPU::SI_SPILL_V256_SAVE:
1197 case AMDGPU::SI_SPILL_V256_RESTORE:
1198 case AMDGPU::SI_SPILL_A256_SAVE:
1199 case AMDGPU::SI_SPILL_A256_RESTORE:
1200 case AMDGPU::SI_SPILL_AV256_SAVE:
1201 case AMDGPU::SI_SPILL_AV256_RESTORE:
1202 return 8;
1203 case AMDGPU::SI_SPILL_S224_SAVE:
1204 case AMDGPU::SI_SPILL_S224_RESTORE:
1205 case AMDGPU::SI_SPILL_V224_SAVE:
1206 case AMDGPU::SI_SPILL_V224_RESTORE:
1207 case AMDGPU::SI_SPILL_A224_SAVE:
1208 case AMDGPU::SI_SPILL_A224_RESTORE:
1209 case AMDGPU::SI_SPILL_AV224_SAVE:
1210 case AMDGPU::SI_SPILL_AV224_RESTORE:
1211 return 7;
1212 case AMDGPU::SI_SPILL_S192_SAVE:
1213 case AMDGPU::SI_SPILL_S192_RESTORE:
1214 case AMDGPU::SI_SPILL_V192_SAVE:
1215 case AMDGPU::SI_SPILL_V192_RESTORE:
1216 case AMDGPU::SI_SPILL_A192_SAVE:
1217 case AMDGPU::SI_SPILL_A192_RESTORE:
1218 case AMDGPU::SI_SPILL_AV192_SAVE:
1219 case AMDGPU::SI_SPILL_AV192_RESTORE:
1220 return 6;
1221 case AMDGPU::SI_SPILL_S160_SAVE:
1222 case AMDGPU::SI_SPILL_S160_RESTORE:
1223 case AMDGPU::SI_SPILL_V160_SAVE:
1224 case AMDGPU::SI_SPILL_V160_RESTORE:
1225 case AMDGPU::SI_SPILL_A160_SAVE:
1226 case AMDGPU::SI_SPILL_A160_RESTORE:
1227 case AMDGPU::SI_SPILL_AV160_SAVE:
1228 case AMDGPU::SI_SPILL_AV160_RESTORE:
1229 return 5;
1230 case AMDGPU::SI_SPILL_S128_SAVE:
1231 case AMDGPU::SI_SPILL_S128_RESTORE:
1232 case AMDGPU::SI_SPILL_V128_SAVE:
1233 case AMDGPU::SI_SPILL_V128_RESTORE:
1234 case AMDGPU::SI_SPILL_A128_SAVE:
1235 case AMDGPU::SI_SPILL_A128_RESTORE:
1236 case AMDGPU::SI_SPILL_AV128_SAVE:
1237 case AMDGPU::SI_SPILL_AV128_RESTORE:
1238 return 4;
1239 case AMDGPU::SI_SPILL_S96_SAVE:
1240 case AMDGPU::SI_SPILL_S96_RESTORE:
1241 case AMDGPU::SI_SPILL_V96_SAVE:
1242 case AMDGPU::SI_SPILL_V96_RESTORE:
1243 case AMDGPU::SI_SPILL_A96_SAVE:
1244 case AMDGPU::SI_SPILL_A96_RESTORE:
1245 case AMDGPU::SI_SPILL_AV96_SAVE:
1246 case AMDGPU::SI_SPILL_AV96_RESTORE:
1247 return 3;
1248 case AMDGPU::SI_SPILL_S64_SAVE:
1249 case AMDGPU::SI_SPILL_S64_RESTORE:
1250 case AMDGPU::SI_SPILL_V64_SAVE:
1251 case AMDGPU::SI_SPILL_V64_RESTORE:
1252 case AMDGPU::SI_SPILL_A64_SAVE:
1253 case AMDGPU::SI_SPILL_A64_RESTORE:
1254 case AMDGPU::SI_SPILL_AV64_SAVE:
1255 case AMDGPU::SI_SPILL_AV64_RESTORE:
1256 return 2;
1257 case AMDGPU::SI_SPILL_S32_SAVE:
1258 case AMDGPU::SI_SPILL_S32_RESTORE:
1259 case AMDGPU::SI_SPILL_V32_SAVE:
1260 case AMDGPU::SI_SPILL_V32_RESTORE:
1261 case AMDGPU::SI_SPILL_A32_SAVE:
1262 case AMDGPU::SI_SPILL_A32_RESTORE:
1263 case AMDGPU::SI_SPILL_AV32_SAVE:
1264 case AMDGPU::SI_SPILL_AV32_RESTORE:
1265 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1266 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1267 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1268 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1269 case AMDGPU::SI_SPILL_V16_SAVE:
1270 case AMDGPU::SI_SPILL_V16_RESTORE:
1271 return 1;
1272 default: llvm_unreachable("Invalid spill opcode");
1273 }
1274}
1275
1276static int getOffsetMUBUFStore(unsigned Opc) {
1277 switch (Opc) {
1278 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1279 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1280 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1281 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1282 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1283 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1284 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1285 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1286 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1287 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1288 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1289 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1290 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1291 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1292 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1293 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1294 default:
1295 return -1;
1296 }
1297}
1298
1299static int getOffsetMUBUFLoad(unsigned Opc) {
1300 switch (Opc) {
1301 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1302 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1303 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1304 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1305 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1306 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1307 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1308 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1309 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1310 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1311 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1312 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1313 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1314 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1315 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1316 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1317 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1318 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1319 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1320 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1321 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1322 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1323 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1324 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1325 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1326 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1327 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1328 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1329 default:
1330 return -1;
1331 }
1332}
1333
1334static int getOffenMUBUFStore(unsigned Opc) {
1335 switch (Opc) {
1336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1337 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1338 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1339 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1340 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1341 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1342 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1343 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1344 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1345 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1346 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1347 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1348 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1349 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1350 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1351 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1352 default:
1353 return -1;
1354 }
1355}
1356
1357static int getOffenMUBUFLoad(unsigned Opc) {
1358 switch (Opc) {
1359 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1360 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1361 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1362 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1363 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1364 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1365 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1366 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1367 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1368 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1369 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1370 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1371 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1372 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1373 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1374 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1375 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1376 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1377 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1378 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1379 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1380 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1381 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1382 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1383 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1384 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1385 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1386 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1387 default:
1388 return -1;
1389 }
1390}
1391
1395 int Index, unsigned Lane,
1396 unsigned ValueReg, bool IsKill) {
1397 MachineFunction *MF = MBB.getParent();
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400
1401 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1402
1403 if (Reg == AMDGPU::NoRegister)
1404 return MachineInstrBuilder();
1405
1406 bool IsStore = MI->mayStore();
1407 MachineRegisterInfo &MRI = MF->getRegInfo();
1408 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1409
1410 unsigned Dst = IsStore ? Reg : ValueReg;
1411 unsigned Src = IsStore ? ValueReg : Reg;
1412 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1413 const DebugLoc &DL = MI->getDebugLoc();
1414 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1415 // Spiller during regalloc may restore a spilled register to its superclass.
1416 // It could result in AGPR spills restored to VGPRs or the other way around,
1417 // making the src and dst with identical regclasses at this point. It just
1418 // needs a copy in such cases.
1419 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1420 .addReg(Src, getKillRegState(IsKill));
1422 return CopyMIB;
1423 }
1424 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1425 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1426
1427 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1428 .addReg(Src, getKillRegState(IsKill));
1430 return MIB;
1431}
1432
1433// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1434// need to handle the case where an SGPR may need to be spilled while spilling.
1436 MachineFrameInfo &MFI,
1438 int Index,
1439 int64_t Offset) {
1440 const SIInstrInfo *TII = ST.getInstrInfo();
1441 MachineBasicBlock *MBB = MI->getParent();
1442 const DebugLoc &DL = MI->getDebugLoc();
1443 bool IsStore = MI->mayStore();
1444
1445 unsigned Opc = MI->getOpcode();
1446 int LoadStoreOp = IsStore ?
1448 if (LoadStoreOp == -1)
1449 return false;
1450
1451 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1452 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1453 return true;
1454
1455 MachineInstrBuilder NewMI =
1456 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1457 .add(*Reg)
1458 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1459 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1460 .addImm(Offset)
1461 .addImm(0) // cpol
1462 .addImm(0) // swz
1463 .cloneMemRefs(*MI);
1464
1465 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1466 AMDGPU::OpName::vdata_in);
1467 if (VDataIn)
1468 NewMI.add(*VDataIn);
1469 return true;
1470}
1471
1473 unsigned LoadStoreOp,
1474 unsigned EltSize) {
1475 bool IsStore = TII->get(LoadStoreOp).mayStore();
1476 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1477 bool UseST =
1478 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1479
1480 // Handle block load/store first.
1481 if (TII->isBlockLoadStore(LoadStoreOp))
1482 return LoadStoreOp;
1483
1484 switch (EltSize) {
1485 case 4:
1486 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1487 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1488 break;
1489 case 8:
1490 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1491 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1492 break;
1493 case 12:
1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1495 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1496 break;
1497 case 16:
1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1499 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1500 break;
1501 default:
1502 llvm_unreachable("Unexpected spill load/store size!");
1503 }
1504
1505 if (HasVAddr)
1506 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1507 else if (UseST)
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1509
1510 return LoadStoreOp;
1511}
1512
1515 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1516 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1517 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1518 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1519
1520 MachineFunction *MF = MBB.getParent();
1521 const SIInstrInfo *TII = ST.getInstrInfo();
1522 const MachineFrameInfo &MFI = MF->getFrameInfo();
1523 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1524
1525 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1526 bool IsStore = Desc->mayStore();
1527 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1528 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1529
1530 bool CanClobberSCC = false;
1531 bool Scavenged = false;
1532 MCRegister SOffset = ScratchOffsetReg;
1533
1534 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1535 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1536 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1537 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1538
1539 // On targets with register tuple alignment requirements,
1540 // for unaligned tuples, break the spill into 32-bit pieces.
1541 // TODO: Optimize misaligned spills by using larger aligned chunks instead of
1542 // 32-bit splits.
1543 bool IsRegMisaligned = false;
1544 if (!IsBlock && RegWidth > 4) {
1545 unsigned SpillOpcode =
1546 getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
1547 int VDataIdx =
1548 IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
1549 : 0; // Restore Ops have data reg as the first (output) operand.
1550 const TargetRegisterClass *ExpectedRC =
1551 TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
1552 if (!ExpectedRC->contains(ValueReg)) {
1553 unsigned NumRegs = std::min(AMDGPU::getRegBitWidth(*ExpectedRC) / 4, 4u);
1554 unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
1555 const TargetRegisterClass *MatchRC =
1556 getMatchingSuperRegClass(RC, ExpectedRC, SubIdx);
1557 if (!MatchRC || !MatchRC->contains(ValueReg))
1558 IsRegMisaligned = true;
1559 }
1560 }
1561 // Always use 4 byte operations for AGPRs because we need to scavenge
1562 // a temporary VGPR.
1563 // If we're using a block operation, the element should be the whole block.
1564 // For misaligned registers, use 4-byte elements to avoid alignment errors.
1565 unsigned EltSize = IsBlock ? RegWidth
1566 : (IsFlat && !IsAGPR && !IsRegMisaligned)
1567 ? std::min(RegWidth, 16u)
1568 : 4u;
1569 unsigned NumSubRegs = RegWidth / EltSize;
1570 unsigned Size = NumSubRegs * EltSize;
1571 unsigned RemSize = RegWidth - Size;
1572 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1573 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1574 int64_t MaterializedOffset = Offset;
1575
1576 // Maxoffset is the starting offset for the last chunk to be spilled.
1577 // In case of non-zero remainder element, max offset will be the
1578 // last address(offset + Size) after spilling all the EltSize chunks.
1579 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1580 int64_t ScratchOffsetRegDelta = 0;
1581
1582 if (IsFlat && EltSize > 4) {
1583 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1584 Desc = &TII->get(LoadStoreOp);
1585 }
1586
1587 Align Alignment = MFI.getObjectAlign(Index);
1588 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1589
1590 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1591 "unexpected VGPR spill offset");
1592
1593 // Track a VGPR to use for a constant offset we need to materialize.
1594 Register TmpOffsetVGPR;
1595
1596 // Track a VGPR to use as an intermediate value.
1597 Register TmpIntermediateVGPR;
1598 bool UseVGPROffset = false;
1599
1600 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1601 // combination.
1602 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1603 int64_t VOffset) {
1604 // We are using a VGPR offset
1605 if (IsFlat && SGPRBase) {
1606 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1607 // SGPR, so perform the add as vector.
1608 // We don't need a base SGPR in the kernel.
1609
1610 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1611 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1612 .addReg(SGPRBase)
1613 .addImm(VOffset)
1614 .addImm(0); // clamp
1615 } else {
1616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1617 .addReg(SGPRBase);
1618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1619 .addImm(VOffset)
1620 .addReg(TmpOffsetVGPR);
1621 }
1622 } else {
1623 assert(TmpOffsetVGPR);
1624 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1625 .addImm(VOffset);
1626 }
1627 };
1628
1629 bool IsOffsetLegal =
1630 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1632 : TII->isLegalMUBUFImmOffset(MaxOffset);
1633 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1634 SOffset = MCRegister();
1635
1636 // We don't have access to the register scavenger if this function is called
1637 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1638 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1639 // entry.
1640 if (RS) {
1641 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1642
1643 // Piggy back on the liveness scan we just did see if SCC is dead.
1644 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1645 } else if (LiveUnits) {
1646 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1647 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1648 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1649 SOffset = Reg;
1650 break;
1651 }
1652 }
1653 }
1654
1655 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1656 SOffset = Register();
1657
1658 if (!SOffset) {
1659 UseVGPROffset = true;
1660
1661 if (RS) {
1662 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1663 } else {
1664 assert(LiveUnits);
1665 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1666 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1667 TmpOffsetVGPR = Reg;
1668 break;
1669 }
1670 }
1671 }
1672
1673 assert(TmpOffsetVGPR);
1674 } else if (!SOffset && CanClobberSCC) {
1675 // There are no free SGPRs, and since we are in the process of spilling
1676 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1677 // on SI/CI and on VI it is true until we implement spilling using scalar
1678 // stores), we have no way to free up an SGPR. Our solution here is to
1679 // add the offset directly to the ScratchOffset or StackPtrOffset
1680 // register, and then subtract the offset after the spill to return the
1681 // register to it's original value.
1682
1683 // TODO: If we don't have to do an emergency stack slot spill, converting
1684 // to use the VGPR offset is fewer instructions.
1685 if (!ScratchOffsetReg)
1686 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1687 SOffset = ScratchOffsetReg;
1688 ScratchOffsetRegDelta = Offset;
1689 } else {
1690 Scavenged = true;
1691 }
1692
1693 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1694 // we can simplify the adjustment of Offset here to just scale with
1695 // WavefrontSize.
1696 if (!IsFlat && !UseVGPROffset)
1697 Offset *= ST.getWavefrontSize();
1698
1699 if (!UseVGPROffset && !SOffset)
1700 report_fatal_error("could not scavenge SGPR to spill in entry function");
1701
1702 if (UseVGPROffset) {
1703 // We are using a VGPR offset
1704 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1705 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1706 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1707 } else {
1708 assert(Offset != 0);
1709 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1710 .addReg(ScratchOffsetReg)
1711 .addImm(Offset);
1712 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1713 }
1714
1715 Offset = 0;
1716 }
1717
1718 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1719 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1720 && "Unexpected vaddr for flat scratch with a FI operand");
1721
1722 if (UseVGPROffset) {
1723 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1724 } else {
1725 assert(ST.hasFlatScratchSTMode());
1726 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1727 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1728 }
1729
1730 Desc = &TII->get(LoadStoreOp);
1731 }
1732
1733 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1734 ++i, RegOffset += EltSize) {
1735 if (i == NumSubRegs) {
1736 EltSize = RemSize;
1737 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1738 }
1739 Desc = &TII->get(LoadStoreOp);
1740
1741 if (!IsFlat && UseVGPROffset) {
1742 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1743 : getOffenMUBUFLoad(LoadStoreOp);
1744 Desc = &TII->get(NewLoadStoreOp);
1745 }
1746
1747 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1748 // If we are spilling an AGPR beyond the range of the memory instruction
1749 // offset and need to use a VGPR offset, we ideally have at least 2
1750 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1751 // recycle the VGPR used for the offset which requires resetting after
1752 // each subregister.
1753
1754 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1755 }
1756
1757 unsigned NumRegs = EltSize / 4;
1758 Register SubReg = e == 1
1759 ? ValueReg
1760 : Register(getSubReg(ValueReg,
1761 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1762
1763 RegState SOffsetRegState = {};
1764 RegState SrcDstRegState = getDefRegState(!IsStore);
1765 const bool IsLastSubReg = i + 1 == e;
1766 const bool IsFirstSubReg = i == 0;
1767 if (IsLastSubReg) {
1768 SOffsetRegState |= getKillRegState(Scavenged);
1769 // The last implicit use carries the "Kill" flag.
1770 SrcDstRegState |= getKillRegState(IsKill);
1771 }
1772
1773 // Make sure the whole register is defined if there are undef components by
1774 // adding an implicit def of the super-reg on the first instruction.
1775 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1776 bool NeedSuperRegImpOperand = e > 1;
1777
1778 // Remaining element size to spill into memory after some parts of it
1779 // spilled into either AGPRs or VGPRs.
1780 unsigned RemEltSize = EltSize;
1781
1782 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1783 // starting from the last lane. In case if a register cannot be completely
1784 // spilled into another register that will ensure its alignment does not
1785 // change. For targets with VGPR alignment requirement this is important
1786 // in case of flat scratch usage as we might get a scratch_load or
1787 // scratch_store of an unaligned register otherwise.
1788 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1789 LaneE = RegOffset / 4;
1790 Lane >= LaneE; --Lane) {
1791 bool IsSubReg = e > 1 || EltSize > 4;
1792 Register Sub = IsSubReg
1793 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1794 : ValueReg;
1795 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1796 if (!MIB.getInstr())
1797 break;
1798 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1799 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1800 NeedSuperRegDef = false;
1801 }
1802 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1803 NeedSuperRegImpOperand = true;
1804 RegState State = SrcDstRegState;
1805 if (!IsLastSubReg || (Lane != LaneE))
1806 State &= ~RegState::Kill;
1807 if (!IsFirstSubReg || (Lane != LaneS))
1808 State &= ~RegState::Define;
1809 MIB.addReg(ValueReg, RegState::Implicit | State);
1810 }
1811 RemEltSize -= 4;
1812 }
1813
1814 if (!RemEltSize) // Fully spilled into AGPRs.
1815 continue;
1816
1817 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1818 assert(IsFlat && EltSize > 4);
1819
1820 unsigned NumRegs = RemEltSize / 4;
1821 SubReg = Register(getSubReg(ValueReg,
1822 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1823 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1824 Desc = &TII->get(Opc);
1825 }
1826
1827 unsigned FinalReg = SubReg;
1828
1829 if (IsAGPR) {
1830 assert(EltSize == 4);
1831
1832 if (!TmpIntermediateVGPR) {
1833 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1834 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1835 }
1836 if (IsStore) {
1837 auto AccRead = BuildMI(MBB, MI, DL,
1838 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1839 TmpIntermediateVGPR)
1840 .addReg(SubReg, getKillRegState(IsKill));
1841 if (NeedSuperRegDef)
1842 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1843 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1844 AccRead.addReg(ValueReg, RegState::Implicit);
1846 }
1847 SubReg = TmpIntermediateVGPR;
1848 } else if (UseVGPROffset) {
1849 if (!TmpOffsetVGPR) {
1850 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1851 MI, false, 0);
1852 RS->setRegUsed(TmpOffsetVGPR);
1853 }
1854 }
1855
1856 Register FinalValueReg = ValueReg;
1857 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1858 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1859 // 32-bit VGPR to load and extract 16-bits into the final register.
1860 ValueReg =
1861 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1862 SubReg = ValueReg;
1863 IsKill = false;
1864 }
1865
1866 // Create the MMO, additional set the NonVolatile flag as scratch memory
1867 // used for spills will not be used outside the thread.
1868 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1870 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1871 commonAlignment(Alignment, RegOffset));
1872
1873 auto MIB =
1874 BuildMI(MBB, MI, DL, *Desc)
1875 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1876
1877 if (UseVGPROffset) {
1878 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1879 // intermediate accvgpr_write.
1880 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1881 }
1882
1883 if (!IsFlat)
1884 MIB.addReg(FuncInfo->getScratchRSrcReg());
1885
1886 if (SOffset == AMDGPU::NoRegister) {
1887 if (!IsFlat) {
1888 if (UseVGPROffset && ScratchOffsetReg) {
1889 MIB.addReg(ScratchOffsetReg);
1890 } else {
1891 assert(FuncInfo->isBottomOfStack());
1892 MIB.addImm(0);
1893 }
1894 }
1895 } else {
1896 MIB.addReg(SOffset, SOffsetRegState);
1897 }
1898
1899 MIB.addImm(Offset + RegOffset);
1900
1901 bool LastUse = MMO->getFlags() & MOLastUse;
1902 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1903
1904 if (!IsFlat)
1905 MIB.addImm(0); // swz
1906 MIB.addMemOperand(NewMMO);
1907
1908 if (FinalValueReg != ValueReg) {
1909 // Extract 16-bit from the loaded 32-bit value.
1910 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1911 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1912 .addReg(FinalValueReg, getDefRegState(true))
1913 .addImm(0)
1914 .addReg(ValueReg, getKillRegState(true))
1915 .addImm(0);
1916 ValueReg = FinalValueReg;
1917 }
1918
1919 if (!IsAGPR && NeedSuperRegDef)
1920 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1921
1922 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1923 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1924 FinalReg)
1925 .addReg(TmpIntermediateVGPR, RegState::Kill);
1927 }
1928
1929 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
1930 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1931 if (NeedSuperRegImpOperand &&
1932 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1933 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1934 if (PartialReloadCopy)
1935 MIB.addReg(ValueReg, RegState::Implicit);
1936 }
1937
1938 // The epilog restore of a wwm-scratch register can cause undesired
1939 // optimization during machine-cp post PrologEpilogInserter if the same
1940 // register was assigned for return value ABI lowering with a COPY
1941 // instruction. As given below, with the epilog reload, the earlier COPY
1942 // appeared to be dead during machine-cp.
1943 // ...
1944 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1945 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1946 // ...
1947 // Epilog block:
1948 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1949 // ...
1950 // WWM spill restore to preserve the inactive lanes of v0.
1951 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1952 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1953 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1954 // ...
1955 // SI_RETURN implicit $vgpr0
1956 // ...
1957 // To fix it, mark the same reg as a tied op for such restore instructions
1958 // so that it marks a usage for the preceding COPY.
1959 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1960 MI->readsRegister(SubReg, this)) {
1961 MIB.addReg(SubReg, RegState::Implicit);
1962 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1963 }
1964
1965 // If we're building a block load, we should add artificial uses for the
1966 // CSR VGPRs that are *not* being transferred. This is because liveness
1967 // analysis is not aware of the mask, so we need to somehow inform it that
1968 // those registers are not available before the load and they should not be
1969 // scavenged.
1970 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1971 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1972 }
1973
1974 if (ScratchOffsetRegDelta != 0) {
1975 // Subtract the offset we added to the ScratchOffset register.
1976 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1977 .addReg(SOffset)
1978 .addImm(-ScratchOffsetRegDelta);
1979 }
1980}
1981
1983 Register BlockReg) const {
1984 const MachineFunction *MF = MIB->getMF();
1985 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1986 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1987 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1988 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1989 if (!(Mask & (1 << RegOffset)) &&
1990 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1991 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1992}
1993
1995 int Offset, bool IsLoad,
1996 bool IsKill) const {
1997 // Load/store VGPR
1998 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1999 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2000
2001 Register FrameReg =
2002 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2003 ? getBaseRegister()
2004 : getFrameRegister(SB.MF);
2005
2006 Align Alignment = FrameInfo.getObjectAlign(Index);
2010 SB.EltSize, Alignment);
2011
2012 if (IsLoad) {
2013 unsigned Opc = ST.hasFlatScratchEnabled()
2014 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2015 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2016 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2017 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2018 } else {
2019 unsigned Opc = ST.hasFlatScratchEnabled()
2020 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2021 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2022 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2023 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2024 // This only ever adds one VGPR spill
2025 SB.MFI.addToSpilledVGPRs(1);
2026 }
2027}
2028
2030 RegScavenger *RS, SlotIndexes *Indexes,
2031 LiveIntervals *LIS, bool OnlyToVGPR,
2032 bool SpillToPhysVGPRLane) const {
2033 assert(!MI->getOperand(0).isUndef() &&
2034 "undef spill should have been deleted earlier");
2035
2036 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2037
2038 ArrayRef<SpilledReg> VGPRSpills =
2039 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2041 bool SpillToVGPR = !VGPRSpills.empty();
2042 if (OnlyToVGPR && !SpillToVGPR)
2043 return false;
2044
2045 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2046 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2047
2048 if (SpillToVGPR) {
2049
2050 // Since stack slot coloring pass is trying to optimize SGPR spills,
2051 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2052 // spills of different sizes. This accounts for number of VGPR lanes alloted
2053 // equal to the largest SGPR being spilled in them.
2054 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2055 "Num of SGPRs spilled should be less than or equal to num of "
2056 "the VGPR lanes.");
2057
2058 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2059 Register SubReg =
2060 SB.NumSubRegs == 1
2061 ? SB.SuperReg
2062 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2063 SpilledReg Spill = VGPRSpills[i];
2064
2065 bool IsFirstSubreg = i == 0;
2066 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2067 bool UseKill = SB.IsKill && IsLastSubreg;
2068
2069
2070 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2071 // spill to this specific vgpr in the first basic block.
2072 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2073 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2074 .addReg(SubReg, getKillRegState(UseKill))
2075 .addImm(Spill.Lane)
2076 .addReg(Spill.VGPR);
2077 if (Indexes) {
2078 if (IsFirstSubreg)
2079 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2080 else
2081 Indexes->insertMachineInstrInMaps(*MIB);
2082 }
2083
2084 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2085 // We may be spilling a super-register which is only partially defined,
2086 // and need to ensure later spills think the value is defined.
2087 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2088 }
2089
2090 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2092
2093 // FIXME: Since this spills to another register instead of an actual
2094 // frame index, we should delete the frame index when all references to
2095 // it are fixed.
2096 }
2097 } else {
2098 SB.prepare();
2099
2100 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2101 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2102
2103 // Per VGPR helper data
2104 auto PVD = SB.getPerVGPRData();
2105
2106 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2107 RegState TmpVGPRFlags = RegState::Undef;
2108
2109 // Write sub registers into the VGPR
2110 for (unsigned i = Offset * PVD.PerVGPR,
2111 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2112 i < e; ++i) {
2113 Register SubReg =
2114 SB.NumSubRegs == 1
2115 ? SB.SuperReg
2116 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2117
2118 MachineInstrBuilder WriteLane =
2119 BuildMI(*SB.MBB, MI, SB.DL,
2120 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2121 .addReg(SubReg, SubKillState)
2122 .addImm(i % PVD.PerVGPR)
2123 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2124 TmpVGPRFlags = {};
2125
2126 if (Indexes) {
2127 if (i == 0)
2128 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2129 else
2130 Indexes->insertMachineInstrInMaps(*WriteLane);
2131 }
2132
2133 // There could be undef components of a spilled super register.
2134 // TODO: Can we detect this and skip the spill?
2135 if (SB.NumSubRegs > 1) {
2136 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2137 RegState SuperKillState = {};
2138 if (i + 1 == SB.NumSubRegs)
2139 SuperKillState |= getKillRegState(SB.IsKill);
2140 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2141 }
2142 }
2143
2144 // Write out VGPR
2145 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2146 }
2147
2148 SB.restore();
2149 }
2150
2151 MI->eraseFromParent();
2153
2154 if (LIS)
2156
2157 return true;
2158}
2159
2161 RegScavenger *RS, SlotIndexes *Indexes,
2162 LiveIntervals *LIS, bool OnlyToVGPR,
2163 bool SpillToPhysVGPRLane) const {
2164 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2165
2166 ArrayRef<SpilledReg> VGPRSpills =
2167 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2169 bool SpillToVGPR = !VGPRSpills.empty();
2170 if (OnlyToVGPR && !SpillToVGPR)
2171 return false;
2172
2173 if (SpillToVGPR) {
2174 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2175 Register SubReg =
2176 SB.NumSubRegs == 1
2177 ? SB.SuperReg
2178 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2179
2180 SpilledReg Spill = VGPRSpills[i];
2181 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2182 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2183 .addReg(Spill.VGPR)
2184 .addImm(Spill.Lane);
2185 if (SB.NumSubRegs > 1 && i == 0)
2187 if (Indexes) {
2188 if (i == e - 1)
2189 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2190 else
2191 Indexes->insertMachineInstrInMaps(*MIB);
2192 }
2193 }
2194 } else {
2195 SB.prepare();
2196
2197 // Per VGPR helper data
2198 auto PVD = SB.getPerVGPRData();
2199
2200 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2201 // Load in VGPR data
2202 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2203
2204 // Unpack lanes
2205 for (unsigned i = Offset * PVD.PerVGPR,
2206 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2207 i < e; ++i) {
2208 Register SubReg =
2209 SB.NumSubRegs == 1
2210 ? SB.SuperReg
2211 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2212
2213 bool LastSubReg = (i + 1 == e);
2214 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2215 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2216 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2217 .addImm(i);
2218 if (SB.NumSubRegs > 1 && i == 0)
2220 if (Indexes) {
2221 if (i == e - 1)
2222 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2223 else
2224 Indexes->insertMachineInstrInMaps(*MIB);
2225 }
2226 }
2227 }
2228
2229 SB.restore();
2230 }
2231
2232 MI->eraseFromParent();
2233
2234 if (LIS)
2236
2237 return true;
2238}
2239
2241 MachineBasicBlock &RestoreMBB,
2242 Register SGPR, RegScavenger *RS) const {
2243 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2244 RS);
2245 SB.prepare();
2246 // Generate the spill of SGPR to SB.TmpVGPR.
2247 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2248 auto PVD = SB.getPerVGPRData();
2249 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2250 RegState TmpVGPRFlags = RegState::Undef;
2251 // Write sub registers into the VGPR
2252 for (unsigned i = Offset * PVD.PerVGPR,
2253 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2254 i < e; ++i) {
2255 Register SubReg =
2256 SB.NumSubRegs == 1
2257 ? SB.SuperReg
2258 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2259
2260 MachineInstrBuilder WriteLane =
2261 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2262 SB.TmpVGPR)
2263 .addReg(SubReg, SubKillState)
2264 .addImm(i % PVD.PerVGPR)
2265 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2266 TmpVGPRFlags = {};
2267 // There could be undef components of a spilled super register.
2268 // TODO: Can we detect this and skip the spill?
2269 if (SB.NumSubRegs > 1) {
2270 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2271 RegState SuperKillState = {};
2272 if (i + 1 == SB.NumSubRegs)
2273 SuperKillState |= getKillRegState(SB.IsKill);
2274 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2275 }
2276 }
2277 // Don't need to write VGPR out.
2278 }
2279
2280 // Restore clobbered registers in the specified restore block.
2281 MI = RestoreMBB.end();
2282 SB.setMI(&RestoreMBB, MI);
2283 // Generate the restore of SGPR from SB.TmpVGPR.
2284 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2285 // Don't need to load VGPR in.
2286 // Unpack lanes
2287 for (unsigned i = Offset * PVD.PerVGPR,
2288 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2289 i < e; ++i) {
2290 Register SubReg =
2291 SB.NumSubRegs == 1
2292 ? SB.SuperReg
2293 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2294
2295 assert(SubReg.isPhysical());
2296 bool LastSubReg = (i + 1 == e);
2297 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2298 SubReg)
2299 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2300 .addImm(i);
2301 if (SB.NumSubRegs > 1 && i == 0)
2303 }
2304 }
2305 SB.restore();
2306
2308 return false;
2309}
2310
2311/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2312/// a VGPR and the stack slot can be safely eliminated when all other users are
2313/// handled.
2316 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2317 switch (MI->getOpcode()) {
2318 case AMDGPU::SI_SPILL_S1024_SAVE:
2319 case AMDGPU::SI_SPILL_S512_SAVE:
2320 case AMDGPU::SI_SPILL_S384_SAVE:
2321 case AMDGPU::SI_SPILL_S352_SAVE:
2322 case AMDGPU::SI_SPILL_S320_SAVE:
2323 case AMDGPU::SI_SPILL_S288_SAVE:
2324 case AMDGPU::SI_SPILL_S256_SAVE:
2325 case AMDGPU::SI_SPILL_S224_SAVE:
2326 case AMDGPU::SI_SPILL_S192_SAVE:
2327 case AMDGPU::SI_SPILL_S160_SAVE:
2328 case AMDGPU::SI_SPILL_S128_SAVE:
2329 case AMDGPU::SI_SPILL_S96_SAVE:
2330 case AMDGPU::SI_SPILL_S64_SAVE:
2331 case AMDGPU::SI_SPILL_S32_SAVE:
2332 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2333 case AMDGPU::SI_SPILL_S1024_RESTORE:
2334 case AMDGPU::SI_SPILL_S512_RESTORE:
2335 case AMDGPU::SI_SPILL_S384_RESTORE:
2336 case AMDGPU::SI_SPILL_S352_RESTORE:
2337 case AMDGPU::SI_SPILL_S320_RESTORE:
2338 case AMDGPU::SI_SPILL_S288_RESTORE:
2339 case AMDGPU::SI_SPILL_S256_RESTORE:
2340 case AMDGPU::SI_SPILL_S224_RESTORE:
2341 case AMDGPU::SI_SPILL_S192_RESTORE:
2342 case AMDGPU::SI_SPILL_S160_RESTORE:
2343 case AMDGPU::SI_SPILL_S128_RESTORE:
2344 case AMDGPU::SI_SPILL_S96_RESTORE:
2345 case AMDGPU::SI_SPILL_S64_RESTORE:
2346 case AMDGPU::SI_SPILL_S32_RESTORE:
2347 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2348 default:
2349 llvm_unreachable("not an SGPR spill instruction");
2350 }
2351}
2352
2354 int SPAdj, unsigned FIOperandNum,
2355 RegScavenger *RS) const {
2356 MachineFunction *MF = MI->getMF();
2357 MachineBasicBlock *MBB = MI->getParent();
2359 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2360 const SIInstrInfo *TII = ST.getInstrInfo();
2361 const DebugLoc &DL = MI->getDebugLoc();
2362
2363 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2364
2366 "unreserved scratch RSRC register");
2367
2368 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2369 int Index = MI->getOperand(FIOperandNum).getIndex();
2370
2371 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2372 ? getBaseRegister()
2373 : getFrameRegister(*MF);
2374
2375 switch (MI->getOpcode()) {
2376 // SGPR register spill
2377 case AMDGPU::SI_SPILL_S1024_SAVE:
2378 case AMDGPU::SI_SPILL_S512_SAVE:
2379 case AMDGPU::SI_SPILL_S384_SAVE:
2380 case AMDGPU::SI_SPILL_S352_SAVE:
2381 case AMDGPU::SI_SPILL_S320_SAVE:
2382 case AMDGPU::SI_SPILL_S288_SAVE:
2383 case AMDGPU::SI_SPILL_S256_SAVE:
2384 case AMDGPU::SI_SPILL_S224_SAVE:
2385 case AMDGPU::SI_SPILL_S192_SAVE:
2386 case AMDGPU::SI_SPILL_S160_SAVE:
2387 case AMDGPU::SI_SPILL_S128_SAVE:
2388 case AMDGPU::SI_SPILL_S96_SAVE:
2389 case AMDGPU::SI_SPILL_S64_SAVE:
2390 case AMDGPU::SI_SPILL_S32_SAVE: {
2391 return spillSGPR(MI, Index, RS);
2392 }
2393
2394 // SGPR register restore
2395 case AMDGPU::SI_SPILL_S1024_RESTORE:
2396 case AMDGPU::SI_SPILL_S512_RESTORE:
2397 case AMDGPU::SI_SPILL_S384_RESTORE:
2398 case AMDGPU::SI_SPILL_S352_RESTORE:
2399 case AMDGPU::SI_SPILL_S320_RESTORE:
2400 case AMDGPU::SI_SPILL_S288_RESTORE:
2401 case AMDGPU::SI_SPILL_S256_RESTORE:
2402 case AMDGPU::SI_SPILL_S224_RESTORE:
2403 case AMDGPU::SI_SPILL_S192_RESTORE:
2404 case AMDGPU::SI_SPILL_S160_RESTORE:
2405 case AMDGPU::SI_SPILL_S128_RESTORE:
2406 case AMDGPU::SI_SPILL_S96_RESTORE:
2407 case AMDGPU::SI_SPILL_S64_RESTORE:
2408 case AMDGPU::SI_SPILL_S32_RESTORE: {
2409 return restoreSGPR(MI, Index, RS);
2410 }
2411
2412 // VGPR register spill
2413 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2414 // Put mask into M0.
2415 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2416 AMDGPU::M0)
2417 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2418 [[fallthrough]];
2419 }
2420 case AMDGPU::SI_SPILL_V1024_SAVE:
2421 case AMDGPU::SI_SPILL_V512_SAVE:
2422 case AMDGPU::SI_SPILL_V384_SAVE:
2423 case AMDGPU::SI_SPILL_V352_SAVE:
2424 case AMDGPU::SI_SPILL_V320_SAVE:
2425 case AMDGPU::SI_SPILL_V288_SAVE:
2426 case AMDGPU::SI_SPILL_V256_SAVE:
2427 case AMDGPU::SI_SPILL_V224_SAVE:
2428 case AMDGPU::SI_SPILL_V192_SAVE:
2429 case AMDGPU::SI_SPILL_V160_SAVE:
2430 case AMDGPU::SI_SPILL_V128_SAVE:
2431 case AMDGPU::SI_SPILL_V96_SAVE:
2432 case AMDGPU::SI_SPILL_V64_SAVE:
2433 case AMDGPU::SI_SPILL_V32_SAVE:
2434 case AMDGPU::SI_SPILL_V16_SAVE:
2435 case AMDGPU::SI_SPILL_A1024_SAVE:
2436 case AMDGPU::SI_SPILL_A512_SAVE:
2437 case AMDGPU::SI_SPILL_A384_SAVE:
2438 case AMDGPU::SI_SPILL_A352_SAVE:
2439 case AMDGPU::SI_SPILL_A320_SAVE:
2440 case AMDGPU::SI_SPILL_A288_SAVE:
2441 case AMDGPU::SI_SPILL_A256_SAVE:
2442 case AMDGPU::SI_SPILL_A224_SAVE:
2443 case AMDGPU::SI_SPILL_A192_SAVE:
2444 case AMDGPU::SI_SPILL_A160_SAVE:
2445 case AMDGPU::SI_SPILL_A128_SAVE:
2446 case AMDGPU::SI_SPILL_A96_SAVE:
2447 case AMDGPU::SI_SPILL_A64_SAVE:
2448 case AMDGPU::SI_SPILL_A32_SAVE:
2449 case AMDGPU::SI_SPILL_AV1024_SAVE:
2450 case AMDGPU::SI_SPILL_AV512_SAVE:
2451 case AMDGPU::SI_SPILL_AV384_SAVE:
2452 case AMDGPU::SI_SPILL_AV352_SAVE:
2453 case AMDGPU::SI_SPILL_AV320_SAVE:
2454 case AMDGPU::SI_SPILL_AV288_SAVE:
2455 case AMDGPU::SI_SPILL_AV256_SAVE:
2456 case AMDGPU::SI_SPILL_AV224_SAVE:
2457 case AMDGPU::SI_SPILL_AV192_SAVE:
2458 case AMDGPU::SI_SPILL_AV160_SAVE:
2459 case AMDGPU::SI_SPILL_AV128_SAVE:
2460 case AMDGPU::SI_SPILL_AV96_SAVE:
2461 case AMDGPU::SI_SPILL_AV64_SAVE:
2462 case AMDGPU::SI_SPILL_AV32_SAVE:
2463 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2464 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2465 const MachineOperand *VData = TII->getNamedOperand(*MI,
2466 AMDGPU::OpName::vdata);
2467 if (VData->isUndef()) {
2468 MI->eraseFromParent();
2469 return true;
2470 }
2471
2472 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2473 MFI->getStackPtrOffsetReg());
2474
2475 unsigned Opc;
2476 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2477 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2478 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2479 } else {
2480 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2481 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2482 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2483 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2484 }
2485
2486 auto *MBB = MI->getParent();
2487 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2488 if (IsWWMRegSpill) {
2489 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2490 RS->isRegUsed(AMDGPU::SCC));
2491 }
2493 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2494 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2495 *MI->memoperands_begin(), RS);
2497 if (IsWWMRegSpill)
2498 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2499
2500 MI->eraseFromParent();
2501 return true;
2502 }
2503 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2504 // Put mask into M0.
2505 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2506 AMDGPU::M0)
2507 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2508 [[fallthrough]];
2509 }
2510 case AMDGPU::SI_SPILL_V16_RESTORE:
2511 case AMDGPU::SI_SPILL_V32_RESTORE:
2512 case AMDGPU::SI_SPILL_V64_RESTORE:
2513 case AMDGPU::SI_SPILL_V96_RESTORE:
2514 case AMDGPU::SI_SPILL_V128_RESTORE:
2515 case AMDGPU::SI_SPILL_V160_RESTORE:
2516 case AMDGPU::SI_SPILL_V192_RESTORE:
2517 case AMDGPU::SI_SPILL_V224_RESTORE:
2518 case AMDGPU::SI_SPILL_V256_RESTORE:
2519 case AMDGPU::SI_SPILL_V288_RESTORE:
2520 case AMDGPU::SI_SPILL_V320_RESTORE:
2521 case AMDGPU::SI_SPILL_V352_RESTORE:
2522 case AMDGPU::SI_SPILL_V384_RESTORE:
2523 case AMDGPU::SI_SPILL_V512_RESTORE:
2524 case AMDGPU::SI_SPILL_V1024_RESTORE:
2525 case AMDGPU::SI_SPILL_A32_RESTORE:
2526 case AMDGPU::SI_SPILL_A64_RESTORE:
2527 case AMDGPU::SI_SPILL_A96_RESTORE:
2528 case AMDGPU::SI_SPILL_A128_RESTORE:
2529 case AMDGPU::SI_SPILL_A160_RESTORE:
2530 case AMDGPU::SI_SPILL_A192_RESTORE:
2531 case AMDGPU::SI_SPILL_A224_RESTORE:
2532 case AMDGPU::SI_SPILL_A256_RESTORE:
2533 case AMDGPU::SI_SPILL_A288_RESTORE:
2534 case AMDGPU::SI_SPILL_A320_RESTORE:
2535 case AMDGPU::SI_SPILL_A352_RESTORE:
2536 case AMDGPU::SI_SPILL_A384_RESTORE:
2537 case AMDGPU::SI_SPILL_A512_RESTORE:
2538 case AMDGPU::SI_SPILL_A1024_RESTORE:
2539 case AMDGPU::SI_SPILL_AV32_RESTORE:
2540 case AMDGPU::SI_SPILL_AV64_RESTORE:
2541 case AMDGPU::SI_SPILL_AV96_RESTORE:
2542 case AMDGPU::SI_SPILL_AV128_RESTORE:
2543 case AMDGPU::SI_SPILL_AV160_RESTORE:
2544 case AMDGPU::SI_SPILL_AV192_RESTORE:
2545 case AMDGPU::SI_SPILL_AV224_RESTORE:
2546 case AMDGPU::SI_SPILL_AV256_RESTORE:
2547 case AMDGPU::SI_SPILL_AV288_RESTORE:
2548 case AMDGPU::SI_SPILL_AV320_RESTORE:
2549 case AMDGPU::SI_SPILL_AV352_RESTORE:
2550 case AMDGPU::SI_SPILL_AV384_RESTORE:
2551 case AMDGPU::SI_SPILL_AV512_RESTORE:
2552 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2553 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2554 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2555 const MachineOperand *VData = TII->getNamedOperand(*MI,
2556 AMDGPU::OpName::vdata);
2557 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2558 MFI->getStackPtrOffsetReg());
2559
2560 unsigned Opc;
2561 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2562 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2563 Opc = ST.d16PreservesUnusedBits()
2564 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2565 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2566 } else {
2567 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2568 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2569 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2570 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2571 }
2572
2573 auto *MBB = MI->getParent();
2574 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2575 if (IsWWMRegSpill) {
2576 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2577 RS->isRegUsed(AMDGPU::SCC));
2578 }
2579
2581 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2582 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2583 *MI->memoperands_begin(), RS);
2584
2585 if (IsWWMRegSpill)
2586 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2587
2588 MI->eraseFromParent();
2589 return true;
2590 }
2591 case AMDGPU::V_ADD_U32_e32:
2592 case AMDGPU::V_ADD_U32_e64:
2593 case AMDGPU::V_ADD_CO_U32_e32:
2594 case AMDGPU::V_ADD_CO_U32_e64: {
2595 // TODO: Handle sub, and, or.
2596 unsigned NumDefs = MI->getNumExplicitDefs();
2597 unsigned Src0Idx = NumDefs;
2598
2599 bool HasClamp = false;
2600 MachineOperand *VCCOp = nullptr;
2601
2602 switch (MI->getOpcode()) {
2603 case AMDGPU::V_ADD_U32_e32:
2604 break;
2605 case AMDGPU::V_ADD_U32_e64:
2606 HasClamp = MI->getOperand(3).getImm();
2607 break;
2608 case AMDGPU::V_ADD_CO_U32_e32:
2609 VCCOp = &MI->getOperand(3);
2610 break;
2611 case AMDGPU::V_ADD_CO_U32_e64:
2612 VCCOp = &MI->getOperand(1);
2613 HasClamp = MI->getOperand(4).getImm();
2614 break;
2615 default:
2616 break;
2617 }
2618 bool DeadVCC = !VCCOp || VCCOp->isDead();
2619 MachineOperand &DstOp = MI->getOperand(0);
2620 Register DstReg = DstOp.getReg();
2621
2622 unsigned OtherOpIdx =
2623 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2624 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2625
2626 unsigned Src1Idx = Src0Idx + 1;
2627 Register MaterializedReg = FrameReg;
2628 Register ScavengedVGPR;
2629
2630 int64_t Offset = FrameInfo.getObjectOffset(Index);
2631 // For the non-immediate case, we could fall through to the default
2632 // handling, but we do an in-place update of the result register here to
2633 // avoid scavenging another register.
2634 if (OtherOp->isImm()) {
2635 int64_t TotalOffset = OtherOp->getImm() + Offset;
2636
2637 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2638 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2639 // If we can't support a VOP3 literal in the VALU instruction, we
2640 // can't specially fold into the add.
2641 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2642 break;
2643 }
2644
2645 OtherOp->setImm(TotalOffset);
2646 Offset = 0;
2647 }
2648
2649 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2650 // We should just do an in-place update of the result register. However,
2651 // the value there may also be used by the add, in which case we need a
2652 // temporary register.
2653 //
2654 // FIXME: The scavenger is not finding the result register in the
2655 // common case where the add does not read the register.
2656
2657 ScavengedVGPR = RS->scavengeRegisterBackwards(
2658 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2659
2660 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2661 // shift.
2662 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2663 .addDef(ScavengedVGPR, RegState::Renamable)
2664 .addImm(ST.getWavefrontSizeLog2())
2665 .addReg(FrameReg);
2666 MaterializedReg = ScavengedVGPR;
2667 }
2668
2669 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2670 if (ST.hasFlatScratchEnabled() &&
2671 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2672 // We didn't need the shift above, so we have an SGPR for the frame
2673 // register, but may have a VGPR only operand.
2674 //
2675 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2676 // and use the higher constant bus restriction to avoid this copy.
2677
2678 if (!ScavengedVGPR) {
2679 ScavengedVGPR = RS->scavengeRegisterBackwards(
2680 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2681 /*SPAdj=*/0);
2682 }
2683
2684 assert(ScavengedVGPR != DstReg);
2685
2686 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2687 .addReg(MaterializedReg,
2688 getKillRegState(MaterializedReg != FrameReg));
2689 MaterializedReg = ScavengedVGPR;
2690 }
2691
2692 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2693 // is not live, we could use a scalar add + vector add instead of 2
2694 // vector adds.
2695 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2696 .addDef(DstReg, RegState::Renamable);
2697 if (NumDefs == 2)
2698 AddI32.add(MI->getOperand(1));
2699
2700 RegState MaterializedRegFlags =
2701 getKillRegState(MaterializedReg != FrameReg);
2702
2703 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2704 // If we know we have a VGPR already, it's more likely the other
2705 // operand is a legal vsrc0.
2706 AddI32
2707 .add(*OtherOp)
2708 .addReg(MaterializedReg, MaterializedRegFlags);
2709 } else {
2710 // Commute operands to avoid violating VOP2 restrictions. This will
2711 // typically happen when using scratch.
2712 AddI32
2713 .addReg(MaterializedReg, MaterializedRegFlags)
2714 .add(*OtherOp);
2715 }
2716
2717 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2718 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2719 AddI32.addImm(0); // clamp
2720
2721 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2722 AddI32.setOperandDead(3); // Dead vcc
2723
2724 MaterializedReg = DstReg;
2725
2726 OtherOp->ChangeToRegister(MaterializedReg, false);
2727 OtherOp->setIsKill(true);
2729 Offset = 0;
2730 } else if (Offset != 0) {
2731 assert(!MaterializedReg);
2733 Offset = 0;
2734 } else {
2735 if (DeadVCC && !HasClamp) {
2736 assert(Offset == 0);
2737
2738 // TODO: Losing kills and implicit operands. Just mutate to copy and
2739 // let lowerCopy deal with it?
2740 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2741 // Folded to an identity copy.
2742 MI->eraseFromParent();
2743 return true;
2744 }
2745
2746 // The immediate value should be in OtherOp
2747 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2748 MI->removeOperand(FIOperandNum);
2749
2750 unsigned NumOps = MI->getNumOperands();
2751 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2752 MI->removeOperand(I);
2753
2754 if (NumDefs == 2)
2755 MI->removeOperand(1);
2756
2757 // The code below can't deal with a mov.
2758 return true;
2759 }
2760
2761 // This folded to a constant, but we have to keep the add around for
2762 // pointless implicit defs or clamp modifier.
2763 FIOp->ChangeToImmediate(0);
2764 }
2765
2766 // Try to improve legality by commuting.
2767 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2768 std::swap(FIOp, OtherOp);
2769 std::swap(FIOperandNum, OtherOpIdx);
2770 }
2771
2772 // We need at most one mov to satisfy the operand constraints. Prefer to
2773 // move the FI operand first, as it may be a literal in a VOP3
2774 // instruction.
2775 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2776 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2777 // If commuting didn't make the operands legal, we need to materialize
2778 // in a register.
2779 // TODO: Can use SGPR on gfx10+ in some cases.
2780 if (!ScavengedVGPR) {
2781 ScavengedVGPR = RS->scavengeRegisterBackwards(
2782 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2783 /*SPAdj=*/0);
2784 }
2785
2786 assert(ScavengedVGPR != DstReg);
2787
2788 MachineOperand &Src = MI->getOperand(SrcIdx);
2789 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2790 .add(Src);
2791
2792 Src.ChangeToRegister(ScavengedVGPR, false);
2793 Src.setIsKill(true);
2794 break;
2795 }
2796 }
2797
2798 // Fold out add of 0 case that can appear in kernels.
2799 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2800 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2801 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2802 }
2803
2804 MI->eraseFromParent();
2805 }
2806
2807 return true;
2808 }
2809 case AMDGPU::S_ADD_I32:
2810 case AMDGPU::S_ADD_U32: {
2811 // TODO: Handle s_or_b32, s_and_b32.
2812 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2813 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2814
2815 assert(FrameReg || MFI->isBottomOfStack());
2816
2817 MachineOperand &DstOp = MI->getOperand(0);
2818 const DebugLoc &DL = MI->getDebugLoc();
2819 Register MaterializedReg = FrameReg;
2820
2821 // Defend against live scc, which should never happen in practice.
2822 bool DeadSCC = MI->getOperand(3).isDead();
2823
2824 Register TmpReg;
2825
2826 // FIXME: Scavenger should figure out that the result register is
2827 // available. Also should do this for the v_add case.
2828 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2829 TmpReg = DstOp.getReg();
2830
2831 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2832 // FIXME: In the common case where the add does not also read its result
2833 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2834 // available.
2835 if (!TmpReg)
2836 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2837 MI, /*RestoreAfter=*/false, 0,
2838 /*AllowSpill=*/false);
2839 if (TmpReg) {
2840 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2841 .addDef(TmpReg, RegState::Renamable)
2842 .addReg(FrameReg)
2843 .addImm(ST.getWavefrontSizeLog2())
2844 .setOperandDead(3); // Set SCC dead
2845 }
2846 MaterializedReg = TmpReg;
2847 }
2848
2849 int64_t Offset = FrameInfo.getObjectOffset(Index);
2850
2851 // For the non-immediate case, we could fall through to the default
2852 // handling, but we do an in-place update of the result register here to
2853 // avoid scavenging another register.
2854 if (OtherOp.isImm()) {
2855 OtherOp.setImm(OtherOp.getImm() + Offset);
2856 Offset = 0;
2857
2858 if (MaterializedReg)
2859 FIOp->ChangeToRegister(MaterializedReg, false);
2860 else
2861 FIOp->ChangeToImmediate(0);
2862 } else if (MaterializedReg) {
2863 // If we can't fold the other operand, do another increment.
2864 Register DstReg = DstOp.getReg();
2865
2866 if (!TmpReg && MaterializedReg == FrameReg) {
2867 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2868 MI, /*RestoreAfter=*/false, 0,
2869 /*AllowSpill=*/false);
2870 DstReg = TmpReg;
2871 }
2872
2873 if (TmpReg) {
2874 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2875 .addDef(DstReg, RegState::Renamable)
2876 .addReg(MaterializedReg, RegState::Kill)
2877 .add(OtherOp);
2878 if (DeadSCC)
2879 AddI32.setOperandDead(3);
2880
2881 MaterializedReg = DstReg;
2882
2883 OtherOp.ChangeToRegister(MaterializedReg, false);
2884 OtherOp.setIsKill(true);
2885 OtherOp.setIsRenamable(true);
2886 }
2888 } else {
2889 // If we don't have any other offset to apply, we can just directly
2890 // interpret the frame index as the offset.
2892 }
2893
2894 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2895 assert(Offset == 0);
2896 MI->removeOperand(3);
2897 MI->removeOperand(OtherOpIdx);
2898 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2899 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2900 assert(Offset == 0);
2901 MI->removeOperand(3);
2902 MI->removeOperand(FIOperandNum);
2903 MI->setDesc(
2904 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2905 }
2906
2907 assert(!FIOp->isFI());
2908 return true;
2909 }
2910 default: {
2911 break;
2912 }
2913 }
2914
2915 int64_t Offset = FrameInfo.getObjectOffset(Index);
2916 if (ST.hasFlatScratchEnabled()) {
2917 if (TII->isFLATScratch(*MI)) {
2918 assert(
2919 (int16_t)FIOperandNum ==
2920 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2921
2922 // The offset is always swizzled, just replace it
2923 if (FrameReg)
2924 FIOp->ChangeToRegister(FrameReg, false);
2925
2927 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2928 int64_t NewOffset = Offset + OffsetOp->getImm();
2929 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2931 OffsetOp->setImm(NewOffset);
2932 if (FrameReg)
2933 return false;
2934 Offset = 0;
2935 }
2936
2937 if (!Offset) {
2938 unsigned Opc = MI->getOpcode();
2939 int NewOpc = -1;
2940 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2942 } else if (ST.hasFlatScratchSTMode()) {
2943 // On GFX10 we have ST mode to use no registers for an address.
2944 // Otherwise we need to materialize 0 into an SGPR.
2946 }
2947
2948 if (NewOpc != -1) {
2949 // removeOperand doesn't fixup tied operand indexes as it goes, so
2950 // it asserts. Untie vdst_in for now and retie them afterwards.
2951 int VDstIn =
2952 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2953 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2954 MI->getOperand(VDstIn).isTied();
2955 if (TiedVDst)
2956 MI->untieRegOperand(VDstIn);
2957
2958 MI->removeOperand(
2959 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2960
2961 if (TiedVDst) {
2962 int NewVDst =
2963 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2964 int NewVDstIn =
2965 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2966 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2967 MI->tieOperands(NewVDst, NewVDstIn);
2968 }
2969 MI->setDesc(TII->get(NewOpc));
2970 return false;
2971 }
2972 }
2973 }
2974
2975 if (!FrameReg) {
2977 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2978 return false;
2979 }
2980
2981 // We need to use register here. Check if we can use an SGPR or need
2982 // a VGPR.
2983 FIOp->ChangeToRegister(AMDGPU::M0, false);
2984 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2985
2986 if (!Offset && FrameReg && UseSGPR) {
2987 FIOp->setReg(FrameReg);
2988 return false;
2989 }
2990
2991 const TargetRegisterClass *RC =
2992 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2993
2994 Register TmpReg =
2995 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2996 FIOp->setReg(TmpReg);
2997 FIOp->setIsKill();
2998
2999 if ((!FrameReg || !Offset) && TmpReg) {
3000 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3001 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3002 if (FrameReg)
3003 MIB.addReg(FrameReg);
3004 else
3005 MIB.addImm(Offset);
3006
3007 return false;
3008 }
3009
3010 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3011 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3012
3013 Register TmpSReg =
3014 UseSGPR ? TmpReg
3015 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3016 MI, false, 0, !UseSGPR);
3017
3018 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
3019 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
3020 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
3021 Register TmpVGPR = RS->scavengeRegisterBackwards(
3022 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3023
3024 // Materialize the frame register.
3025 auto MIB =
3026 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3027 if (FrameReg)
3028 MIB.addReg(FrameReg);
3029 else
3030 MIB.addImm(Offset);
3031
3032 // Add the offset to the frame register.
3033 if (FrameReg && Offset)
3034 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3035 .addReg(FrameReg, RegState::Kill)
3036 .addImm(Offset);
3037
3038 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3039 .add(MI->getOperand(0)) // $vdata
3040 .addReg(TmpVGPR) // $vaddr
3041 .addImm(0) // Offset
3042 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3043 MI->eraseFromParent();
3044 return true;
3045 }
3046 report_fatal_error("Cannot scavenge register in FI elimination!");
3047 }
3048
3049 if (!TmpSReg) {
3050 // Use frame register and restore it after.
3051 TmpSReg = FrameReg;
3052 FIOp->setReg(FrameReg);
3053 FIOp->setIsKill(false);
3054 }
3055
3056 if (NeedSaveSCC) {
3057 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3058 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3059 .addReg(FrameReg)
3060 .addImm(Offset);
3061 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3062 .addReg(TmpSReg)
3063 .addImm(0);
3064 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3065 .addImm(0)
3066 .addReg(TmpSReg);
3067 } else {
3068 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3069 .addReg(FrameReg)
3070 .addImm(Offset);
3071 }
3072
3073 if (!UseSGPR)
3074 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3075 .addReg(TmpSReg, RegState::Kill);
3076
3077 if (TmpSReg == FrameReg) {
3078 // Undo frame register modification.
3079 if (NeedSaveSCC &&
3080 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3082 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3083 TmpSReg)
3084 .addReg(FrameReg)
3085 .addImm(-Offset);
3086 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3087 .addReg(TmpSReg)
3088 .addImm(0);
3089 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3090 TmpSReg)
3091 .addImm(0)
3092 .addReg(TmpSReg);
3093 } else {
3094 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3095 FrameReg)
3096 .addReg(FrameReg)
3097 .addImm(-Offset);
3098 }
3099 }
3100
3101 return false;
3102 }
3103
3104 bool IsMUBUF = TII->isMUBUF(*MI);
3105
3106 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3107 // Convert to a swizzled stack address by scaling by the wave size.
3108 // In an entry function/kernel the offset is already swizzled.
3109 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3110 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3111 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3112 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3113 ? &AMDGPU::SReg_32RegClass
3114 : &AMDGPU::VGPR_32RegClass;
3115 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3116 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3117 MI->getOpcode() == AMDGPU::S_MOV_B32;
3118 Register ResultReg =
3119 IsCopy ? MI->getOperand(0).getReg()
3120 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3121
3122 int64_t Offset = FrameInfo.getObjectOffset(Index);
3123 if (Offset == 0) {
3124 unsigned OpCode =
3125 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3126 Register TmpResultReg = ResultReg;
3127 if (IsSALU && LiveSCC) {
3128 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3129 MI, false, 0);
3130 }
3131
3132 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3133 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3134 // For V_LSHRREV, the operands are reversed (the shift count goes
3135 // first).
3136 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3137 else
3138 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3139 if (IsSALU && !LiveSCC)
3140 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3141 if (IsSALU && LiveSCC) {
3142 Register NewDest;
3143 if (IsCopy) {
3144 assert(ResultReg.isPhysical());
3145 NewDest = ResultReg;
3146 } else {
3147 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3148 Shift, false, 0);
3149 }
3150 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3151 .addReg(TmpResultReg);
3152 ResultReg = NewDest;
3153 }
3154 } else {
3156 if (!IsSALU) {
3157 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3158 nullptr) {
3159 // Reuse ResultReg in intermediate step.
3160 Register ScaledReg = ResultReg;
3161
3162 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3163 ScaledReg)
3164 .addImm(ST.getWavefrontSizeLog2())
3165 .addReg(FrameReg);
3166
3167 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3168
3169 // TODO: Fold if use instruction is another add of a constant.
3170 if (IsVOP2 ||
3171 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3172 // FIXME: This can fail
3173 MIB.addImm(Offset);
3174 MIB.addReg(ScaledReg, RegState::Kill);
3175 if (!IsVOP2)
3176 MIB.addImm(0); // clamp bit
3177 } else {
3178 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3179 "Need to reuse carry out register");
3180
3181 // Use scavenged unused carry out as offset register.
3182 Register ConstOffsetReg;
3183 if (!isWave32)
3184 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3185 else
3186 ConstOffsetReg = MIB.getReg(1);
3187
3188 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3189 ConstOffsetReg)
3190 .addImm(Offset);
3191 MIB.addReg(ConstOffsetReg, RegState::Kill);
3192 MIB.addReg(ScaledReg, RegState::Kill);
3193 MIB.addImm(0); // clamp bit
3194 }
3195 }
3196 }
3197 if (!MIB || IsSALU) {
3198 // We have to produce a carry out, and there isn't a free SGPR pair
3199 // for it. We can keep the whole computation on the SALU to avoid
3200 // clobbering an additional register at the cost of an extra mov.
3201
3202 // We may have 1 free scratch SGPR even though a carry out is
3203 // unavailable. Only one additional mov is needed.
3204 Register TmpScaledReg = IsCopy && IsSALU
3205 ? ResultReg
3206 : RS->scavengeRegisterBackwards(
3207 AMDGPU::SReg_32_XM0RegClass, MI,
3208 false, 0, /*AllowSpill=*/false);
3209 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3210 Register TmpResultReg = ScaledReg;
3211
3212 if (!LiveSCC) {
3213 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3214 .addReg(FrameReg)
3215 .addImm(ST.getWavefrontSizeLog2());
3216 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3217 .addReg(TmpResultReg, RegState::Kill)
3218 .addImm(Offset);
3219 } else {
3220 TmpResultReg = RS->scavengeRegisterBackwards(
3221 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3222
3224 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3225 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3226 TmpResultReg)
3227 .addImm(ST.getWavefrontSizeLog2())
3228 .addReg(FrameReg);
3229 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3230 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3231 .addImm(Offset);
3232 Add.addReg(ResultReg, RegState::Kill)
3233 .addReg(TmpResultReg, RegState::Kill)
3234 .addImm(0);
3235 } else
3236 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3237 } else {
3238 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3239 "offset is unsafe for v_mad_u32_u24");
3240
3241 // We start with a frame pointer with a wave space value, and
3242 // an offset in lane-space. We are materializing a lane space
3243 // value. We can either do a right shift of the frame pointer
3244 // to get to lane space, or a left shift of the offset to get
3245 // to wavespace. We can right shift after the computation to
3246 // get back to the desired per-lane value. We are using the
3247 // mad_u32_u24 primarily as an add with no carry out clobber.
3248 bool IsInlinableLiteral =
3249 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3250 if (!IsInlinableLiteral) {
3251 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3252 TmpResultReg)
3253 .addImm(Offset);
3254 }
3255
3256 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3257 TmpResultReg);
3258
3259 if (!IsInlinableLiteral) {
3260 Add.addReg(TmpResultReg, RegState::Kill);
3261 } else {
3262 // We fold the offset into mad itself if its inlinable.
3263 Add.addImm(Offset);
3264 }
3265 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3266 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3267 TmpResultReg)
3268 .addImm(ST.getWavefrontSizeLog2())
3269 .addReg(TmpResultReg);
3270 }
3271
3272 Register NewDest;
3273 if (IsCopy) {
3274 NewDest = ResultReg;
3275 } else {
3276 NewDest = RS->scavengeRegisterBackwards(
3277 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3278 /*AllowSpill=*/true);
3279 }
3280
3281 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3282 NewDest)
3283 .addReg(TmpResultReg);
3284 ResultReg = NewDest;
3285 }
3286 if (!IsSALU)
3287 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3288 .addReg(TmpResultReg, RegState::Kill);
3289 // If there were truly no free SGPRs, we need to undo everything.
3290 if (!TmpScaledReg.isValid()) {
3291 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3292 .addReg(ScaledReg, RegState::Kill)
3293 .addImm(-Offset);
3294 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3295 .addReg(FrameReg)
3296 .addImm(ST.getWavefrontSizeLog2());
3297 }
3298 }
3299 }
3300
3301 // Don't introduce an extra copy if we're just materializing in a mov.
3302 if (IsCopy) {
3303 MI->eraseFromParent();
3304 return true;
3305 }
3306 FIOp->ChangeToRegister(ResultReg, false, false, true);
3307 return false;
3308 }
3309
3310 if (IsMUBUF) {
3311 // Disable offen so we don't need a 0 vgpr base.
3312 assert(
3313 static_cast<int>(FIOperandNum) ==
3314 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3315
3316 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3317 assert((SOffset.isImm() && SOffset.getImm() == 0));
3318
3319 if (FrameReg != AMDGPU::NoRegister)
3320 SOffset.ChangeToRegister(FrameReg, false);
3321
3322 int64_t Offset = FrameInfo.getObjectOffset(Index);
3323 int64_t OldImm =
3324 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3325 int64_t NewOffset = OldImm + Offset;
3326
3327 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3328 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3329 MI->eraseFromParent();
3330 return true;
3331 }
3332 }
3333
3334 // If the offset is simply too big, don't convert to a scratch wave offset
3335 // relative index.
3336
3338 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3339 Register TmpReg =
3340 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3341 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3342 .addImm(Offset);
3343 FIOp->ChangeToRegister(TmpReg, false, false, true);
3344 }
3345
3346 return false;
3347}
3348
3352
3354 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3355}
3356
3358 return getRegBitWidth(RC.getID());
3359}
3360
3361static const TargetRegisterClass *
3363 if (BitWidth == 64)
3364 return &AMDGPU::VReg_64RegClass;
3365 if (BitWidth == 96)
3366 return &AMDGPU::VReg_96RegClass;
3367 if (BitWidth == 128)
3368 return &AMDGPU::VReg_128RegClass;
3369 if (BitWidth == 160)
3370 return &AMDGPU::VReg_160RegClass;
3371 if (BitWidth == 192)
3372 return &AMDGPU::VReg_192RegClass;
3373 if (BitWidth == 224)
3374 return &AMDGPU::VReg_224RegClass;
3375 if (BitWidth == 256)
3376 return &AMDGPU::VReg_256RegClass;
3377 if (BitWidth == 288)
3378 return &AMDGPU::VReg_288RegClass;
3379 if (BitWidth == 320)
3380 return &AMDGPU::VReg_320RegClass;
3381 if (BitWidth == 352)
3382 return &AMDGPU::VReg_352RegClass;
3383 if (BitWidth == 384)
3384 return &AMDGPU::VReg_384RegClass;
3385 if (BitWidth == 512)
3386 return &AMDGPU::VReg_512RegClass;
3387 if (BitWidth == 1024)
3388 return &AMDGPU::VReg_1024RegClass;
3389
3390 return nullptr;
3391}
3392
3393static const TargetRegisterClass *
3395 if (BitWidth == 64)
3396 return &AMDGPU::VReg_64_Align2RegClass;
3397 if (BitWidth == 96)
3398 return &AMDGPU::VReg_96_Align2RegClass;
3399 if (BitWidth == 128)
3400 return &AMDGPU::VReg_128_Align2RegClass;
3401 if (BitWidth == 160)
3402 return &AMDGPU::VReg_160_Align2RegClass;
3403 if (BitWidth == 192)
3404 return &AMDGPU::VReg_192_Align2RegClass;
3405 if (BitWidth == 224)
3406 return &AMDGPU::VReg_224_Align2RegClass;
3407 if (BitWidth == 256)
3408 return &AMDGPU::VReg_256_Align2RegClass;
3409 if (BitWidth == 288)
3410 return &AMDGPU::VReg_288_Align2RegClass;
3411 if (BitWidth == 320)
3412 return &AMDGPU::VReg_320_Align2RegClass;
3413 if (BitWidth == 352)
3414 return &AMDGPU::VReg_352_Align2RegClass;
3415 if (BitWidth == 384)
3416 return &AMDGPU::VReg_384_Align2RegClass;
3417 if (BitWidth == 512)
3418 return &AMDGPU::VReg_512_Align2RegClass;
3419 if (BitWidth == 1024)
3420 return &AMDGPU::VReg_1024_Align2RegClass;
3421
3422 return nullptr;
3423}
3424
3425const TargetRegisterClass *
3427 if (BitWidth == 1)
3428 return &AMDGPU::VReg_1RegClass;
3429 if (BitWidth == 16)
3430 return &AMDGPU::VGPR_16RegClass;
3431 if (BitWidth == 32)
3432 return &AMDGPU::VGPR_32RegClass;
3433 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3435}
3436
3437const TargetRegisterClass *
3439 if (BitWidth <= 32)
3440 return &AMDGPU::VGPR_32_Lo256RegClass;
3441 if (BitWidth <= 64)
3442 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3443 if (BitWidth <= 96)
3444 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3445 if (BitWidth <= 128)
3446 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3447 if (BitWidth <= 160)
3448 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3449 if (BitWidth <= 192)
3450 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3451 if (BitWidth <= 224)
3452 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3453 if (BitWidth <= 256)
3454 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3455 if (BitWidth <= 288)
3456 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3457 if (BitWidth <= 320)
3458 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3459 if (BitWidth <= 352)
3460 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3461 if (BitWidth <= 384)
3462 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3463 if (BitWidth <= 512)
3464 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3465 if (BitWidth <= 1024)
3466 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3467
3468 return nullptr;
3469}
3470
3471static const TargetRegisterClass *
3473 if (BitWidth == 64)
3474 return &AMDGPU::AReg_64RegClass;
3475 if (BitWidth == 96)
3476 return &AMDGPU::AReg_96RegClass;
3477 if (BitWidth == 128)
3478 return &AMDGPU::AReg_128RegClass;
3479 if (BitWidth == 160)
3480 return &AMDGPU::AReg_160RegClass;
3481 if (BitWidth == 192)
3482 return &AMDGPU::AReg_192RegClass;
3483 if (BitWidth == 224)
3484 return &AMDGPU::AReg_224RegClass;
3485 if (BitWidth == 256)
3486 return &AMDGPU::AReg_256RegClass;
3487 if (BitWidth == 288)
3488 return &AMDGPU::AReg_288RegClass;
3489 if (BitWidth == 320)
3490 return &AMDGPU::AReg_320RegClass;
3491 if (BitWidth == 352)
3492 return &AMDGPU::AReg_352RegClass;
3493 if (BitWidth == 384)
3494 return &AMDGPU::AReg_384RegClass;
3495 if (BitWidth == 512)
3496 return &AMDGPU::AReg_512RegClass;
3497 if (BitWidth == 1024)
3498 return &AMDGPU::AReg_1024RegClass;
3499
3500 return nullptr;
3501}
3502
3503static const TargetRegisterClass *
3505 if (BitWidth == 64)
3506 return &AMDGPU::AReg_64_Align2RegClass;
3507 if (BitWidth == 96)
3508 return &AMDGPU::AReg_96_Align2RegClass;
3509 if (BitWidth == 128)
3510 return &AMDGPU::AReg_128_Align2RegClass;
3511 if (BitWidth == 160)
3512 return &AMDGPU::AReg_160_Align2RegClass;
3513 if (BitWidth == 192)
3514 return &AMDGPU::AReg_192_Align2RegClass;
3515 if (BitWidth == 224)
3516 return &AMDGPU::AReg_224_Align2RegClass;
3517 if (BitWidth == 256)
3518 return &AMDGPU::AReg_256_Align2RegClass;
3519 if (BitWidth == 288)
3520 return &AMDGPU::AReg_288_Align2RegClass;
3521 if (BitWidth == 320)
3522 return &AMDGPU::AReg_320_Align2RegClass;
3523 if (BitWidth == 352)
3524 return &AMDGPU::AReg_352_Align2RegClass;
3525 if (BitWidth == 384)
3526 return &AMDGPU::AReg_384_Align2RegClass;
3527 if (BitWidth == 512)
3528 return &AMDGPU::AReg_512_Align2RegClass;
3529 if (BitWidth == 1024)
3530 return &AMDGPU::AReg_1024_Align2RegClass;
3531
3532 return nullptr;
3533}
3534
3535const TargetRegisterClass *
3537 if (BitWidth == 16)
3538 return &AMDGPU::AGPR_LO16RegClass;
3539 if (BitWidth == 32)
3540 return &AMDGPU::AGPR_32RegClass;
3541 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3543}
3544
3545static const TargetRegisterClass *
3547 if (BitWidth == 64)
3548 return &AMDGPU::AV_64RegClass;
3549 if (BitWidth == 96)
3550 return &AMDGPU::AV_96RegClass;
3551 if (BitWidth == 128)
3552 return &AMDGPU::AV_128RegClass;
3553 if (BitWidth == 160)
3554 return &AMDGPU::AV_160RegClass;
3555 if (BitWidth == 192)
3556 return &AMDGPU::AV_192RegClass;
3557 if (BitWidth == 224)
3558 return &AMDGPU::AV_224RegClass;
3559 if (BitWidth == 256)
3560 return &AMDGPU::AV_256RegClass;
3561 if (BitWidth == 288)
3562 return &AMDGPU::AV_288RegClass;
3563 if (BitWidth == 320)
3564 return &AMDGPU::AV_320RegClass;
3565 if (BitWidth == 352)
3566 return &AMDGPU::AV_352RegClass;
3567 if (BitWidth == 384)
3568 return &AMDGPU::AV_384RegClass;
3569 if (BitWidth == 512)
3570 return &AMDGPU::AV_512RegClass;
3571 if (BitWidth == 1024)
3572 return &AMDGPU::AV_1024RegClass;
3573
3574 return nullptr;
3575}
3576
3577static const TargetRegisterClass *
3579 if (BitWidth == 64)
3580 return &AMDGPU::AV_64_Align2RegClass;
3581 if (BitWidth == 96)
3582 return &AMDGPU::AV_96_Align2RegClass;
3583 if (BitWidth == 128)
3584 return &AMDGPU::AV_128_Align2RegClass;
3585 if (BitWidth == 160)
3586 return &AMDGPU::AV_160_Align2RegClass;
3587 if (BitWidth == 192)
3588 return &AMDGPU::AV_192_Align2RegClass;
3589 if (BitWidth == 224)
3590 return &AMDGPU::AV_224_Align2RegClass;
3591 if (BitWidth == 256)
3592 return &AMDGPU::AV_256_Align2RegClass;
3593 if (BitWidth == 288)
3594 return &AMDGPU::AV_288_Align2RegClass;
3595 if (BitWidth == 320)
3596 return &AMDGPU::AV_320_Align2RegClass;
3597 if (BitWidth == 352)
3598 return &AMDGPU::AV_352_Align2RegClass;
3599 if (BitWidth == 384)
3600 return &AMDGPU::AV_384_Align2RegClass;
3601 if (BitWidth == 512)
3602 return &AMDGPU::AV_512_Align2RegClass;
3603 if (BitWidth == 1024)
3604 return &AMDGPU::AV_1024_Align2RegClass;
3605
3606 return nullptr;
3607}
3608
3609const TargetRegisterClass *
3611 if (BitWidth == 32)
3612 return &AMDGPU::AV_32RegClass;
3613 return ST.needsAlignedVGPRs()
3616}
3617
3618const TargetRegisterClass *
3620 // TODO: In principle this should use AV classes for gfx908 too. This is
3621 // limited to 90a+ to avoid regressing special case copy optimizations which
3622 // need new handling. The core issue is that it's not possible to directly
3623 // copy between AGPRs on gfx908, and the current optimizations around that
3624 // expect to see copies to VGPR.
3625 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3627}
3628
3629const TargetRegisterClass *
3631 if (BitWidth == 16 || BitWidth == 32)
3632 return &AMDGPU::SReg_32RegClass;
3633 if (BitWidth == 64)
3634 return &AMDGPU::SReg_64RegClass;
3635 if (BitWidth == 96)
3636 return &AMDGPU::SGPR_96RegClass;
3637 if (BitWidth == 128)
3638 return &AMDGPU::SGPR_128RegClass;
3639 if (BitWidth == 160)
3640 return &AMDGPU::SGPR_160RegClass;
3641 if (BitWidth == 192)
3642 return &AMDGPU::SGPR_192RegClass;
3643 if (BitWidth == 224)
3644 return &AMDGPU::SGPR_224RegClass;
3645 if (BitWidth == 256)
3646 return &AMDGPU::SGPR_256RegClass;
3647 if (BitWidth == 288)
3648 return &AMDGPU::SGPR_288RegClass;
3649 if (BitWidth == 320)
3650 return &AMDGPU::SGPR_320RegClass;
3651 if (BitWidth == 352)
3652 return &AMDGPU::SGPR_352RegClass;
3653 if (BitWidth == 384)
3654 return &AMDGPU::SGPR_384RegClass;
3655 if (BitWidth == 512)
3656 return &AMDGPU::SGPR_512RegClass;
3657 if (BitWidth == 1024)
3658 return &AMDGPU::SGPR_1024RegClass;
3659
3660 return nullptr;
3661}
3662
3664 Register Reg) const {
3665 const TargetRegisterClass *RC;
3666 if (Reg.isVirtual())
3667 RC = MRI.getRegClass(Reg);
3668 else
3669 RC = getPhysRegBaseClass(Reg);
3670 return RC && isSGPRClass(RC);
3671}
3672
3673const TargetRegisterClass *
3675 unsigned Size = getRegSizeInBits(*SRC);
3676
3677 switch (SRC->getID()) {
3678 default:
3679 break;
3680 case AMDGPU::VS_32_Lo256RegClassID:
3681 case AMDGPU::VS_64_Lo256RegClassID:
3682 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3683 }
3684
3685 const TargetRegisterClass *VRC =
3686 getAllocatableClass(getVGPRClassForBitWidth(Size));
3687 assert(VRC && "Invalid register class size");
3688 return VRC;
3689}
3690
3691const TargetRegisterClass *
3693 unsigned Size = getRegSizeInBits(*SRC);
3695 assert(ARC && "Invalid register class size");
3696 return ARC;
3697}
3698
3699const TargetRegisterClass *
3701 unsigned Size = getRegSizeInBits(*SRC);
3703 assert(ARC && "Invalid register class size");
3704 return ARC;
3705}
3706
3707const TargetRegisterClass *
3709 unsigned Size = getRegSizeInBits(*VRC);
3710 if (Size == 32)
3711 return &AMDGPU::SGPR_32RegClass;
3713 assert(SRC && "Invalid register class size");
3714 return SRC;
3715}
3716
3717const TargetRegisterClass *
3719 const TargetRegisterClass *SubRC,
3720 unsigned SubIdx) const {
3721 // Ensure this subregister index is aligned in the super register.
3722 const TargetRegisterClass *MatchRC =
3723 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3724 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3725}
3726
3727bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3730 return !ST.hasMFMAInlineLiteralBug();
3731
3732 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3733 OpType <= AMDGPU::OPERAND_SRC_LAST;
3734}
3735
3736bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3737 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3738 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3740}
3741
3742/// Returns a lowest register that is not used at any point in the function.
3743/// If all registers are used, then this function will return
3744/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3745/// highest unused register.
3747 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3748 const MachineFunction &MF, bool ReserveHighestRegister) const {
3749 if (ReserveHighestRegister) {
3750 for (MCRegister Reg : reverse(*RC))
3751 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3752 return Reg;
3753 } else {
3754 for (MCRegister Reg : *RC)
3755 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3756 return Reg;
3757 }
3758 return MCRegister();
3759}
3760
3762 const RegisterBankInfo &RBI,
3763 Register Reg) const {
3764 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3765 if (!RB)
3766 return false;
3767
3768 return !RBI.isDivergentRegBank(RB);
3769}
3770
3772 unsigned EltSize) const {
3773 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3774 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3775
3776 const unsigned RegHalves = RegBitWidth / 16;
3777 const unsigned EltHalves = EltSize / 2;
3778 assert(RegSplitParts.size() + 1 >= EltHalves);
3779
3780 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3781 const unsigned NumParts = RegHalves / EltHalves;
3782
3783 return ArrayRef(Parts.data(), NumParts);
3784}
3785
3788 Register Reg) const {
3789 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3790}
3791
3792const TargetRegisterClass *
3794 const MachineOperand &MO) const {
3795 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3796 return getSubRegisterClass(SrcRC, MO.getSubReg());
3797}
3798
3800 Register Reg) const {
3801 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3802 // Registers without classes are unaddressable, SGPR-like registers.
3803 return RC && isVGPRClass(RC);
3804}
3805
3807 Register Reg) const {
3808 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3809
3810 // Registers without classes are unaddressable, SGPR-like registers.
3811 return RC && isAGPRClass(RC);
3812}
3813
3815 MachineFunction &MF) const {
3816 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3817 switch (RC->getID()) {
3818 default:
3819 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3820 case AMDGPU::VGPR_32RegClassID:
3821 return std::min(
3822 ST.getMaxNumVGPRs(
3823 MinOcc,
3825 ST.getMaxNumVGPRs(MF));
3826 case AMDGPU::SGPR_32RegClassID:
3827 case AMDGPU::SGPR_LO16RegClassID:
3828 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3829 }
3830}
3831
3833 unsigned Idx) const {
3834 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3835 case AMDGPU::RegisterPressureSets::VGPR_32:
3836 case AMDGPU::RegisterPressureSets::AGPR_32:
3837 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3838 const_cast<MachineFunction &>(MF));
3839 case AMDGPU::RegisterPressureSets::SReg_32:
3840 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3841 const_cast<MachineFunction &>(MF));
3842 }
3843
3844 llvm_unreachable("Unexpected register pressure set!");
3845}
3846
3847const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
3848 static const int Empty[] = { -1 };
3849
3850 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
3851 return Empty;
3852
3853 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3854}
3855
3857 ArrayRef<MCPhysReg> Order,
3859 const MachineFunction &MF,
3860 const VirtRegMap *VRM,
3861 const LiveRegMatrix *Matrix) const {
3862
3863 const MachineRegisterInfo &MRI = MF.getRegInfo();
3864 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3865
3866 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3867
3868 switch (Hint.first) {
3869 case AMDGPURI::Size32: {
3870 Register Paired = Hint.second;
3871 assert(Paired);
3872 Register PairedPhys;
3873 if (Paired.isPhysical()) {
3874 PairedPhys =
3875 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3876 } else if (VRM && VRM->hasPhys(Paired)) {
3877 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3878 &AMDGPU::VGPR_32RegClass);
3879 }
3880
3881 // Prefer the paired physreg.
3882 if (PairedPhys)
3883 // isLo(Paired) is implicitly true here from the API of
3884 // getMatchingSuperReg.
3885 Hints.push_back(PairedPhys);
3886 return false;
3887 }
3888 case AMDGPURI::Size16: {
3889 Register Paired = Hint.second;
3890 assert(Paired);
3891 Register PairedPhys;
3892 if (Paired.isPhysical()) {
3893 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3894 } else if (VRM && VRM->hasPhys(Paired)) {
3895 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3896 }
3897
3898 // First prefer the paired physreg.
3899 if (PairedPhys)
3900 Hints.push_back(PairedPhys);
3901 else {
3902 // Add all the lo16 physregs.
3903 // When the Paired operand has not yet been assigned a physreg it is
3904 // better to try putting VirtReg in a lo16 register, because possibly
3905 // later Paired can be assigned to the overlapping register and the COPY
3906 // can be eliminated.
3907 for (MCPhysReg PhysReg : Order) {
3908 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3909 continue;
3910 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3911 !MRI.isReserved(PhysReg))
3912 Hints.push_back(PhysReg);
3913 }
3914 }
3915 return false;
3916 }
3917 default:
3918 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3919 VRM);
3920 }
3921}
3922
3924 // Not a callee saved register.
3925 return AMDGPU::SGPR30_SGPR31;
3926}
3927
3928const TargetRegisterClass *
3930 const RegisterBank &RB) const {
3931 switch (RB.getID()) {
3932 case AMDGPU::VGPRRegBankID:
3934 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3935 case AMDGPU::VCCRegBankID:
3936 assert(Size == 1);
3937 return getWaveMaskRegClass();
3938 case AMDGPU::SGPRRegBankID:
3939 return getSGPRClassForBitWidth(std::max(32u, Size));
3940 case AMDGPU::AGPRRegBankID:
3941 return getAGPRClassForBitWidth(std::max(32u, Size));
3942 default:
3943 llvm_unreachable("unknown register bank");
3944 }
3945}
3946
3947const TargetRegisterClass *
3949 const MachineRegisterInfo &MRI) const {
3950 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3951 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3952 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3953
3954 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3955 return getAllocatableClass(RC);
3956
3957 return nullptr;
3958}
3959
3961 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3962}
3963
3965 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3966}
3967
3969 // VGPR tuples have an alignment requirement on gfx90a variants.
3970 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3971 : &AMDGPU::VReg_64RegClass;
3972}
3973
3974// Find reaching register definition
3978 LiveIntervals *LIS) const {
3979 auto &MDT = LIS->getDomTree();
3980 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3981 SlotIndex DefIdx;
3982
3983 if (Reg.isVirtual()) {
3984 if (!LIS->hasInterval(Reg))
3985 return nullptr;
3986 LiveInterval &LI = LIS->getInterval(Reg);
3987 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3988 : MRI.getMaxLaneMaskForVReg(Reg);
3989 VNInfo *V = nullptr;
3990 if (LI.hasSubRanges()) {
3991 for (auto &S : LI.subranges()) {
3992 if ((S.LaneMask & SubLanes) == SubLanes) {
3993 V = S.getVNInfoAt(UseIdx);
3994 break;
3995 }
3996 }
3997 } else {
3998 V = LI.getVNInfoAt(UseIdx);
3999 }
4000 if (!V)
4001 return nullptr;
4002 DefIdx = V->def;
4003 } else {
4004 // Find last def.
4005 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
4006 LiveRange &LR = LIS->getRegUnit(Unit);
4007 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
4008 if (!DefIdx.isValid() ||
4009 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
4010 LIS->getInstructionFromIndex(V->def)))
4011 DefIdx = V->def;
4012 } else {
4013 return nullptr;
4014 }
4015 }
4016 }
4017
4018 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
4019
4020 if (!Def || !MDT.dominates(Def, &Use))
4021 return nullptr;
4022
4023 assert(Def->modifiesRegister(Reg, this));
4024
4025 return Def;
4026}
4027
4029 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4030
4031 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4032 AMDGPU::SReg_32RegClass,
4033 AMDGPU::AGPR_32RegClass } ) {
4034 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4035 return Super;
4036 }
4037 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4038 &AMDGPU::VGPR_32RegClass)) {
4039 return Super;
4040 }
4041
4042 return AMDGPU::NoRegister;
4043}
4044
4046 if (!ST.needsAlignedVGPRs())
4047 return true;
4048
4049 if (isVGPRClass(&RC))
4050 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4051 if (isAGPRClass(&RC))
4052 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4053 if (isVectorSuperClass(&RC))
4054 return RC.hasSuperClassEq(
4055 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4056
4057 assert(&RC != &AMDGPU::VS_64RegClass);
4058
4059 return true;
4060}
4061
4064 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4065}
4066
4069 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4070}
4071
4074 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4075}
4076
4077unsigned
4079 unsigned SubReg) const {
4080 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4081 case SIRCFlags::HasSGPR:
4082 return std::min(128u, getSubRegIdxSize(SubReg));
4083 case SIRCFlags::HasAGPR:
4084 case SIRCFlags::HasVGPR:
4086 return std::min(32u, getSubRegIdxSize(SubReg));
4087 default:
4088 break;
4089 }
4090 return 0;
4091}
4092
4094 const TargetRegisterClass &RC,
4095 bool IncludeCalls) const {
4096 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4098 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4099 ? RC.getRegisters().take_front(NumArchVGPRs)
4100 : RC.getRegisters();
4101 for (MCPhysReg Reg : reverse(Registers))
4102 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4103 return getHWRegIndex(Reg) + 1;
4104 return 0;
4105}
4106
4109 const MachineFunction &MF) const {
4111 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4112 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4113 RegFlags.push_back("WWM_REG");
4114 return RegFlags;
4115}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
std::pair< unsigned, Register > getRegAllocationHint(Register VReg) const
getRegAllocationHint - Return the register allocation hint for the specified virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:253
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:259
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:260
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:254
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:57
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67