LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lower(MachineInstr &MI,
836 const RegBankLLTMapping &Mapping,
837 WaterfallInfo &WFI) {
838
839 switch (Mapping.LoweringMethod) {
840 case DoNotLower:
841 break;
842 case VccExtToSel:
843 return lowerVccExtToSel(MI);
844 case UniExtToSel: {
845 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
846 auto True = B.buildConstant({SgprRB, Ty},
847 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant({SgprRB, Ty}, 0);
849 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
850 // We are making select here. S1 cond was already 'any-extended to S32' +
851 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
852 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
853 False);
854 MI.eraseFromParent();
855 return true;
856 }
857 case UnpackBitShift:
858 return lowerUnpackBitShift(MI);
859 case UnpackMinMax:
860 return lowerUnpackMinMax(MI);
861 case ScalarizeToS16:
862 return lowerSplitTo16(MI);
863 case Ext32To64: {
864 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
865 MachineInstrBuilder Hi;
866 switch (MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant({RB, S32}, 0);
869 break;
870 }
871 case AMDGPU::G_SEXT: {
872 // Replicate sign bit from 32-bit extended part.
873 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
874 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
875 break;
876 }
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef({RB, S32});
879 break;
880 }
881 default:
882 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
883 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
884 MI);
885 return false;
886 }
887
888 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
889 {MI.getOperand(1).getReg(), Hi});
890 MI.eraseFromParent();
891 return true;
892 }
893 case UniCstExt: {
894 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
895 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
896
897 MI.eraseFromParent();
898 return true;
899 }
900 case VgprToVccCopy: {
901 Register Src = MI.getOperand(1).getReg();
902 LLT Ty = MRI.getType(Src);
903 // Take lowest bit from each lane and put it in lane mask.
904 // Lowering via compare, but we need to clean high bits first as compare
905 // compares all bits in register.
906 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
907 if (Ty == S64) {
908 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
909 auto One = B.buildConstant(VgprRB_S32, 1);
910 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
911 auto Zero = B.buildConstant(VgprRB_S32, 0);
912 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
913 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
914 } else {
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant({VgprRB, Ty}, 1);
917 B.buildAnd(BoolSrc, Src, One);
918 }
919 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
920 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
921 MI.eraseFromParent();
922 return true;
923 }
924 case V_BFE:
925 return lowerV_BFE(MI);
926 case S_BFE:
927 return lowerS_BFE(MI);
928 case UniMAD64:
929 return lowerUniMAD64(MI);
930 case UniMul64: {
931 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
932 MI.eraseFromParent();
933 return true;
934 }
935 case DivSMulToMAD: {
936 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
937 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
938 auto Zero = B.buildConstant({VgprRB, S64}, 0);
939
940 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
943
944 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
945 {Op1, Op2, Zero});
946 MI.eraseFromParent();
947 return true;
948 }
949 case SplitTo32:
950 return lowerSplitTo32(MI);
951 case SplitTo32Mul:
952 return lowerSplitTo32Mul(MI);
953 case SplitTo32Select:
954 return lowerSplitTo32Select(MI);
956 return lowerSplitTo32SExtInReg(MI);
957 case SplitLoad: {
958 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
959 unsigned Size = DstTy.getSizeInBits();
960 // Even split to 128-bit loads
961 if (Size > 128) {
962 LLT B128;
963 if (DstTy.isVector()) {
964 LLT EltTy = DstTy.getElementType();
965 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
966 } else {
967 B128 = LLT::scalar(128);
968 }
969 if (Size / 128 == 2)
970 splitLoad(MI, {B128, B128});
971 else if (Size / 128 == 4)
972 splitLoad(MI, {B128, B128, B128, B128});
973 else {
974 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
975 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
976 MI);
977 return false;
978 }
979 }
980 // 64 and 32 bit load
981 else if (DstTy == S96)
982 splitLoad(MI, {S64, S32}, S32);
983 else if (DstTy == V3S32)
984 splitLoad(MI, {V2S32, S32}, S32);
985 else if (DstTy == V6S16)
986 splitLoad(MI, {V4S16, V2S16}, V2S16);
987 else {
988 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
989 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
990 MI);
991 return false;
992 }
993 return true;
994 }
995 case WidenLoad: {
996 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
997 if (DstTy == S96)
998 widenLoad(MI, S128);
999 else if (DstTy == V3S32)
1000 widenLoad(MI, V4S32, S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(MI, V8S16, V2S16);
1003 else {
1004 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1005 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1006 MI);
1007 return false;
1008 }
1009 return true;
1010 }
1011 case UnpackAExt:
1012 return lowerUnpackAExt(MI);
1013 case WidenMMOToS32:
1014 return widenMMOToS32(cast<GAnyLoad>(MI));
1015 case VerifyAllSgpr: {
1016 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1017 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1018 }));
1019 return true;
1020 }
1021 case ApplyAllVgpr: {
1022 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1023 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1024 }));
1025 B.setInstrAndDebugLoc(MI);
1026 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1027 Register Reg = MI.getOperand(i).getReg();
1028 if (MRI.getRegBank(Reg) != VgprRB) {
1029 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1030 MI.getOperand(i).setReg(Copy.getReg(0));
1031 }
1032 }
1033 return true;
1034 }
1035 case UnmergeToShiftTrunc: {
1036 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1037 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1038 if (Ty.getSizeInBits() % 32 != 0) {
1039 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1040 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1041 MI);
1042 return false;
1043 }
1044
1045 B.setInstrAndDebugLoc(MI);
1046 if (Ty.getSizeInBits() > 32) {
1047 auto UnmergeV2S16 =
1048 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1049 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1050 auto [Dst0S32, Dst1S32] =
1051 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1052 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1053 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1054 }
1055 } else {
1056 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1057 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1058 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1059 }
1060
1061 MI.eraseFromParent();
1062 return true;
1063 }
1065 Register Dst = MI.getOperand(0).getReg();
1066 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1067 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1068 MI.getOperand(0).setReg(NewDst);
1069 B.buildTrunc(Dst, NewDst);
1070
1071 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1072 Register UseReg = MI.getOperand(i).getReg();
1073
1074 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1075 MachineBasicBlock *DefMBB = DefMI->getParent();
1076
1077 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1078
1079 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1080 MI.getOperand(i).setReg(NewUse.getReg(0));
1081 }
1082 break;
1083 }
1084 case VerifyAllSgprGPHI: {
1085 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1086 if (Op.isMBB())
1087 return true;
1088 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1089 }));
1090 return true;
1091 }
1093 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1094 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1095 if (Op.isMBB())
1096 return true;
1097 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1098 return RB == VgprRB || RB == SgprRB;
1099 }));
1100 return true;
1101 }
1102 case ApplyINTRIN_IMAGE:
1103 return applyRegisterBanksINTRIN_IMAGE(MI);
1104 }
1105
1106 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1107 if (!executeInWaterfallLoop(B, WFI))
1108 return false;
1109 }
1110 return true;
1111}
1112
1113LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1114 switch (ID) {
1115 case Vcc:
1116 case UniInVcc:
1117 return LLT::scalar(1);
1118 case Sgpr16:
1119 case Vgpr16:
1120 case UniInVgprS16:
1121 return LLT::scalar(16);
1122 case Sgpr32:
1123 case Sgpr32_WF:
1124 case Sgpr32Trunc:
1125 case Sgpr32AExt:
1127 case Sgpr32SExt:
1128 case Sgpr32ZExt:
1129 case UniInVgprS32:
1130 case Vgpr32:
1131 case Vgpr32AExt:
1132 case Vgpr32SExt:
1133 case Vgpr32ZExt:
1134 return LLT::scalar(32);
1135 case Sgpr64:
1136 case Vgpr64:
1137 case UniInVgprS64:
1138 return LLT::scalar(64);
1139 case Sgpr128:
1140 case Vgpr128:
1141 return LLT::scalar(128);
1142 case SgprP0:
1143 case SgprP0Call_WF:
1144 case VgprP0:
1145 return LLT::pointer(0, 64);
1146 case SgprP1:
1147 case VgprP1:
1148 return LLT::pointer(1, 64);
1149 case SgprP2:
1150 case VgprP2:
1151 return LLT::pointer(2, 32);
1152 case SgprP3:
1153 case VgprP3:
1154 return LLT::pointer(3, 32);
1155 case SgprP4:
1156 case SgprP4Call_WF:
1157 case VgprP4:
1158 return LLT::pointer(4, 64);
1159 case SgprP5:
1160 case VgprP5:
1161 return LLT::pointer(5, 32);
1162 case SgprP8:
1163 return LLT::pointer(8, 128);
1164 case SgprV2S16:
1165 case VgprV2S16:
1166 case UniInVgprV2S16:
1167 return LLT::fixed_vector(2, 16);
1168 case SgprV2S32:
1169 case VgprV2S32:
1170 case UniInVgprV2S32:
1171 return LLT::fixed_vector(2, 32);
1172 case VgprV3S32:
1173 return LLT::fixed_vector(3, 32);
1174 case SgprV4S32:
1175 case SgprV4S32_WF:
1176 case VgprV4S32:
1177 case UniInVgprV4S32:
1178 return LLT::fixed_vector(4, 32);
1179 case VgprV2S64:
1180 case UniInVgprV2S64:
1181 return LLT::fixed_vector(2, 64);
1182 default:
1183 return LLT();
1184 }
1185}
1186
1187LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1188 switch (ID) {
1189 case SgprB32:
1190 case VgprB32:
1191 case SgprB32_M0:
1192 case UniInVgprB32:
1193 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1194 isAnyPtr(Ty, 32))
1195 return Ty;
1196 return LLT();
1197 case SgprPtr32:
1198 case VgprPtr32:
1199 return isAnyPtr(Ty, 32) ? Ty : LLT();
1200 case SgprPtr64:
1201 case VgprPtr64:
1202 return isAnyPtr(Ty, 64) ? Ty : LLT();
1203 case SgprPtr128:
1204 case VgprPtr128:
1205 return isAnyPtr(Ty, 128) ? Ty : LLT();
1206 case SgprB64:
1207 case VgprB64:
1208 case UniInVgprB64:
1209 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1210 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1211 return Ty;
1212 return LLT();
1213 case SgprB96:
1214 case VgprB96:
1215 case UniInVgprB96:
1216 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1217 Ty == LLT::fixed_vector(6, 16))
1218 return Ty;
1219 return LLT();
1220 case SgprB128:
1221 case VgprB128:
1222 case UniInVgprB128:
1223 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1224 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1225 isAnyPtr(Ty, 128))
1226 return Ty;
1227 return LLT();
1228 case VgprB160:
1229 case UniInVgprB160:
1230 if (Ty.getSizeInBits() == 160)
1231 return Ty;
1232 return LLT();
1233 case SgprB256:
1234 case VgprB256:
1235 case UniInVgprB256:
1236 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1237 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1238 return Ty;
1239 return LLT();
1240 case SgprB512:
1241 case VgprB512:
1242 case UniInVgprB512:
1243 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1244 Ty == LLT::fixed_vector(8, 64))
1245 return Ty;
1246 return LLT();
1247 case SgprBRC: {
1248 const SIRegisterInfo *TRI =
1249 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1250 unsigned LLTSize = Ty.getSizeInBits();
1251 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1252 return Ty;
1253 return LLT();
1254 }
1255 case VgprBRC: {
1256 const SIRegisterInfo *TRI =
1257 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1258 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1259 return Ty;
1260 return LLT();
1261 }
1262 default:
1263 return LLT();
1264 }
1265}
1266
1267const RegisterBank *
1268RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1269 switch (ID) {
1270 case Vcc:
1271 return VccRB;
1272 case Sgpr16:
1273 case Sgpr32:
1274 case Sgpr32_WF:
1275 case Sgpr64:
1276 case Sgpr128:
1277 case SgprP0:
1278 case SgprP0Call_WF:
1279 case SgprP1:
1280 case SgprP2:
1281 case SgprP3:
1282 case SgprP4:
1283 case SgprP4Call_WF:
1284 case SgprP5:
1285 case SgprP8:
1286 case SgprPtr32:
1287 case SgprPtr64:
1288 case SgprPtr128:
1289 case SgprV2S16:
1290 case SgprV2S32:
1291 case SgprV4S32:
1292 case SgprV4S32_WF:
1293 case SgprB32:
1294 case SgprB64:
1295 case SgprB96:
1296 case SgprB128:
1297 case SgprB256:
1298 case SgprB512:
1299 case SgprBRC:
1300 case UniInVcc:
1301 case UniInVgprS16:
1302 case UniInVgprS32:
1303 case UniInVgprS64:
1304 case UniInVgprV2S16:
1305 case UniInVgprV2S32:
1306 case UniInVgprV4S32:
1307 case UniInVgprV2S64:
1308 case UniInVgprB32:
1309 case UniInVgprB64:
1310 case UniInVgprB96:
1311 case UniInVgprB128:
1312 case UniInVgprB160:
1313 case UniInVgprB256:
1314 case UniInVgprB512:
1315 case Sgpr32Trunc:
1316 case Sgpr32AExt:
1318 case Sgpr32SExt:
1319 case Sgpr32ZExt:
1320 return SgprRB;
1321 case Vgpr16:
1322 case Vgpr32:
1323 case Vgpr64:
1324 case Vgpr128:
1325 case VgprP0:
1326 case VgprP1:
1327 case VgprP2:
1328 case VgprP3:
1329 case VgprP4:
1330 case VgprP5:
1331 case VgprPtr32:
1332 case VgprPtr64:
1333 case VgprPtr128:
1334 case VgprV2S16:
1335 case VgprV2S32:
1336 case VgprV2S64:
1337 case VgprV3S32:
1338 case VgprV4S32:
1339 case VgprB32:
1340 case VgprB64:
1341 case VgprB96:
1342 case VgprB128:
1343 case VgprB160:
1344 case VgprB256:
1345 case VgprB512:
1346 case VgprBRC:
1347 case Vgpr32AExt:
1348 case Vgpr32SExt:
1349 case Vgpr32ZExt:
1350 return VgprRB;
1351 default:
1352 return nullptr;
1353 }
1354}
1355
1356bool RegBankLegalizeHelper::applyMappingDst(
1357 MachineInstr &MI, unsigned &OpIdx,
1358 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1359 // Defs start from operand 0
1360 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1361 if (MethodIDs[OpIdx] == None)
1362 continue;
1363 MachineOperand &Op = MI.getOperand(OpIdx);
1364 Register Reg = Op.getReg();
1365 LLT Ty = MRI.getType(Reg);
1366 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1367
1368 switch (MethodIDs[OpIdx]) {
1369 // vcc, sgpr and vgpr scalars, pointers and vectors
1370 case Vcc:
1371 case Sgpr16:
1372 case Sgpr32:
1373 case Sgpr64:
1374 case Sgpr128:
1375 case SgprP0:
1376 case SgprP1:
1377 case SgprP3:
1378 case SgprP4:
1379 case SgprP5:
1380 case SgprP8:
1381 case SgprV2S16:
1382 case SgprV2S32:
1383 case SgprV4S32:
1384 case Vgpr16:
1385 case Vgpr32:
1386 case Vgpr64:
1387 case Vgpr128:
1388 case VgprP0:
1389 case VgprP1:
1390 case VgprP2:
1391 case VgprP3:
1392 case VgprP4:
1393 case VgprP5:
1394 case VgprV2S16:
1395 case VgprV2S32:
1396 case VgprV2S64:
1397 case VgprV3S32:
1398 case VgprV4S32: {
1399 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1400 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1401 break;
1402 }
1403 // sgpr and vgpr B-types
1404 case SgprB32:
1405 case SgprB64:
1406 case SgprB96:
1407 case SgprB128:
1408 case SgprB256:
1409 case SgprB512:
1410 case SgprBRC:
1411 case SgprPtr32:
1412 case SgprPtr64:
1413 case SgprPtr128:
1414 case VgprB32:
1415 case VgprB64:
1416 case VgprB96:
1417 case VgprB128:
1418 case VgprB160:
1419 case VgprB256:
1420 case VgprB512:
1421 case VgprBRC:
1422 case VgprPtr32:
1423 case VgprPtr64:
1424 case VgprPtr128: {
1425 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1426 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1427 break;
1428 }
1429 // uniform in vcc/vgpr: scalars, vectors and B-types
1430 case UniInVcc: {
1431 assert(Ty == S1);
1432 assert(RB == SgprRB);
1433 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1434 Op.setReg(NewDst);
1435 if (!MRI.use_empty(Reg)) {
1436 auto CopyS32_Vcc =
1437 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1438 B.buildTrunc(Reg, CopyS32_Vcc);
1439 }
1440 break;
1441 }
1442 case UniInVgprS16: {
1443 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1444 assert(RB == SgprRB);
1445 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1446 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1447 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1448 Op.setReg(NewVgprDstS16);
1449 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1450 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1451 B.buildTrunc(Reg, NewSgprDstS32);
1452 break;
1453 }
1454 case UniInVgprS32:
1455 case UniInVgprS64:
1456 case UniInVgprV2S16:
1457 case UniInVgprV2S32:
1458 case UniInVgprV4S32:
1459 case UniInVgprV2S64: {
1460 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1461 assert(RB == SgprRB);
1462 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1463 Op.setReg(NewVgprDst);
1464 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1465 break;
1466 }
1467 case UniInVgprB32:
1468 case UniInVgprB64:
1469 case UniInVgprB96:
1470 case UniInVgprB128:
1471 case UniInVgprB160:
1472 case UniInVgprB256:
1473 case UniInVgprB512: {
1474 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1475 assert(RB == SgprRB);
1476 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1477 Op.setReg(NewVgprDst);
1478 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1479 break;
1480 }
1481 // sgpr trunc
1482 case Sgpr32Trunc: {
1483 assert(Ty.getSizeInBits() < 32);
1484 assert(RB == SgprRB);
1485 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1486 Op.setReg(NewDst);
1487 if (!MRI.use_empty(Reg))
1488 B.buildTrunc(Reg, NewDst);
1489 break;
1490 }
1491 case InvalidMapping: {
1493 MF, MORE, "amdgpu-regbanklegalize",
1494 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1495 return false;
1496 }
1497 default:
1499 MF, MORE, "amdgpu-regbanklegalize",
1500 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1501 return false;
1502 }
1503 }
1504
1505 return true;
1506}
1507
1508bool RegBankLegalizeHelper::applyMappingSrc(
1509 MachineInstr &MI, unsigned &OpIdx,
1510 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1511 WaterfallInfo &WFI) {
1512 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1513 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1514 continue;
1515
1516 MachineOperand &Op = MI.getOperand(OpIdx);
1517 Register Reg = Op.getReg();
1518 LLT Ty = MRI.getType(Reg);
1519 const RegisterBank *RB = MRI.getRegBank(Reg);
1520
1521 switch (MethodIDs[i]) {
1522 case Vcc: {
1523 assert(Ty == S1);
1524 assert(RB == VccRB || RB == SgprRB);
1525 if (RB == SgprRB) {
1526 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1527 auto CopyVcc_Scc =
1528 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1529 Op.setReg(CopyVcc_Scc.getReg(0));
1530 }
1531 break;
1532 }
1533 // sgpr scalars, pointers and vectors
1534 case Sgpr16:
1535 case Sgpr32:
1536 case Sgpr64:
1537 case Sgpr128:
1538 case SgprP0:
1539 case SgprP1:
1540 case SgprP3:
1541 case SgprP4:
1542 case SgprP5:
1543 case SgprP8:
1544 case SgprV2S16:
1545 case SgprV2S32:
1546 case SgprV4S32: {
1547 assert(Ty == getTyFromID(MethodIDs[i]));
1548 assert(RB == getRegBankFromID(MethodIDs[i]));
1549 break;
1550 }
1551 // sgpr B-types
1552 case SgprB32:
1553 case SgprB64:
1554 case SgprB96:
1555 case SgprB128:
1556 case SgprB256:
1557 case SgprB512:
1558 case SgprBRC:
1559 case SgprPtr32:
1560 case SgprPtr64:
1561 case SgprPtr128: {
1562 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1563 assert(RB == getRegBankFromID(MethodIDs[i]));
1564 break;
1565 }
1566 // vgpr scalars, pointers and vectors
1567 case Vgpr16:
1568 case Vgpr32:
1569 case Vgpr64:
1570 case Vgpr128:
1571 case VgprP0:
1572 case VgprP1:
1573 case VgprP2:
1574 case VgprP3:
1575 case VgprP4:
1576 case VgprP5:
1577 case VgprV2S16:
1578 case VgprV2S32:
1579 case VgprV2S64:
1580 case VgprV3S32:
1581 case VgprV4S32: {
1582 assert(Ty == getTyFromID(MethodIDs[i]));
1583 if (RB != VgprRB) {
1584 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1585 Op.setReg(CopyToVgpr.getReg(0));
1586 }
1587 break;
1588 }
1589 // vgpr B-types
1590 case VgprB32:
1591 case VgprB64:
1592 case VgprB96:
1593 case VgprB128:
1594 case VgprB160:
1595 case VgprB256:
1596 case VgprB512:
1597 case VgprBRC:
1598 case VgprPtr32:
1599 case VgprPtr64:
1600 case VgprPtr128: {
1601 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1602 if (RB != VgprRB) {
1603 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1604 Op.setReg(CopyToVgpr.getReg(0));
1605 }
1606 break;
1607 }
1608 // sgpr waterfall, scalars, and vectors
1609 case Sgpr32_WF:
1610 case SgprV4S32_WF: {
1611 assert(Ty == getTyFromID(MethodIDs[i]));
1612 if (RB != SgprRB) {
1613 WFI.SgprWaterfallOperandRegs.insert(Reg);
1614 if (!WFI.Start.isValid()) {
1615 WFI.Start = MI.getIterator();
1616 WFI.End = std::next(MI.getIterator());
1617 }
1618 }
1619 break;
1620 }
1621 case SgprP0Call_WF:
1622 case SgprP4Call_WF: {
1623 assert(Ty == getTyFromID(MethodIDs[i]));
1624 if (RB != SgprRB) {
1625 WFI.SgprWaterfallOperandRegs.insert(Reg);
1626
1627 // Find the ADJCALLSTACKUP before the call.
1628 MachineBasicBlock::iterator Start = MI.getIterator();
1629 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1630 --Start;
1631
1632 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1633 MachineBasicBlock::iterator End = MI.getIterator();
1634 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1635 ++End;
1636 ++End;
1637
1638 B.setInsertPt(*MI.getParent(), Start);
1639 WFI.Start = Start;
1640 WFI.End = End;
1641 }
1642 break;
1643 }
1644 case SgprB32_M0: {
1645 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1646 if (RB == SgprRB)
1647 break;
1648 assert(RB == VgprRB);
1649 Register NewSGPR32 = MRI.createVirtualRegister({SgprRB, Ty});
1650 buildReadFirstLane(B, NewSGPR32, Op.getReg(), RBI);
1651 Op.setReg(NewSGPR32);
1652 break;
1653 }
1654 // sgpr and vgpr scalars with extend
1655 case Sgpr32AExt: {
1656 // Note: this ext allows S1, and it is meant to be combined away.
1657 assert(Ty.getSizeInBits() < 32);
1658 assert(RB == SgprRB);
1659 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1660 Op.setReg(Aext.getReg(0));
1661 break;
1662 }
1663 case Sgpr32AExtBoolInReg: {
1664 // Note: this ext allows S1, and it is meant to be combined away.
1665 assert(Ty.getSizeInBits() == 1);
1666 assert(RB == SgprRB);
1667 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1668 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1669 // most of times meant to be combined away in AMDGPURegBankCombiner.
1670 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1671 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1672 Op.setReg(BoolInReg.getReg(0));
1673 break;
1674 }
1675 case Sgpr32SExt: {
1676 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1677 assert(RB == SgprRB);
1678 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1679 Op.setReg(Sext.getReg(0));
1680 break;
1681 }
1682 case Sgpr32ZExt: {
1683 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1684 assert(RB == SgprRB);
1685 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1686 Op.setReg(Zext.getReg(0));
1687 break;
1688 }
1689 case Vgpr32AExt: {
1690 assert(Ty.getSizeInBits() < 32);
1691 assert(RB == VgprRB);
1692 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1693 Op.setReg(Aext.getReg(0));
1694 break;
1695 }
1696 case Vgpr32SExt: {
1697 // Note this ext allows S1, and it is meant to be combined away.
1698 assert(Ty.getSizeInBits() < 32);
1699 assert(RB == VgprRB);
1700 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1701 Op.setReg(Sext.getReg(0));
1702 break;
1703 }
1704 case Vgpr32ZExt: {
1705 // Note this ext allows S1, and it is meant to be combined away.
1706 assert(Ty.getSizeInBits() < 32);
1707 assert(RB == VgprRB);
1708 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1709 Op.setReg(Zext.getReg(0));
1710 break;
1711 }
1712 default:
1714 MF, MORE, "amdgpu-regbanklegalize",
1715 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1716 return false;
1717 }
1718 }
1719 return true;
1720}
1721
1722[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1723 const RegisterBank *RB,
1725 unsigned StartOpIdx,
1726 unsigned EndOpIdx) {
1727 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1728 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1729 return false;
1730 }
1731 return true;
1732}
1733
1735 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1736 // Put RB on all registers
1737 unsigned NumDefs = MI.getNumDefs();
1738 unsigned NumOperands = MI.getNumOperands();
1739
1740 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1741 if (RB == SgprRB)
1742 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1743
1744 if (RB == VgprRB) {
1745 B.setInstr(MI);
1746 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1747 Register Reg = MI.getOperand(i).getReg();
1748 if (MRI.getRegBank(Reg) != RB) {
1749 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1750 MI.getOperand(i).setReg(Copy.getReg(0));
1751 }
1752 }
1753 }
1754}
1755
1756bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1757 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1759 assert(RSrcIntrin && RSrcIntrin->IsImage);
1760
1761 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1762 const unsigned NumDefs = MI.getNumExplicitDefs();
1763
1764 // The reported argument index is relative to the IR intrinsic call arguments,
1765 // so we need to shift by the number of defs and the intrinsic ID.
1766 RsrcIdx += NumDefs + 1;
1767
1768 MachineBasicBlock *MBB = MI.getParent();
1769 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1770
1771 // Defs(for image loads with return) are vgpr.
1772 for (unsigned i = 0; i < NumDefs; ++i) {
1773 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1774 if (RB == VgprRB)
1775 continue;
1776
1777 Register Reg = MI.getOperand(i).getReg();
1778 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1779 MI.getOperand(i).setReg(NewVgprDst);
1780 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1781 }
1782
1783 B.setInstrAndDebugLoc(MI);
1784
1785 // Register uses(before RsrcIdx) are vgpr.
1786 for (unsigned i = 1; i < RsrcIdx; ++i) {
1787 MachineOperand &Op = MI.getOperand(i);
1788 if (!Op.isReg())
1789 continue;
1790
1791 Register Reg = Op.getReg();
1792 if (!Reg.isVirtual())
1793 continue;
1794
1795 if (MRI.getRegBank(Reg) == VgprRB)
1796 continue;
1797
1798 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1799 Op.setReg(Copy.getReg(0));
1800 }
1801
1802 SmallSet<Register, 4> OpsToWaterfall;
1803
1804 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1805 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1806 MachineOperand &Op = MI.getOperand(i);
1807 if (!Op.isReg())
1808 continue;
1809
1810 Register Reg = Op.getReg();
1811 if (MRI.getRegBank(Reg) != SgprRB)
1812 OpsToWaterfall.insert(Reg);
1813 }
1814
1815 if (!OpsToWaterfall.empty()) {
1816 MachineBasicBlock::iterator MII = MI.getIterator();
1817 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1818 }
1819
1820 return true;
1821}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs