LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
78 const Function &F = I.getMF()->getFunction();
79 F.getContext().diagnose(DiagnosticInfoUnsupported(
80 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));
81}
82
83bool AMDGPUInstructionSelector::isVCC(Register Reg,
84 const MachineRegisterInfo &MRI) const {
85 // The verifier is oblivious to s1 being a valid value for wavesize registers.
86 if (Reg.isPhysical())
87 return false;
88
89 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
90 const TargetRegisterClass *RC =
92 if (RC) {
93 const LLT Ty = MRI.getType(Reg);
94 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
95 return false;
96 // G_TRUNC s1 result is never vcc.
97 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
98 RC->hasSuperClassEq(TRI.getBoolRC());
99 }
100
101 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
102 return RB->getID() == AMDGPU::VCCRegBankID;
103}
104
105bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
106 unsigned NewOpc) const {
107 MI.setDesc(TII.get(NewOpc));
108 MI.removeOperand(1); // Remove intrinsic ID.
109 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
110
111 MachineOperand &Dst = MI.getOperand(0);
112 MachineOperand &Src = MI.getOperand(1);
113
114 // TODO: This should be legalized to s32 if needed
115 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
116 return false;
117
118 const TargetRegisterClass *DstRC
119 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
120 const TargetRegisterClass *SrcRC
121 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
122 if (!DstRC || DstRC != SrcRC)
123 return false;
124
125 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
126 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
127 return false;
128 const MCInstrDesc &MCID = MI.getDesc();
129 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
130 MI.getOperand(0).setIsEarlyClobber(true);
131 }
132 return true;
133}
134
135bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
136 const DebugLoc &DL = I.getDebugLoc();
137 MachineBasicBlock *BB = I.getParent();
138 I.setDesc(TII.get(TargetOpcode::COPY));
139
140 const MachineOperand &Src = I.getOperand(1);
141 MachineOperand &Dst = I.getOperand(0);
142 Register DstReg = Dst.getReg();
143 Register SrcReg = Src.getReg();
144
145 if (isVCC(DstReg, *MRI)) {
146 if (SrcReg == AMDGPU::SCC) {
147 const TargetRegisterClass *RC
148 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
149 if (!RC)
150 return true;
151 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
152 }
153
154 if (!isVCC(SrcReg, *MRI)) {
155 // TODO: Should probably leave the copy and let copyPhysReg expand it.
156 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
157 return false;
158
159 const TargetRegisterClass *SrcRC
160 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
161
162 std::optional<ValueAndVReg> ConstVal =
163 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
164 if (ConstVal) {
165 unsigned MovOpc =
166 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
167 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
168 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
169 } else {
170 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
171
172 // We can't trust the high bits at this point, so clear them.
173
174 // TODO: Skip masking high bits if def is known boolean.
175
176 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
177 assert(Subtarget->useRealTrue16Insts());
178 const int64_t NoMods = 0;
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
180 .addImm(NoMods)
181 .addImm(1)
182 .addImm(NoMods)
183 .addReg(SrcReg)
184 .addImm(NoMods);
185 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
186 .addImm(NoMods)
187 .addImm(0)
188 .addImm(NoMods)
189 .addReg(MaskedReg)
190 .addImm(NoMods);
191 } else {
192 bool IsSGPR = TRI.isSGPRClass(SrcRC);
193 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
194 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
195 .addImm(1)
196 .addReg(SrcReg);
197 if (IsSGPR)
198 And.setOperandDead(3); // Dead scc
199
200 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
201 .addImm(0)
202 .addReg(MaskedReg);
203 }
204 }
205
206 if (!MRI->getRegClassOrNull(SrcReg))
207 MRI->setRegClass(SrcReg, SrcRC);
208 I.eraseFromParent();
209 return true;
210 }
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
214 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
215 return false;
216
217 return true;
218 }
219
220 for (const MachineOperand &MO : I.operands()) {
221 if (MO.getReg().isPhysical())
222 continue;
223
224 const TargetRegisterClass *RC =
225 TRI.getConstrainedRegClassForOperand(MO, *MRI);
226 if (!RC)
227 continue;
228 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
229 }
230 return true;
231}
232
233bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
234 const DebugLoc &DL = I.getDebugLoc();
235 MachineBasicBlock *BB = I.getParent();
236 Register VCCReg = I.getOperand(1).getReg();
237 MachineInstr *Cmp;
238
239 // Set SCC as a side effect with S_CMP or S_OR.
240 if (STI.hasScalarCompareEq64()) {
241 unsigned CmpOpc =
242 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
243 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
244 } else {
245 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
246 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
247 .addReg(VCCReg)
248 .addReg(VCCReg);
249 }
250
251 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
252
253 Register DstReg = I.getOperand(0).getReg();
254 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
255
256 I.eraseFromParent();
257 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
258}
259
260bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
261 const DebugLoc &DL = I.getDebugLoc();
262 MachineBasicBlock *BB = I.getParent();
263
264 Register DstReg = I.getOperand(0).getReg();
265 Register SrcReg = I.getOperand(1).getReg();
266 std::optional<ValueAndVReg> Arg =
267 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
268
269 if (Arg) {
270 const int64_t Value = Arg->Value.getZExtValue();
271 if (Value == 0) {
272 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
273 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
274 } else {
275 assert(Value == 1);
276 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
277 }
278 I.eraseFromParent();
279 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
280 }
281
282 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
283 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
284
285 unsigned SelectOpcode =
286 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
287 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
288 .addReg(TRI.getExec())
289 .addImm(0);
290
291 I.eraseFromParent();
293 return true;
294}
295
296bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
297 Register DstReg = I.getOperand(0).getReg();
298 Register SrcReg = I.getOperand(1).getReg();
299
300 const DebugLoc &DL = I.getDebugLoc();
301 MachineBasicBlock *BB = I.getParent();
302
303 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
304 .addReg(SrcReg);
305
306 I.eraseFromParent();
307 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
308 return true;
309}
310
311bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
312 const Register DefReg = I.getOperand(0).getReg();
313 const LLT DefTy = MRI->getType(DefReg);
314
315 // S1 G_PHIs should not be selected in instruction-select, instead:
316 // - divergent S1 G_PHI should go through lane mask merging algorithm
317 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
318 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
319 if (DefTy == LLT::scalar(1))
320 return false;
321
322 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
323
324 const RegClassOrRegBank &RegClassOrBank =
325 MRI->getRegClassOrRegBank(DefReg);
326
327 const TargetRegisterClass *DefRC =
329 if (!DefRC) {
330 if (!DefTy.isValid()) {
331 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
332 return false;
333 }
334
335 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
336 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
337 if (!DefRC) {
338 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
339 return false;
340 }
341 }
342
343 // If inputs have register bank, assign corresponding reg class.
344 // Note: registers don't need to have the same reg bank.
345 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
346 const Register SrcReg = I.getOperand(i).getReg();
347
348 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
349 if (RB) {
350 const LLT SrcTy = MRI->getType(SrcReg);
351 const TargetRegisterClass *SrcRC =
352 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
353 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
354 return false;
355 }
356 }
357
358 I.setDesc(TII.get(TargetOpcode::PHI));
359 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
360}
361
363AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
364 const TargetRegisterClass &SubRC,
365 unsigned SubIdx) const {
366
367 MachineInstr *MI = MO.getParent();
368 MachineBasicBlock *BB = MO.getParent()->getParent();
369 Register DstReg = MRI->createVirtualRegister(&SubRC);
370
371 if (MO.isReg()) {
372 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
373 Register Reg = MO.getReg();
374 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
375 .addReg(Reg, {}, ComposedSubIdx);
376
377 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
378 MO.isKill(), MO.isDead(), MO.isUndef(),
379 MO.isEarlyClobber(), 0, MO.isDebug(),
380 MO.isInternalRead());
381 }
382
383 assert(MO.isImm());
384
385 APInt Imm(64, MO.getImm());
386
387 switch (SubIdx) {
388 default:
389 llvm_unreachable("do not know to split immediate with this sub index.");
390 case AMDGPU::sub0:
391 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
392 case AMDGPU::sub1:
393 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
394 }
395}
396
397static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
398 switch (Opc) {
399 case AMDGPU::G_AND:
400 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
401 case AMDGPU::G_OR:
402 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
403 case AMDGPU::G_XOR:
404 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
405 default:
406 llvm_unreachable("not a bit op");
407 }
408}
409
410bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
411 Register DstReg = I.getOperand(0).getReg();
412 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
413
414 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
415 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
416 DstRB->getID() != AMDGPU::VCCRegBankID)
417 return false;
418
419 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
420 STI.isWave64());
421 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
422
423 // Dead implicit-def of scc
424 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
425 true, // isImp
426 false, // isKill
427 true)); // isDead
428 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
429 return true;
430}
431
432bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
433 MachineBasicBlock *BB = I.getParent();
434 MachineFunction *MF = BB->getParent();
435 Register DstReg = I.getOperand(0).getReg();
436 const DebugLoc &DL = I.getDebugLoc();
437 LLT Ty = MRI->getType(DstReg);
438 if (Ty.isVector())
439 return false;
440
441 unsigned Size = Ty.getSizeInBits();
442 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
443 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
444 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
445
446 if (Size == 32) {
447 if (IsSALU) {
448 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
449 MachineInstr *Add =
450 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
451 .add(I.getOperand(1))
452 .add(I.getOperand(2))
453 .setOperandDead(3); // Dead scc
454 I.eraseFromParent();
455 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
456 return true;
457 }
458
459 if (STI.hasAddNoCarryInsts()) {
460 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
461 I.setDesc(TII.get(Opc));
462 I.addOperand(*MF, MachineOperand::CreateImm(0));
463 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
464 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
465 return true;
466 }
467
468 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
469
470 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
471 MachineInstr *Add
472 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
473 .addDef(UnusedCarry, RegState::Dead)
474 .add(I.getOperand(1))
475 .add(I.getOperand(2))
476 .addImm(0);
477 I.eraseFromParent();
478 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
479 return true;
480 }
481
482 assert(!Sub && "illegal sub should not reach here");
483
484 const TargetRegisterClass &RC
485 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
486 const TargetRegisterClass &HalfRC
487 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
488
489 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
490 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
491 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
492 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
493
494 Register DstLo = MRI->createVirtualRegister(&HalfRC);
495 Register DstHi = MRI->createVirtualRegister(&HalfRC);
496
497 if (IsSALU) {
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
499 .add(Lo1)
500 .add(Lo2);
501 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
502 .add(Hi1)
503 .add(Hi2)
504 .setOperandDead(3); // Dead scc
505 } else {
506 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
507 Register CarryReg = MRI->createVirtualRegister(CarryRC);
508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
509 .addDef(CarryReg)
510 .add(Lo1)
511 .add(Lo2)
512 .addImm(0);
513 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
514 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
515 .add(Hi1)
516 .add(Hi2)
517 .addReg(CarryReg, RegState::Kill)
518 .addImm(0);
519
520 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
521 }
522
523 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
524 .addReg(DstLo)
525 .addImm(AMDGPU::sub0)
526 .addReg(DstHi)
527 .addImm(AMDGPU::sub1);
528
529
530 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
531 return false;
532
533 I.eraseFromParent();
534 return true;
535}
536
537bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
538 MachineInstr &I) const {
539 MachineBasicBlock *BB = I.getParent();
540 MachineFunction *MF = BB->getParent();
541 const DebugLoc &DL = I.getDebugLoc();
542 Register Dst0Reg = I.getOperand(0).getReg();
543 Register Dst1Reg = I.getOperand(1).getReg();
544 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
545 I.getOpcode() == AMDGPU::G_UADDE;
546 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
547 I.getOpcode() == AMDGPU::G_USUBE;
548
549 if (isVCC(Dst1Reg, *MRI)) {
550 unsigned NoCarryOpc =
551 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
552 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
553 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
554 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
555 I.addOperand(*MF, MachineOperand::CreateImm(0));
556 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
557 return true;
558 }
559
560 Register Src0Reg = I.getOperand(2).getReg();
561 Register Src1Reg = I.getOperand(3).getReg();
562
563 if (HasCarryIn) {
564 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
565 .addReg(I.getOperand(4).getReg());
566 }
567
568 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
569 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
570
571 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
572 .add(I.getOperand(2))
573 .add(I.getOperand(3));
574
575 if (MRI->use_nodbg_empty(Dst1Reg)) {
576 CarryInst.setOperandDead(3); // Dead scc
577 } else {
578 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
579 .addReg(AMDGPU::SCC);
580 if (!MRI->getRegClassOrNull(Dst1Reg))
581 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
582 }
583
584 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
585 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
586 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
587 return false;
588
589 if (HasCarryIn &&
590 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
591 AMDGPU::SReg_32RegClass, *MRI))
592 return false;
593
594 I.eraseFromParent();
595 return true;
596}
597
598bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
599 MachineInstr &I) const {
600 MachineBasicBlock *BB = I.getParent();
601 MachineFunction *MF = BB->getParent();
602 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
603 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
604 MRI->use_nodbg_empty(I.getOperand(1).getReg());
605
606 unsigned Opc;
607 if (Subtarget->hasMADIntraFwdBug())
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
609 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
610 else if (UseNoCarry)
611 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
612 : AMDGPU::V_MAD_NC_I64_I32_e64;
613 else
614 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
615
616 if (UseNoCarry)
617 I.removeOperand(1);
618
619 I.setDesc(TII.get(Opc));
620 I.addOperand(*MF, MachineOperand::CreateImm(0));
621 I.addImplicitDefUseOperands(*MF);
622 I.getOperand(0).setIsEarlyClobber(true);
623 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
624 return true;
625}
626
627// TODO: We should probably legalize these to only using 32-bit results.
628bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
629 MachineBasicBlock *BB = I.getParent();
630 Register DstReg = I.getOperand(0).getReg();
631 Register SrcReg = I.getOperand(1).getReg();
632 LLT DstTy = MRI->getType(DstReg);
633 LLT SrcTy = MRI->getType(SrcReg);
634 const unsigned SrcSize = SrcTy.getSizeInBits();
635 unsigned DstSize = DstTy.getSizeInBits();
636
637 // TODO: Should handle any multiple of 32 offset.
638 unsigned Offset = I.getOperand(2).getImm();
639 if (Offset % 32 != 0 || DstSize > 128)
640 return false;
641
642 // 16-bit operations really use 32-bit registers.
643 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
644 if (DstSize == 16)
645 DstSize = 32;
646
647 const TargetRegisterClass *DstRC =
648 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
650 return false;
651
652 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
653 const TargetRegisterClass *SrcRC =
654 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
655 if (!SrcRC)
656 return false;
657 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
658 DstSize / 32);
659 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
660 if (!SrcRC)
661 return false;
662
663 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
664 *SrcRC, I.getOperand(1));
665 const DebugLoc &DL = I.getDebugLoc();
666 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
667 .addReg(SrcReg, {}, SubReg);
668
669 I.eraseFromParent();
670 return true;
671}
672
673bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {
674 Register Dst = MI.getOperand(0).getReg();
675 Register Src0 = MI.getOperand(1).getReg();
676 Register Src1 = MI.getOperand(2).getReg();
677
678 LLT Src0Ty = MRI->getType(Src0);
679 LLT Src1Ty = MRI->getType(Src1);
680
681 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
682 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);
683 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);
684 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
685
686 Register ShiftSrc0;
687 Register ShiftSrc1;
688
689 const DebugLoc &DL = MI.getDebugLoc();
690 MachineBasicBlock *BB = MI.getParent();
691
692 // VGPR case
693 if (IsVector) {
694 // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters
695 if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&
696 Src1Bank->getID() == AMDGPU::VGPRRegBankID &&
697 Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) {
698 BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)
699 .addReg(Src0)
700 .addImm(AMDGPU::lo16)
701 .addReg(Src1)
702 .addImm(AMDGPU::hi16);
703
704 if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))
705 return false;
706
707 MI.eraseFromParent();
708 return true;
709 }
710
711 // Otherwise, use V_LSHL_OR_B32_e64
712 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
713 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
714 .addImm(0xFFFF)
715 .addReg(Src0);
716 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
717
718 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
719 .addReg(Src1)
720 .addImm(16)
721 .addReg(TmpReg);
722 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
723
724 MI.eraseFromParent();
725 return true;
726 }
727
728 // SGPR case -> S_PACK_*_B32_B16
729 // With multiple uses of the shift, this will duplicate the shift and
730 // increase register pressure.
731 //
732 // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
733 // => (S_PACK_HH_B32_B16 $src0, $src1)
734 // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)
735 // => (S_PACK_HL_B32_B16 $src0, $src1)
736 // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))
737 // => (S_PACK_LH_B32_B16 $src0, $src1)
738 // (merge $src0, $src1)
739 // => (S_PACK_LL_B32_B16 $src0, $src1)
740
741 bool Shift0 = mi_match(
742 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
743
744 bool Shift1 = mi_match(
745 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
746
747 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
748 if (Shift0 && Shift1) {
749 Opc = AMDGPU::S_PACK_HH_B32_B16;
750 MI.getOperand(1).setReg(ShiftSrc0);
751 MI.getOperand(2).setReg(ShiftSrc1);
752 } else if (Shift1) {
753 Opc = AMDGPU::S_PACK_LH_B32_B16;
754 MI.getOperand(2).setReg(ShiftSrc1);
755 } else if (Shift0) {
756 auto ConstSrc1 =
757 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
758 if (ConstSrc1 && ConstSrc1->Value == 0) {
759 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
760 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
761 .addReg(ShiftSrc0)
762 .addImm(16)
763 .setOperandDead(3); // Dead scc
764
765 MI.eraseFromParent();
766 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
767 return true;
768 }
769 if (STI.hasSPackHL()) {
770 Opc = AMDGPU::S_PACK_HL_B32_B16;
771 MI.getOperand(1).setReg(ShiftSrc0);
772 }
773 }
774
775 MI.setDesc(TII.get(Opc));
776 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
777 return true;
778}
779
780bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
781 MachineBasicBlock *BB = MI.getParent();
782 Register DstReg = MI.getOperand(0).getReg();
783 LLT DstTy = MRI->getType(DstReg);
784 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
785
786 const unsigned SrcSize = SrcTy.getSizeInBits();
787 if (SrcSize < 32) {
788 // Handle s32 <- G_MERGE_VALUES s16, s16
789 if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&
790 MI.getNumOperands() == 3) {
791 return selectS16MergeToS32(MI);
792 }
793 return selectImpl(MI, *CoverageInfo);
794 }
795
796 const DebugLoc &DL = MI.getDebugLoc();
797 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
798 const unsigned DstSize = DstTy.getSizeInBits();
799 const TargetRegisterClass *DstRC =
800 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
801 if (!DstRC)
802 return false;
803
804 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
805 MachineInstrBuilder MIB =
806 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
807 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
808 MachineOperand &Src = MI.getOperand(I + 1);
809 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
810 MIB.addImm(SubRegs[I]);
811
812 const TargetRegisterClass *SrcRC
813 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
814 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
815 return false;
816 }
817
818 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
819 return false;
820
821 MI.eraseFromParent();
822 return true;
823}
824
825bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
826 MachineBasicBlock *BB = MI.getParent();
827 const int NumDst = MI.getNumOperands() - 1;
828
829 MachineOperand &Src = MI.getOperand(NumDst);
830
831 Register SrcReg = Src.getReg();
832 Register DstReg0 = MI.getOperand(0).getReg();
833 LLT DstTy = MRI->getType(DstReg0);
834 LLT SrcTy = MRI->getType(SrcReg);
835
836 const unsigned DstSize = DstTy.getSizeInBits();
837 const unsigned SrcSize = SrcTy.getSizeInBits();
838 const DebugLoc &DL = MI.getDebugLoc();
839 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
840
841 const TargetRegisterClass *SrcRC =
842 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
843 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
844 return false;
845
846 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
847 // source, and this relies on the fact that the same subregister indices are
848 // used for both.
849 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
850 for (int I = 0, E = NumDst; I != E; ++I) {
851 MachineOperand &Dst = MI.getOperand(I);
852 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
853 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
854 SubRegs[I] == AMDGPU::hi16) {
855 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
856 .addReg(SrcReg)
857 .addImm(16);
858 } else {
859 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
860 .addReg(SrcReg, {}, SubRegs[I]);
861 }
862
863 // Make sure the subregister index is valid for the source register.
864 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
865 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
866 return false;
867
868 const TargetRegisterClass *DstRC =
869 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
870 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
871 return false;
872 }
873
874 MI.eraseFromParent();
875 return true;
876}
877
878bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
879 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
880 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
881
882 Register Src0 = MI.getOperand(1).getReg();
883 Register Src1 = MI.getOperand(2).getReg();
884 LLT SrcTy = MRI->getType(Src0);
885 const unsigned SrcSize = SrcTy.getSizeInBits();
886
887 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
888 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
889 return selectG_MERGE_VALUES(MI);
890 }
891
892 // Selection logic below is for V2S16 only.
893 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
894 Register Dst = MI.getOperand(0).getReg();
895 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
896 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
897 SrcTy != LLT::scalar(32)))
898 return selectImpl(MI, *CoverageInfo);
899
900 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
901 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
902 return false;
903
904 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
905 DstBank->getID() == AMDGPU::VGPRRegBankID);
906 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
907
908 const DebugLoc &DL = MI.getDebugLoc();
909 MachineBasicBlock *BB = MI.getParent();
910
911 // First, before trying TableGen patterns, check if both sources are
912 // constants. In those cases, we can trivially compute the final constant
913 // and emit a simple move.
914 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
915 if (ConstSrc1) {
916 auto ConstSrc0 =
917 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
918 if (ConstSrc0) {
919 const int64_t K0 = ConstSrc0->Value.getSExtValue();
920 const int64_t K1 = ConstSrc1->Value.getSExtValue();
921 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
922 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
923 uint32_t Imm = Lo16 | (Hi16 << 16);
924
925 // VALU
926 if (IsVector) {
927 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
928 MI.eraseFromParent();
929 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
930 }
931
932 // SALU
933 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
934 MI.eraseFromParent();
935 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
936 }
937 }
938
939 // Now try TableGen patterns.
940 if (selectImpl(MI, *CoverageInfo))
941 return true;
942
943 // TODO: This should probably be a combine somewhere
944 // (build_vector $src0, undef) -> copy $src0
945 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
946 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
947 MI.setDesc(TII.get(AMDGPU::COPY));
948 MI.removeOperand(2);
949 const auto &RC =
950 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
951 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
952 RBI.constrainGenericRegister(Src0, RC, *MRI);
953 }
954
955 return selectS16MergeToS32(MI);
956}
957
958bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
959 const MachineOperand &MO = I.getOperand(0);
960
961 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
962 // regbank check here is to know why getConstrainedRegClassForOperand failed.
963 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
964 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
965 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
966 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
967 return true;
968 }
969
970 return false;
971}
972
973bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
974 MachineBasicBlock *BB = I.getParent();
975
976 Register DstReg = I.getOperand(0).getReg();
977 Register Src0Reg = I.getOperand(1).getReg();
978 Register Src1Reg = I.getOperand(2).getReg();
979 LLT Src1Ty = MRI->getType(Src1Reg);
980
981 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
982 unsigned InsSize = Src1Ty.getSizeInBits();
983
984 int64_t Offset = I.getOperand(3).getImm();
985
986 // FIXME: These cases should have been illegal and unnecessary to check here.
987 if (Offset % 32 != 0 || InsSize % 32 != 0)
988 return false;
989
990 // Currently not handled by getSubRegFromChannel.
991 if (InsSize > 128)
992 return false;
993
994 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
995 if (SubReg == AMDGPU::NoSubRegister)
996 return false;
997
998 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
999 const TargetRegisterClass *DstRC =
1000 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
1001 if (!DstRC)
1002 return false;
1003
1004 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
1005 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
1006 const TargetRegisterClass *Src0RC =
1007 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
1008 const TargetRegisterClass *Src1RC =
1009 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
1010
1011 // Deal with weird cases where the class only partially supports the subreg
1012 // index.
1013 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1014 if (!Src0RC || !Src1RC)
1015 return false;
1016
1017 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
1018 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
1019 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
1020 return false;
1021
1022 const DebugLoc &DL = I.getDebugLoc();
1023 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
1024 .addReg(Src0Reg)
1025 .addReg(Src1Reg)
1026 .addImm(SubReg);
1027
1028 I.eraseFromParent();
1029 return true;
1030}
1031
1032bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
1033 Register DstReg = MI.getOperand(0).getReg();
1034 Register SrcReg = MI.getOperand(1).getReg();
1035 Register OffsetReg = MI.getOperand(2).getReg();
1036 Register WidthReg = MI.getOperand(3).getReg();
1037
1038 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1039 "scalar BFX instructions are expanded in regbankselect");
1040 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1041 "64-bit vector BFX instructions are expanded in regbankselect");
1042
1043 const DebugLoc &DL = MI.getDebugLoc();
1044 MachineBasicBlock *MBB = MI.getParent();
1045
1046 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
1047 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1048 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
1049 .addReg(SrcReg)
1050 .addReg(OffsetReg)
1051 .addReg(WidthReg);
1052 MI.eraseFromParent();
1053 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1054 return true;
1055}
1056
1057bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1058 if (STI.getLDSBankCount() != 16)
1059 return selectImpl(MI, *CoverageInfo);
1060
1061 Register Dst = MI.getOperand(0).getReg();
1062 Register Src0 = MI.getOperand(2).getReg();
1063 Register M0Val = MI.getOperand(6).getReg();
1064 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1065 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1066 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1067 return false;
1068
1069 // This requires 2 instructions. It is possible to write a pattern to support
1070 // this, but the generated isel emitter doesn't correctly deal with multiple
1071 // output instructions using the same physical register input. The copy to m0
1072 // is incorrectly placed before the second instruction.
1073 //
1074 // TODO: Match source modifiers.
1075
1076 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1077 const DebugLoc &DL = MI.getDebugLoc();
1078 MachineBasicBlock *MBB = MI.getParent();
1079
1080 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1081 .addReg(M0Val);
1082 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1083 .addImm(2)
1084 .addImm(MI.getOperand(4).getImm()) // $attr
1085 .addImm(MI.getOperand(3).getImm()); // $attrchan
1086
1087 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1088 .addImm(0) // $src0_modifiers
1089 .addReg(Src0) // $src0
1090 .addImm(MI.getOperand(4).getImm()) // $attr
1091 .addImm(MI.getOperand(3).getImm()) // $attrchan
1092 .addImm(0) // $src2_modifiers
1093 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1094 .addImm(MI.getOperand(5).getImm()) // $high
1095 .addImm(0) // $clamp
1096 .addImm(0); // $omod
1097
1098 MI.eraseFromParent();
1099 return true;
1100}
1101
1102// Writelane is special in that it can use SGPR and M0 (which would normally
1103// count as using the constant bus twice - but in this case it is allowed since
1104// the lane selector doesn't count as a use of the constant bus). However, it is
1105// still required to abide by the 1 SGPR rule. Fix this up if we might have
1106// multiple SGPRs.
1107bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1108 // With a constant bus limit of at least 2, there's no issue.
1109 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1110 return selectImpl(MI, *CoverageInfo);
1111
1112 MachineBasicBlock *MBB = MI.getParent();
1113 const DebugLoc &DL = MI.getDebugLoc();
1114 Register VDst = MI.getOperand(0).getReg();
1115 Register Val = MI.getOperand(2).getReg();
1116 Register LaneSelect = MI.getOperand(3).getReg();
1117 Register VDstIn = MI.getOperand(4).getReg();
1118
1119 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1120
1121 std::optional<ValueAndVReg> ConstSelect =
1122 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1123 if (ConstSelect) {
1124 // The selector has to be an inline immediate, so we can use whatever for
1125 // the other operands.
1126 MIB.addReg(Val);
1127 MIB.addImm(ConstSelect->Value.getSExtValue() &
1128 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1129 } else {
1130 std::optional<ValueAndVReg> ConstVal =
1132
1133 // If the value written is an inline immediate, we can get away without a
1134 // copy to m0.
1135 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1136 STI.hasInv2PiInlineImm())) {
1137 MIB.addImm(ConstVal->Value.getSExtValue());
1138 MIB.addReg(LaneSelect);
1139 } else {
1140 MIB.addReg(Val);
1141
1142 // If the lane selector was originally in a VGPR and copied with
1143 // readfirstlane, there's a hazard to read the same SGPR from the
1144 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1145 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1146
1147 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1148 .addReg(LaneSelect);
1149 MIB.addReg(AMDGPU::M0);
1150 }
1151 }
1152
1153 MIB.addReg(VDstIn);
1154
1155 MI.eraseFromParent();
1156 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1157 return true;
1158}
1159
1160// We need to handle this here because tablegen doesn't support matching
1161// instructions with multiple outputs.
1162bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1163 Register Dst0 = MI.getOperand(0).getReg();
1164 Register Dst1 = MI.getOperand(1).getReg();
1165
1166 LLT Ty = MRI->getType(Dst0);
1167 unsigned Opc;
1168 if (Ty == LLT::scalar(32))
1169 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1170 else if (Ty == LLT::scalar(64))
1171 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1172 else
1173 return false;
1174
1175 // TODO: Match source modifiers.
1176
1177 const DebugLoc &DL = MI.getDebugLoc();
1178 MachineBasicBlock *MBB = MI.getParent();
1179
1180 Register Numer = MI.getOperand(3).getReg();
1181 Register Denom = MI.getOperand(4).getReg();
1182 unsigned ChooseDenom = MI.getOperand(5).getImm();
1183
1184 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1185
1186 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1187 .addDef(Dst1)
1188 .addImm(0) // $src0_modifiers
1189 .addUse(Src0) // $src0
1190 .addImm(0) // $src1_modifiers
1191 .addUse(Denom) // $src1
1192 .addImm(0) // $src2_modifiers
1193 .addUse(Numer) // $src2
1194 .addImm(0) // $clamp
1195 .addImm(0); // $omod
1196
1197 MI.eraseFromParent();
1198 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1199 return true;
1200}
1201
1202bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1203 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1204 switch (IntrinsicID) {
1205 case Intrinsic::amdgcn_if_break: {
1206 MachineBasicBlock *BB = I.getParent();
1207
1208 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1209 // SelectionDAG uses for wave32 vs wave64.
1210 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1211 .add(I.getOperand(0))
1212 .add(I.getOperand(2))
1213 .add(I.getOperand(3));
1214
1215 Register DstReg = I.getOperand(0).getReg();
1216 Register Src0Reg = I.getOperand(2).getReg();
1217 Register Src1Reg = I.getOperand(3).getReg();
1218
1219 I.eraseFromParent();
1220
1221 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1222 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1223
1224 return true;
1225 }
1226 case Intrinsic::amdgcn_interp_p1_f16:
1227 return selectInterpP1F16(I);
1228 case Intrinsic::amdgcn_wqm:
1229 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1230 case Intrinsic::amdgcn_softwqm:
1231 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1232 case Intrinsic::amdgcn_strict_wwm:
1233 case Intrinsic::amdgcn_wwm:
1234 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1235 case Intrinsic::amdgcn_strict_wqm:
1236 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1237 case Intrinsic::amdgcn_writelane:
1238 return selectWritelane(I);
1239 case Intrinsic::amdgcn_div_scale:
1240 return selectDivScale(I);
1241 case Intrinsic::amdgcn_icmp:
1242 case Intrinsic::amdgcn_fcmp:
1243 if (selectImpl(I, *CoverageInfo))
1244 return true;
1245 return selectIntrinsicCmp(I);
1246 case Intrinsic::amdgcn_ballot:
1247 return selectBallot(I);
1248 case Intrinsic::amdgcn_reloc_constant:
1249 return selectRelocConstant(I);
1250 case Intrinsic::amdgcn_groupstaticsize:
1251 return selectGroupStaticSize(I);
1252 case Intrinsic::returnaddress:
1253 return selectReturnAddress(I);
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1256 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1257 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1258 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1259 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1270 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1271 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1272 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1273 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1280 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1281 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1282 return selectSMFMACIntrin(I);
1283 case Intrinsic::amdgcn_permlane16_swap:
1284 case Intrinsic::amdgcn_permlane32_swap:
1285 return selectPermlaneSwapIntrin(I, IntrinsicID);
1286 case Intrinsic::amdgcn_wave_shuffle:
1287 return selectWaveShuffleIntrin(I);
1288 case Intrinsic::amdgcn_fma_legacy:
1289 if (!STI.hasFmaLegacy32Insts()) {
1291 return false;
1292 }
1293 return selectImpl(I, *CoverageInfo);
1294 case Intrinsic::amdgcn_sudot4:
1295 case Intrinsic::amdgcn_sudot8:
1296 if (!STI.hasDot8Insts()) {
1298 return false;
1299 }
1300 return selectImpl(I, *CoverageInfo);
1301 case Intrinsic::amdgcn_permlane16:
1302 case Intrinsic::amdgcn_permlanex16:
1303 if (!STI.hasPermlane16Insts()) {
1305 return false;
1306 }
1307 return selectImpl(I, *CoverageInfo);
1308 case Intrinsic::amdgcn_mov_dpp8:
1309 if (!STI.hasDPP8()) {
1311 return false;
1312 }
1313 return selectImpl(I, *CoverageInfo);
1314 case Intrinsic::amdgcn_tanh:
1315 if (!STI.hasTanhInsts()) {
1317 return false;
1318 }
1319 return selectImpl(I, *CoverageInfo);
1320 default:
1321 return selectImpl(I, *CoverageInfo);
1322 }
1323}
1324
1326 const GCNSubtarget &ST) {
1327 if (Size != 16 && Size != 32 && Size != 64)
1328 return -1;
1329
1330 if (Size == 16 && !ST.has16BitInsts())
1331 return -1;
1332
1333 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1334 unsigned FakeS16Opc, unsigned S32Opc,
1335 unsigned S64Opc) {
1336 if (Size == 16)
1337 return ST.hasTrue16BitInsts()
1338 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1339 : S16Opc;
1340 if (Size == 32)
1341 return S32Opc;
1342 return S64Opc;
1343 };
1344
1345 switch (P) {
1346 default:
1347 llvm_unreachable("Unknown condition code!");
1348 case CmpInst::ICMP_NE:
1349 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1350 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1351 AMDGPU::V_CMP_NE_U64_e64);
1352 case CmpInst::ICMP_EQ:
1353 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1354 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1355 AMDGPU::V_CMP_EQ_U64_e64);
1356 case CmpInst::ICMP_SGT:
1357 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1358 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1359 AMDGPU::V_CMP_GT_I64_e64);
1360 case CmpInst::ICMP_SGE:
1361 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1362 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1363 AMDGPU::V_CMP_GE_I64_e64);
1364 case CmpInst::ICMP_SLT:
1365 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1366 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1367 AMDGPU::V_CMP_LT_I64_e64);
1368 case CmpInst::ICMP_SLE:
1369 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1370 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1371 AMDGPU::V_CMP_LE_I64_e64);
1372 case CmpInst::ICMP_UGT:
1373 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1374 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1375 AMDGPU::V_CMP_GT_U64_e64);
1376 case CmpInst::ICMP_UGE:
1377 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1378 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1379 AMDGPU::V_CMP_GE_U64_e64);
1380 case CmpInst::ICMP_ULT:
1381 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1382 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1383 AMDGPU::V_CMP_LT_U64_e64);
1384 case CmpInst::ICMP_ULE:
1385 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1386 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1387 AMDGPU::V_CMP_LE_U64_e64);
1388
1389 case CmpInst::FCMP_OEQ:
1390 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1391 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1392 AMDGPU::V_CMP_EQ_F64_e64);
1393 case CmpInst::FCMP_OGT:
1394 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1395 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1396 AMDGPU::V_CMP_GT_F64_e64);
1397 case CmpInst::FCMP_OGE:
1398 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1399 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1400 AMDGPU::V_CMP_GE_F64_e64);
1401 case CmpInst::FCMP_OLT:
1402 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1403 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1404 AMDGPU::V_CMP_LT_F64_e64);
1405 case CmpInst::FCMP_OLE:
1406 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1407 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1408 AMDGPU::V_CMP_LE_F64_e64);
1409 case CmpInst::FCMP_ONE:
1410 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1411 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1412 AMDGPU::V_CMP_NEQ_F64_e64);
1413 case CmpInst::FCMP_ORD:
1414 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1415 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1416 AMDGPU::V_CMP_O_F64_e64);
1417 case CmpInst::FCMP_UNO:
1418 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1419 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1420 AMDGPU::V_CMP_U_F64_e64);
1421 case CmpInst::FCMP_UEQ:
1422 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1423 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1424 AMDGPU::V_CMP_NLG_F64_e64);
1425 case CmpInst::FCMP_UGT:
1426 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1427 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1428 AMDGPU::V_CMP_NLE_F64_e64);
1429 case CmpInst::FCMP_UGE:
1430 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1431 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1432 AMDGPU::V_CMP_NLT_F64_e64);
1433 case CmpInst::FCMP_ULT:
1434 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1435 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1436 AMDGPU::V_CMP_NGE_F64_e64);
1437 case CmpInst::FCMP_ULE:
1438 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1439 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1440 AMDGPU::V_CMP_NGT_F64_e64);
1441 case CmpInst::FCMP_UNE:
1442 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1443 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1444 AMDGPU::V_CMP_NEQ_F64_e64);
1445 case CmpInst::FCMP_TRUE:
1446 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1447 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1448 AMDGPU::V_CMP_TRU_F64_e64);
1450 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1451 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1452 AMDGPU::V_CMP_F_F64_e64);
1453 }
1454}
1455
1456int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1457 unsigned Size) const {
1458 if (Size == 64) {
1459 if (!STI.hasScalarCompareEq64())
1460 return -1;
1461
1462 switch (P) {
1463 case CmpInst::ICMP_NE:
1464 return AMDGPU::S_CMP_LG_U64;
1465 case CmpInst::ICMP_EQ:
1466 return AMDGPU::S_CMP_EQ_U64;
1467 default:
1468 return -1;
1469 }
1470 }
1471
1472 if (Size == 32) {
1473 switch (P) {
1474 case CmpInst::ICMP_NE:
1475 return AMDGPU::S_CMP_LG_U32;
1476 case CmpInst::ICMP_EQ:
1477 return AMDGPU::S_CMP_EQ_U32;
1478 case CmpInst::ICMP_SGT:
1479 return AMDGPU::S_CMP_GT_I32;
1480 case CmpInst::ICMP_SGE:
1481 return AMDGPU::S_CMP_GE_I32;
1482 case CmpInst::ICMP_SLT:
1483 return AMDGPU::S_CMP_LT_I32;
1484 case CmpInst::ICMP_SLE:
1485 return AMDGPU::S_CMP_LE_I32;
1486 case CmpInst::ICMP_UGT:
1487 return AMDGPU::S_CMP_GT_U32;
1488 case CmpInst::ICMP_UGE:
1489 return AMDGPU::S_CMP_GE_U32;
1490 case CmpInst::ICMP_ULT:
1491 return AMDGPU::S_CMP_LT_U32;
1492 case CmpInst::ICMP_ULE:
1493 return AMDGPU::S_CMP_LE_U32;
1494 case CmpInst::FCMP_OEQ:
1495 return AMDGPU::S_CMP_EQ_F32;
1496 case CmpInst::FCMP_OGT:
1497 return AMDGPU::S_CMP_GT_F32;
1498 case CmpInst::FCMP_OGE:
1499 return AMDGPU::S_CMP_GE_F32;
1500 case CmpInst::FCMP_OLT:
1501 return AMDGPU::S_CMP_LT_F32;
1502 case CmpInst::FCMP_OLE:
1503 return AMDGPU::S_CMP_LE_F32;
1504 case CmpInst::FCMP_ONE:
1505 return AMDGPU::S_CMP_LG_F32;
1506 case CmpInst::FCMP_ORD:
1507 return AMDGPU::S_CMP_O_F32;
1508 case CmpInst::FCMP_UNO:
1509 return AMDGPU::S_CMP_U_F32;
1510 case CmpInst::FCMP_UEQ:
1511 return AMDGPU::S_CMP_NLG_F32;
1512 case CmpInst::FCMP_UGT:
1513 return AMDGPU::S_CMP_NLE_F32;
1514 case CmpInst::FCMP_UGE:
1515 return AMDGPU::S_CMP_NLT_F32;
1516 case CmpInst::FCMP_ULT:
1517 return AMDGPU::S_CMP_NGE_F32;
1518 case CmpInst::FCMP_ULE:
1519 return AMDGPU::S_CMP_NGT_F32;
1520 case CmpInst::FCMP_UNE:
1521 return AMDGPU::S_CMP_NEQ_F32;
1522 default:
1523 llvm_unreachable("Unknown condition code!");
1524 }
1525 }
1526
1527 if (Size == 16) {
1528 if (!STI.hasSALUFloatInsts())
1529 return -1;
1530
1531 switch (P) {
1532 case CmpInst::FCMP_OEQ:
1533 return AMDGPU::S_CMP_EQ_F16;
1534 case CmpInst::FCMP_OGT:
1535 return AMDGPU::S_CMP_GT_F16;
1536 case CmpInst::FCMP_OGE:
1537 return AMDGPU::S_CMP_GE_F16;
1538 case CmpInst::FCMP_OLT:
1539 return AMDGPU::S_CMP_LT_F16;
1540 case CmpInst::FCMP_OLE:
1541 return AMDGPU::S_CMP_LE_F16;
1542 case CmpInst::FCMP_ONE:
1543 return AMDGPU::S_CMP_LG_F16;
1544 case CmpInst::FCMP_ORD:
1545 return AMDGPU::S_CMP_O_F16;
1546 case CmpInst::FCMP_UNO:
1547 return AMDGPU::S_CMP_U_F16;
1548 case CmpInst::FCMP_UEQ:
1549 return AMDGPU::S_CMP_NLG_F16;
1550 case CmpInst::FCMP_UGT:
1551 return AMDGPU::S_CMP_NLE_F16;
1552 case CmpInst::FCMP_UGE:
1553 return AMDGPU::S_CMP_NLT_F16;
1554 case CmpInst::FCMP_ULT:
1555 return AMDGPU::S_CMP_NGE_F16;
1556 case CmpInst::FCMP_ULE:
1557 return AMDGPU::S_CMP_NGT_F16;
1558 case CmpInst::FCMP_UNE:
1559 return AMDGPU::S_CMP_NEQ_F16;
1560 default:
1561 llvm_unreachable("Unknown condition code!");
1562 }
1563 }
1564
1565 return -1;
1566}
1567
1568bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1569
1570 MachineBasicBlock *BB = I.getParent();
1571 const DebugLoc &DL = I.getDebugLoc();
1572
1573 Register SrcReg = I.getOperand(2).getReg();
1574 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1575
1576 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1577
1578 Register CCReg = I.getOperand(0).getReg();
1579 if (!isVCC(CCReg, *MRI)) {
1580 int Opcode = getS_CMPOpcode(Pred, Size);
1581 if (Opcode == -1)
1582 return false;
1583 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1584 .add(I.getOperand(2))
1585 .add(I.getOperand(3));
1586 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1587 .addReg(AMDGPU::SCC);
1588 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1589 bool Ret =
1590 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1591 I.eraseFromParent();
1592 return Ret;
1593 }
1594
1595 if (I.getOpcode() == AMDGPU::G_FCMP)
1596 return false;
1597
1598 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1599 if (Opcode == -1)
1600 return false;
1601
1602 MachineInstrBuilder ICmp;
1603 // t16 instructions
1604 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1605 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1606 .addImm(0)
1607 .add(I.getOperand(2))
1608 .addImm(0)
1609 .add(I.getOperand(3))
1610 .addImm(0); // op_sel
1611 } else {
1612 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1613 .add(I.getOperand(2))
1614 .add(I.getOperand(3));
1615 }
1616
1617 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1618 *TRI.getBoolRC(), *MRI);
1619 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1620 I.eraseFromParent();
1621 return true;
1622}
1623
1624bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1625 Register Dst = I.getOperand(0).getReg();
1626 if (isVCC(Dst, *MRI))
1627 return false;
1628
1629 LLT DstTy = MRI->getType(Dst);
1630 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1631 return false;
1632
1633 MachineBasicBlock *BB = I.getParent();
1634 const DebugLoc &DL = I.getDebugLoc();
1635 Register SrcReg = I.getOperand(2).getReg();
1636 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1637
1638 // i1 inputs are not supported in GlobalISel.
1639 if (Size == 1)
1640 return false;
1641
1642 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1643 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1645 I.eraseFromParent();
1646 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1647 }
1648
1649 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1650 if (Opcode == -1)
1651 return false;
1652
1653 MachineInstrBuilder SelectedMI;
1654 MachineOperand &LHS = I.getOperand(2);
1655 MachineOperand &RHS = I.getOperand(3);
1656 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1657 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1658 Register Src0Reg =
1659 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1660 Register Src1Reg =
1661 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1662 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1663 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1664 SelectedMI.addImm(Src0Mods);
1665 SelectedMI.addReg(Src0Reg);
1666 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1667 SelectedMI.addImm(Src1Mods);
1668 SelectedMI.addReg(Src1Reg);
1669 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1670 SelectedMI.addImm(0); // clamp
1671 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1672 SelectedMI.addImm(0); // op_sel
1673
1674 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1675 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681// Ballot has to zero bits in input lane-mask that are zero in current exec,
1682// Done as AND with exec. For inputs that are results of instruction that
1683// implicitly use same exec, for example compares in same basic block or SCC to
1684// VCC copy, use copy.
1687 MachineInstr *MI = MRI.getVRegDef(Reg);
1688 if (MI->getParent() != MBB)
1689 return false;
1690
1691 // Lane mask generated by SCC to VCC copy.
1692 if (MI->getOpcode() == AMDGPU::COPY) {
1693 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1694 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1695 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1696 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1697 return true;
1698 }
1699
1700 // Lane mask generated using compare with same exec.
1701 if (isa<GAnyCmp>(MI))
1702 return true;
1703
1704 Register LHS, RHS;
1705 // Look through AND.
1706 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1707 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1709
1710 return false;
1711}
1712
1713bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1714 MachineBasicBlock *BB = I.getParent();
1715 const DebugLoc &DL = I.getDebugLoc();
1716 Register DstReg = I.getOperand(0).getReg();
1717 Register SrcReg = I.getOperand(2).getReg();
1718 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1719 const unsigned WaveSize = STI.getWavefrontSize();
1720
1721 // In the common case, the return type matches the wave size.
1722 // However we also support emitting i64 ballots in wave32 mode.
1723 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1724 return false;
1725
1726 std::optional<ValueAndVReg> Arg =
1728
1729 Register Dst = DstReg;
1730 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1731 if (BallotSize != WaveSize) {
1732 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1733 }
1734
1735 if (Arg) {
1736 const int64_t Value = Arg->Value.getZExtValue();
1737 if (Value == 0) {
1738 // Dst = S_MOV 0
1739 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1740 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1741 } else {
1742 // Dst = COPY EXEC
1743 assert(Value == 1);
1744 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1745 }
1746 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1747 return false;
1748 } else {
1749 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1750 // Dst = COPY SrcReg
1751 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1752 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1753 return false;
1754 } else {
1755 // Dst = S_AND SrcReg, EXEC
1756 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1757 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1758 .addReg(SrcReg)
1759 .addReg(TRI.getExec())
1760 .setOperandDead(3); // Dead scc
1761 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1762 }
1763 }
1764
1765 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1766 if (BallotSize != WaveSize) {
1767 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1768 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1769 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1770 .addReg(Dst)
1771 .addImm(AMDGPU::sub0)
1772 .addReg(HiReg)
1773 .addImm(AMDGPU::sub1);
1774 }
1775
1776 I.eraseFromParent();
1777 return true;
1778}
1779
1780bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1781 Register DstReg = I.getOperand(0).getReg();
1782 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1783 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1784 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1785 return false;
1786
1787 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1788
1789 Module *M = MF->getFunction().getParent();
1790 const MDNode *Metadata = I.getOperand(2).getMetadata();
1791 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1792 auto *RelocSymbol = cast<GlobalVariable>(
1793 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1794
1795 MachineBasicBlock *BB = I.getParent();
1796 BuildMI(*BB, &I, I.getDebugLoc(),
1797 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1799
1800 I.eraseFromParent();
1801 return true;
1802}
1803
1804bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1805 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1806
1807 Register DstReg = I.getOperand(0).getReg();
1808 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1809 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1810 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1811
1812 MachineBasicBlock *MBB = I.getParent();
1813 const DebugLoc &DL = I.getDebugLoc();
1814
1815 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1816
1817 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1818 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1819 MIB.addImm(MFI->getLDSSize());
1820 } else {
1821 Module *M = MF->getFunction().getParent();
1822 const GlobalValue *GV =
1823 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1825 }
1826
1827 I.eraseFromParent();
1828 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1829 return true;
1830}
1831
1832bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1833 MachineBasicBlock *MBB = I.getParent();
1834 MachineFunction &MF = *MBB->getParent();
1835 const DebugLoc &DL = I.getDebugLoc();
1836
1837 MachineOperand &Dst = I.getOperand(0);
1838 Register DstReg = Dst.getReg();
1839 unsigned Depth = I.getOperand(2).getImm();
1840
1841 const TargetRegisterClass *RC
1842 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1843 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1844 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1845 return false;
1846
1847 // Check for kernel and shader functions
1848 if (Depth != 0 ||
1849 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1850 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1851 .addImm(0);
1852 I.eraseFromParent();
1853 return true;
1854 }
1855
1856 MachineFrameInfo &MFI = MF.getFrameInfo();
1857 // There is a call to @llvm.returnaddress in this function
1858 MFI.setReturnAddressIsTaken(true);
1859
1860 // Get the return address reg and mark it as an implicit live-in
1861 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1862 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1863 AMDGPU::SReg_64RegClass, DL);
1864 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1865 .addReg(LiveIn);
1866 I.eraseFromParent();
1867 return true;
1868}
1869
1870bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1871 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1872 // SelectionDAG uses for wave32 vs wave64.
1873 MachineBasicBlock *BB = MI.getParent();
1874 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1875 .add(MI.getOperand(1));
1876
1877 Register Reg = MI.getOperand(1).getReg();
1878 MI.eraseFromParent();
1879
1880 if (!MRI->getRegClassOrNull(Reg))
1881 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1882 return true;
1883}
1884
1885bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1886 MachineInstr &MI, Intrinsic::ID IntrID) const {
1887 MachineBasicBlock *MBB = MI.getParent();
1888 MachineFunction *MF = MBB->getParent();
1889 const DebugLoc &DL = MI.getDebugLoc();
1890
1891 unsigned IndexOperand = MI.getOperand(7).getImm();
1892 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1893 bool WaveDone = MI.getOperand(9).getImm() != 0;
1894
1895 if (WaveDone && !WaveRelease) {
1896 // TODO: Move this to IR verifier
1897 const Function &Fn = MF->getFunction();
1898 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1899 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1900 }
1901
1902 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1903 IndexOperand &= ~0x3f;
1904 unsigned CountDw = 0;
1905
1906 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1907 CountDw = (IndexOperand >> 24) & 0xf;
1908 IndexOperand &= ~(0xf << 24);
1909
1910 if (CountDw < 1 || CountDw > 4) {
1911 const Function &Fn = MF->getFunction();
1912 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1913 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1914 CountDw = 1;
1915 }
1916 }
1917
1918 if (IndexOperand) {
1919 const Function &Fn = MF->getFunction();
1920 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1921 Fn, "ds_ordered_count: bad index operand", DL));
1922 }
1923
1924 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1925 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1926
1927 unsigned Offset0 = OrderedCountIndex << 2;
1928 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1929
1930 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1931 Offset1 |= (CountDw - 1) << 6;
1932
1933 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1934 Offset1 |= ShaderType << 2;
1935
1936 unsigned Offset = Offset0 | (Offset1 << 8);
1937
1938 Register M0Val = MI.getOperand(2).getReg();
1939 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1940 .addReg(M0Val);
1941
1942 Register DstReg = MI.getOperand(0).getReg();
1943 Register ValReg = MI.getOperand(3).getReg();
1944 MachineInstrBuilder DS =
1945 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1946 .addReg(ValReg)
1947 .addImm(Offset)
1948 .cloneMemRefs(MI);
1949
1950 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1951 return false;
1952
1953 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1954 MI.eraseFromParent();
1955 return true;
1956}
1957
1958static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1959 switch (IntrID) {
1960 case Intrinsic::amdgcn_ds_gws_init:
1961 return AMDGPU::DS_GWS_INIT;
1962 case Intrinsic::amdgcn_ds_gws_barrier:
1963 return AMDGPU::DS_GWS_BARRIER;
1964 case Intrinsic::amdgcn_ds_gws_sema_v:
1965 return AMDGPU::DS_GWS_SEMA_V;
1966 case Intrinsic::amdgcn_ds_gws_sema_br:
1967 return AMDGPU::DS_GWS_SEMA_BR;
1968 case Intrinsic::amdgcn_ds_gws_sema_p:
1969 return AMDGPU::DS_GWS_SEMA_P;
1970 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1971 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1972 default:
1973 llvm_unreachable("not a gws intrinsic");
1974 }
1975}
1976
1977bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1978 Intrinsic::ID IID) const {
1979 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1980 !STI.hasGWSSemaReleaseAll()))
1981 return false;
1982
1983 // intrinsic ID, vsrc, offset
1984 const bool HasVSrc = MI.getNumOperands() == 3;
1985 assert(HasVSrc || MI.getNumOperands() == 2);
1986
1987 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1988 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1989 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1990 return false;
1991
1992 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1993 unsigned ImmOffset;
1994
1995 MachineBasicBlock *MBB = MI.getParent();
1996 const DebugLoc &DL = MI.getDebugLoc();
1997
1998 MachineInstr *Readfirstlane = nullptr;
1999
2000 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
2001 // incoming offset, in case there's an add of a constant. We'll have to put it
2002 // back later.
2003 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
2004 Readfirstlane = OffsetDef;
2005 BaseOffset = OffsetDef->getOperand(1).getReg();
2006 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
2007 }
2008
2009 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
2010 // If we have a constant offset, try to use the 0 in m0 as the base.
2011 // TODO: Look into changing the default m0 initialization value. If the
2012 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2013 // the immediate offset.
2014
2015 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
2016 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2017 .addImm(0);
2018 } else {
2019 std::tie(BaseOffset, ImmOffset) =
2020 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
2021
2022 if (Readfirstlane) {
2023 // We have the constant offset now, so put the readfirstlane back on the
2024 // variable component.
2025 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
2026 return false;
2027
2028 Readfirstlane->getOperand(1).setReg(BaseOffset);
2029 BaseOffset = Readfirstlane->getOperand(0).getReg();
2030 } else {
2031 if (!RBI.constrainGenericRegister(BaseOffset,
2032 AMDGPU::SReg_32RegClass, *MRI))
2033 return false;
2034 }
2035
2036 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
2038 .addReg(BaseOffset)
2039 .addImm(16)
2040 .setOperandDead(3); // Dead scc
2041
2042 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2043 .addReg(M0Base);
2044 }
2045
2046 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2047 // offset field) % 64. Some versions of the programming guide omit the m0
2048 // part, or claim it's from offset 0.
2049
2050 unsigned Opc = gwsIntrinToOpcode(IID);
2051 const MCInstrDesc &InstrDesc = TII.get(Opc);
2052
2053 if (HasVSrc) {
2054 Register VSrc = MI.getOperand(1).getReg();
2055
2056 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
2057 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
2058 const TargetRegisterClass *SubRC =
2059 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2060
2061 if (!SubRC) {
2062 // 32-bit normal case.
2063 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
2064 return false;
2065
2066 BuildMI(*MBB, &MI, DL, InstrDesc)
2067 .addReg(VSrc)
2068 .addImm(ImmOffset)
2069 .cloneMemRefs(MI);
2070 } else {
2071 // Requires even register alignment, so create 64-bit value and pad the
2072 // top half with undef.
2073 Register DataReg = MRI->createVirtualRegister(DataRC);
2074 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
2075 return false;
2076
2077 Register UndefReg = MRI->createVirtualRegister(SubRC);
2078 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2079 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
2080 .addReg(VSrc)
2081 .addImm(AMDGPU::sub0)
2082 .addReg(UndefReg)
2083 .addImm(AMDGPU::sub1);
2084
2085 BuildMI(*MBB, &MI, DL, InstrDesc)
2086 .addReg(DataReg)
2087 .addImm(ImmOffset)
2088 .cloneMemRefs(MI);
2089 }
2090 } else {
2091 BuildMI(*MBB, &MI, DL, InstrDesc)
2092 .addImm(ImmOffset)
2093 .cloneMemRefs(MI);
2094 }
2095
2096 MI.eraseFromParent();
2097 return true;
2098}
2099
2100bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2101 bool IsAppend) const {
2102 Register PtrBase = MI.getOperand(2).getReg();
2103 LLT PtrTy = MRI->getType(PtrBase);
2104 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2105
2106 unsigned Offset;
2107 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2108
2109 // TODO: Should this try to look through readfirstlane like GWS?
2110 if (!isDSOffsetLegal(PtrBase, Offset)) {
2111 PtrBase = MI.getOperand(2).getReg();
2112 Offset = 0;
2113 }
2114
2115 MachineBasicBlock *MBB = MI.getParent();
2116 const DebugLoc &DL = MI.getDebugLoc();
2117 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2118
2119 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2120 .addReg(PtrBase);
2121 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2122 return false;
2123
2124 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2125 .addImm(Offset)
2126 .addImm(IsGDS ? -1 : 0)
2127 .cloneMemRefs(MI);
2128 MI.eraseFromParent();
2129 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2130 return true;
2131}
2132
2133bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2134 MachineFunction *MF = MI.getMF();
2135 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2136
2137 MFInfo->setInitWholeWave();
2138 return selectImpl(MI, *CoverageInfo);
2139}
2140
2141static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2142 bool &IsTexFail) {
2143 if (TexFailCtrl)
2144 IsTexFail = true;
2145
2146 TFE = TexFailCtrl & 0x1;
2147 TexFailCtrl &= ~(uint64_t)0x1;
2148 LWE = TexFailCtrl & 0x2;
2149 TexFailCtrl &= ~(uint64_t)0x2;
2150
2151 return TexFailCtrl == 0;
2152}
2153
2154bool AMDGPUInstructionSelector::selectImageIntrinsic(
2155 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2156 MachineBasicBlock *MBB = MI.getParent();
2157 const DebugLoc &DL = MI.getDebugLoc();
2158 unsigned IntrOpcode = Intr->BaseOpcode;
2159
2160 // For image atomic: use no-return opcode if result is unused.
2161 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2162 Register ResultDef = MI.getOperand(0).getReg();
2163 if (MRI->use_nodbg_empty(ResultDef))
2164 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2165 }
2166
2167 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2169
2170 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2171 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2172 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2173 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2174 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2175
2176 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2177
2178 Register VDataIn = AMDGPU::NoRegister;
2179 Register VDataOut = AMDGPU::NoRegister;
2180 LLT VDataTy;
2181 int NumVDataDwords = -1;
2182 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2183 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2184
2185 bool Unorm;
2186 if (!BaseOpcode->Sampler)
2187 Unorm = true;
2188 else
2189 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2190
2191 bool TFE;
2192 bool LWE;
2193 bool IsTexFail = false;
2194 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2195 TFE, LWE, IsTexFail))
2196 return false;
2197
2198 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2199 const bool IsA16 = (Flags & 1) != 0;
2200 const bool IsG16 = (Flags & 2) != 0;
2201
2202 // A16 implies 16 bit gradients if subtarget doesn't support G16
2203 if (IsA16 && !STI.hasG16() && !IsG16)
2204 return false;
2205
2206 unsigned DMask = 0;
2207 unsigned DMaskLanes = 0;
2208
2209 if (BaseOpcode->Atomic) {
2210 if (!BaseOpcode->NoReturn)
2211 VDataOut = MI.getOperand(0).getReg();
2212 VDataIn = MI.getOperand(2).getReg();
2213 LLT Ty = MRI->getType(VDataIn);
2214
2215 // Be careful to allow atomic swap on 16-bit element vectors.
2216 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2217 Ty.getSizeInBits() == 128 :
2218 Ty.getSizeInBits() == 64;
2219
2220 if (BaseOpcode->AtomicX2) {
2221 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2222
2223 DMask = Is64Bit ? 0xf : 0x3;
2224 NumVDataDwords = Is64Bit ? 4 : 2;
2225 } else {
2226 DMask = Is64Bit ? 0x3 : 0x1;
2227 NumVDataDwords = Is64Bit ? 2 : 1;
2228 }
2229 } else {
2230 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2231 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2232
2233 if (BaseOpcode->Store) {
2234 VDataIn = MI.getOperand(1).getReg();
2235 VDataTy = MRI->getType(VDataIn);
2236 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2237 } else if (BaseOpcode->NoReturn) {
2238 NumVDataDwords = 0;
2239 } else {
2240 VDataOut = MI.getOperand(0).getReg();
2241 VDataTy = MRI->getType(VDataOut);
2242 NumVDataDwords = DMaskLanes;
2243
2244 if (IsD16 && !STI.hasUnpackedD16VMem())
2245 NumVDataDwords = (DMaskLanes + 1) / 2;
2246 }
2247 }
2248
2249 // Set G16 opcode
2250 if (Subtarget->hasG16() && IsG16) {
2251 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2253 assert(G16MappingInfo);
2254 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2255 }
2256
2257 // TODO: Check this in verifier.
2258 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2259
2260 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2261 // Keep GLC only when the atomic's result is actually used.
2262 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2264 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2266 return false;
2267
2268 int NumVAddrRegs = 0;
2269 int NumVAddrDwords = 0;
2270 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2271 // Skip the $noregs and 0s inserted during legalization.
2272 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2273 if (!AddrOp.isReg())
2274 continue; // XXX - Break?
2275
2276 Register Addr = AddrOp.getReg();
2277 if (!Addr)
2278 break;
2279
2280 ++NumVAddrRegs;
2281 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2282 }
2283
2284 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2285 // NSA, these should have been packed into a single value in the first
2286 // address register
2287 const bool UseNSA =
2288 NumVAddrRegs != 1 &&
2289 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2290 : NumVAddrDwords == NumVAddrRegs);
2291 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2292 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2293 return false;
2294 }
2295
2296 if (IsTexFail)
2297 ++NumVDataDwords;
2298
2299 int Opcode = -1;
2300 if (IsGFX13Plus) {
2301 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
2302 NumVDataDwords, NumVAddrDwords);
2303 } else if (IsGFX12Plus) {
2304 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2305 NumVDataDwords, NumVAddrDwords);
2306 } else if (IsGFX11Plus) {
2307 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2308 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2309 : AMDGPU::MIMGEncGfx11Default,
2310 NumVDataDwords, NumVAddrDwords);
2311 } else if (IsGFX10Plus) {
2312 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2313 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2314 : AMDGPU::MIMGEncGfx10Default,
2315 NumVDataDwords, NumVAddrDwords);
2316 } else {
2317 if (Subtarget->hasGFX90AInsts()) {
2318 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2319 NumVDataDwords, NumVAddrDwords);
2320 if (Opcode == -1) {
2321 LLVM_DEBUG(
2322 dbgs()
2323 << "requested image instruction is not supported on this GPU\n");
2324 return false;
2325 }
2326 }
2327 if (Opcode == -1 &&
2328 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2329 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2330 NumVDataDwords, NumVAddrDwords);
2331 if (Opcode == -1)
2332 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2333 NumVDataDwords, NumVAddrDwords);
2334 }
2335 if (Opcode == -1)
2336 return false;
2337
2338 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2339 .cloneMemRefs(MI);
2340
2341 if (VDataOut) {
2342 if (BaseOpcode->AtomicX2) {
2343 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2344
2345 Register TmpReg = MRI->createVirtualRegister(
2346 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2347 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2348
2349 MIB.addDef(TmpReg);
2350 if (!MRI->use_empty(VDataOut)) {
2351 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2352 .addReg(TmpReg, RegState::Kill, SubReg);
2353 }
2354
2355 } else {
2356 MIB.addDef(VDataOut); // vdata output
2357 }
2358 }
2359
2360 if (VDataIn)
2361 MIB.addReg(VDataIn); // vdata input
2362
2363 for (int I = 0; I != NumVAddrRegs; ++I) {
2364 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2365 if (SrcOp.isReg()) {
2366 assert(SrcOp.getReg() != 0);
2367 MIB.addReg(SrcOp.getReg());
2368 }
2369 }
2370
2371 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2372 if (BaseOpcode->Sampler)
2373 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2374
2375 MIB.addImm(DMask); // dmask
2376
2377 if (IsGFX10Plus)
2378 MIB.addImm(DimInfo->Encoding);
2379 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2380 MIB.addImm(Unorm);
2381
2382 MIB.addImm(CPol);
2383 MIB.addImm(IsA16 && // a16 or r128
2384 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2385 if (IsGFX10Plus)
2386 MIB.addImm(IsA16 ? -1 : 0);
2387
2388 if (!Subtarget->hasGFX90AInsts()) {
2389 MIB.addImm(TFE); // tfe
2390 } else if (TFE) {
2391 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2392 return false;
2393 }
2394
2395 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2396 MIB.addImm(LWE); // lwe
2397 if (!IsGFX10Plus)
2398 MIB.addImm(DimInfo->DA ? -1 : 0);
2399 if (BaseOpcode->HasD16)
2400 MIB.addImm(IsD16 ? -1 : 0);
2401
2402 MI.eraseFromParent();
2403 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2404 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2405 return true;
2406}
2407
2408// We need to handle this here because tablegen doesn't support matching
2409// instructions with multiple outputs.
2410bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2411 MachineInstr &MI) const {
2412 Register Dst0 = MI.getOperand(0).getReg();
2413 Register Dst1 = MI.getOperand(1).getReg();
2414
2415 const DebugLoc &DL = MI.getDebugLoc();
2416 MachineBasicBlock *MBB = MI.getParent();
2417
2418 Register Addr = MI.getOperand(3).getReg();
2419 Register Data0 = MI.getOperand(4).getReg();
2420 Register Data1 = MI.getOperand(5).getReg();
2421 unsigned Offset = MI.getOperand(6).getImm();
2422
2423 unsigned Opc;
2424 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2425 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2426 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2427 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2428 break;
2429 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2430 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2431 break;
2432 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2433 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2434 break;
2435 }
2436
2437 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2438 .addDef(Dst1)
2439 .addUse(Addr)
2440 .addUse(Data0)
2441 .addUse(Data1)
2442 .addImm(Offset)
2443 .cloneMemRefs(MI);
2444
2445 MI.eraseFromParent();
2446 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2447 return true;
2448}
2449
2450bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2451 MachineInstr &I) const {
2452 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2453 switch (IntrinsicID) {
2454 case Intrinsic::amdgcn_end_cf:
2455 return selectEndCfIntrinsic(I);
2456 case Intrinsic::amdgcn_ds_ordered_add:
2457 case Intrinsic::amdgcn_ds_ordered_swap:
2458 return selectDSOrderedIntrinsic(I, IntrinsicID);
2459 case Intrinsic::amdgcn_ds_gws_init:
2460 case Intrinsic::amdgcn_ds_gws_barrier:
2461 case Intrinsic::amdgcn_ds_gws_sema_v:
2462 case Intrinsic::amdgcn_ds_gws_sema_br:
2463 case Intrinsic::amdgcn_ds_gws_sema_p:
2464 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2465 return selectDSGWSIntrinsic(I, IntrinsicID);
2466 case Intrinsic::amdgcn_ds_append:
2467 return selectDSAppendConsume(I, true);
2468 case Intrinsic::amdgcn_ds_consume:
2469 return selectDSAppendConsume(I, false);
2470 case Intrinsic::amdgcn_init_whole_wave:
2471 return selectInitWholeWave(I);
2472 case Intrinsic::amdgcn_raw_buffer_load_lds:
2473 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2474 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2475 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2476 case Intrinsic::amdgcn_struct_buffer_load_lds:
2477 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2478 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2479 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2480 return selectBufferLoadLds(I);
2481 // Until we can store both the address space of the global and the LDS
2482 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2483 // that the argument is a global pointer (buffer pointers have been handled by
2484 // a LLVM IR-level lowering).
2485 case Intrinsic::amdgcn_load_to_lds:
2486 case Intrinsic::amdgcn_load_async_to_lds:
2487 case Intrinsic::amdgcn_global_load_lds:
2488 case Intrinsic::amdgcn_global_load_async_lds:
2489 return selectGlobalLoadLds(I);
2490 case Intrinsic::amdgcn_tensor_load_to_lds:
2491 case Intrinsic::amdgcn_tensor_store_from_lds:
2492 return selectTensorLoadStore(I, IntrinsicID);
2493 case Intrinsic::amdgcn_asyncmark:
2494 case Intrinsic::amdgcn_wait_asyncmark:
2495 if (!Subtarget->hasAsyncMark())
2496 return false;
2497 break;
2498 case Intrinsic::amdgcn_exp_compr:
2499 if (!STI.hasCompressedExport()) {
2501 return false;
2502 }
2503 break;
2504 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2505 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2506 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2507 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2508 return selectDSBvhStackIntrinsic(I);
2509 case Intrinsic::amdgcn_s_alloc_vgpr: {
2510 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2511 // SCC. We then need to COPY it into the result vreg.
2512 MachineBasicBlock *MBB = I.getParent();
2513 const DebugLoc &DL = I.getDebugLoc();
2514
2515 Register ResReg = I.getOperand(0).getReg();
2516
2517 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2518 .add(I.getOperand(2));
2519 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2520 .addReg(AMDGPU::SCC);
2521 I.eraseFromParent();
2522 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2523 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2524 }
2525 case Intrinsic::amdgcn_s_barrier_init:
2526 case Intrinsic::amdgcn_s_barrier_signal_var:
2527 return selectNamedBarrierInit(I, IntrinsicID);
2528 case Intrinsic::amdgcn_s_wakeup_barrier: {
2529 if (!STI.hasSWakeupBarrier()) {
2531 return false;
2532 }
2533 return selectNamedBarrierInst(I, IntrinsicID);
2534 }
2535 case Intrinsic::amdgcn_s_barrier_join:
2536 case Intrinsic::amdgcn_s_get_named_barrier_state:
2537 return selectNamedBarrierInst(I, IntrinsicID);
2538 case Intrinsic::amdgcn_s_get_barrier_state:
2539 return selectSGetBarrierState(I, IntrinsicID);
2540 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2541 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2542 }
2543 return selectImpl(I, *CoverageInfo);
2544}
2545
2546bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2547 if (selectImpl(I, *CoverageInfo))
2548 return true;
2549
2550 MachineBasicBlock *BB = I.getParent();
2551 const DebugLoc &DL = I.getDebugLoc();
2552
2553 Register DstReg = I.getOperand(0).getReg();
2554 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2555 assert(Size <= 32 || Size == 64);
2556 const MachineOperand &CCOp = I.getOperand(1);
2557 Register CCReg = CCOp.getReg();
2558 if (!isVCC(CCReg, *MRI)) {
2559 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2560 AMDGPU::S_CSELECT_B32;
2561 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2562 .addReg(CCReg);
2563
2564 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2565 // bank, because it does not cover the register class that we used to represent
2566 // for it. So we need to manually set the register class here.
2567 if (!MRI->getRegClassOrNull(CCReg))
2568 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2569 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2570 .add(I.getOperand(2))
2571 .add(I.getOperand(3));
2572
2574 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2575 I.eraseFromParent();
2576 return true;
2577 }
2578
2579 // Wide VGPR select should have been split in RegBankSelect.
2580 if (Size > 32)
2581 return false;
2582
2583 MachineInstr *Select =
2584 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2585 .addImm(0)
2586 .add(I.getOperand(3))
2587 .addImm(0)
2588 .add(I.getOperand(2))
2589 .add(I.getOperand(1));
2590
2592 I.eraseFromParent();
2593 return true;
2594}
2595
2596bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2597 Register DstReg = I.getOperand(0).getReg();
2598 Register SrcReg = I.getOperand(1).getReg();
2599 const LLT DstTy = MRI->getType(DstReg);
2600 const LLT SrcTy = MRI->getType(SrcReg);
2601 const LLT S1 = LLT::scalar(1);
2602
2603 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2604 const RegisterBank *DstRB;
2605 if (DstTy == S1) {
2606 // This is a special case. We don't treat s1 for legalization artifacts as
2607 // vcc booleans.
2608 DstRB = SrcRB;
2609 } else {
2610 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2611 if (SrcRB != DstRB)
2612 return false;
2613 }
2614
2615 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2616
2617 unsigned DstSize = DstTy.getSizeInBits();
2618 unsigned SrcSize = SrcTy.getSizeInBits();
2619
2620 const TargetRegisterClass *SrcRC =
2621 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2622 const TargetRegisterClass *DstRC =
2623 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2624 if (!SrcRC || !DstRC)
2625 return false;
2626
2627 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2628 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2629 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2630 return false;
2631 }
2632
2633 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2634 assert(STI.useRealTrue16Insts());
2635 const DebugLoc &DL = I.getDebugLoc();
2636 MachineBasicBlock *MBB = I.getParent();
2637 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2638 .addReg(SrcReg, {}, AMDGPU::lo16);
2639 I.eraseFromParent();
2640 return true;
2641 }
2642
2643 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2644 MachineBasicBlock *MBB = I.getParent();
2645 const DebugLoc &DL = I.getDebugLoc();
2646
2647 Register LoReg = MRI->createVirtualRegister(DstRC);
2648 Register HiReg = MRI->createVirtualRegister(DstRC);
2649 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2650 .addReg(SrcReg, {}, AMDGPU::sub0);
2651 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2652 .addReg(SrcReg, {}, AMDGPU::sub1);
2653
2654 if (IsVALU && STI.hasSDWA()) {
2655 // Write the low 16-bits of the high element into the high 16-bits of the
2656 // low element.
2657 MachineInstr *MovSDWA =
2658 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2659 .addImm(0) // $src0_modifiers
2660 .addReg(HiReg) // $src0
2661 .addImm(0) // $clamp
2662 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2663 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2664 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2665 .addReg(LoReg, RegState::Implicit);
2666 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2667 } else {
2668 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2669 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2670 Register ImmReg = MRI->createVirtualRegister(DstRC);
2671 if (IsVALU) {
2672 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2673 .addImm(16)
2674 .addReg(HiReg);
2675 } else {
2676 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2677 .addReg(HiReg)
2678 .addImm(16)
2679 .setOperandDead(3); // Dead scc
2680 }
2681
2682 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2683 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2684 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2685
2686 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2687 .addImm(0xffff);
2688 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2689 .addReg(LoReg)
2690 .addReg(ImmReg);
2691 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2692 .addReg(TmpReg0)
2693 .addReg(TmpReg1);
2694
2695 if (!IsVALU) {
2696 And.setOperandDead(3); // Dead scc
2697 Or.setOperandDead(3); // Dead scc
2698 }
2699 }
2700
2701 I.eraseFromParent();
2702 return true;
2703 }
2704
2705 if (!DstTy.isScalar())
2706 return false;
2707
2708 if (SrcSize > 32) {
2709 unsigned SubRegIdx = DstSize < 32
2710 ? static_cast<unsigned>(AMDGPU::sub0)
2711 : TRI.getSubRegFromChannel(0, DstSize / 32);
2712 if (SubRegIdx == AMDGPU::NoSubRegister)
2713 return false;
2714
2715 // Deal with weird cases where the class only partially supports the subreg
2716 // index.
2717 const TargetRegisterClass *SrcWithSubRC
2718 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2719 if (!SrcWithSubRC)
2720 return false;
2721
2722 if (SrcWithSubRC != SrcRC) {
2723 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2724 return false;
2725 }
2726
2727 I.getOperand(1).setSubReg(SubRegIdx);
2728 }
2729
2730 I.setDesc(TII.get(TargetOpcode::COPY));
2731 return true;
2732}
2733
2734/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2735static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2737 int SignedMask = static_cast<int>(Mask);
2738 return SignedMask >= -16 && SignedMask <= 64;
2739}
2740
2741// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2742const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2743 Register Reg, const MachineRegisterInfo &MRI,
2744 const TargetRegisterInfo &TRI) const {
2745 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2746 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2747 return RB;
2748
2749 // Ignore the type, since we don't use vcc in artifacts.
2750 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2751 return &RBI.getRegBankFromRegClass(*RC, LLT());
2752 return nullptr;
2753}
2754
2755bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2756 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2757 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2758 const DebugLoc &DL = I.getDebugLoc();
2759 MachineBasicBlock &MBB = *I.getParent();
2760 const Register DstReg = I.getOperand(0).getReg();
2761 const Register SrcReg = I.getOperand(1).getReg();
2762
2763 const LLT DstTy = MRI->getType(DstReg);
2764 const LLT SrcTy = MRI->getType(SrcReg);
2765 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2766 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2767 const unsigned DstSize = DstTy.getSizeInBits();
2768 if (!DstTy.isScalar())
2769 return false;
2770
2771 // Artifact casts should never use vcc.
2772 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2773
2774 // FIXME: This should probably be illegal and split earlier.
2775 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2776 if (DstSize <= 32)
2777 return selectCOPY(I);
2778
2779 const TargetRegisterClass *SrcRC =
2780 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2781 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2782 const TargetRegisterClass *DstRC =
2783 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2784
2785 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2786 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2787 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2788 .addReg(SrcReg)
2789 .addImm(AMDGPU::sub0)
2790 .addReg(UndefReg)
2791 .addImm(AMDGPU::sub1);
2792 I.eraseFromParent();
2793
2794 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2795 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2796 }
2797
2798 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2799 // 64-bit should have been split up in RegBankSelect
2800
2801 // Try to use an and with a mask if it will save code size.
2802 unsigned Mask;
2803 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2804 MachineInstr *ExtI =
2805 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2806 .addImm(Mask)
2807 .addReg(SrcReg);
2808 I.eraseFromParent();
2809 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2810 return true;
2811 }
2812
2813 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2814 MachineInstr *ExtI =
2815 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2816 .addReg(SrcReg)
2817 .addImm(0) // Offset
2818 .addImm(SrcSize); // Width
2819 I.eraseFromParent();
2820 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2821 return true;
2822 }
2823
2824 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2825 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2826 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2827 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2828 return false;
2829
2830 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2831 const unsigned SextOpc = SrcSize == 8 ?
2832 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2833 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2834 .addReg(SrcReg);
2835 I.eraseFromParent();
2836 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2837 }
2838
2839 // Using a single 32-bit SALU to calculate the high half is smaller than
2840 // S_BFE with a literal constant operand.
2841 if (DstSize > 32 && SrcSize == 32) {
2842 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2843 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2844 if (Signed) {
2845 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2846 .addReg(SrcReg, {}, SubReg)
2847 .addImm(31)
2848 .setOperandDead(3); // Dead scc
2849 } else {
2850 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2851 .addImm(0);
2852 }
2853 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2854 .addReg(SrcReg, {}, SubReg)
2855 .addImm(AMDGPU::sub0)
2856 .addReg(HiReg)
2857 .addImm(AMDGPU::sub1);
2858 I.eraseFromParent();
2859 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2860 *MRI);
2861 }
2862
2863 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2864 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2865
2866 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2867 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2868 // We need a 64-bit register source, but the high bits don't matter.
2869 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2870 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2871 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2872
2873 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2874 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2875 .addReg(SrcReg, {}, SubReg)
2876 .addImm(AMDGPU::sub0)
2877 .addReg(UndefReg)
2878 .addImm(AMDGPU::sub1);
2879
2880 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2881 .addReg(ExtReg)
2882 .addImm(SrcSize << 16);
2883
2884 I.eraseFromParent();
2885 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2886 }
2887
2888 unsigned Mask;
2889 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2890 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2891 .addReg(SrcReg)
2892 .addImm(Mask)
2893 .setOperandDead(3); // Dead scc
2894 } else {
2895 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2896 .addReg(SrcReg)
2897 .addImm(SrcSize << 16);
2898 }
2899
2900 I.eraseFromParent();
2901 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2902 }
2903
2904 return false;
2905}
2906
2910
2912 Register BitcastSrc;
2913 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2914 Reg = BitcastSrc;
2915 return Reg;
2916}
2917
2919 Register &Out) {
2920 // When unmerging a register that is composed of 2 x 16-bit values allow to
2921 // use an extract hi instruction for the upper 16 bits. We only need to check
2922 // the size of `In` as all defs are guaranteed to be the same type for
2923 // GUnmerge.
2924 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2925 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2926 MRI.getType(In).getSizeInBits() == 16) {
2927 Out = Unmerge->getSourceReg();
2928 return true;
2929 }
2930 }
2931
2932 Register Trunc;
2933 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2934 return false;
2935
2936 Register LShlSrc;
2937 Register Cst;
2938 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2939 Cst = stripCopy(Cst, MRI);
2940 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2941 Out = stripBitCast(LShlSrc, MRI);
2942 return true;
2943 }
2944 }
2945
2946 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2947 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2948 return false;
2949
2950 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2951 LLT::fixed_vector(2, 16));
2952
2953 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2954 assert(Mask.size() == 2);
2955
2956 if (Mask[0] == 1 && Mask[1] <= 1) {
2957 Out = Shuffle->getOperand(0).getReg();
2958 return true;
2959 }
2960
2961 return false;
2962}
2963
2964bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2965 if (!Subtarget->hasSALUFloatInsts())
2966 return false;
2967
2968 Register Dst = I.getOperand(0).getReg();
2969 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2970 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2971 return false;
2972
2973 Register Src = I.getOperand(1).getReg();
2974
2975 if (MRI->getType(Dst) == LLT::scalar(32) &&
2976 MRI->getType(Src) == LLT::scalar(16)) {
2977 if (isExtractHiElt(*MRI, Src, Src)) {
2978 MachineBasicBlock *BB = I.getParent();
2979 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2980 .addUse(Src);
2981 I.eraseFromParent();
2982 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2983 }
2984 }
2985
2986 return false;
2987}
2988
2989bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2990 // Only manually handle the f64 SGPR case.
2991 //
2992 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2993 // the bit ops theoretically have a second result due to the implicit def of
2994 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2995 // that is easy by disabling the check. The result works, but uses a
2996 // nonsensical sreg32orlds_and_sreg_1 regclass.
2997 //
2998 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2999 // the variadic REG_SEQUENCE operands.
3000
3001 Register Dst = MI.getOperand(0).getReg();
3002 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3003 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3004 MRI->getType(Dst) != LLT::scalar(64))
3005 return false;
3006
3007 Register Src = MI.getOperand(1).getReg();
3008 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
3009 if (Fabs)
3010 Src = Fabs->getOperand(1).getReg();
3011
3012 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3013 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3014 return false;
3015
3016 MachineBasicBlock *BB = MI.getParent();
3017 const DebugLoc &DL = MI.getDebugLoc();
3018 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3019 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3020 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3021 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3022
3023 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3024 .addReg(Src, {}, AMDGPU::sub0);
3025 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3026 .addReg(Src, {}, AMDGPU::sub1);
3027 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3028 .addImm(0x80000000);
3029
3030 // Set or toggle sign bit.
3031 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3032 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
3033 .addReg(HiReg)
3034 .addReg(ConstReg)
3035 .setOperandDead(3); // Dead scc
3036 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3037 .addReg(LoReg)
3038 .addImm(AMDGPU::sub0)
3039 .addReg(OpReg)
3040 .addImm(AMDGPU::sub1);
3041 MI.eraseFromParent();
3042 return true;
3043}
3044
3045// FIXME: This is a workaround for the same tablegen problems as G_FNEG
3046bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
3047 Register Dst = MI.getOperand(0).getReg();
3048 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3049 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3050 MRI->getType(Dst) != LLT::scalar(64))
3051 return false;
3052
3053 Register Src = MI.getOperand(1).getReg();
3054 MachineBasicBlock *BB = MI.getParent();
3055 const DebugLoc &DL = MI.getDebugLoc();
3056 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3057 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3058 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3059 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3060
3061 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3062 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3063 return false;
3064
3065 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3066 .addReg(Src, {}, AMDGPU::sub0);
3067 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3068 .addReg(Src, {}, AMDGPU::sub1);
3069 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3070 .addImm(0x7fffffff);
3071
3072 // Clear sign bit.
3073 // TODO: Should this used S_BITSET0_*?
3074 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
3075 .addReg(HiReg)
3076 .addReg(ConstReg)
3077 .setOperandDead(3); // Dead scc
3078 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3079 .addReg(LoReg)
3080 .addImm(AMDGPU::sub0)
3081 .addReg(OpReg)
3082 .addImm(AMDGPU::sub1);
3083
3084 MI.eraseFromParent();
3085 return true;
3086}
3087
3088static bool isConstant(const MachineInstr &MI) {
3089 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3090}
3091
3092void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3093 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3094
3095 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3096 const MachineInstr *PtrMI =
3097 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3098
3099 assert(PtrMI);
3100
3101 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3102 return;
3103
3104 GEPInfo GEPInfo;
3105
3106 for (unsigned i = 1; i != 3; ++i) {
3107 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3108 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3109 assert(OpDef);
3110 if (i == 2 && isConstant(*OpDef)) {
3111 // TODO: Could handle constant base + variable offset, but a combine
3112 // probably should have commuted it.
3113 assert(GEPInfo.Imm == 0);
3114 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3115 continue;
3116 }
3117 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3118 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3119 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3120 else
3121 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3122 }
3123
3124 AddrInfo.push_back(GEPInfo);
3125 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3126}
3127
3128bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3129 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3130}
3131
3132bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3133 if (!MI.hasOneMemOperand())
3134 return false;
3135
3136 const MachineMemOperand *MMO = *MI.memoperands_begin();
3137 const Value *Ptr = MMO->getValue();
3138
3139 // UndefValue means this is a load of a kernel input. These are uniform.
3140 // Sometimes LDS instructions have constant pointers.
3141 // If Ptr is null, then that means this mem operand contains a
3142 // PseudoSourceValue like GOT.
3144 return true;
3145
3147 return true;
3148
3149 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3150 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3151 AMDGPU::SGPRRegBankID;
3152
3153 const Instruction *I = dyn_cast<Instruction>(Ptr);
3154 return I && I->getMetadata("amdgpu.uniform");
3155}
3156
3157bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3158 for (const GEPInfo &GEPInfo : AddrInfo) {
3159 if (!GEPInfo.VgprParts.empty())
3160 return true;
3161 }
3162 return false;
3163}
3164
3165void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3166 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3167 unsigned AS = PtrTy.getAddressSpace();
3169 STI.ldsRequiresM0Init()) {
3170 MachineBasicBlock *BB = I.getParent();
3171
3172 // If DS instructions require M0 initialization, insert it before selecting.
3173 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3174 .addImm(-1);
3175 }
3176}
3177
3178bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3179 MachineInstr &I) const {
3180 initM0(I);
3181 return selectImpl(I, *CoverageInfo);
3182}
3183
3185 if (Reg.isPhysical())
3186 return false;
3187
3189 const unsigned Opcode = MI.getOpcode();
3190
3191 if (Opcode == AMDGPU::COPY)
3192 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3193
3194 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3195 Opcode == AMDGPU::G_XOR)
3196 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3197 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3198
3199 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3200 return GI->is(Intrinsic::amdgcn_class);
3201
3202 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3203}
3204
3205bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3206 MachineBasicBlock *BB = I.getParent();
3207 MachineOperand &CondOp = I.getOperand(0);
3208 Register CondReg = CondOp.getReg();
3209 const DebugLoc &DL = I.getDebugLoc();
3210
3211 unsigned BrOpcode;
3212 Register CondPhysReg;
3213 const TargetRegisterClass *ConstrainRC;
3214
3215 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3216 // whether the branch is uniform when selecting the instruction. In
3217 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3218 // RegBankSelect knows what it's doing if the branch condition is scc, even
3219 // though it currently does not.
3220 if (!isVCC(CondReg, *MRI)) {
3221 if (MRI->getType(CondReg) != LLT::scalar(32))
3222 return false;
3223
3224 CondPhysReg = AMDGPU::SCC;
3225 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3226 ConstrainRC = &AMDGPU::SReg_32RegClass;
3227 } else {
3228 // FIXME: Should scc->vcc copies and with exec?
3229
3230 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3231 // need to insert an and with exec.
3232 if (!isVCmpResult(CondReg, *MRI)) {
3233 const bool Is64 = STI.isWave64();
3234 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3235 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3236
3237 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3238 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3239 .addReg(CondReg)
3240 .addReg(Exec)
3241 .setOperandDead(3); // Dead scc
3242 CondReg = TmpReg;
3243 }
3244
3245 CondPhysReg = TRI.getVCC();
3246 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3247 ConstrainRC = TRI.getBoolRC();
3248 }
3249
3250 if (!MRI->getRegClassOrNull(CondReg))
3251 MRI->setRegClass(CondReg, ConstrainRC);
3252
3253 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3254 .addReg(CondReg);
3255 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3256 .addMBB(I.getOperand(1).getMBB());
3257
3258 I.eraseFromParent();
3259 return true;
3260}
3261
3262bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3263 MachineInstr &I) const {
3264 Register DstReg = I.getOperand(0).getReg();
3265 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3266 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3267 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3268 if (IsVGPR)
3269 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3270
3271 return RBI.constrainGenericRegister(
3272 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3273}
3274
3275bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3276 Register DstReg = I.getOperand(0).getReg();
3277 Register SrcReg = I.getOperand(1).getReg();
3278 Register MaskReg = I.getOperand(2).getReg();
3279 LLT Ty = MRI->getType(DstReg);
3280 LLT MaskTy = MRI->getType(MaskReg);
3281 MachineBasicBlock *BB = I.getParent();
3282 const DebugLoc &DL = I.getDebugLoc();
3283
3284 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3285 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3286 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3287 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3288 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3289 return false;
3290
3291 // Try to avoid emitting a bit operation when we only need to touch half of
3292 // the 64-bit pointer.
3293 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3294 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3295 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3296
3297 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3298 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3299
3300 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3301 !CanCopyLow32 && !CanCopyHi32) {
3302 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3303 .addReg(SrcReg)
3304 .addReg(MaskReg)
3305 .setOperandDead(3); // Dead scc
3306 I.eraseFromParent();
3307 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3308 return true;
3309 }
3310
3311 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3312 const TargetRegisterClass &RegRC
3313 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3314
3315 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3316 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3317 const TargetRegisterClass *MaskRC =
3318 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3319
3320 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3321 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3322 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3323 return false;
3324
3325 if (Ty.getSizeInBits() == 32) {
3326 assert(MaskTy.getSizeInBits() == 32 &&
3327 "ptrmask should have been narrowed during legalize");
3328
3329 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3330 .addReg(SrcReg)
3331 .addReg(MaskReg);
3332
3333 if (!IsVGPR)
3334 NewOp.setOperandDead(3); // Dead scc
3335 I.eraseFromParent();
3336 return true;
3337 }
3338
3339 Register HiReg = MRI->createVirtualRegister(&RegRC);
3340 Register LoReg = MRI->createVirtualRegister(&RegRC);
3341
3342 // Extract the subregisters from the source pointer.
3343 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3344 .addReg(SrcReg, {}, AMDGPU::sub0);
3345 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3346 .addReg(SrcReg, {}, AMDGPU::sub1);
3347
3348 Register MaskedLo, MaskedHi;
3349
3350 if (CanCopyLow32) {
3351 // If all the bits in the low half are 1, we only need a copy for it.
3352 MaskedLo = LoReg;
3353 } else {
3354 // Extract the mask subregister and apply the and.
3355 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3356 MaskedLo = MRI->createVirtualRegister(&RegRC);
3357
3358 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3359 .addReg(MaskReg, {}, AMDGPU::sub0);
3360 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3361 .addReg(LoReg)
3362 .addReg(MaskLo);
3363 }
3364
3365 if (CanCopyHi32) {
3366 // If all the bits in the high half are 1, we only need a copy for it.
3367 MaskedHi = HiReg;
3368 } else {
3369 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3370 MaskedHi = MRI->createVirtualRegister(&RegRC);
3371
3372 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3373 .addReg(MaskReg, {}, AMDGPU::sub1);
3374 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3375 .addReg(HiReg)
3376 .addReg(MaskHi);
3377 }
3378
3379 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3380 .addReg(MaskedLo)
3381 .addImm(AMDGPU::sub0)
3382 .addReg(MaskedHi)
3383 .addImm(AMDGPU::sub1);
3384 I.eraseFromParent();
3385 return true;
3386}
3387
3388/// Return the register to use for the index value, and the subregister to use
3389/// for the indirectly accessed register.
3390static std::pair<Register, unsigned>
3392 const TargetRegisterClass *SuperRC, Register IdxReg,
3393 unsigned EltSize, GISelValueTracking &ValueTracking) {
3394 Register IdxBaseReg;
3395 int Offset;
3396
3397 std::tie(IdxBaseReg, Offset) =
3398 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3399 if (IdxBaseReg == AMDGPU::NoRegister) {
3400 // This will happen if the index is a known constant. This should ordinarily
3401 // be legalized out, but handle it as a register just in case.
3402 assert(Offset == 0);
3403 IdxBaseReg = IdxReg;
3404 }
3405
3406 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3407
3408 // Skip out of bounds offsets, or else we would end up using an undefined
3409 // register.
3410 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3411 return std::pair(IdxReg, SubRegs[0]);
3412 return std::pair(IdxBaseReg, SubRegs[Offset]);
3413}
3414
3415bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3416 MachineInstr &MI) const {
3417 Register DstReg = MI.getOperand(0).getReg();
3418 Register SrcReg = MI.getOperand(1).getReg();
3419 Register IdxReg = MI.getOperand(2).getReg();
3420
3421 LLT DstTy = MRI->getType(DstReg);
3422 LLT SrcTy = MRI->getType(SrcReg);
3423
3424 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3425 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3426 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3427
3428 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3429 // into a waterfall loop.
3430 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3431 return false;
3432
3433 const TargetRegisterClass *SrcRC =
3434 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3435 const TargetRegisterClass *DstRC =
3436 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3437 if (!SrcRC || !DstRC)
3438 return false;
3439 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3440 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3441 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3442 return false;
3443
3444 MachineBasicBlock *BB = MI.getParent();
3445 const DebugLoc &DL = MI.getDebugLoc();
3446 const bool Is64 = DstTy.getSizeInBits() == 64;
3447
3448 unsigned SubReg;
3449 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3450 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3451
3452 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3453 if (DstTy.getSizeInBits() != 32 && !Is64)
3454 return false;
3455
3456 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3457 .addReg(IdxReg);
3458
3459 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3460 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3461 .addReg(SrcReg, {}, SubReg)
3462 .addReg(SrcReg, RegState::Implicit);
3463 MI.eraseFromParent();
3464 return true;
3465 }
3466
3467 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3468 return false;
3469
3470 if (!STI.useVGPRIndexMode()) {
3471 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3472 .addReg(IdxReg);
3473 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3474 .addReg(SrcReg, {}, SubReg)
3475 .addReg(SrcReg, RegState::Implicit);
3476 MI.eraseFromParent();
3477 return true;
3478 }
3479
3480 const MCInstrDesc &GPRIDXDesc =
3481 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3482 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3483 .addReg(SrcReg)
3484 .addReg(IdxReg)
3485 .addImm(SubReg);
3486
3487 MI.eraseFromParent();
3488 return true;
3489}
3490
3491// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3492bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3493 MachineInstr &MI) const {
3494 Register DstReg = MI.getOperand(0).getReg();
3495 Register VecReg = MI.getOperand(1).getReg();
3496 Register ValReg = MI.getOperand(2).getReg();
3497 Register IdxReg = MI.getOperand(3).getReg();
3498
3499 LLT VecTy = MRI->getType(DstReg);
3500 LLT ValTy = MRI->getType(ValReg);
3501 unsigned VecSize = VecTy.getSizeInBits();
3502 unsigned ValSize = ValTy.getSizeInBits();
3503
3504 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3505 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3506 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3507
3508 assert(VecTy.getElementType() == ValTy);
3509
3510 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3511 // into a waterfall loop.
3512 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3513 return false;
3514
3515 const TargetRegisterClass *VecRC =
3516 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3517 const TargetRegisterClass *ValRC =
3518 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3519
3520 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3521 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3522 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3523 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3524 return false;
3525
3526 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3527 return false;
3528
3529 unsigned SubReg;
3530 std::tie(IdxReg, SubReg) =
3531 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3532
3533 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3534 STI.useVGPRIndexMode();
3535
3536 MachineBasicBlock *BB = MI.getParent();
3537 const DebugLoc &DL = MI.getDebugLoc();
3538
3539 if (!IndexMode) {
3540 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3541 .addReg(IdxReg);
3542
3543 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3544 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3545 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3546 .addReg(VecReg)
3547 .addReg(ValReg)
3548 .addImm(SubReg);
3549 MI.eraseFromParent();
3550 return true;
3551 }
3552
3553 const MCInstrDesc &GPRIDXDesc =
3554 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3555 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3556 .addReg(VecReg)
3557 .addReg(ValReg)
3558 .addReg(IdxReg)
3559 .addImm(SubReg);
3560
3561 MI.eraseFromParent();
3562 return true;
3563}
3564
3565static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3566 switch (Intr) {
3567 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3568 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3569 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3570 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3571 case Intrinsic::amdgcn_load_async_to_lds:
3572 case Intrinsic::amdgcn_global_load_async_lds:
3573 return true;
3574 }
3575 return false;
3576}
3577
3578bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3579 if (!Subtarget->hasVMemToLDSLoad())
3580 return false;
3581 unsigned Opc;
3582 unsigned Size = MI.getOperand(3).getImm();
3583 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3584
3585 // The struct intrinsic variants add one additional operand over raw.
3586 const bool HasVIndex = MI.getNumOperands() == 9;
3587 Register VIndex;
3588 int OpOffset = 0;
3589 if (HasVIndex) {
3590 VIndex = MI.getOperand(4).getReg();
3591 OpOffset = 1;
3592 }
3593
3594 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3595 std::optional<ValueAndVReg> MaybeVOffset =
3597 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3598
3599 switch (Size) {
3600 default:
3601 return false;
3602 case 1:
3603 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3604 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3605 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3606 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3607 break;
3608 case 2:
3609 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3610 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3611 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3612 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3613 break;
3614 case 4:
3615 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3616 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3617 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3618 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3619 break;
3620 case 12:
3621 if (!Subtarget->hasLDSLoadB96_B128())
3622 return false;
3623
3624 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3625 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3626 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3627 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3628 break;
3629 case 16:
3630 if (!Subtarget->hasLDSLoadB96_B128())
3631 return false;
3632
3633 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3634 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3635 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3636 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3637 break;
3638 }
3639
3640 MachineBasicBlock *MBB = MI.getParent();
3641 const DebugLoc &DL = MI.getDebugLoc();
3642 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3643 .add(MI.getOperand(2));
3644
3645 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3646
3647 if (HasVIndex && HasVOffset) {
3648 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3649 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3650 .addReg(VIndex)
3651 .addImm(AMDGPU::sub0)
3652 .addReg(VOffset)
3653 .addImm(AMDGPU::sub1);
3654
3655 MIB.addReg(IdxReg);
3656 } else if (HasVIndex) {
3657 MIB.addReg(VIndex);
3658 } else if (HasVOffset) {
3659 MIB.addReg(VOffset);
3660 }
3661
3662 MIB.add(MI.getOperand(1)); // rsrc
3663 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3664 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3665 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3666 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3667 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3668 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3669 MIB.addImm(
3670 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3671 ? 1
3672 : 0); // swz
3673 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3674
3675 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3676 // Don't set the offset value here because the pointer points to the base of
3677 // the buffer.
3678 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3679
3680 MachinePointerInfo StorePtrI = LoadPtrI;
3681 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3685
3686 auto F = LoadMMO->getFlags() &
3688 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3689 Size, LoadMMO->getBaseAlign());
3690
3691 MachineMemOperand *StoreMMO =
3692 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3693 sizeof(int32_t), LoadMMO->getBaseAlign());
3694
3695 MIB.setMemRefs({LoadMMO, StoreMMO});
3696
3697 MI.eraseFromParent();
3698 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3699 return true;
3700}
3701
3702/// Match a zero extend from a 32-bit value to 64-bits.
3703Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3704 Register ZExtSrc;
3705 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3706 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3707
3708 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3709 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3710 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3711 return Register();
3712
3713 assert(Def->getNumOperands() == 3 &&
3714 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3715 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3716 return Def->getOperand(1).getReg();
3717 }
3718
3719 return Register();
3720}
3721
3722/// Match a sign extend from a 32-bit value to 64-bits.
3723Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3724 Register SExtSrc;
3725 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3726 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3727
3728 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3729 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3730 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3731 return Register();
3732
3733 assert(Def->getNumOperands() == 3 &&
3734 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3735 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3736 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3737 m_SpecificICst(31))))
3738 return Def->getOperand(1).getReg();
3739
3740 if (VT->signBitIsZero(Reg))
3741 return matchZeroExtendFromS32(Reg);
3742
3743 return Register();
3744}
3745
3746/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3747/// is 32-bit.
3749AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3750 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3751 : matchZeroExtendFromS32(Reg);
3752}
3753
3754/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3755/// is 32-bit.
3757AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3758 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3759 : matchSignExtendFromS32(Reg);
3760}
3761
3763AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3764 bool IsSigned) const {
3765 if (IsSigned)
3766 return matchSignExtendFromS32OrS32(Reg);
3767
3768 return matchZeroExtendFromS32OrS32(Reg);
3769}
3770
3771Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3772 Register AnyExtSrc;
3773 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3774 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3775
3776 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3777 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3778 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3779 return Register();
3780
3781 assert(Def->getNumOperands() == 3 &&
3782 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3783
3784 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3785 return Def->getOperand(1).getReg();
3786
3787 return Register();
3788}
3789
3790bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3791 if (!Subtarget->hasVMemToLDSLoad())
3792 return false;
3793
3794 unsigned Opc;
3795 unsigned Size = MI.getOperand(3).getImm();
3796 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3797
3798 switch (Size) {
3799 default:
3800 return false;
3801 case 1:
3802 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3803 break;
3804 case 2:
3805 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3806 break;
3807 case 4:
3808 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3809 break;
3810 case 12:
3811 if (!Subtarget->hasLDSLoadB96_B128())
3812 return false;
3813 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3814 break;
3815 case 16:
3816 if (!Subtarget->hasLDSLoadB96_B128())
3817 return false;
3818 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3819 break;
3820 }
3821
3822 MachineBasicBlock *MBB = MI.getParent();
3823 const DebugLoc &DL = MI.getDebugLoc();
3824 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3825 .add(MI.getOperand(2));
3826
3827 Register Addr = MI.getOperand(1).getReg();
3828 Register VOffset;
3829 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3830 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3831 if (!isSGPR(Addr)) {
3832 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3833 if (isSGPR(AddrDef->Reg)) {
3834 Addr = AddrDef->Reg;
3835 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3836 Register SAddr =
3837 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3838 if (isSGPR(SAddr)) {
3839 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3840 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3841 Addr = SAddr;
3842 VOffset = Off;
3843 }
3844 }
3845 }
3846 }
3847
3848 if (isSGPR(Addr)) {
3850 if (!VOffset) {
3851 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3852 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3853 .addImm(0);
3854 }
3855 }
3856
3857 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3858 .addReg(Addr);
3859
3860 if (isSGPR(Addr))
3861 MIB.addReg(VOffset);
3862
3863 MIB.add(MI.getOperand(4)); // offset
3864
3865 unsigned Aux = MI.getOperand(5).getImm();
3866 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3867 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3868
3869 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3870 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3871 LoadPtrI.Offset = MI.getOperand(4).getImm();
3872 MachinePointerInfo StorePtrI = LoadPtrI;
3873 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3877 auto F = LoadMMO->getFlags() &
3879 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3880 Size, LoadMMO->getBaseAlign());
3881 MachineMemOperand *StoreMMO =
3882 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3883 sizeof(int32_t), Align(4));
3884
3885 MIB.setMemRefs({LoadMMO, StoreMMO});
3886
3887 MI.eraseFromParent();
3888 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3889 return true;
3890}
3891
3892bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3893 Intrinsic::ID IID) const {
3894 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3895 unsigned Opc =
3896 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3897 int NumGroups = 4;
3898
3899 // A lamda function to check whether an operand is a vector of all 0s.
3900 const auto isAllZeros = [&](MachineOperand &Opnd) {
3901 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3902 if (!DefMI)
3903 return false;
3904 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3905 };
3906
3907 // Use _D2 version if both group 2 and 3 are zero-initialized.
3908 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3909 NumGroups = 2;
3910 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3911 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3912 }
3913
3914 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3915 // for now because all existing targets only support up to 4 groups.
3916 MachineBasicBlock *MBB = MI.getParent();
3917 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3918 .add(MI.getOperand(1)) // D# group 0
3919 .add(MI.getOperand(2)); // D# group 1
3920
3921 if (NumGroups >= 4) { // Has at least 4 groups
3922 MIB.add(MI.getOperand(3)) // D# group 2
3923 .add(MI.getOperand(4)); // D# group 3
3924 }
3925
3926 MIB.addImm(0) // r128
3927 .add(MI.getOperand(6)); // cpol
3928
3929 MI.eraseFromParent();
3930 return true;
3931}
3932
3933bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3934 MachineInstr &MI) const {
3935 unsigned OpcodeOpIdx =
3936 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3937 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3938 MI.removeOperand(OpcodeOpIdx);
3939 MI.addImplicitDefUseOperands(*MI.getMF());
3940 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3941 return true;
3942}
3943
3944// FIXME: This should be removed and let the patterns select. We just need the
3945// AGPR/VGPR combination versions.
3946bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3947 unsigned Opc;
3948 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3949 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3950 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3951 break;
3952 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3953 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3954 break;
3955 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3956 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3957 break;
3958 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3959 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3960 break;
3961 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3962 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3963 break;
3964 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3965 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3966 break;
3967 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3968 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3969 break;
3970 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3971 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3972 break;
3973 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3974 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3975 break;
3976 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3977 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3978 break;
3979 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3980 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3981 break;
3982 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3983 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3984 break;
3985 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3986 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3987 break;
3988 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3989 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3990 break;
3991 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3992 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3993 break;
3994 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3995 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3996 break;
3997 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3998 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3999 break;
4000 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4001 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
4002 break;
4003 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4004 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
4005 break;
4006 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4007 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
4008 break;
4009 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4010 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
4011 break;
4012 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4013 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
4014 break;
4015 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4016 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
4017 break;
4018 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4019 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4020 break;
4021 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4022 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4023 break;
4024 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4025 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4026 break;
4027 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4028 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4029 break;
4030 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4031 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4032 break;
4033 default:
4034 llvm_unreachable("unhandled smfmac intrinsic");
4035 }
4036
4037 auto VDst_In = MI.getOperand(4);
4038
4039 MI.setDesc(TII.get(Opc));
4040 MI.removeOperand(4); // VDst_In
4041 MI.removeOperand(1); // Intrinsic ID
4042 MI.addOperand(VDst_In); // Readd VDst_In to the end
4043 MI.addImplicitDefUseOperands(*MI.getMF());
4044 const MCInstrDesc &MCID = MI.getDesc();
4045 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
4046 MI.getOperand(0).setIsEarlyClobber(true);
4047 }
4048 return true;
4049}
4050
4051bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4052 MachineInstr &MI, Intrinsic::ID IntrID) const {
4053 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4054 !Subtarget->hasPermlane16Swap())
4055 return false;
4056 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4057 !Subtarget->hasPermlane32Swap())
4058 return false;
4059
4060 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4061 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4062 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4063
4064 MI.removeOperand(2);
4065 MI.setDesc(TII.get(Opcode));
4066 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
4067
4068 MachineOperand &FI = MI.getOperand(4);
4070
4071 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
4072 return true;
4073}
4074
4075bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4076 Register DstReg = MI.getOperand(0).getReg();
4077 Register SrcReg = MI.getOperand(1).getReg();
4078 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4079 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4080 MachineBasicBlock *MBB = MI.getParent();
4081 const DebugLoc &DL = MI.getDebugLoc();
4082
4083 if (IsVALU) {
4084 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4085 .addImm(Subtarget->getWavefrontSizeLog2())
4086 .addReg(SrcReg);
4087 } else {
4088 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4089 .addReg(SrcReg)
4090 .addImm(Subtarget->getWavefrontSizeLog2())
4091 .setOperandDead(3); // Dead scc
4092 }
4093
4094 const TargetRegisterClass &RC =
4095 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4096 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4097 return false;
4098
4099 MI.eraseFromParent();
4100 return true;
4101}
4102
4103bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4104 MachineInstr &MI) const {
4105 assert(MI.getNumOperands() == 4);
4106 MachineBasicBlock *MBB = MI.getParent();
4107 const DebugLoc &DL = MI.getDebugLoc();
4108
4109 Register DstReg = MI.getOperand(0).getReg();
4110 Register ValReg = MI.getOperand(2).getReg();
4111 Register IdxReg = MI.getOperand(3).getReg();
4112
4113 const LLT DstTy = MRI->getType(DstReg);
4114 unsigned DstSize = DstTy.getSizeInBits();
4115 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4116 const TargetRegisterClass *DstRC =
4117 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4118
4119 if (DstTy != LLT::scalar(32))
4120 return false;
4121
4122 if (!Subtarget->supportsBPermute())
4123 return false;
4124
4125 // If we can bpermute across the whole wave, then just do that
4126 if (Subtarget->supportsWaveWideBPermute()) {
4127 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4128 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4129 .addImm(2)
4130 .addReg(IdxReg);
4131
4132 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4133 .addReg(ShiftIdxReg)
4134 .addReg(ValReg)
4135 .addImm(0);
4136 } else {
4137 // Otherwise, we need to make use of whole wave mode
4138 assert(Subtarget->isWave64());
4139
4140 // Set inactive lanes to poison
4141 Register UndefValReg =
4142 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4143 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4144
4145 Register UndefExecReg = MRI->createVirtualRegister(
4146 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4147 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4148
4149 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4150 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4151 .addImm(0)
4152 .addReg(ValReg)
4153 .addImm(0)
4154 .addReg(UndefValReg)
4155 .addReg(UndefExecReg);
4156
4157 // ds_bpermute requires index to be multiplied by 4
4158 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4159 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4160 .addImm(2)
4161 .addReg(IdxReg);
4162
4163 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4164 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4165 .addImm(0)
4166 .addReg(ShiftIdxReg)
4167 .addImm(0)
4168 .addReg(UndefValReg)
4169 .addReg(UndefExecReg);
4170
4171 // Get permutation of each half, then we'll select which one to use
4172 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4173 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4174 .addReg(PoisonIdxReg)
4175 .addReg(PoisonValReg)
4176 .addImm(0);
4177
4178 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4179 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4180 .addReg(PoisonValReg);
4181
4182 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4183 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4184 .addReg(PoisonIdxReg)
4185 .addReg(SwappedValReg)
4186 .addImm(0);
4187
4188 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4189 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4190 .addReg(OppSidePermReg);
4191
4192 // Select which side to take the permute from
4193 // We can get away with only using mbcnt_lo here since we're only
4194 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4195 // returns 32 for lanes 32-63.
4196 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4197 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4198 .addImm(-1)
4199 .addImm(0);
4200
4201 Register XORReg = MRI->createVirtualRegister(DstRC);
4202 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4203 .addReg(ThreadIDReg)
4204 .addReg(PoisonIdxReg);
4205
4206 Register ANDReg = MRI->createVirtualRegister(DstRC);
4207 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4208 .addReg(XORReg)
4209 .addImm(32);
4210
4211 Register CompareReg = MRI->createVirtualRegister(
4212 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4213 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4214 .addReg(ANDReg)
4215 .addImm(0);
4216
4217 // Finally do the selection
4218 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4219 .addImm(0)
4220 .addReg(WWMSwapPermReg)
4221 .addImm(0)
4222 .addReg(SameSidePermReg)
4223 .addReg(CompareReg);
4224 }
4225
4226 MI.eraseFromParent();
4227 return true;
4228}
4229
4230// Match BITOP3 operation and return a number of matched instructions plus
4231// truth table.
4232static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4234 const MachineRegisterInfo &MRI) {
4235 unsigned NumOpcodes = 0;
4236 uint8_t LHSBits, RHSBits;
4237
4238 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4239 // Define truth table given Src0, Src1, Src2 bits permutations:
4240 // 0 0 0
4241 // 0 0 1
4242 // 0 1 0
4243 // 0 1 1
4244 // 1 0 0
4245 // 1 0 1
4246 // 1 1 0
4247 // 1 1 1
4248 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4249
4250 if (mi_match(Op, MRI, m_AllOnesInt())) {
4251 Bits = 0xff;
4252 return true;
4253 }
4254 if (mi_match(Op, MRI, m_ZeroInt())) {
4255 Bits = 0;
4256 return true;
4257 }
4258
4259 for (unsigned I = 0; I < Src.size(); ++I) {
4260 // Try to find existing reused operand
4261 if (Src[I] == Op) {
4262 Bits = SrcBits[I];
4263 return true;
4264 }
4265 // Try to replace parent operator
4266 if (Src[I] == R) {
4267 Bits = SrcBits[I];
4268 Src[I] = Op;
4269 return true;
4270 }
4271 }
4272
4273 if (Src.size() == 3) {
4274 // No room left for operands. Try one last time, there can be a 'not' of
4275 // one of our source operands. In this case we can compute the bits
4276 // without growing Src vector.
4277 Register LHS;
4278 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4280 for (unsigned I = 0; I < Src.size(); ++I) {
4281 if (Src[I] == LHS) {
4282 Bits = ~SrcBits[I];
4283 return true;
4284 }
4285 }
4286 }
4287
4288 return false;
4289 }
4290
4291 Bits = SrcBits[Src.size()];
4292 Src.push_back(Op);
4293 return true;
4294 };
4295
4296 MachineInstr *MI = MRI.getVRegDef(R);
4297 switch (MI->getOpcode()) {
4298 case TargetOpcode::G_AND:
4299 case TargetOpcode::G_OR:
4300 case TargetOpcode::G_XOR: {
4301 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4302 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4303
4304 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4305 if (!getOperandBits(LHS, LHSBits) ||
4306 !getOperandBits(RHS, RHSBits)) {
4307 Src = std::move(Backup);
4308 return std::make_pair(0, 0);
4309 }
4310
4311 // Recursion is naturally limited by the size of the operand vector.
4312 auto Op = BitOp3_Op(LHS, Src, MRI);
4313 if (Op.first) {
4314 NumOpcodes += Op.first;
4315 LHSBits = Op.second;
4316 }
4317
4318 Op = BitOp3_Op(RHS, Src, MRI);
4319 if (Op.first) {
4320 NumOpcodes += Op.first;
4321 RHSBits = Op.second;
4322 }
4323 break;
4324 }
4325 default:
4326 return std::make_pair(0, 0);
4327 }
4328
4329 uint8_t TTbl;
4330 switch (MI->getOpcode()) {
4331 case TargetOpcode::G_AND:
4332 TTbl = LHSBits & RHSBits;
4333 break;
4334 case TargetOpcode::G_OR:
4335 TTbl = LHSBits | RHSBits;
4336 break;
4337 case TargetOpcode::G_XOR:
4338 TTbl = LHSBits ^ RHSBits;
4339 break;
4340 default:
4341 break;
4342 }
4343
4344 return std::make_pair(NumOpcodes + 1, TTbl);
4345}
4346
4347bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4348 if (!Subtarget->hasBitOp3Insts())
4349 return false;
4350
4351 Register DstReg = MI.getOperand(0).getReg();
4352 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4353 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4354 if (!IsVALU)
4355 return false;
4356
4358 uint8_t TTbl;
4359 unsigned NumOpcodes;
4360
4361 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4362
4363 // Src.empty() case can happen if all operands are all zero or all ones.
4364 // Normally it shall be optimized out before reaching this.
4365 if (NumOpcodes < 2 || Src.empty())
4366 return false;
4367
4368 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4369 if (NumOpcodes == 2 && IsB32) {
4370 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4371 // asm more readable. This cannot be modeled with AddedComplexity because
4372 // selector does not know how many operations did we match.
4373 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4374 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4375 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4376 return false;
4377 } else if (NumOpcodes < 4) {
4378 // For a uniform case threshold should be higher to account for moves
4379 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4380 // in SGPRs and a readtfirstlane after.
4381 return false;
4382 }
4383
4384 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4385 if (!IsB32 && STI.hasTrue16BitInsts())
4386 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4387 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4388 unsigned CBL = STI.getConstantBusLimit(Opc);
4389 MachineBasicBlock *MBB = MI.getParent();
4390 const DebugLoc &DL = MI.getDebugLoc();
4391
4392 for (unsigned I = 0; I < Src.size(); ++I) {
4393 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4394 if (RB->getID() != AMDGPU::SGPRRegBankID)
4395 continue;
4396 if (CBL > 0) {
4397 --CBL;
4398 continue;
4399 }
4400 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4401 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4402 .addReg(Src[I]);
4403 Src[I] = NewReg;
4404 }
4405
4406 // Last operand can be ignored, turning a ternary operation into a binary.
4407 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4408 // 'c' with 'a' here without changing the answer. In some pathological
4409 // cases it should be possible to get an operation with a single operand
4410 // too if optimizer would not catch it.
4411 while (Src.size() < 3)
4412 Src.push_back(Src[0]);
4413
4414 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4415 if (!IsB32)
4416 MIB.addImm(0); // src_mod0
4417 MIB.addReg(Src[0]);
4418 if (!IsB32)
4419 MIB.addImm(0); // src_mod1
4420 MIB.addReg(Src[1]);
4421 if (!IsB32)
4422 MIB.addImm(0); // src_mod2
4423 MIB.addReg(Src[2])
4424 .addImm(TTbl);
4425 if (!IsB32)
4426 MIB.addImm(0); // op_sel
4427
4428 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4429 MI.eraseFromParent();
4430
4431 return true;
4432}
4433
4434bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4435 Register SrcReg = MI.getOperand(0).getReg();
4436 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4437 return false;
4438
4439 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4440 Register SP =
4441 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4442 Register WaveAddr = getWaveAddress(DefMI);
4443 MachineBasicBlock *MBB = MI.getParent();
4444 const DebugLoc &DL = MI.getDebugLoc();
4445
4446 if (!WaveAddr) {
4447 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4448 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4449 .addReg(SrcReg)
4450 .addImm(Subtarget->getWavefrontSizeLog2())
4451 .setOperandDead(3); // Dead scc
4452 }
4453
4454 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4455 .addReg(WaveAddr);
4456
4457 MI.eraseFromParent();
4458 return true;
4459}
4460
4462
4463 if (!I.isPreISelOpcode()) {
4464 if (I.isCopy())
4465 return selectCOPY(I);
4466 return true;
4467 }
4468
4469 switch (I.getOpcode()) {
4470 case TargetOpcode::G_AND:
4471 case TargetOpcode::G_OR:
4472 case TargetOpcode::G_XOR:
4473 if (selectBITOP3(I))
4474 return true;
4475 if (selectImpl(I, *CoverageInfo))
4476 return true;
4477 return selectG_AND_OR_XOR(I);
4478 case TargetOpcode::G_ADD:
4479 case TargetOpcode::G_SUB:
4480 case TargetOpcode::G_PTR_ADD:
4481 if (selectImpl(I, *CoverageInfo))
4482 return true;
4483 return selectG_ADD_SUB(I);
4484 case TargetOpcode::G_UADDO:
4485 case TargetOpcode::G_USUBO:
4486 case TargetOpcode::G_UADDE:
4487 case TargetOpcode::G_USUBE:
4488 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4489 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4490 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4491 return selectG_AMDGPU_MAD_64_32(I);
4492 case TargetOpcode::G_INTTOPTR:
4493 case TargetOpcode::G_BITCAST:
4494 case TargetOpcode::G_PTRTOINT:
4495 case TargetOpcode::G_FREEZE:
4496 return selectCOPY(I);
4497 case TargetOpcode::G_FNEG:
4498 if (selectImpl(I, *CoverageInfo))
4499 return true;
4500 return selectG_FNEG(I);
4501 case TargetOpcode::G_FABS:
4502 if (selectImpl(I, *CoverageInfo))
4503 return true;
4504 return selectG_FABS(I);
4505 case TargetOpcode::G_EXTRACT:
4506 return selectG_EXTRACT(I);
4507 case TargetOpcode::G_MERGE_VALUES:
4508 case TargetOpcode::G_CONCAT_VECTORS:
4509 return selectG_MERGE_VALUES(I);
4510 case TargetOpcode::G_UNMERGE_VALUES:
4511 return selectG_UNMERGE_VALUES(I);
4512 case TargetOpcode::G_BUILD_VECTOR:
4513 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4514 return selectG_BUILD_VECTOR(I);
4515 case TargetOpcode::G_IMPLICIT_DEF:
4516 return selectG_IMPLICIT_DEF(I);
4517 case TargetOpcode::G_INSERT:
4518 return selectG_INSERT(I);
4519 case TargetOpcode::G_INTRINSIC:
4520 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4521 return selectG_INTRINSIC(I);
4522 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4523 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4524 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4525 case TargetOpcode::G_ICMP:
4526 case TargetOpcode::G_FCMP:
4527 if (selectG_ICMP_or_FCMP(I))
4528 return true;
4529 return selectImpl(I, *CoverageInfo);
4530 case TargetOpcode::G_LOAD:
4531 case TargetOpcode::G_ZEXTLOAD:
4532 case TargetOpcode::G_SEXTLOAD:
4533 case TargetOpcode::G_STORE:
4534 case TargetOpcode::G_ATOMIC_CMPXCHG:
4535 case TargetOpcode::G_ATOMICRMW_XCHG:
4536 case TargetOpcode::G_ATOMICRMW_ADD:
4537 case TargetOpcode::G_ATOMICRMW_SUB:
4538 case TargetOpcode::G_ATOMICRMW_AND:
4539 case TargetOpcode::G_ATOMICRMW_OR:
4540 case TargetOpcode::G_ATOMICRMW_XOR:
4541 case TargetOpcode::G_ATOMICRMW_MIN:
4542 case TargetOpcode::G_ATOMICRMW_MAX:
4543 case TargetOpcode::G_ATOMICRMW_UMIN:
4544 case TargetOpcode::G_ATOMICRMW_UMAX:
4545 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4546 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4547 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4548 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4549 case TargetOpcode::G_ATOMICRMW_FADD:
4550 case TargetOpcode::G_ATOMICRMW_FMIN:
4551 case TargetOpcode::G_ATOMICRMW_FMAX:
4552 return selectG_LOAD_STORE_ATOMICRMW(I);
4553 case TargetOpcode::G_SELECT:
4554 return selectG_SELECT(I);
4555 case TargetOpcode::G_TRUNC:
4556 return selectG_TRUNC(I);
4557 case TargetOpcode::G_SEXT:
4558 case TargetOpcode::G_ZEXT:
4559 case TargetOpcode::G_ANYEXT:
4560 case TargetOpcode::G_SEXT_INREG:
4561 // This is a workaround. For extension from type i1, `selectImpl()` uses
4562 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4563 // i1 can only be hold in a SGPR class.
4564 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4565 selectImpl(I, *CoverageInfo))
4566 return true;
4567 return selectG_SZA_EXT(I);
4568 case TargetOpcode::G_FPEXT:
4569 if (selectG_FPEXT(I))
4570 return true;
4571 return selectImpl(I, *CoverageInfo);
4572 case TargetOpcode::G_BRCOND:
4573 return selectG_BRCOND(I);
4574 case TargetOpcode::G_GLOBAL_VALUE:
4575 return selectG_GLOBAL_VALUE(I);
4576 case TargetOpcode::G_PTRMASK:
4577 return selectG_PTRMASK(I);
4578 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4579 return selectG_EXTRACT_VECTOR_ELT(I);
4580 case TargetOpcode::G_INSERT_VECTOR_ELT:
4581 return selectG_INSERT_VECTOR_ELT(I);
4582 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4583 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4584 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4585 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4586 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4587 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4589 assert(Intr && "not an image intrinsic with image pseudo");
4590 return selectImageIntrinsic(I, Intr);
4591 }
4592 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4593 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4594 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4595 return selectBVHIntersectRayIntrinsic(I);
4596 case AMDGPU::G_SBFX:
4597 case AMDGPU::G_UBFX:
4598 return selectG_SBFX_UBFX(I);
4599 case AMDGPU::G_SI_CALL:
4600 I.setDesc(TII.get(AMDGPU::SI_CALL));
4601 return true;
4602 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4603 return selectWaveAddress(I);
4604 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4605 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4606 return true;
4607 }
4608 case AMDGPU::G_STACKRESTORE:
4609 return selectStackRestore(I);
4610 case AMDGPU::G_PHI:
4611 return selectPHI(I);
4612 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4613 return selectCOPY_SCC_VCC(I);
4614 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4615 return selectCOPY_VCC_SCC(I);
4616 case AMDGPU::G_AMDGPU_READANYLANE:
4617 return selectReadAnyLane(I);
4618 case TargetOpcode::G_CONSTANT:
4619 case TargetOpcode::G_FCONSTANT:
4620 default:
4621 return selectImpl(I, *CoverageInfo);
4622 }
4623 return false;
4624}
4625
4627AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4628 return {{
4629 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4630 }};
4631
4632}
4633
4634std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4635 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4636 unsigned Mods = 0;
4637 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4638
4639 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4640 Src = MI->getOperand(1).getReg();
4641 Mods |= SISrcMods::NEG;
4642 MI = getDefIgnoringCopies(Src, *MRI);
4643 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4644 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4645 // denormal mode, but we're implicitly canonicalizing in a source operand.
4646 const ConstantFP *LHS =
4647 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4648 if (LHS && LHS->isZero()) {
4649 Mods |= SISrcMods::NEG;
4650 Src = MI->getOperand(2).getReg();
4651 }
4652 }
4653
4654 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4655 Src = MI->getOperand(1).getReg();
4656 Mods |= SISrcMods::ABS;
4657 }
4658
4659 if (OpSel)
4660 Mods |= SISrcMods::OP_SEL_0;
4661
4662 return std::pair(Src, Mods);
4663}
4664
4665std::pair<Register, unsigned>
4666AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4667 unsigned Mods;
4668 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4669 Mods |= SISrcMods::OP_SEL_1;
4670 return std::pair(Src, Mods);
4671}
4672
4673Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4674 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4675 bool ForceVGPR) const {
4676 if ((Mods != 0 || ForceVGPR) &&
4677 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4678
4679 // If we looked through copies to find source modifiers on an SGPR operand,
4680 // we now have an SGPR register source. To avoid potentially violating the
4681 // constant bus restriction, we need to insert a copy to a VGPR.
4682 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4683 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4684 TII.get(AMDGPU::COPY), VGPRSrc)
4685 .addReg(Src);
4686 Src = VGPRSrc;
4687 }
4688
4689 return Src;
4690}
4691
4692///
4693/// This will select either an SGPR or VGPR operand and will save us from
4694/// having to write an extra tablegen pattern.
4696AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4697 return {{
4698 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4699 }};
4700}
4701
4703AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4704 Register Src;
4705 unsigned Mods;
4706 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4707
4708 return {{
4709 [=](MachineInstrBuilder &MIB) {
4710 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4711 },
4712 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4713 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4714 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4715 }};
4716}
4717
4719AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4720 Register Src;
4721 unsigned Mods;
4722 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4723 /*IsCanonicalizing=*/true,
4724 /*AllowAbs=*/false);
4725
4726 return {{
4727 [=](MachineInstrBuilder &MIB) {
4728 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4729 },
4730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4731 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4732 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4733 }};
4734}
4735
4737AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4738 return {{
4739 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4740 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4741 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4742 }};
4743}
4744
4746AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4747 Register Src;
4748 unsigned Mods;
4749 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4750
4751 return {{
4752 [=](MachineInstrBuilder &MIB) {
4753 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4754 },
4755 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4756 }};
4757}
4758
4760AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4761 MachineOperand &Root) const {
4762 Register Src;
4763 unsigned Mods;
4764 std::tie(Src, Mods) =
4765 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4766
4767 return {{
4768 [=](MachineInstrBuilder &MIB) {
4769 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4770 },
4771 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4772 }};
4773}
4774
4776AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4777 Register Src;
4778 unsigned Mods;
4779 std::tie(Src, Mods) =
4780 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4781 /*AllowAbs=*/false);
4782
4783 return {{
4784 [=](MachineInstrBuilder &MIB) {
4785 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4786 },
4787 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4788 }};
4789}
4790
4792AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4793 Register Reg = Root.getReg();
4794 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4795 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4796 return {};
4797 return {{
4798 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4799 }};
4800}
4801
4802enum class SrcStatus {
4807 // This means current op = [op_upper, op_lower] and src = -op_lower.
4810 // This means current op = [op_upper, op_lower] and src = [op_upper,
4811 // -op_lower].
4819};
4820/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4821static bool isTruncHalf(const MachineInstr *MI,
4822 const MachineRegisterInfo &MRI) {
4823 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4824 return false;
4825
4826 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4827 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4828 return DstSize * 2 == SrcSize;
4829}
4830
4831/// Test if the MI is logic shift right with half bits,
4832/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4833static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4834 if (MI->getOpcode() != AMDGPU::G_LSHR)
4835 return false;
4836
4837 Register ShiftSrc;
4838 std::optional<ValueAndVReg> ShiftAmt;
4839 if (mi_match(MI->getOperand(0).getReg(), MRI,
4840 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4841 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4842 unsigned Shift = ShiftAmt->Value.getZExtValue();
4843 return Shift * 2 == SrcSize;
4844 }
4845 return false;
4846}
4847
4848/// Test if the MI is shift left with half bits,
4849/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4850static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4851 if (MI->getOpcode() != AMDGPU::G_SHL)
4852 return false;
4853
4854 Register ShiftSrc;
4855 std::optional<ValueAndVReg> ShiftAmt;
4856 if (mi_match(MI->getOperand(0).getReg(), MRI,
4857 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4858 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4859 unsigned Shift = ShiftAmt->Value.getZExtValue();
4860 return Shift * 2 == SrcSize;
4861 }
4862 return false;
4863}
4864
4865/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4866static bool isUnmergeHalf(const MachineInstr *MI,
4867 const MachineRegisterInfo &MRI) {
4868 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4869 return false;
4870 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4871 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4872}
4873
4875
4877 const MachineRegisterInfo &MRI) {
4878 LLT OpTy = MRI.getType(Reg);
4879 if (OpTy.isScalar())
4880 return TypeClass::SCALAR;
4881 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4884}
4885
4887 const MachineRegisterInfo &MRI) {
4888 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4889 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4890 return SrcStatus::INVALID;
4891
4892 switch (S) {
4893 case SrcStatus::IS_SAME:
4894 if (NegType == TypeClass::VECTOR_OF_TWO) {
4895 // Vector of 2:
4896 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4897 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4898 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4899 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4901 }
4902 if (NegType == TypeClass::SCALAR) {
4903 // Scalar:
4904 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4905 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4906 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4907 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4908 return SrcStatus::IS_HI_NEG;
4909 }
4910 break;
4912 if (NegType == TypeClass::VECTOR_OF_TWO) {
4913 // Vector of 2:
4914 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4915 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4916 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4917 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4918 return SrcStatus::IS_LO_NEG;
4919 }
4920 if (NegType == TypeClass::SCALAR) {
4921 // Scalar:
4922 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4923 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4924 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4925 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4926 return SrcStatus::IS_SAME;
4927 }
4928 break;
4930 if (NegType == TypeClass::VECTOR_OF_TWO) {
4931 // Vector of 2:
4932 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4933 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4934 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4935 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4936 return SrcStatus::IS_HI_NEG;
4937 }
4938 if (NegType == TypeClass::SCALAR) {
4939 // Scalar:
4940 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4941 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4942 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4943 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4945 }
4946 break;
4948 if (NegType == TypeClass::VECTOR_OF_TWO) {
4949 // Vector of 2:
4950 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4951 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4952 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4953 // [SrcHi, SrcLo] = [OpHi, OpLo]
4954 return SrcStatus::IS_SAME;
4955 }
4956 if (NegType == TypeClass::SCALAR) {
4957 // Scalar:
4958 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4959 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4960 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4961 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4962 return SrcStatus::IS_LO_NEG;
4963 }
4964 break;
4966 // Vector of 2:
4967 // Src = CurrUpper
4968 // Curr = [CurrUpper, CurrLower]
4969 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4970 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4971 // Src = -OpUpper
4972 //
4973 // Scalar:
4974 // Src = CurrUpper
4975 // Curr = [CurrUpper, CurrLower]
4976 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4977 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4978 // Src = -OpUpper
4981 if (NegType == TypeClass::VECTOR_OF_TWO) {
4982 // Vector of 2:
4983 // Src = CurrLower
4984 // Curr = [CurrUpper, CurrLower]
4985 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4986 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4987 // Src = -OpLower
4989 }
4990 if (NegType == TypeClass::SCALAR) {
4991 // Scalar:
4992 // Src = CurrLower
4993 // Curr = [CurrUpper, CurrLower]
4994 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4995 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4996 // Src = OpLower
4998 }
4999 break;
5001 // Vector of 2:
5002 // Src = -CurrUpper
5003 // Curr = [CurrUpper, CurrLower]
5004 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5005 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5006 // Src = -(-OpUpper) = OpUpper
5007 //
5008 // Scalar:
5009 // Src = -CurrUpper
5010 // Curr = [CurrUpper, CurrLower]
5011 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5012 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5013 // Src = -(-OpUpper) = OpUpper
5016 if (NegType == TypeClass::VECTOR_OF_TWO) {
5017 // Vector of 2:
5018 // Src = -CurrLower
5019 // Curr = [CurrUpper, CurrLower]
5020 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5021 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5022 // Src = -(-OpLower) = OpLower
5024 }
5025 if (NegType == TypeClass::SCALAR) {
5026 // Scalar:
5027 // Src = -CurrLower
5028 // Curr = [CurrUpper, CurrLower]
5029 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5030 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5031 // Src = -OpLower
5033 }
5034 break;
5035 default:
5036 break;
5037 }
5038 llvm_unreachable("unexpected SrcStatus & NegType combination");
5039}
5040
5041static std::optional<std::pair<Register, SrcStatus>>
5042calcNextStatus(std::pair<Register, SrcStatus> Curr,
5043 const MachineRegisterInfo &MRI) {
5044 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
5045
5046 unsigned Opc = MI->getOpcode();
5047
5048 // Handle general Opc cases.
5049 switch (Opc) {
5050 case AMDGPU::G_BITCAST:
5051 return std::optional<std::pair<Register, SrcStatus>>(
5052 {MI->getOperand(1).getReg(), Curr.second});
5053 case AMDGPU::COPY:
5054 if (MI->getOperand(1).getReg().isPhysical())
5055 return std::nullopt;
5056 return std::optional<std::pair<Register, SrcStatus>>(
5057 {MI->getOperand(1).getReg(), Curr.second});
5058 case AMDGPU::G_FNEG: {
5059 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
5060 if (Stat == SrcStatus::INVALID)
5061 return std::nullopt;
5062 return std::optional<std::pair<Register, SrcStatus>>(
5063 {MI->getOperand(1).getReg(), Stat});
5064 }
5065 default:
5066 break;
5067 }
5068
5069 // Calc next Stat from current Stat.
5070 switch (Curr.second) {
5071 case SrcStatus::IS_SAME:
5072 if (isTruncHalf(MI, MRI))
5073 return std::optional<std::pair<Register, SrcStatus>>(
5074 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5075 else if (isUnmergeHalf(MI, MRI)) {
5076 if (Curr.first == MI->getOperand(0).getReg())
5077 return std::optional<std::pair<Register, SrcStatus>>(
5078 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
5079 return std::optional<std::pair<Register, SrcStatus>>(
5080 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5081 }
5082 break;
5084 if (isTruncHalf(MI, MRI)) {
5085 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5086 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5087 // = [OpLowerHi, OpLowerLo]
5088 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5089 // = [-OpLowerHi, OpLowerLo]
5090 // = -OpLower
5091 return std::optional<std::pair<Register, SrcStatus>>(
5092 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5093 }
5094 if (isUnmergeHalf(MI, MRI)) {
5095 if (Curr.first == MI->getOperand(0).getReg())
5096 return std::optional<std::pair<Register, SrcStatus>>(
5097 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5098 return std::optional<std::pair<Register, SrcStatus>>(
5099 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5100 }
5101 break;
5103 if (isShlHalf(MI, MRI))
5104 return std::optional<std::pair<Register, SrcStatus>>(
5105 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5106 break;
5108 if (isLshrHalf(MI, MRI))
5109 return std::optional<std::pair<Register, SrcStatus>>(
5110 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5111 break;
5113 if (isShlHalf(MI, MRI))
5114 return std::optional<std::pair<Register, SrcStatus>>(
5115 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5116 break;
5118 if (isLshrHalf(MI, MRI))
5119 return std::optional<std::pair<Register, SrcStatus>>(
5120 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5121 break;
5122 default:
5123 break;
5124 }
5125 return std::nullopt;
5126}
5127
5128/// This is used to control valid status that current MI supports. For example,
5129/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5130/// bit on VOP3P.
5131/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5132/// for different MI on different arch
5134private:
5135 bool HasNeg = false;
5136 // Assume all complex pattern of VOP3P have opsel.
5137 bool HasOpsel = true;
5138
5139public:
5141 const MachineInstr *MI = MRI.getVRegDef(Reg);
5142 unsigned Opc = MI->getOpcode();
5143
5144 if (Opc == TargetOpcode::G_INTRINSIC) {
5145 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5146 // Only float point intrinsic has neg & neg_hi bits.
5147 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5148 HasNeg = true;
5150 // Keep same for generic op.
5151 HasNeg = true;
5152 }
5153 }
5154 bool checkOptions(SrcStatus Stat) const {
5155 if (!HasNeg &&
5156 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5157 return false;
5158 }
5159 if (!HasOpsel &&
5160 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5161 return false;
5162 }
5163 return true;
5164 }
5165};
5166
5169 int MaxDepth = 3) {
5170 int Depth = 0;
5171 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5173
5174 while (Depth <= MaxDepth && Curr.has_value()) {
5175 Depth++;
5176 if (SO.checkOptions(Curr.value().second))
5177 Statlist.push_back(Curr.value());
5178 Curr = calcNextStatus(Curr.value(), MRI);
5179 }
5180
5181 return Statlist;
5182}
5183
5184static std::pair<Register, SrcStatus>
5186 int MaxDepth = 3) {
5187 int Depth = 0;
5188 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5189 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5190
5191 while (Depth <= MaxDepth && Curr.has_value()) {
5192 Depth++;
5193 SrcStatus Stat = Curr.value().second;
5194 if (SO.checkOptions(Stat)) {
5195 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5197 LastSameOrNeg = Curr.value();
5198 }
5199 Curr = calcNextStatus(Curr.value(), MRI);
5200 }
5201
5202 return LastSameOrNeg;
5203}
5204
5205static bool isSameBitWidth(Register Reg1, Register Reg2,
5206 const MachineRegisterInfo &MRI) {
5207 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5208 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5209 return Width1 == Width2;
5210}
5211
5212static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5213 // SrcStatus::IS_LOWER_HALF remain 0.
5214 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5215 Mods ^= SISrcMods::NEG_HI;
5216 Mods |= SISrcMods::OP_SEL_1;
5217 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5218 Mods |= SISrcMods::OP_SEL_1;
5219 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5220 Mods ^= SISrcMods::NEG_HI;
5221 else if (HiStat == SrcStatus::IS_HI_NEG)
5222 Mods ^= SISrcMods::NEG_HI;
5223
5224 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5225 Mods ^= SISrcMods::NEG;
5226 Mods |= SISrcMods::OP_SEL_0;
5227 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5228 Mods |= SISrcMods::OP_SEL_0;
5229 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5230 Mods |= SISrcMods::NEG;
5231 else if (LoStat == SrcStatus::IS_HI_NEG)
5232 Mods ^= SISrcMods::NEG;
5233
5234 return Mods;
5235}
5236
5237static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5238 Register RootReg, const SIInstrInfo &TII,
5239 const MachineRegisterInfo &MRI) {
5240 auto IsHalfState = [](SrcStatus S) {
5243 };
5244 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5245 IsHalfState(HiStat);
5246}
5247
5248std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5249 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5250 unsigned Mods = 0;
5251 // No modification if Root type is not form of <2 x Type>.
5252 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5253 Mods |= SISrcMods::OP_SEL_1;
5254 return {RootReg, Mods};
5255 }
5256
5257 SearchOptions SO(RootReg, MRI);
5258
5259 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5260
5261 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5263 else if (Stat.second == SrcStatus::IS_HI_NEG)
5264 Mods ^= SISrcMods::NEG_HI;
5265 else if (Stat.second == SrcStatus::IS_LO_NEG)
5266 Mods ^= SISrcMods::NEG;
5267
5268 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5269
5270 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5271 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5272 Mods |= SISrcMods::OP_SEL_1;
5273 return {Stat.first, Mods};
5274 }
5275
5277 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5278
5279 if (StatlistHi.empty()) {
5280 Mods |= SISrcMods::OP_SEL_1;
5281 return {Stat.first, Mods};
5282 }
5283
5285 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5286
5287 if (StatlistLo.empty()) {
5288 Mods |= SISrcMods::OP_SEL_1;
5289 return {Stat.first, Mods};
5290 }
5291
5292 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5293 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5294 if (StatlistHi[I].first == StatlistLo[J].first &&
5295 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5296 StatlistHi[I].first, RootReg, TII, MRI))
5297 return {StatlistHi[I].first,
5298 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5299 }
5300 }
5301 // Packed instructions do not have abs modifiers.
5302 Mods |= SISrcMods::OP_SEL_1;
5303
5304 return {Stat.first, Mods};
5305}
5306
5307// Removed unused function `getAllKindImm` to eliminate dead code.
5308
5309static bool checkRB(Register Reg, unsigned int RBNo,
5310 const AMDGPURegisterBankInfo &RBI,
5311 const MachineRegisterInfo &MRI,
5312 const TargetRegisterInfo &TRI) {
5313 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5314 return RB->getID() == RBNo;
5315}
5316
5317// This function is used to get the correct register bank for returned reg.
5318// Assume:
5319// 1. VOP3P is always legal for VGPR.
5320// 2. RootOp's regbank is legal.
5321// Thus
5322// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5323// 2. If RootOp is VGPR, then NewOp must be VGPR.
5325 const AMDGPURegisterBankInfo &RBI,
5327 const TargetRegisterInfo &TRI,
5328 const SIInstrInfo &TII) {
5329 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5330 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5331 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5332 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5333 return NewReg;
5334
5335 MachineInstr *MI = MRI.getVRegDef(RootReg);
5336 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5337 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5338 return RootReg;
5339 }
5340
5341 MachineBasicBlock *BB = MI->getParent();
5342 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5343
5345 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5346 .addReg(NewReg);
5347
5348 // Only accept VGPR.
5349 return MIB->getOperand(0).getReg();
5350}
5351
5353AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5354 bool IsDOT) const {
5355 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5356 Register Reg;
5357 unsigned Mods;
5358 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5359
5360 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5361 return {{
5362 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5363 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5364 }};
5365}
5366
5368AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5369
5370 return selectVOP3PRetHelper(Root);
5371}
5372
5374AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5375
5376 return selectVOP3PRetHelper(Root, true);
5377}
5378
5380AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5381 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5382 Register Src;
5383 unsigned Mods;
5384 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5385 if (Mods != SISrcMods::OP_SEL_1)
5386 return {};
5387
5388 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5389}
5390
5392AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5393 Register Src;
5394 unsigned Mods;
5395 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5396
5397 return {{
5398 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5399 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5400 }};
5401}
5402
5404AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5405 Register Src;
5406 unsigned Mods;
5407 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5408 if (Mods != SISrcMods::OP_SEL_1)
5409 return {};
5410
5411 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5412}
5413
5415AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5416 MachineOperand &Root) const {
5417 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5418 "expected i1 value");
5419 unsigned Mods = SISrcMods::OP_SEL_1;
5420 if (Root.getImm() != 0)
5421 Mods |= SISrcMods::OP_SEL_0;
5422
5423 return {{
5424 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5425 }};
5426}
5427
5429 MachineInstr *InsertPt,
5430 MachineRegisterInfo &MRI) {
5431 const TargetRegisterClass *DstRegClass;
5432 switch (Elts.size()) {
5433 case 8:
5434 DstRegClass = &AMDGPU::VReg_256RegClass;
5435 break;
5436 case 4:
5437 DstRegClass = &AMDGPU::VReg_128RegClass;
5438 break;
5439 case 2:
5440 DstRegClass = &AMDGPU::VReg_64RegClass;
5441 break;
5442 default:
5443 llvm_unreachable("unhandled Reg sequence size");
5444 }
5445
5446 MachineIRBuilder B(*InsertPt);
5447 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5448 .addDef(MRI.createVirtualRegister(DstRegClass));
5449 for (unsigned i = 0; i < Elts.size(); ++i) {
5450 MIB.addReg(Elts[i]);
5452 }
5453 return MIB->getOperand(0).getReg();
5454}
5455
5456static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5458 MachineInstr *InsertPt,
5459 MachineRegisterInfo &MRI) {
5460 if (ModOpcode == TargetOpcode::G_FNEG) {
5461 Mods |= SISrcMods::NEG;
5462 // Check if all elements also have abs modifier
5463 SmallVector<Register, 8> NegAbsElts;
5464 for (auto El : Elts) {
5465 Register FabsSrc;
5466 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5467 break;
5468 NegAbsElts.push_back(FabsSrc);
5469 }
5470 if (Elts.size() != NegAbsElts.size()) {
5471 // Neg
5472 Src = buildRegSequence(Elts, InsertPt, MRI);
5473 } else {
5474 // Neg and Abs
5475 Mods |= SISrcMods::NEG_HI;
5476 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5477 }
5478 } else {
5479 assert(ModOpcode == TargetOpcode::G_FABS);
5480 // Abs
5481 Mods |= SISrcMods::NEG_HI;
5482 Src = buildRegSequence(Elts, InsertPt, MRI);
5483 }
5484}
5485
5487AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5488 Register Src = Root.getReg();
5489 unsigned Mods = SISrcMods::OP_SEL_1;
5491
5492 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5493 assert(BV->getNumSources() > 0);
5494 // Based on first element decide which mod we match, neg or abs
5495 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5496 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5497 ? AMDGPU::G_FNEG
5498 : AMDGPU::G_FABS;
5499 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5500 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5501 if (ElF32->getOpcode() != ModOpcode)
5502 break;
5503 EltsF32.push_back(ElF32->getOperand(1).getReg());
5504 }
5505
5506 // All elements had ModOpcode modifier
5507 if (BV->getNumSources() == EltsF32.size()) {
5508 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5509 *MRI);
5510 }
5511 }
5512
5513 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5514 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5515}
5516
5518AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5519 Register Src = Root.getReg();
5520 unsigned Mods = SISrcMods::OP_SEL_1;
5521 SmallVector<Register, 8> EltsV2F16;
5522
5523 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5524 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5525 Register FNegSrc;
5526 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5527 break;
5528 EltsV2F16.push_back(FNegSrc);
5529 }
5530
5531 // All elements had ModOpcode modifier
5532 if (CV->getNumSources() == EltsV2F16.size()) {
5533 Mods |= SISrcMods::NEG;
5534 Mods |= SISrcMods::NEG_HI;
5535 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5536 }
5537 }
5538
5539 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5540 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5541}
5542
5544AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5545 Register Src = Root.getReg();
5546 unsigned Mods = SISrcMods::OP_SEL_1;
5547 SmallVector<Register, 8> EltsV2F16;
5548
5549 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5550 assert(CV->getNumSources() > 0);
5551 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5552 // Based on first element decide which mod we match, neg or abs
5553 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5554 ? AMDGPU::G_FNEG
5555 : AMDGPU::G_FABS;
5556
5557 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5558 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5559 if (ElV2F16->getOpcode() != ModOpcode)
5560 break;
5561 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5562 }
5563
5564 // All elements had ModOpcode modifier
5565 if (CV->getNumSources() == EltsV2F16.size()) {
5566 MachineIRBuilder B(*Root.getParent());
5567 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5568 *MRI);
5569 }
5570 }
5571
5572 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5573 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5574}
5575
5577AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5578 std::optional<FPValueAndVReg> FPValReg;
5579 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5580 if (TII.isInlineConstant(FPValReg->Value)) {
5581 return {{[=](MachineInstrBuilder &MIB) {
5582 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5583 }}};
5584 }
5585 // Non-inlineable splat floats should not fall-through for integer immediate
5586 // checks.
5587 return {};
5588 }
5589
5590 APInt ICst;
5591 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5592 if (TII.isInlineConstant(ICst)) {
5593 return {
5594 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5595 }
5596 }
5597
5598 return {};
5599}
5600
5602AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5603 Register Src =
5604 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5605 unsigned Key = 0;
5606
5607 Register ShiftSrc;
5608 std::optional<ValueAndVReg> ShiftAmt;
5609 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5610 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5611 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5612 Key = ShiftAmt->Value.getZExtValue() / 8;
5613 Src = ShiftSrc;
5614 }
5615
5616 return {{
5617 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5618 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5619 }};
5620}
5621
5623AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5624
5625 Register Src =
5626 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5627 unsigned Key = 0;
5628
5629 Register ShiftSrc;
5630 std::optional<ValueAndVReg> ShiftAmt;
5631 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5632 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5633 ShiftAmt->Value.getZExtValue() == 16) {
5634 Src = ShiftSrc;
5635 Key = 1;
5636 }
5637
5638 return {{
5639 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5640 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5641 }};
5642}
5643
5645AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5646 Register Src =
5647 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5648 unsigned Key = 0;
5649
5650 Register S32 = matchZeroExtendFromS32(Src);
5651 if (!S32)
5652 S32 = matchAnyExtendFromS32(Src);
5653
5654 if (S32) {
5655 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5656 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5657 assert(Def->getNumOperands() == 3);
5658 Register DstReg1 = Def->getOperand(1).getReg();
5659 if (mi_match(S32, *MRI,
5660 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5661 Src = Def->getOperand(2).getReg();
5662 Key = 1;
5663 }
5664 }
5665 }
5666
5667 return {{
5668 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5669 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5670 }};
5671}
5672
5674AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5675 Register Src;
5676 unsigned Mods;
5677 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5678
5679 // FIXME: Handle op_sel
5680 return {{
5681 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5682 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5683 }};
5684}
5685
5686// FIXME-TRUE16 remove when fake16 is removed
5688AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5689 Register Src;
5690 unsigned Mods;
5691 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5692 /*IsCanonicalizing=*/true,
5693 /*AllowAbs=*/false,
5694 /*OpSel=*/false);
5695
5696 return {{
5697 [=](MachineInstrBuilder &MIB) {
5698 MIB.addReg(
5699 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5700 },
5701 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5702 }};
5703}
5704
5706AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5707 Register Src;
5708 unsigned Mods;
5709 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5710 /*IsCanonicalizing=*/true,
5711 /*AllowAbs=*/false,
5712 /*OpSel=*/true);
5713
5714 return {{
5715 [=](MachineInstrBuilder &MIB) {
5716 MIB.addReg(
5717 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5718 },
5719 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5720 }};
5721}
5722
5723// Given \p Offset and load specified by the \p Root operand check if \p Offset
5724// is a multiple of the load byte size. If it is update \p Offset to a
5725// pre-scaled value and return true.
5726bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5728 bool IsSigned) const {
5729 if (!Subtarget->hasScaleOffset())
5730 return false;
5731
5732 const MachineInstr &MI = *Root.getParent();
5733 MachineMemOperand *MMO = *MI.memoperands_begin();
5734
5735 if (!MMO->getSize().hasValue())
5736 return false;
5737
5738 uint64_t Size = MMO->getSize().getValue();
5739
5740 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5741 if (!OffsetReg)
5742 OffsetReg = Offset;
5743
5744 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5745 OffsetReg = Def->Reg;
5746
5747 Register Op0;
5748 MachineInstr *Mul;
5749 bool ScaleOffset =
5750 (isPowerOf2_64(Size) &&
5751 mi_match(OffsetReg, *MRI,
5752 m_GShl(m_Reg(Op0),
5755 mi_match(OffsetReg, *MRI,
5757 m_Copy(m_SpecificICst(Size))))) ||
5758 mi_match(
5759 OffsetReg, *MRI,
5760 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5761 m_Reg(Op0), m_SpecificICst(Size))) ||
5762 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5763 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5764 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5765 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5766 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5767 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5768 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5769 mi_match(Mul->getOperand(3).getReg(), *MRI,
5771 m_Copy(m_SpecificICst(Size))))) &&
5772 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5773
5774 if (ScaleOffset)
5775 Offset = Op0;
5776
5777 return ScaleOffset;
5778}
5779
5780bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5781 Register &Base,
5782 Register *SOffset,
5783 int64_t *Offset,
5784 bool *ScaleOffset) const {
5785 MachineInstr *MI = Root.getParent();
5786 MachineBasicBlock *MBB = MI->getParent();
5787
5788 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5789 // then we can select all ptr + 32-bit offsets.
5790 SmallVector<GEPInfo, 4> AddrInfo;
5791 getAddrModeInfo(*MI, *MRI, AddrInfo);
5792
5793 if (AddrInfo.empty())
5794 return false;
5795
5796 const GEPInfo &GEPI = AddrInfo[0];
5797 std::optional<int64_t> EncodedImm;
5798
5799 if (ScaleOffset)
5800 *ScaleOffset = false;
5801
5802 if (SOffset && Offset) {
5803 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5804 /*HasSOffset=*/true);
5805 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5806 AddrInfo.size() > 1) {
5807 const GEPInfo &GEPI2 = AddrInfo[1];
5808 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5809 Register OffsetReg = GEPI2.SgprParts[1];
5810 if (ScaleOffset)
5811 *ScaleOffset =
5812 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5813 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5814 if (OffsetReg) {
5815 Base = GEPI2.SgprParts[0];
5816 *SOffset = OffsetReg;
5817 *Offset = *EncodedImm;
5818 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5819 return true;
5820
5821 // For unbuffered smem loads, it is illegal for the Immediate Offset
5822 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5823 // is negative. Handle the case where the Immediate Offset + SOffset
5824 // is negative.
5825 auto SKnown = VT->getKnownBits(*SOffset);
5826 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5827 return false;
5828
5829 return true;
5830 }
5831 }
5832 }
5833 return false;
5834 }
5835
5836 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5837 /*HasSOffset=*/false);
5838 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5839 Base = GEPI.SgprParts[0];
5840 *Offset = *EncodedImm;
5841 return true;
5842 }
5843
5844 // SGPR offset is unsigned.
5845 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5846 GEPI.Imm != 0) {
5847 // If we make it this far we have a load with an 32-bit immediate offset.
5848 // It is OK to select this using a sgpr offset, because we have already
5849 // failed trying to select this load into one of the _IMM variants since
5850 // the _IMM Patterns are considered before the _SGPR patterns.
5851 Base = GEPI.SgprParts[0];
5852 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5853 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5854 .addImm(GEPI.Imm);
5855 return true;
5856 }
5857
5858 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5859 Register OffsetReg = GEPI.SgprParts[1];
5860 if (ScaleOffset)
5861 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5862 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5863 if (OffsetReg) {
5864 Base = GEPI.SgprParts[0];
5865 *SOffset = OffsetReg;
5866 return true;
5867 }
5868 }
5869
5870 return false;
5871}
5872
5874AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5875 Register Base;
5876 int64_t Offset;
5877 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5878 /* ScaleOffset */ nullptr))
5879 return std::nullopt;
5880
5881 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5882 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5883}
5884
5886AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5887 SmallVector<GEPInfo, 4> AddrInfo;
5888 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5889
5890 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5891 return std::nullopt;
5892
5893 const GEPInfo &GEPInfo = AddrInfo[0];
5894 Register PtrReg = GEPInfo.SgprParts[0];
5895 std::optional<int64_t> EncodedImm =
5896 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5897 if (!EncodedImm)
5898 return std::nullopt;
5899
5900 return {{
5901 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5902 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5903 }};
5904}
5905
5907AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5908 Register Base, SOffset;
5909 bool ScaleOffset;
5910 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5911 &ScaleOffset))
5912 return std::nullopt;
5913
5914 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5915 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5916 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5917 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5918}
5919
5921AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5922 Register Base, SOffset;
5923 int64_t Offset;
5924 bool ScaleOffset;
5925 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5926 return std::nullopt;
5927
5928 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5929 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5930 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5931 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5932 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5933}
5934
5935std::pair<Register, int> AMDGPUInstructionSelector::selectFlatOffsetImpl(
5936 MachineOperand &Root, AMDGPU::FlatAddrSpace FlatVariant) const {
5937 MachineInstr *MI = Root.getParent();
5938
5939 auto Default = std::pair(Root.getReg(), 0);
5940
5941 if (!STI.hasFlatInstOffsets())
5942 return Default;
5943
5944 Register PtrBase;
5945 int64_t ConstOffset;
5946 bool IsInBounds;
5947 std::tie(PtrBase, ConstOffset, IsInBounds) =
5948 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5949
5950 // Adding the offset to the base address with an immediate in a FLAT
5951 // instruction must not change the memory aperture in which the address falls.
5952 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5953 // instructions.
5954 if (ConstOffset == 0 ||
5955 (FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch &&
5956 !isFlatScratchBaseLegal(Root.getReg())) ||
5957 (FlatVariant == AMDGPU::FlatAddrSpace::FLAT && !IsInBounds))
5958 return Default;
5959
5960 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5961 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5962 return Default;
5963
5964 return std::pair(PtrBase, ConstOffset);
5965}
5966
5968AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5969 auto PtrWithOffset = selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FLAT);
5970
5971 return {{
5972 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5973 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5974 }};
5975}
5976
5978AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5979 auto PtrWithOffset =
5980 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatGlobal);
5981
5982 return {{
5983 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5984 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5985 }};
5986}
5987
5989AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5990 auto PtrWithOffset =
5991 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatScratch);
5992
5993 return {{
5994 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5995 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5996 }};
5997}
5998
5999// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
6001AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
6002 unsigned CPolBits,
6003 bool NeedIOffset) const {
6004 Register Addr = Root.getReg();
6005 Register PtrBase;
6006 int64_t ConstOffset;
6007 int64_t ImmOffset = 0;
6008
6009 // Match the immediate offset first, which canonically is moved as low as
6010 // possible.
6011 std::tie(PtrBase, ConstOffset, std::ignore) =
6012 getPtrBaseWithConstantOffset(Addr, *MRI);
6013
6014 if (ConstOffset != 0) {
6015 if (NeedIOffset &&
6016 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6018 Addr = PtrBase;
6019 ImmOffset = ConstOffset;
6020 } else {
6021 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
6022 if (isSGPR(PtrBaseDef->Reg)) {
6023 if (ConstOffset > 0) {
6024 // Offset is too large.
6025 //
6026 // saddr + large_offset -> saddr +
6027 // (voffset = large_offset & ~MaxOffset) +
6028 // (large_offset & MaxOffset);
6029 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6030 if (NeedIOffset) {
6031 std::tie(SplitImmOffset, RemainderOffset) =
6032 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6034 }
6035
6036 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
6037 : isUInt<32>(RemainderOffset)) {
6038 MachineInstr *MI = Root.getParent();
6039 MachineBasicBlock *MBB = MI->getParent();
6040 Register HighBits =
6041 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6042
6043 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6044 HighBits)
6045 .addImm(RemainderOffset);
6046
6047 if (NeedIOffset)
6048 return {{
6049 [=](MachineInstrBuilder &MIB) {
6050 MIB.addReg(PtrBase);
6051 }, // saddr
6052 [=](MachineInstrBuilder &MIB) {
6053 MIB.addReg(HighBits);
6054 }, // voffset
6055 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
6056 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6057 }};
6058 return {{
6059 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
6060 [=](MachineInstrBuilder &MIB) {
6061 MIB.addReg(HighBits);
6062 }, // voffset
6063 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6064 }};
6065 }
6066 }
6067
6068 // We are adding a 64 bit SGPR and a constant. If constant bus limit
6069 // is 1 we would need to perform 1 or 2 extra moves for each half of
6070 // the constant and it is better to do a scalar add and then issue a
6071 // single VALU instruction to materialize zero. Otherwise it is less
6072 // instructions to perform VALU adds with immediates or inline literals.
6073 unsigned NumLiterals =
6074 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
6075 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
6076 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6077 return std::nullopt;
6078 }
6079 }
6080 }
6081
6082 // Match the variable offset.
6083 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6084 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6085 // Look through the SGPR->VGPR copy.
6086 Register SAddr =
6087 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6088
6089 if (isSGPR(SAddr)) {
6090 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6091
6092 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6093 // inserted later.
6094 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6095 Subtarget->hasSignedGVSOffset());
6096 if (Register VOffset = matchExtendFromS32OrS32(
6097 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6098 if (NeedIOffset)
6099 return {{[=](MachineInstrBuilder &MIB) { // saddr
6100 MIB.addReg(SAddr);
6101 },
6102 [=](MachineInstrBuilder &MIB) { // voffset
6103 MIB.addReg(VOffset);
6104 },
6105 [=](MachineInstrBuilder &MIB) { // offset
6106 MIB.addImm(ImmOffset);
6107 },
6108 [=](MachineInstrBuilder &MIB) { // cpol
6109 MIB.addImm(CPolBits |
6110 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6111 }}};
6112 return {{[=](MachineInstrBuilder &MIB) { // saddr
6113 MIB.addReg(SAddr);
6114 },
6115 [=](MachineInstrBuilder &MIB) { // voffset
6116 MIB.addReg(VOffset);
6117 },
6118 [=](MachineInstrBuilder &MIB) { // cpol
6119 MIB.addImm(CPolBits |
6120 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6121 }}};
6122 }
6123 }
6124 }
6125
6126 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6127 // drop this.
6128 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6129 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6130 return std::nullopt;
6131
6132 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6133 // moves required to copy a 64-bit SGPR to VGPR.
6134 MachineInstr *MI = Root.getParent();
6135 MachineBasicBlock *MBB = MI->getParent();
6136 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6137
6138 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6139 .addImm(0);
6140
6141 if (NeedIOffset)
6142 return {{
6143 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6144 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6145 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6146 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6147 }};
6148 return {{
6149 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6150 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6151 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6152 }};
6153}
6154
6156AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6157 return selectGlobalSAddr(Root, 0);
6158}
6159
6161AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6162 const MachineInstr &I = *Root.getParent();
6163
6164 // We are assuming CPol is always the last operand of the intrinsic.
6165 auto PassedCPol =
6166 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6167 return selectGlobalSAddr(Root, PassedCPol);
6168}
6169
6171AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6172 const MachineInstr &I = *Root.getParent();
6173
6174 // We are assuming CPol is second from last operand of the intrinsic.
6175 auto PassedCPol =
6176 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6177 return selectGlobalSAddr(Root, PassedCPol);
6178}
6179
6181AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6182 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6183}
6184
6186AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6187 MachineOperand &Root) const {
6188 const MachineInstr &I = *Root.getParent();
6189
6190 // We are assuming CPol is always the last operand of the intrinsic.
6191 auto PassedCPol =
6192 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6193 return selectGlobalSAddr(Root, PassedCPol, false);
6194}
6195
6197AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6198 MachineOperand &Root) const {
6199 const MachineInstr &I = *Root.getParent();
6200
6201 // We are assuming CPol is second from last operand of the intrinsic.
6202 auto PassedCPol =
6203 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6204 return selectGlobalSAddr(Root, PassedCPol, false);
6205}
6206
6208AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6209 Register Addr = Root.getReg();
6210 Register PtrBase;
6211 int64_t ConstOffset;
6212 int64_t ImmOffset = 0;
6213
6214 // Match the immediate offset first, which canonically is moved as low as
6215 // possible.
6216 std::tie(PtrBase, ConstOffset, std::ignore) =
6217 getPtrBaseWithConstantOffset(Addr, *MRI);
6218
6219 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6220 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6222 Addr = PtrBase;
6223 ImmOffset = ConstOffset;
6224 }
6225
6226 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6227 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6228 int FI = AddrDef->MI->getOperand(1).getIndex();
6229 return {{
6230 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6231 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6232 }};
6233 }
6234
6235 Register SAddr = AddrDef->Reg;
6236
6237 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6238 Register LHS = AddrDef->MI->getOperand(1).getReg();
6239 Register RHS = AddrDef->MI->getOperand(2).getReg();
6240 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6241 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6242
6243 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6244 isSGPR(RHSDef->Reg)) {
6245 int FI = LHSDef->MI->getOperand(1).getIndex();
6246 MachineInstr &I = *Root.getParent();
6247 MachineBasicBlock *BB = I.getParent();
6248 const DebugLoc &DL = I.getDebugLoc();
6249 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6250
6251 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6252 .addFrameIndex(FI)
6253 .addReg(RHSDef->Reg)
6254 .setOperandDead(3); // Dead scc
6255 }
6256 }
6257
6258 if (!isSGPR(SAddr))
6259 return std::nullopt;
6260
6261 return {{
6262 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6263 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6264 }};
6265}
6266
6267// Check whether the flat scratch SVS swizzle bug affects this access.
6268bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6269 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6270 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6271 return false;
6272
6273 // The bug affects the swizzling of SVS accesses if there is any carry out
6274 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6275 // voffset to (soffset + inst_offset).
6276 auto VKnown = VT->getKnownBits(VAddr);
6277 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6278 KnownBits::makeConstant(APInt(32, ImmOffset)));
6279 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6280 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6281 return (VMax & 3) + (SMax & 3) >= 4;
6282}
6283
6285AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6286 Register Addr = Root.getReg();
6287 Register PtrBase;
6288 int64_t ConstOffset;
6289 int64_t ImmOffset = 0;
6290
6291 // Match the immediate offset first, which canonically is moved as low as
6292 // possible.
6293 std::tie(PtrBase, ConstOffset, std::ignore) =
6294 getPtrBaseWithConstantOffset(Addr, *MRI);
6295
6296 Register OrigAddr = Addr;
6297 if (ConstOffset != 0 &&
6298 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6300 Addr = PtrBase;
6301 ImmOffset = ConstOffset;
6302 }
6303
6304 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6305 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6306 return std::nullopt;
6307
6308 Register RHS = AddrDef->MI->getOperand(2).getReg();
6309 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6310 return std::nullopt;
6311
6312 Register LHS = AddrDef->MI->getOperand(1).getReg();
6313 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6314
6315 if (OrigAddr != Addr) {
6316 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6317 return std::nullopt;
6318 } else {
6319 if (!isFlatScratchBaseLegalSV(OrigAddr))
6320 return std::nullopt;
6321 }
6322
6323 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6324 return std::nullopt;
6325
6326 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6328 : 0;
6329
6330 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6331 int FI = LHSDef->MI->getOperand(1).getIndex();
6332 return {{
6333 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6334 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6335 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6336 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6337 }};
6338 }
6339
6340 if (!isSGPR(LHS))
6341 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6342 LHS = Def->Reg;
6343
6344 if (!isSGPR(LHS))
6345 return std::nullopt;
6346
6347 return {{
6348 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6349 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6350 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6351 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6352 }};
6353}
6354
6356AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6357 MachineInstr *MI = Root.getParent();
6358 MachineBasicBlock *MBB = MI->getParent();
6359 MachineFunction *MF = MBB->getParent();
6360 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6361
6362 int64_t Offset = 0;
6363 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6365 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6366
6367 // TODO: Should this be inside the render function? The iterator seems to
6368 // move.
6369 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6370 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6371 HighBits)
6372 .addImm(Offset & ~MaxOffset);
6373
6374 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6375 MIB.addReg(Info->getScratchRSrcReg());
6376 },
6377 [=](MachineInstrBuilder &MIB) { // vaddr
6378 MIB.addReg(HighBits);
6379 },
6380 [=](MachineInstrBuilder &MIB) { // soffset
6381 // Use constant zero for soffset and rely on eliminateFrameIndex
6382 // to choose the appropriate frame register if need be.
6383 MIB.addImm(0);
6384 },
6385 [=](MachineInstrBuilder &MIB) { // offset
6386 MIB.addImm(Offset & MaxOffset);
6387 }}};
6388 }
6389
6390 assert(Offset == 0 || Offset == -1);
6391
6392 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6393 // offsets.
6394 std::optional<int> FI;
6395 Register VAddr = Root.getReg();
6396
6397 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6398 Register PtrBase;
6399 int64_t ConstOffset;
6400 std::tie(PtrBase, ConstOffset, std::ignore) =
6401 getPtrBaseWithConstantOffset(VAddr, *MRI);
6402 if (ConstOffset != 0) {
6403 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6404 (!STI.privateMemoryResourceIsRangeChecked() ||
6405 VT->signBitIsZero(PtrBase))) {
6406 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6407 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6408 FI = PtrBaseDef->getOperand(1).getIndex();
6409 else
6410 VAddr = PtrBase;
6411 Offset = ConstOffset;
6412 }
6413 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6414 FI = RootDef->getOperand(1).getIndex();
6415 }
6416
6417 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6418 MIB.addReg(Info->getScratchRSrcReg());
6419 },
6420 [=](MachineInstrBuilder &MIB) { // vaddr
6421 if (FI)
6422 MIB.addFrameIndex(*FI);
6423 else
6424 MIB.addReg(VAddr);
6425 },
6426 [=](MachineInstrBuilder &MIB) { // soffset
6427 // Use constant zero for soffset and rely on eliminateFrameIndex
6428 // to choose the appropriate frame register if need be.
6429 MIB.addImm(0);
6430 },
6431 [=](MachineInstrBuilder &MIB) { // offset
6432 MIB.addImm(Offset);
6433 }}};
6434}
6435
6436bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6437 int64_t Offset) const {
6438 if (!isUInt<16>(Offset))
6439 return false;
6440
6441 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6442 return true;
6443
6444 // On Southern Islands instruction with a negative base value and an offset
6445 // don't seem to work.
6446 return VT->signBitIsZero(Base);
6447}
6448
6449bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6450 int64_t Offset1,
6451 unsigned Size) const {
6452 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6453 return false;
6454 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6455 return false;
6456
6457 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6458 return true;
6459
6460 // On Southern Islands instruction with a negative base value and an offset
6461 // don't seem to work.
6462 return VT->signBitIsZero(Base);
6463}
6464
6465// Return whether the operation has NoUnsignedWrap property.
6466static bool isNoUnsignedWrap(MachineInstr *Addr) {
6467 return Addr->getOpcode() == TargetOpcode::G_OR ||
6468 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6470}
6471
6472// Check that the base address of flat scratch load/store in the form of `base +
6473// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6474// requirement). We always treat the first operand as the base address here.
6475bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6476 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6477
6478 if (isNoUnsignedWrap(AddrMI))
6479 return true;
6480
6481 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6482 // values.
6483 if (STI.hasSignedScratchOffsets())
6484 return true;
6485
6486 Register LHS = AddrMI->getOperand(1).getReg();
6487 Register RHS = AddrMI->getOperand(2).getReg();
6488
6489 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6490 std::optional<ValueAndVReg> RhsValReg =
6492 // If the immediate offset is negative and within certain range, the base
6493 // address cannot also be negative. If the base is also negative, the sum
6494 // would be either negative or much larger than the valid range of scratch
6495 // memory a thread can access.
6496 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6497 RhsValReg->Value.getSExtValue() > -0x40000000)
6498 return true;
6499 }
6500
6501 return VT->signBitIsZero(LHS);
6502}
6503
6504// Check address value in SGPR/VGPR are legal for flat scratch in the form
6505// of: SGPR + VGPR.
6506bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6507 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6508
6509 if (isNoUnsignedWrap(AddrMI))
6510 return true;
6511
6512 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6513 // values.
6514 if (STI.hasSignedScratchOffsets())
6515 return true;
6516
6517 Register LHS = AddrMI->getOperand(1).getReg();
6518 Register RHS = AddrMI->getOperand(2).getReg();
6519 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6520}
6521
6522// Check address value in SGPR/VGPR are legal for flat scratch in the form
6523// of: SGPR + VGPR + Imm.
6524bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6525 Register Addr) const {
6526 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6527 // values.
6528 if (STI.hasSignedScratchOffsets())
6529 return true;
6530
6531 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6532 Register Base = AddrMI->getOperand(1).getReg();
6533 std::optional<DefinitionAndSourceRegister> BaseDef =
6535 std::optional<ValueAndVReg> RHSOffset =
6537 assert(RHSOffset);
6538
6539 // If the immediate offset is negative and within certain range, the base
6540 // address cannot also be negative. If the base is also negative, the sum
6541 // would be either negative or much larger than the valid range of scratch
6542 // memory a thread can access.
6543 if (isNoUnsignedWrap(BaseDef->MI) &&
6544 (isNoUnsignedWrap(AddrMI) ||
6545 (RHSOffset->Value.getSExtValue() < 0 &&
6546 RHSOffset->Value.getSExtValue() > -0x40000000)))
6547 return true;
6548
6549 Register LHS = BaseDef->MI->getOperand(1).getReg();
6550 Register RHS = BaseDef->MI->getOperand(2).getReg();
6551 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6552}
6553
6554bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6555 unsigned ShAmtBits) const {
6556 assert(MI.getOpcode() == TargetOpcode::G_AND);
6557
6558 std::optional<APInt> RHS =
6559 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6560 if (!RHS)
6561 return false;
6562
6563 if (RHS->countr_one() >= ShAmtBits)
6564 return true;
6565
6566 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6567 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6568}
6569
6571AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6572 MachineOperand &Root) const {
6573 Register Reg = Root.getReg();
6574 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6575
6576 std::optional<DefinitionAndSourceRegister> Def =
6578 assert(Def && "this shouldn't be an optional result");
6579 Reg = Def->Reg;
6580
6581 if (Register WaveBase = getWaveAddress(Def->MI)) {
6582 return {{
6583 [=](MachineInstrBuilder &MIB) { // rsrc
6584 MIB.addReg(Info->getScratchRSrcReg());
6585 },
6586 [=](MachineInstrBuilder &MIB) { // soffset
6587 MIB.addReg(WaveBase);
6588 },
6589 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6590 }};
6591 }
6592
6593 int64_t Offset = 0;
6594
6595 // FIXME: Copy check is a hack
6597 if (mi_match(Reg, *MRI,
6598 m_GPtrAdd(m_Reg(BasePtr),
6600 if (!TII.isLegalMUBUFImmOffset(Offset))
6601 return {};
6602 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6603 Register WaveBase = getWaveAddress(BasePtrDef);
6604 if (!WaveBase)
6605 return {};
6606
6607 return {{
6608 [=](MachineInstrBuilder &MIB) { // rsrc
6609 MIB.addReg(Info->getScratchRSrcReg());
6610 },
6611 [=](MachineInstrBuilder &MIB) { // soffset
6612 MIB.addReg(WaveBase);
6613 },
6614 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6615 }};
6616 }
6617
6618 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6619 !TII.isLegalMUBUFImmOffset(Offset))
6620 return {};
6621
6622 return {{
6623 [=](MachineInstrBuilder &MIB) { // rsrc
6624 MIB.addReg(Info->getScratchRSrcReg());
6625 },
6626 [=](MachineInstrBuilder &MIB) { // soffset
6627 MIB.addImm(0);
6628 },
6629 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6630 }};
6631}
6632
6633std::pair<Register, unsigned>
6634AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6635 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6636 int64_t ConstAddr = 0;
6637
6638 Register PtrBase;
6639 int64_t Offset;
6640 std::tie(PtrBase, Offset, std::ignore) =
6641 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6642
6643 if (Offset) {
6644 if (isDSOffsetLegal(PtrBase, Offset)) {
6645 // (add n0, c0)
6646 return std::pair(PtrBase, Offset);
6647 }
6648 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6649 // TODO
6650
6651
6652 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6653 // TODO
6654
6655 }
6656
6657 return std::pair(Root.getReg(), 0);
6658}
6659
6661AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6662 Register Reg;
6663 unsigned Offset;
6664 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6665 return {{
6666 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6667 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6668 }};
6669}
6670
6672AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6673 return selectDSReadWrite2(Root, 4);
6674}
6675
6677AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6678 return selectDSReadWrite2(Root, 8);
6679}
6680
6682AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6683 unsigned Size) const {
6684 Register Reg;
6685 unsigned Offset;
6686 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6687 return {{
6688 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6689 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6690 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6691 }};
6692}
6693
6694std::pair<Register, unsigned>
6695AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6696 unsigned Size) const {
6697 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6698 int64_t ConstAddr = 0;
6699
6700 Register PtrBase;
6701 int64_t Offset;
6702 std::tie(PtrBase, Offset, std::ignore) =
6703 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6704
6705 if (Offset) {
6706 int64_t OffsetValue0 = Offset;
6707 int64_t OffsetValue1 = Offset + Size;
6708 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6709 // (add n0, c0)
6710 return std::pair(PtrBase, OffsetValue0 / Size);
6711 }
6712 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6713 // TODO
6714
6715 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6716 // TODO
6717
6718 }
6719
6720 return std::pair(Root.getReg(), 0);
6721}
6722
6723/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6724/// the base value with the constant offset, and if the offset computation is
6725/// known to be inbounds. There may be intervening copies between \p Root and
6726/// the identified constant. Returns \p Root, 0, false if this does not match
6727/// the pattern.
6728std::tuple<Register, int64_t, bool>
6729AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6730 Register Root, const MachineRegisterInfo &MRI) const {
6731 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6732 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6733 return {Root, 0, false};
6734
6735 MachineOperand &RHS = RootI->getOperand(2);
6736 std::optional<ValueAndVReg> MaybeOffset =
6738 if (!MaybeOffset)
6739 return {Root, 0, false};
6740 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6741 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6742 IsInBounds};
6743}
6744
6746 MIB.addImm(0);
6747}
6748
6749/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6750/// BasePtr is not valid, a null base pointer will be used.
6752 uint32_t FormatLo, uint32_t FormatHi,
6753 Register BasePtr) {
6754 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6755 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6756 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6757 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6758
6759 B.buildInstr(AMDGPU::S_MOV_B32)
6760 .addDef(RSrc2)
6761 .addImm(FormatLo);
6762 B.buildInstr(AMDGPU::S_MOV_B32)
6763 .addDef(RSrc3)
6764 .addImm(FormatHi);
6765
6766 // Build the half of the subregister with the constants before building the
6767 // full 128-bit register. If we are building multiple resource descriptors,
6768 // this will allow CSEing of the 2-component register.
6769 B.buildInstr(AMDGPU::REG_SEQUENCE)
6770 .addDef(RSrcHi)
6771 .addReg(RSrc2)
6772 .addImm(AMDGPU::sub0)
6773 .addReg(RSrc3)
6774 .addImm(AMDGPU::sub1);
6775
6776 Register RSrcLo = BasePtr;
6777 if (!BasePtr) {
6778 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6779 B.buildInstr(AMDGPU::S_MOV_B64)
6780 .addDef(RSrcLo)
6781 .addImm(0);
6782 }
6783
6784 B.buildInstr(AMDGPU::REG_SEQUENCE)
6785 .addDef(RSrc)
6786 .addReg(RSrcLo)
6787 .addImm(AMDGPU::sub0_sub1)
6788 .addReg(RSrcHi)
6789 .addImm(AMDGPU::sub2_sub3);
6790
6791 return RSrc;
6792}
6793
6795 const SIInstrInfo &TII, Register BasePtr) {
6796 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6797
6798 // FIXME: Why are half the "default" bits ignored based on the addressing
6799 // mode?
6800 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6801}
6802
6804 const SIInstrInfo &TII, Register BasePtr) {
6805 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6806
6807 // FIXME: Why are half the "default" bits ignored based on the addressing
6808 // mode?
6809 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6810}
6811
6812AMDGPUInstructionSelector::MUBUFAddressData
6813AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6814 MUBUFAddressData Data;
6815 Data.N0 = Src;
6816
6817 Register PtrBase;
6818 int64_t Offset;
6819
6820 std::tie(PtrBase, Offset, std::ignore) =
6821 getPtrBaseWithConstantOffset(Src, *MRI);
6822 if (isUInt<32>(Offset)) {
6823 Data.N0 = PtrBase;
6824 Data.Offset = Offset;
6825 }
6826
6827 if (MachineInstr *InputAdd
6828 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6829 Data.N2 = InputAdd->getOperand(1).getReg();
6830 Data.N3 = InputAdd->getOperand(2).getReg();
6831
6832 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6833 // FIXME: Don't know this was defined by operand 0
6834 //
6835 // TODO: Remove this when we have copy folding optimizations after
6836 // RegBankSelect.
6837 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6838 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6839 }
6840
6841 return Data;
6842}
6843
6844/// Return if the addr64 mubuf mode should be used for the given address.
6845bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6846 // (ptr_add N2, N3) -> addr64, or
6847 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6848 if (Addr.N2)
6849 return true;
6850
6851 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6852 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6853}
6854
6855/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6856/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6857/// component.
6858void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6859 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6860 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6861 return;
6862
6863 // Illegal offset, store it in soffset.
6864 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6865 B.buildInstr(AMDGPU::S_MOV_B32)
6866 .addDef(SOffset)
6867 .addImm(ImmOffset);
6868 ImmOffset = 0;
6869}
6870
6871bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6872 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6873 Register &SOffset, int64_t &Offset) const {
6874 // FIXME: Predicates should stop this from reaching here.
6875 // addr64 bit was removed for volcanic islands.
6876 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6877 return false;
6878
6879 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6880 if (!shouldUseAddr64(AddrData))
6881 return false;
6882
6883 Register N0 = AddrData.N0;
6884 Register N2 = AddrData.N2;
6885 Register N3 = AddrData.N3;
6886 Offset = AddrData.Offset;
6887
6888 // Base pointer for the SRD.
6889 Register SRDPtr;
6890
6891 if (N2) {
6892 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6893 assert(N3);
6894 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6895 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6896 // addr64, and construct the default resource from a 0 address.
6897 VAddr = N0;
6898 } else {
6899 SRDPtr = N3;
6900 VAddr = N2;
6901 }
6902 } else {
6903 // N2 is not divergent.
6904 SRDPtr = N2;
6905 VAddr = N3;
6906 }
6907 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6908 // Use the default null pointer in the resource
6909 VAddr = N0;
6910 } else {
6911 // N0 -> offset, or
6912 // (N0 + C1) -> offset
6913 SRDPtr = N0;
6914 }
6915
6916 MachineIRBuilder B(*Root.getParent());
6917 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6918 splitIllegalMUBUFOffset(B, SOffset, Offset);
6919 return true;
6920}
6921
6922bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6923 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6924 int64_t &Offset) const {
6925
6926 // FIXME: Pattern should not reach here.
6927 if (STI.useFlatForGlobal())
6928 return false;
6929
6930 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6931 if (shouldUseAddr64(AddrData))
6932 return false;
6933
6934 // N0 -> offset, or
6935 // (N0 + C1) -> offset
6936 Register SRDPtr = AddrData.N0;
6937 Offset = AddrData.Offset;
6938
6939 // TODO: Look through extensions for 32-bit soffset.
6940 MachineIRBuilder B(*Root.getParent());
6941
6942 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6943 splitIllegalMUBUFOffset(B, SOffset, Offset);
6944 return true;
6945}
6946
6948AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6949 Register VAddr;
6950 Register RSrcReg;
6951 Register SOffset;
6952 int64_t Offset = 0;
6953
6954 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6955 return {};
6956
6957 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6958 // pattern.
6959 return {{
6960 [=](MachineInstrBuilder &MIB) { // rsrc
6961 MIB.addReg(RSrcReg);
6962 },
6963 [=](MachineInstrBuilder &MIB) { // vaddr
6964 MIB.addReg(VAddr);
6965 },
6966 [=](MachineInstrBuilder &MIB) { // soffset
6967 if (SOffset)
6968 MIB.addReg(SOffset);
6969 else if (STI.hasRestrictedSOffset())
6970 MIB.addReg(AMDGPU::SGPR_NULL);
6971 else
6972 MIB.addImm(0);
6973 },
6974 [=](MachineInstrBuilder &MIB) { // offset
6975 MIB.addImm(Offset);
6976 },
6977 addZeroImm, // cpol
6978 addZeroImm, // tfe
6979 addZeroImm // swz
6980 }};
6981}
6982
6984AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6985 Register RSrcReg;
6986 Register SOffset;
6987 int64_t Offset = 0;
6988
6989 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6990 return {};
6991
6992 return {{
6993 [=](MachineInstrBuilder &MIB) { // rsrc
6994 MIB.addReg(RSrcReg);
6995 },
6996 [=](MachineInstrBuilder &MIB) { // soffset
6997 if (SOffset)
6998 MIB.addReg(SOffset);
6999 else if (STI.hasRestrictedSOffset())
7000 MIB.addReg(AMDGPU::SGPR_NULL);
7001 else
7002 MIB.addImm(0);
7003 },
7004 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
7005 addZeroImm, // cpol
7006 addZeroImm, // tfe
7007 addZeroImm, // swz
7008 }};
7009}
7010
7012AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
7013
7014 Register SOffset = Root.getReg();
7015
7016 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
7017 SOffset = AMDGPU::SGPR_NULL;
7018
7019 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
7020}
7021
7022/// Get an immediate that must be 32-bits, and treated as zero extended.
7023static std::optional<uint64_t>
7025 // getIConstantVRegVal sexts any values, so see if that matters.
7026 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
7027 if (!OffsetVal || !isInt<32>(*OffsetVal))
7028 return std::nullopt;
7029 return Lo_32(*OffsetVal);
7030}
7031
7033AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
7034 std::optional<uint64_t> OffsetVal =
7035 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
7036 if (!OffsetVal)
7037 return {};
7038
7039 std::optional<int64_t> EncodedImm =
7040 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
7041 if (!EncodedImm)
7042 return {};
7043
7044 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7045}
7046
7048AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
7049 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
7050
7051 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
7052 if (!OffsetVal)
7053 return {};
7054
7055 std::optional<int64_t> EncodedImm =
7057 if (!EncodedImm)
7058 return {};
7059
7060 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7061}
7062
7064AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
7065 // Match the (soffset + offset) pair as a 32-bit register base and
7066 // an immediate offset.
7067 Register SOffset;
7068 unsigned Offset;
7069 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
7070 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
7071 if (!SOffset)
7072 return std::nullopt;
7073
7074 std::optional<int64_t> EncodedOffset =
7075 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
7076 if (!EncodedOffset)
7077 return std::nullopt;
7078
7079 assert(MRI->getType(SOffset) == LLT::scalar(32));
7080 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7081 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7082}
7083
7084std::pair<Register, unsigned>
7085AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7086 bool &Matched) const {
7087 Matched = false;
7088
7089 Register Src;
7090 unsigned Mods;
7091 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7092
7093 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7094 assert(MRI->getType(Src) == LLT::scalar(16));
7095
7096 // Only change Src if src modifier could be gained. In such cases new Src
7097 // could be sgpr but this does not violate constant bus restriction for
7098 // instruction that is being selected.
7099 Src = stripBitCast(Src, *MRI);
7100
7101 const auto CheckAbsNeg = [&]() {
7102 // Be careful about folding modifiers if we already have an abs. fneg is
7103 // applied last, so we don't want to apply an earlier fneg.
7104 if ((Mods & SISrcMods::ABS) == 0) {
7105 unsigned ModsTmp;
7106 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7107
7108 if ((ModsTmp & SISrcMods::NEG) != 0)
7109 Mods ^= SISrcMods::NEG;
7110
7111 if ((ModsTmp & SISrcMods::ABS) != 0)
7112 Mods |= SISrcMods::ABS;
7113 }
7114 };
7115
7116 CheckAbsNeg();
7117
7118 // op_sel/op_sel_hi decide the source type and source.
7119 // If the source's op_sel_hi is set, it indicates to do a conversion from
7120 // fp16. If the sources's op_sel is set, it picks the high half of the
7121 // source register.
7122
7123 Mods |= SISrcMods::OP_SEL_1;
7124
7125 if (isExtractHiElt(*MRI, Src, Src)) {
7126 Mods |= SISrcMods::OP_SEL_0;
7127 CheckAbsNeg();
7128 }
7129
7130 Matched = true;
7131 }
7132
7133 return {Src, Mods};
7134}
7135
7137AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7138 MachineOperand &Root) const {
7139 Register Src;
7140 unsigned Mods;
7141 bool Matched;
7142 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7143 if (!Matched)
7144 return {};
7145
7146 return {{
7147 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7148 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7149 }};
7150}
7151
7153AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7154 Register Src;
7155 unsigned Mods;
7156 bool Matched;
7157 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7158
7159 return {{
7160 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7161 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7162 }};
7163}
7164
7165bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7166 MachineInstr &I, Intrinsic::ID IntrID) const {
7167 MachineBasicBlock *MBB = I.getParent();
7168 const DebugLoc &DL = I.getDebugLoc();
7169 Register CCReg = I.getOperand(0).getReg();
7170
7171 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7172 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7173
7174 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7175 .addImm(I.getOperand(2).getImm());
7176
7177 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7178
7179 I.eraseFromParent();
7180 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7181 *MRI);
7182}
7183
7184bool AMDGPUInstructionSelector::selectSGetBarrierState(
7185 MachineInstr &I, Intrinsic::ID IntrID) const {
7186 MachineBasicBlock *MBB = I.getParent();
7187 const DebugLoc &DL = I.getDebugLoc();
7188 const MachineOperand &BarOp = I.getOperand(2);
7189 std::optional<int64_t> BarValImm =
7190 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7191
7192 if (!BarValImm) {
7193 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7194 .addReg(BarOp.getReg());
7195 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7196 }
7197 MachineInstrBuilder MIB;
7198 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7199 : AMDGPU::S_GET_BARRIER_STATE_M0;
7200 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7201
7202 auto DstReg = I.getOperand(0).getReg();
7203 const TargetRegisterClass *DstRC =
7204 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7205 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7206 return false;
7207 MIB.addDef(DstReg);
7208 if (BarValImm) {
7209 MIB.addImm(*BarValImm);
7210 }
7211 I.eraseFromParent();
7212 return true;
7213}
7214
7215unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7216 if (HasInlineConst) {
7217 switch (IntrID) {
7218 default:
7219 llvm_unreachable("not a named barrier op");
7220 case Intrinsic::amdgcn_s_barrier_join:
7221 return AMDGPU::S_BARRIER_JOIN_IMM;
7222 case Intrinsic::amdgcn_s_wakeup_barrier:
7223 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7224 case Intrinsic::amdgcn_s_get_named_barrier_state:
7225 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7226 };
7227 } else {
7228 switch (IntrID) {
7229 default:
7230 llvm_unreachable("not a named barrier op");
7231 case Intrinsic::amdgcn_s_barrier_join:
7232 return AMDGPU::S_BARRIER_JOIN_M0;
7233 case Intrinsic::amdgcn_s_wakeup_barrier:
7234 return AMDGPU::S_WAKEUP_BARRIER_M0;
7235 case Intrinsic::amdgcn_s_get_named_barrier_state:
7236 return AMDGPU::S_GET_BARRIER_STATE_M0;
7237 };
7238 }
7239}
7240
7241bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7242 MachineInstr &I, Intrinsic::ID IntrID) const {
7243 MachineBasicBlock *MBB = I.getParent();
7244 const DebugLoc &DL = I.getDebugLoc();
7245 const MachineOperand &BarOp = I.getOperand(1);
7246 const MachineOperand &CntOp = I.getOperand(2);
7247
7248 // A member count of 0 means "keep existing member count". That plus a known
7249 // constant value for the barrier ID lets us use the immarg form.
7250 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7251 std::optional<int64_t> CntImm =
7252 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7253 if (CntImm && *CntImm == 0) {
7254 std::optional<int64_t> BarValImm =
7255 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7256 if (BarValImm) {
7257 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7258 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7259 .addImm(BarID);
7260 I.eraseFromParent();
7261 return true;
7262 }
7263 }
7264 }
7265
7266 // BarID = (BarOp >> 4) & 0x3F
7267 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7268 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7269 .add(BarOp)
7270 .addImm(4u)
7271 .setOperandDead(3); // Dead scc
7272
7273 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7274 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7275 .addReg(TmpReg0)
7276 .addImm(0x3F)
7277 .setOperandDead(3); // Dead scc
7278
7279 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7280 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7281 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7282 .add(CntOp)
7283 .addImm(0x3F)
7284 .setOperandDead(3); // Dead scc
7285
7286 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7287 constexpr unsigned ShAmt = 16;
7288 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7289 .addReg(TmpReg2)
7290 .addImm(ShAmt)
7291 .setOperandDead(3); // Dead scc
7292
7293 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7294 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7295 .addReg(TmpReg1)
7296 .addReg(TmpReg3)
7297 .setOperandDead(3); // Dead scc;
7298
7299 auto CopyMIB =
7300 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7301 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7302
7303 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7304 ? AMDGPU::S_BARRIER_INIT_M0
7305 : AMDGPU::S_BARRIER_SIGNAL_M0;
7306 MachineInstrBuilder MIB;
7307 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7308
7309 I.eraseFromParent();
7310 return true;
7311}
7312
7313bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7314 MachineInstr &I, Intrinsic::ID IntrID) const {
7315 MachineBasicBlock *MBB = I.getParent();
7316 const DebugLoc &DL = I.getDebugLoc();
7317 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7318 ? I.getOperand(2)
7319 : I.getOperand(1);
7320 std::optional<int64_t> BarValImm =
7321 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7322
7323 if (!BarValImm) {
7324 // BarID = (BarOp >> 4) & 0x3F
7325 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7326 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7327 .addReg(BarOp.getReg())
7328 .addImm(4u)
7329 .setOperandDead(3); // Dead scc;
7330
7331 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7332 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7333 .addReg(TmpReg0)
7334 .addImm(0x3F)
7335 .setOperandDead(3); // Dead scc;
7336
7337 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7338 .addReg(TmpReg1);
7339 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7340 }
7341
7342 MachineInstrBuilder MIB;
7343 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7344 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7345
7346 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7347 auto DstReg = I.getOperand(0).getReg();
7348 const TargetRegisterClass *DstRC =
7349 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7350 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7351 return false;
7352 MIB.addDef(DstReg);
7353 }
7354
7355 if (BarValImm) {
7356 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7357 MIB.addImm(BarId);
7358 }
7359
7360 I.eraseFromParent();
7361 return true;
7362}
7363
7364void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7365 const MachineInstr &MI,
7366 int OpIdx) const {
7367 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7368 "Expected G_CONSTANT");
7369 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7370}
7371
7372void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7373 const MachineInstr &MI,
7374 int OpIdx) const {
7375 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7376 "Expected G_CONSTANT");
7377 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7378}
7379
7380void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7381 const MachineInstr &MI,
7382 int OpIdx) const {
7383 const MachineOperand &Op = MI.getOperand(1);
7384 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7385 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7386}
7387
7388void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7389 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7390 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7391 "Expected G_CONSTANT");
7392 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7393}
7394
7395/// This only really exists to satisfy DAG type checking machinery, so is a
7396/// no-op here.
7397void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7398 const MachineInstr &MI,
7399 int OpIdx) const {
7400 const MachineOperand &Op = MI.getOperand(OpIdx);
7401 int64_t Imm;
7402 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7403 MIB.addImm(Imm);
7404 else
7405 MIB.addImm(Op.getImm());
7406}
7407
7408void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7409 const MachineInstr &MI,
7410 int OpIdx) const {
7411 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7412}
7413
7414void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7415 const MachineInstr &MI,
7416 int OpIdx) const {
7417 assert(OpIdx >= 0 && "expected to match an immediate operand");
7418 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7419}
7420
7421void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7422 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7423 assert(OpIdx >= 0 && "expected to match an immediate operand");
7424 MIB.addImm(
7425 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7426}
7427
7428void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7429 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7430 assert(OpIdx >= 0 && "expected to match an immediate operand");
7431 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7433 : (int64_t)SISrcMods::DST_OP_SEL);
7434}
7435
7436void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7437 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7438 assert(OpIdx >= 0 && "expected to match an immediate operand");
7439 MIB.addImm(
7440 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7441}
7442
7443void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7444 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7445 assert(OpIdx >= 0 && "expected to match an immediate operand");
7446 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7447 ? (int64_t)(SISrcMods::OP_SEL_0)
7448 : 0);
7449}
7450
7451void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7452 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7453 assert(OpIdx >= 0 && "expected to match an immediate operand");
7454 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7455 : 0);
7456}
7457
7458void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7459 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7460 assert(OpIdx >= 0 && "expected to match an immediate operand");
7461 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7462 : 0);
7463}
7464
7465void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7466 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7467 assert(OpIdx >= 0 && "expected to match an immediate operand");
7468 MIB.addImm(
7469 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7470}
7471
7472void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7473 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7474 assert(OpIdx >= 0 && "expected to match an immediate operand");
7475 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7476 ? (int64_t)SISrcMods::DST_OP_SEL
7477 : 0);
7478}
7479
7480void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7481 const MachineInstr &MI,
7482 int OpIdx) const {
7483 assert(OpIdx >= 0 && "expected to match an immediate operand");
7484 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7487}
7488
7489void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7490 const MachineInstr &MI,
7491 int OpIdx) const {
7492 assert(OpIdx >= 0 && "expected to match an immediate operand");
7493 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7496 MIB.addImm(Swizzle);
7497}
7498
7499void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7500 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7501 assert(OpIdx >= 0 && "expected to match an immediate operand");
7502 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7505 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7506}
7507
7508void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7509 const MachineInstr &MI,
7510 int OpIdx) const {
7511 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7512}
7513
7514void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7515 const MachineInstr &MI,
7516 int OpIdx) const {
7517 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7518 int ExpVal = APF.getExactLog2Abs();
7519 assert(ExpVal != INT_MIN);
7520 MIB.addImm(ExpVal);
7521}
7522
7523void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7524 const MachineInstr &MI,
7525 int OpIdx) const {
7526 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7527 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7528 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7529 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7530 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7531}
7532
7533void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7534 const MachineInstr &MI,
7535 int OpIdx) const {
7536 unsigned Mods = SISrcMods::OP_SEL_1;
7537 if (MI.getOperand(OpIdx).getImm())
7538 Mods ^= SISrcMods::NEG;
7539 MIB.addImm((int64_t)Mods);
7540}
7541
7542void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7543 const MachineInstr &MI,
7544 int OpIdx) const {
7545 unsigned Mods = SISrcMods::OP_SEL_1;
7546 if (MI.getOperand(OpIdx).getImm())
7548 MIB.addImm((int64_t)Mods);
7549}
7550
7551void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7552 const MachineInstr &MI,
7553 int OpIdx) const {
7554 unsigned Val = MI.getOperand(OpIdx).getImm();
7555 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7556 if (Val == 1) // neg
7557 Mods ^= SISrcMods::NEG;
7558 if (Val == 2) // abs
7559 Mods ^= SISrcMods::ABS;
7560 if (Val == 3) // neg and abs
7561 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7562 MIB.addImm((int64_t)Mods);
7563}
7564
7565void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7566 const MachineInstr &MI,
7567 int OpIdx) const {
7568 uint32_t V = MI.getOperand(2).getImm();
7571 if (!Subtarget->hasSafeCUPrefetch())
7572 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7573 MIB.addImm(V);
7574}
7575
7576/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7577void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7578 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7579 unsigned Val = MI.getOperand(OpIdx).getImm();
7580 unsigned New = 0;
7581 if (Val & 0x1)
7583 if (Val & 0x2)
7585 MIB.addImm(New);
7586}
7587
7588bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7589 return TII.isInlineConstant(Imm);
7590}
7591
7592bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7593 return TII.isInlineConstant(Imm);
7594}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
bool isFPPredicate() const
Definition InstrTypes.h:845
bool isIntPredicate() const
Definition InstrTypes.h:846
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.h:218
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1444
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.