LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(N->getOperand(i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
572 DL, EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
585}
586
588 EVT VT = N->getValueType(0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(N);
599
600 SDValue Src0 = SVN->getOperand(0);
601 SDValue Src1 = SVN->getOperand(1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
653
654 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
655 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
673 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 // We are selecting i64 ADD here instead of custom lower it during
695 // DAG legalization, so we can fold some i64 ADDs used for address
696 // calculation into the LOAD and STORE instructions.
697 case ISD::ADDC:
698 case ISD::ADDE:
699 case ISD::SUBC:
700 case ISD::SUBE: {
701 if (N->getValueType(0) != MVT::i64)
702 break;
703
704 SelectADD_SUB_I64(N);
705 return;
706 }
707 case ISD::UADDO_CARRY:
708 case ISD::USUBO_CARRY:
709 if (N->getValueType(0) != MVT::i32)
710 break;
711
712 SelectAddcSubb(N);
713 return;
714 case ISD::UADDO:
715 case ISD::USUBO: {
716 SelectUADDO_USUBO(N);
717 return;
718 }
719 case AMDGPUISD::FMUL_W_CHAIN: {
720 SelectFMUL_W_CHAIN(N);
721 return;
722 }
723 case AMDGPUISD::FMA_W_CHAIN: {
724 SelectFMA_W_CHAIN(N);
725 return;
726 }
727
729 case ISD::BUILD_VECTOR: {
730 EVT VT = N->getValueType(0);
731 unsigned NumVectorElts = VT.getVectorNumElements();
732 if (VT.getScalarSizeInBits() == 16) {
733 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
734 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
735 ReplaceNode(N, Packed);
736 return;
737 }
738 }
739
740 break;
741 }
742
743 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
744 assert(VT.getVectorElementType().bitsEq(MVT::i32));
745 const TargetRegisterClass *RegClass =
746 N->isDivergent()
747 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
748 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
749
750 SelectBuildVector(N, RegClass->getID());
751 return;
752 }
755 return;
756 case ISD::BUILD_PAIR: {
757 SDValue RC, SubReg0, SubReg1;
758 SDLoc DL(N);
759 if (N->getValueType(0) == MVT::i128) {
760 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
761 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
762 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
763 } else if (N->getValueType(0) == MVT::i64) {
764 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
765 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
766 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
767 } else {
768 llvm_unreachable("Unhandled value type for BUILD_PAIR");
769 }
770 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
771 N->getOperand(1), SubReg1 };
772 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
773 N->getValueType(0), Ops));
774 return;
775 }
776
777 case ISD::Constant:
778 case ISD::ConstantFP: {
779 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
780 Subtarget->has64BitLiterals())
781 break;
782
783 uint64_t Imm;
785 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
786 if (AMDGPU::isValid32BitLiteral(Imm, true))
787 break;
788 } else {
790 Imm = C->getZExtValue();
791 if (AMDGPU::isValid32BitLiteral(Imm, false))
792 break;
793 }
794
795 SDLoc DL(N);
796 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
797 return;
798 }
799 case AMDGPUISD::BFE_I32:
800 case AMDGPUISD::BFE_U32: {
801 // There is a scalar version available, but unlike the vector version which
802 // has a separate operand for the offset and width, the scalar version packs
803 // the width and offset into a single operand. Try to move to the scalar
804 // version if the offsets are constant, so that we can try to keep extended
805 // loads of kernel arguments in SGPRs.
806
807 // TODO: Technically we could try to pattern match scalar bitshifts of
808 // dynamic values, but it's probably not useful.
810 if (!Offset)
811 break;
812
813 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
814 if (!Width)
815 break;
816
817 bool Signed = Opc == AMDGPUISD::BFE_I32;
818
819 uint32_t OffsetVal = Offset->getZExtValue();
820 uint32_t WidthVal = Width->getZExtValue();
821
822 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
823 WidthVal));
824 return;
825 }
826 case AMDGPUISD::DIV_SCALE: {
827 SelectDIV_SCALE(N);
828 return;
829 }
832 SelectMAD_64_32(N);
833 return;
834 }
835 case ISD::SMUL_LOHI:
836 case ISD::UMUL_LOHI:
837 return SelectMUL_LOHI(N);
838 case ISD::CopyToReg: {
840 *static_cast<const SITargetLowering*>(getTargetLowering());
841 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
842 break;
843 }
844 case ISD::AND:
845 case ISD::SRL:
846 case ISD::SRA:
848 if (N->getValueType(0) != MVT::i32)
849 break;
850
851 SelectS_BFE(N);
852 return;
853 case ISD::BRCOND:
854 SelectBRCOND(N);
855 return;
856 case ISD::FP_EXTEND:
857 SelectFP_EXTEND(N);
858 return;
859 case AMDGPUISD::CVT_PKRTZ_F16_F32:
860 case AMDGPUISD::CVT_PKNORM_I16_F32:
861 case AMDGPUISD::CVT_PKNORM_U16_F32:
862 case AMDGPUISD::CVT_PK_U16_U32:
863 case AMDGPUISD::CVT_PK_I16_I32: {
864 // Hack around using a legal type if f16 is illegal.
865 if (N->getValueType(0) == MVT::i32) {
866 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
867 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
868 { N->getOperand(0), N->getOperand(1) });
869 SelectCode(N);
870 return;
871 }
872
873 break;
874 }
876 SelectINTRINSIC_W_CHAIN(N);
877 return;
878 }
880 SelectINTRINSIC_WO_CHAIN(N);
881 return;
882 }
883 case ISD::INTRINSIC_VOID: {
884 SelectINTRINSIC_VOID(N);
885 return;
886 }
888 SelectWAVE_ADDRESS(N);
889 return;
890 }
891 case ISD::STACKRESTORE: {
892 SelectSTACKRESTORE(N);
893 return;
894 }
895 }
896
897 SelectCode(N);
898}
899
901 if (!Subtarget->hasSDWA())
902 return false;
903
904 if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
905 EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();
906 return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;
907 }
908
909 if (N->getOpcode() == ISD::AND)
910 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
911 return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;
912
913 if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)
914 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
915 return (RHS->getZExtValue() % 8) == 0;
916
917 return false;
918}
919
920bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
921 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
922 const Instruction *Term = BB->getTerminator();
923 return Term->getMetadata("amdgpu.uniform") ||
924 Term->getMetadata("structurizecfg.uniform");
925}
926
927bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
928 unsigned ShAmtBits) const {
929 assert(N->getOpcode() == ISD::AND);
930
931 const APInt &RHS = N->getConstantOperandAPInt(1);
932 if (RHS.countr_one() >= ShAmtBits)
933 return true;
934
935 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
936 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
937}
938
940 SDValue &N0, SDValue &N1) {
941 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
943 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
944 // (i64 (bitcast (v2i32 (build_vector
945 // (or (extract_vector_elt V, 0), OFFSET),
946 // (extract_vector_elt V, 1)))))
947 SDValue Lo = Addr.getOperand(0).getOperand(0);
948 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
949 SDValue BaseLo = Lo.getOperand(0);
950 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
951 // Check that split base (Lo and Hi) are extracted from the same one.
952 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
954 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
955 // Lo is statically extracted from index 0.
956 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
957 BaseLo.getConstantOperandVal(1) == 0 &&
958 // Hi is statically extracted from index 0.
959 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
960 BaseHi.getConstantOperandVal(1) == 1) {
961 N0 = BaseLo.getOperand(0).getOperand(0);
962 N1 = Lo.getOperand(1);
963 return true;
964 }
965 }
966 }
967 return false;
968}
969
970bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
971 SDValue &RHS) const {
972 if (CurDAG->isBaseWithConstantOffset(Addr)) {
973 LHS = Addr.getOperand(0);
974 RHS = Addr.getOperand(1);
975 return true;
976 }
977
980 return true;
981 }
982
983 return false;
984}
985
987 return "AMDGPU DAG->DAG Pattern Instruction Selection";
988}
989
993
997#ifdef EXPENSIVE_CHECKS
999 .getManager();
1000 auto &F = MF.getFunction();
1001 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1002 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
1003 for (auto &L : LI.getLoopsInPreorder())
1004 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
1005#endif
1006 return SelectionDAGISelPass::run(MF, MFAM);
1007}
1008
1009//===----------------------------------------------------------------------===//
1010// Complex Patterns
1011//===----------------------------------------------------------------------===//
1012
1013bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
1014 SDValue &Offset) {
1015 return false;
1016}
1017
1018bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
1019 SDValue &Offset) {
1021 SDLoc DL(Addr);
1022
1023 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1024 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1025 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1026 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1027 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1028 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1029 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1030 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1031 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1032 Base = Addr.getOperand(0);
1033 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1034 } else {
1035 Base = Addr;
1036 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1037 }
1038
1039 return true;
1040}
1041
1042SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1043 const SDLoc &DL) const {
1044 SDNode *Mov = CurDAG->getMachineNode(
1045 AMDGPU::S_MOV_B32, DL, MVT::i32,
1046 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1047 return SDValue(Mov, 0);
1048}
1049
1050// FIXME: Should only handle uaddo_carry/usubo_carry
1051void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1052 SDLoc DL(N);
1053 SDValue LHS = N->getOperand(0);
1054 SDValue RHS = N->getOperand(1);
1055
1056 unsigned Opcode = N->getOpcode();
1057 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1058 bool ProduceCarry =
1059 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1060 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1061
1062 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1063 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1064
1065 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1066 DL, MVT::i32, LHS, Sub0);
1067 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1068 DL, MVT::i32, LHS, Sub1);
1069
1070 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1071 DL, MVT::i32, RHS, Sub0);
1072 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1073 DL, MVT::i32, RHS, Sub1);
1074
1075 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1076
1077 static const unsigned OpcMap[2][2][2] = {
1078 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1079 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1080 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1081 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1082
1083 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1084 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1085
1086 SDNode *AddLo;
1087 if (!ConsumeCarry) {
1088 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1089 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1090 } else {
1091 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1092 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1093 }
1094 SDValue AddHiArgs[] = {
1095 SDValue(Hi0, 0),
1096 SDValue(Hi1, 0),
1097 SDValue(AddLo, 1)
1098 };
1099 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1100
1101 SDValue RegSequenceArgs[] = {
1102 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1103 SDValue(AddLo,0),
1104 Sub0,
1105 SDValue(AddHi,0),
1106 Sub1,
1107 };
1108 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1109 MVT::i64, RegSequenceArgs);
1110
1111 if (ProduceCarry) {
1112 // Replace the carry-use
1113 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1114 }
1115
1116 // Replace the remaining uses.
1117 ReplaceNode(N, RegSequence);
1118}
1119
1120void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1121 SDValue LHS = N->getOperand(0);
1122 SDValue RHS = N->getOperand(1);
1123 SDValue CI = N->getOperand(2);
1124
1125 if (N->isDivergent()) {
1126 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1127 : AMDGPU::V_SUBB_U32_e64;
1128 CurDAG->SelectNodeTo(
1129 N, Opc, N->getVTList(),
1130 {LHS, RHS, CI,
1131 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1132 } else {
1133 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1134 : AMDGPU::S_SUB_CO_PSEUDO;
1135 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1136 }
1137}
1138
1139void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1140 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1141 // carry out despite the _i32 name. These were renamed in VI to _U32.
1142 // FIXME: We should probably rename the opcodes here.
1143 bool IsAdd = N->getOpcode() == ISD::UADDO;
1144 bool IsVALU = N->isDivergent();
1145
1146 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1147 ++UI)
1148 if (UI.getUse().getResNo() == 1) {
1149 if (UI->isMachineOpcode()) {
1150 if (UI->getMachineOpcode() !=
1151 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1152 IsVALU = true;
1153 break;
1154 }
1155 } else {
1156 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1157 IsVALU = true;
1158 break;
1159 }
1160 }
1161 }
1162
1163 if (IsVALU) {
1164 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1165
1166 CurDAG->SelectNodeTo(
1167 N, Opc, N->getVTList(),
1168 {N->getOperand(0), N->getOperand(1),
1169 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1170 } else {
1171 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1172
1173 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1174 {N->getOperand(0), N->getOperand(1)});
1175 }
1176}
1177
1178void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1179 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1180 SDValue Ops[10];
1181
1182 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1183 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1184 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1185 Ops[8] = N->getOperand(0);
1186 Ops[9] = N->getOperand(4);
1187
1188 // If there are no source modifiers, prefer fmac over fma because it can use
1189 // the smaller VOP2 encoding.
1190 bool UseFMAC = Subtarget->hasDLInsts() &&
1191 cast<ConstantSDNode>(Ops[0])->isZero() &&
1192 cast<ConstantSDNode>(Ops[2])->isZero() &&
1193 cast<ConstantSDNode>(Ops[4])->isZero();
1194 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1195 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1196}
1197
1198void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1199 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1200 SDValue Ops[8];
1201
1202 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1203 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1204 Ops[6] = N->getOperand(0);
1205 Ops[7] = N->getOperand(3);
1206
1207 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1208}
1209
1210// We need to handle this here because tablegen doesn't support matching
1211// instructions with multiple outputs.
1212void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1213 EVT VT = N->getValueType(0);
1214
1215 assert(VT == MVT::f32 || VT == MVT::f64);
1216
1217 unsigned Opc
1218 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1219
1220 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1221 // omod
1222 SDValue Ops[8];
1223 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1224 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1225 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1226 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1227}
1228
1229// We need to handle this here because tablegen doesn't support matching
1230// instructions with multiple outputs.
1231void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1232 SDLoc SL(N);
1233 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1234 unsigned Opc;
1235 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);
1236 if (Subtarget->hasMADIntraFwdBug())
1237 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1238 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1239 else if (UseNoCarry)
1240 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1241 else
1242 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1243
1244 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1245 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1246 Clamp };
1247
1248 if (UseNoCarry) {
1249 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1250 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1251 CurDAG->RemoveDeadNode(N);
1252 return;
1253 }
1254
1255 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1256}
1257
1258// We need to handle this here because tablegen doesn't support matching
1259// instructions with multiple outputs.
1260void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1261 SDLoc SL(N);
1262 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1263 SDVTList VTList;
1264 unsigned Opc;
1265 if (Subtarget->hasMadNC64_32Insts()) {
1266 VTList = CurDAG->getVTList(MVT::i64);
1267 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1268 } else {
1269 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1270 if (Subtarget->hasMADIntraFwdBug()) {
1271 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1272 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1273 } else {
1274 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1275 }
1276 }
1277
1278 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1279 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1280 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1281 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1282 if (!SDValue(N, 0).use_empty()) {
1283 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1284 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1285 MVT::i32, SDValue(Mad, 0), Sub0);
1286 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1287 }
1288 if (!SDValue(N, 1).use_empty()) {
1289 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1290 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1291 MVT::i32, SDValue(Mad, 0), Sub1);
1292 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1293 }
1294 CurDAG->RemoveDeadNode(N);
1295}
1296
1297bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1298 if (!isUInt<16>(Offset))
1299 return false;
1300
1301 if (!Base || Subtarget->hasUsableDSOffset() ||
1302 Subtarget->unsafeDSOffsetFoldingEnabled())
1303 return true;
1304
1305 // On Southern Islands instruction with a negative base value and an offset
1306 // don't seem to work.
1307 return CurDAG->SignBitIsZero(Base);
1308}
1309
1310bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1311 SDValue &Offset) const {
1312 SDLoc DL(Addr);
1313 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1314 SDValue N0 = Addr.getOperand(0);
1315 SDValue N1 = Addr.getOperand(1);
1316 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1317 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1318 // (add n0, c0)
1319 Base = N0;
1320 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1321 return true;
1322 }
1323 } else if (Addr.getOpcode() == ISD::SUB) {
1324 // sub C, x -> add (sub 0, x), C
1325 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1326 int64_t ByteOffset = C->getSExtValue();
1327 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1328 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1329
1330 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1331 // the known bits in isDSOffsetLegal. We need to emit the selected node
1332 // here, so this is thrown away.
1333 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1334 Zero, Addr.getOperand(1));
1335
1336 if (isDSOffsetLegal(Sub, ByteOffset)) {
1338 Opnds.push_back(Zero);
1339 Opnds.push_back(Addr.getOperand(1));
1340
1341 // FIXME: Select to VOP3 version for with-carry.
1342 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1343 if (Subtarget->hasAddNoCarryInsts()) {
1344 SubOp = AMDGPU::V_SUB_U32_e64;
1345 Opnds.push_back(
1346 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1347 }
1348
1349 MachineSDNode *MachineSub =
1350 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1351
1352 Base = SDValue(MachineSub, 0);
1353 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1354 return true;
1355 }
1356 }
1357 }
1358 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1359 // If we have a constant address, prefer to put the constant into the
1360 // offset. This can save moves to load the constant address since multiple
1361 // operations can share the zero base address register, and enables merging
1362 // into read2 / write2 instructions.
1363
1364 SDLoc DL(Addr);
1365
1366 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1367 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1368 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1369 DL, MVT::i32, Zero);
1370 Base = SDValue(MovZero, 0);
1371 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1372 return true;
1373 }
1374 }
1375
1376 // default case
1377 Base = Addr;
1378 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1379 return true;
1380}
1381
1382bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1383 unsigned Offset1,
1384 unsigned Size) const {
1385 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1386 return false;
1387 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1388 return false;
1389
1390 if (!Base || Subtarget->hasUsableDSOffset() ||
1391 Subtarget->unsafeDSOffsetFoldingEnabled())
1392 return true;
1393
1394 // On Southern Islands instruction with a negative base value and an offset
1395 // don't seem to work.
1396 return CurDAG->SignBitIsZero(Base);
1397}
1398
1399// Return whether the operation has NoUnsignedWrap property.
1400static bool isNoUnsignedWrap(SDValue Addr) {
1401 return (Addr.getOpcode() == ISD::ADD &&
1402 Addr->getFlags().hasNoUnsignedWrap()) ||
1403 Addr->getOpcode() == ISD::OR;
1404}
1405
1406// Check that the base address of flat scratch load/store in the form of `base +
1407// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1408// requirement). We always treat the first operand as the base address here.
1409bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1410 if (isNoUnsignedWrap(Addr))
1411 return true;
1412
1413 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1414 // values.
1415 if (Subtarget->hasSignedScratchOffsets())
1416 return true;
1417
1418 auto LHS = Addr.getOperand(0);
1419 auto RHS = Addr.getOperand(1);
1420
1421 // If the immediate offset is negative and within certain range, the base
1422 // address cannot also be negative. If the base is also negative, the sum
1423 // would be either negative or much larger than the valid range of scratch
1424 // memory a thread can access.
1425 ConstantSDNode *ImmOp = nullptr;
1426 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1427 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1428 return true;
1429 }
1430
1431 return CurDAG->SignBitIsZero(LHS);
1432}
1433
1434// Check address value in SGPR/VGPR are legal for flat scratch in the form
1435// of: SGPR + VGPR.
1436bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1437 if (isNoUnsignedWrap(Addr))
1438 return true;
1439
1440 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1441 // values.
1442 if (Subtarget->hasSignedScratchOffsets())
1443 return true;
1444
1445 auto LHS = Addr.getOperand(0);
1446 auto RHS = Addr.getOperand(1);
1447 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1448}
1449
1450// Check address value in SGPR/VGPR are legal for flat scratch in the form
1451// of: SGPR + VGPR + Imm.
1452bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1453 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1454 // values.
1455 if (AMDGPU::isGFX12Plus(*Subtarget))
1456 return true;
1457
1458 auto Base = Addr.getOperand(0);
1459 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1460 // If the immediate offset is negative and within certain range, the base
1461 // address cannot also be negative. If the base is also negative, the sum
1462 // would be either negative or much larger than the valid range of scratch
1463 // memory a thread can access.
1464 if (isNoUnsignedWrap(Base) &&
1465 (isNoUnsignedWrap(Addr) ||
1466 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1467 return true;
1468
1469 auto LHS = Base.getOperand(0);
1470 auto RHS = Base.getOperand(1);
1471 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1472}
1473
1474// TODO: If offset is too big, put low 16-bit into offset.
1475bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1476 SDValue &Offset0,
1477 SDValue &Offset1) const {
1478 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1479}
1480
1481bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1482 SDValue &Offset0,
1483 SDValue &Offset1) const {
1484 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1485}
1486
1487bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1488 SDValue &Offset0, SDValue &Offset1,
1489 unsigned Size) const {
1490 SDLoc DL(Addr);
1491
1492 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1493 SDValue N0 = Addr.getOperand(0);
1494 SDValue N1 = Addr.getOperand(1);
1495 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1496 unsigned OffsetValue0 = C1->getZExtValue();
1497 unsigned OffsetValue1 = OffsetValue0 + Size;
1498
1499 // (add n0, c0)
1500 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1501 Base = N0;
1502 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1503 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1504 return true;
1505 }
1506 } else if (Addr.getOpcode() == ISD::SUB) {
1507 // sub C, x -> add (sub 0, x), C
1508 if (const ConstantSDNode *C =
1510 unsigned OffsetValue0 = C->getZExtValue();
1511 unsigned OffsetValue1 = OffsetValue0 + Size;
1512
1513 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1514 SDLoc DL(Addr);
1515 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1516
1517 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1518 // the known bits in isDSOffsetLegal. We need to emit the selected node
1519 // here, so this is thrown away.
1520 SDValue Sub =
1521 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1522
1523 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1525 Opnds.push_back(Zero);
1526 Opnds.push_back(Addr.getOperand(1));
1527 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1528 if (Subtarget->hasAddNoCarryInsts()) {
1529 SubOp = AMDGPU::V_SUB_U32_e64;
1530 Opnds.push_back(
1531 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1532 }
1533
1534 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1535 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1536
1537 Base = SDValue(MachineSub, 0);
1538 Offset0 =
1539 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1540 Offset1 =
1541 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1542 return true;
1543 }
1544 }
1545 }
1546 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1547 unsigned OffsetValue0 = CAddr->getZExtValue();
1548 unsigned OffsetValue1 = OffsetValue0 + Size;
1549
1550 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1551 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1552 MachineSDNode *MovZero =
1553 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1554 Base = SDValue(MovZero, 0);
1555 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1556 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1557 return true;
1558 }
1559 }
1560
1561 // default case
1562
1563 Base = Addr;
1564 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1565 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1566 return true;
1567}
1568
1569bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1570 SDValue &SOffset, SDValue &Offset,
1571 SDValue &Offen, SDValue &Idxen,
1572 SDValue &Addr64) const {
1573 // Subtarget prefers to use flat instruction
1574 // FIXME: This should be a pattern predicate and not reach here
1575 if (Subtarget->useFlatForGlobal())
1576 return false;
1577
1578 SDLoc DL(Addr);
1579
1580 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1581 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1582 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1583 SOffset = Subtarget->hasRestrictedSOffset()
1584 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1585 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1586
1587 ConstantSDNode *C1 = nullptr;
1588 SDValue N0 = Addr;
1589 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1590 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1591 if (isUInt<32>(C1->getZExtValue()))
1592 N0 = Addr.getOperand(0);
1593 else
1594 C1 = nullptr;
1595 }
1596
1597 if (N0->isAnyAdd()) {
1598 // (add N2, N3) -> addr64, or
1599 // (add (add N2, N3), C1) -> addr64
1600 SDValue N2 = N0.getOperand(0);
1601 SDValue N3 = N0.getOperand(1);
1602 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1603
1604 if (N2->isDivergent()) {
1605 if (N3->isDivergent()) {
1606 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1607 // addr64, and construct the resource from a 0 address.
1608 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1609 VAddr = N0;
1610 } else {
1611 // N2 is divergent, N3 is not.
1612 Ptr = N3;
1613 VAddr = N2;
1614 }
1615 } else {
1616 // N2 is not divergent.
1617 Ptr = N2;
1618 VAddr = N3;
1619 }
1620 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1621 } else if (N0->isDivergent()) {
1622 // N0 is divergent. Use it as the addr64, and construct the resource from a
1623 // 0 address.
1624 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1625 VAddr = N0;
1626 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1627 } else {
1628 // N0 -> offset, or
1629 // (N0 + C1) -> offset
1630 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1631 Ptr = N0;
1632 }
1633
1634 if (!C1) {
1635 // No offset.
1636 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1637 return true;
1638 }
1639
1640 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1641 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1642 // Legal offset for instruction.
1643 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1644 return true;
1645 }
1646
1647 // Illegal offset, store it in soffset.
1648 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1649 SOffset =
1650 SDValue(CurDAG->getMachineNode(
1651 AMDGPU::S_MOV_B32, DL, MVT::i32,
1652 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1653 0);
1654 return true;
1655}
1656
1657bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1658 SDValue &VAddr, SDValue &SOffset,
1659 SDValue &Offset) const {
1660 SDValue Ptr, Offen, Idxen, Addr64;
1661
1662 // addr64 bit was removed for volcanic islands.
1663 // FIXME: This should be a pattern predicate and not reach here
1664 if (!Subtarget->hasAddr64())
1665 return false;
1666
1667 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1668 return false;
1669
1670 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1671 if (C->getSExtValue()) {
1672 SDLoc DL(Addr);
1673
1674 const SITargetLowering& Lowering =
1675 *static_cast<const SITargetLowering*>(getTargetLowering());
1676
1677 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1678 return true;
1679 }
1680
1681 return false;
1682}
1683
1684std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1685 SDLoc DL(N);
1686
1687 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1688 SDValue TFI =
1689 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1690
1691 // We rebase the base address into an absolute stack address and hence
1692 // use constant 0 for soffset. This value must be retained until
1693 // frame elimination and eliminateFrameIndex will choose the appropriate
1694 // frame register if need be.
1695 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1696}
1697
1698bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1699 SDValue Addr, SDValue &Rsrc,
1700 SDValue &VAddr, SDValue &SOffset,
1701 SDValue &ImmOffset) const {
1702
1703 SDLoc DL(Addr);
1704 MachineFunction &MF = CurDAG->getMachineFunction();
1705 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1706
1707 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1708
1709 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1710 int64_t Imm = CAddr->getSExtValue();
1711 const int64_t NullPtr =
1713 // Don't fold null pointer.
1714 if (Imm != NullPtr) {
1715 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1716 SDValue HighBits =
1717 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1718 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1719 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1720 VAddr = SDValue(MovHighBits, 0);
1721
1722 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1723 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1724 return true;
1725 }
1726 }
1727
1728 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1729 // (add n0, c1)
1730
1731 SDValue N0 = Addr.getOperand(0);
1732 uint64_t C1 = Addr.getConstantOperandVal(1);
1733
1734 // Offsets in vaddr must be positive if range checking is enabled.
1735 //
1736 // The total computation of vaddr + soffset + offset must not overflow. If
1737 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1738 // overflowing.
1739 //
1740 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1741 // always perform a range check. If a negative vaddr base index was used,
1742 // this would fail the range check. The overall address computation would
1743 // compute a valid address, but this doesn't happen due to the range
1744 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1745 //
1746 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1747 // MUBUF vaddr, but not on older subtargets which can only do this if the
1748 // sign bit is known 0.
1749 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1750 if (TII->isLegalMUBUFImmOffset(C1) &&
1751 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1752 CurDAG->SignBitIsZero(N0))) {
1753 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1754 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1755 return true;
1756 }
1757 }
1758
1759 // (node)
1760 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1761 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1762 return true;
1763}
1764
1765static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1766 if (Val.getOpcode() != ISD::CopyFromReg)
1767 return false;
1768 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1769 if (!Reg.isPhysical())
1770 return false;
1771 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1772 return RC && TRI.isSGPRClass(RC);
1773}
1774
1775bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1776 SDValue Addr,
1777 SDValue &SRsrc,
1778 SDValue &SOffset,
1779 SDValue &Offset) const {
1780 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1781 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1782 MachineFunction &MF = CurDAG->getMachineFunction();
1783 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1784 SDLoc DL(Addr);
1785
1786 // CopyFromReg <sgpr>
1787 if (IsCopyFromSGPR(*TRI, Addr)) {
1788 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1789 SOffset = Addr;
1790 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1791 return true;
1792 }
1793
1794 ConstantSDNode *CAddr;
1795 if (Addr.getOpcode() == ISD::ADD) {
1796 // Add (CopyFromReg <sgpr>) <constant>
1797 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1798 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1799 return false;
1800 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1801 return false;
1802
1803 SOffset = Addr.getOperand(0);
1804 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1805 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1806 // <constant>
1807 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1808 } else {
1809 return false;
1810 }
1811
1812 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1813
1814 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1815 return true;
1816}
1817
1818bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1819 SDValue &SOffset, SDValue &Offset
1820 ) const {
1821 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1822 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1823
1824 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1825 return false;
1826
1827 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1828 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1829 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1830 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1831 maskTrailingOnes<uint64_t>(32); // Size
1832 SDLoc DL(Addr);
1833
1834 const SITargetLowering& Lowering =
1835 *static_cast<const SITargetLowering*>(getTargetLowering());
1836
1837 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1838 return true;
1839 }
1840 return false;
1841}
1842
1843bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1844 SDValue &SOffset) const {
1845 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1846 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1847 return true;
1848 }
1849
1850 SOffset = ByteOffsetNode;
1851 return true;
1852}
1853
1854// Find a load or store from corresponding pattern root.
1855// Roots may be build_vector, bitconvert or their combinations.
1858 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1859 return MN;
1861 for (SDValue V : N->op_values())
1862 if (MemSDNode *MN =
1864 return MN;
1865 llvm_unreachable("cannot find MemSDNode in the pattern!");
1866}
1867
1868bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(
1869 SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset,
1870 AMDGPU::FlatAddrSpace FlatVariant) const {
1872 int64_t OffsetVal = 0;
1873
1874 unsigned AS = findMemSDNode(N)->getAddressSpace();
1875
1876 bool CanHaveFlatSegmentOffsetBug =
1877 Subtarget->hasFlatSegmentOffsetBug() &&
1878 FlatVariant == FlatAddrSpace::FLAT &&
1880
1881 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1882 SDValue N0, N1;
1883 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1884 (FlatVariant != FlatAddrSpace::FlatScratch ||
1885 isFlatScratchBaseLegal(Addr))) {
1886 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1887
1888 // Adding the offset to the base address in a FLAT instruction must not
1889 // change the memory aperture in which the address falls. Therefore we can
1890 // only fold offsets from inbounds GEPs into FLAT instructions.
1891 bool IsInBounds =
1892 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1893 if (COffsetVal == 0 || FlatVariant != FlatAddrSpace::FLAT || IsInBounds) {
1894 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1895 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1896 Addr = N0;
1897 OffsetVal = COffsetVal;
1898 } else {
1899 // If the offset doesn't fit, put the low bits into the offset field
1900 // and add the rest.
1901 //
1902 // For a FLAT instruction the hardware decides whether to access
1903 // global/scratch/shared memory based on the high bits of vaddr,
1904 // ignoring the offset field, so we have to ensure that when we add
1905 // remainder to vaddr it still points into the same underlying object.
1906 // The easiest way to do that is to make sure that we split the offset
1907 // into two pieces that are both >= 0 or both <= 0.
1908
1909 SDLoc DL(N);
1910 uint64_t RemainderOffset;
1911
1912 std::tie(OffsetVal, RemainderOffset) =
1913 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1914
1915 SDValue AddOffsetLo =
1916 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1917 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1918
1919 if (Addr.getValueType().getSizeInBits() == 32) {
1921 Opnds.push_back(N0);
1922 Opnds.push_back(AddOffsetLo);
1923 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1924 if (Subtarget->hasAddNoCarryInsts()) {
1925 AddOp = AMDGPU::V_ADD_U32_e64;
1926 Opnds.push_back(Clamp);
1927 }
1928 Addr =
1929 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1930 } else {
1931 // TODO: Should this try to use a scalar add pseudo if the base
1932 // address is uniform and saddr is usable?
1933 SDValue Sub0 =
1934 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1935 SDValue Sub1 =
1936 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1937
1938 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1939 DL, MVT::i32, N0, Sub0);
1940 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1941 DL, MVT::i32, N0, Sub1);
1942
1943 SDValue AddOffsetHi =
1944 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1945
1946 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1947
1948 SDNode *Add =
1949 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1950 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1951
1952 SDNode *Addc = CurDAG->getMachineNode(
1953 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1954 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1955
1956 SDValue RegSequenceArgs[] = {
1957 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1958 MVT::i32),
1959 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1960
1961 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1962 MVT::i64, RegSequenceArgs),
1963 0);
1964 }
1965 }
1966 }
1967 }
1968 }
1969
1970 VAddr = Addr;
1971 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1972 return true;
1973}
1974
1975bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1976 SDValue &VAddr,
1977 SDValue &Offset) const {
1978 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1980}
1981
1982bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1983 SDValue &VAddr,
1984 SDValue &Offset) const {
1985 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1987}
1988
1989bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1990 SDValue &VAddr,
1991 SDValue &Offset) const {
1992 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1994}
1995
1996// If this matches *_extend i32:x, return x
1997// Otherwise if the value is I32 returns x.
1999 const SelectionDAG *DAG) {
2000 if (Op.getValueType() == MVT::i32)
2001 return Op;
2002
2003 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
2004 Op.getOpcode() != ISD::ANY_EXTEND &&
2005 !(DAG->SignBitIsZero(Op) &&
2006 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
2007 return SDValue();
2008
2009 SDValue ExtSrc = Op.getOperand(0);
2010 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
2011}
2012
2013// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
2014// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
2015bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2016 SDValue &SAddr, SDValue &VOffset,
2017 SDValue &Offset, bool &ScaleOffset,
2018 bool NeedIOffset) const {
2020 int64_t ImmOffset = 0;
2021 ScaleOffset = false;
2022
2023 // Match the immediate offset first, which canonically is moved as low as
2024 // possible.
2025
2026 SDValue LHS, RHS;
2027 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2028 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2029 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2030
2031 if (NeedIOffset &&
2032 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2033 FlatAddrSpace::FlatGlobal)) {
2034 Addr = LHS;
2035 ImmOffset = COffsetVal;
2036 } else if (!LHS->isDivergent()) {
2037 if (COffsetVal > 0) {
2038 SDLoc SL(N);
2039 // saddr + large_offset -> saddr +
2040 // (voffset = large_offset & ~MaxOffset) +
2041 // (large_offset & MaxOffset);
2042 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2043 if (NeedIOffset) {
2044 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2045 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, FlatAddrSpace::FlatGlobal);
2046 }
2047
2048 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2049 : isUInt<32>(RemainderOffset)) {
2050 SDNode *VMov = CurDAG->getMachineNode(
2051 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2052 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2053 VOffset = SDValue(VMov, 0);
2054 SAddr = LHS;
2055 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2056 return true;
2057 }
2058 }
2059
2060 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2061 // is 1 we would need to perform 1 or 2 extra moves for each half of
2062 // the constant and it is better to do a scalar add and then issue a
2063 // single VALU instruction to materialize zero. Otherwise it is less
2064 // instructions to perform VALU adds with immediates or inline literals.
2065 unsigned NumLiterals =
2066 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2067 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2068 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2069 return false;
2070 }
2071 }
2072
2073 // Match the variable offset.
2074 if (Addr->isAnyAdd()) {
2075 LHS = Addr.getOperand(0);
2076
2077 if (!LHS->isDivergent()) {
2078 // add (i64 sgpr), (*_extend (i32 vgpr))
2079 RHS = Addr.getOperand(1);
2080 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2081 if (SDValue ExtRHS = matchExtFromI32orI32(
2082 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2083 SAddr = LHS;
2084 VOffset = ExtRHS;
2085 }
2086 }
2087
2088 RHS = Addr.getOperand(1);
2089 if (!SAddr && !RHS->isDivergent()) {
2090 // add (*_extend (i32 vgpr)), (i64 sgpr)
2091 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2092 if (SDValue ExtLHS = matchExtFromI32orI32(
2093 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2094 SAddr = RHS;
2095 VOffset = ExtLHS;
2096 }
2097 }
2098
2099 if (SAddr) {
2100 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2101 return true;
2102 }
2103 }
2104
2105 if (Subtarget->hasScaleOffset() &&
2106 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2109 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2110 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2111 Addr.getOperand(0)->isDivergent() &&
2113 !Addr.getOperand(2)->isDivergent()) {
2114 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2115 unsigned Size =
2116 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2117 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2118 if (ScaleOffset) {
2119 SAddr = Addr.getOperand(2);
2120 VOffset = Addr.getOperand(0);
2121 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2122 return true;
2123 }
2124 }
2125
2126 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2127 isa<ConstantSDNode>(Addr))
2128 return false;
2129
2130 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2131 // moves required to copy a 64-bit SGPR to VGPR.
2132 SAddr = Addr;
2133 SDNode *VMov =
2134 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2135 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2136 VOffset = SDValue(VMov, 0);
2137 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2138 return true;
2139}
2140
2141bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2142 SDValue &SAddr, SDValue &VOffset,
2143 SDValue &Offset,
2144 SDValue &CPol) const {
2145 bool ScaleOffset;
2146 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2147 return false;
2148
2149 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2150 SDLoc(), MVT::i32);
2151 return true;
2152}
2153
2154bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2155 SDValue &SAddr, SDValue &VOffset,
2156 SDValue &Offset,
2157 SDValue &CPol) const {
2158 bool ScaleOffset;
2159 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2160 return false;
2161
2162 // We are assuming CPol is always the last operand of the intrinsic.
2163 auto PassedCPol =
2164 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2165 CPol = CurDAG->getTargetConstant(
2166 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2167 return true;
2168}
2169
2170bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2171 SDValue &SAddr,
2172 SDValue &VOffset,
2173 SDValue &Offset,
2174 SDValue &CPol) const {
2175 bool ScaleOffset;
2176 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2177 return false;
2178
2179 // We are assuming CPol is second from last operand of the intrinsic.
2180 auto PassedCPol =
2181 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2182 CPol = CurDAG->getTargetConstant(
2183 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2184 return true;
2185}
2186
2187bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2188 SDValue &SAddr, SDValue &VOffset,
2189 SDValue &Offset,
2190 SDValue &CPol) const {
2191 bool ScaleOffset;
2192 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2193 return false;
2194
2195 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2196 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2197 return true;
2198}
2199
2200bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2201 SDValue &SAddr,
2202 SDValue &VOffset,
2203 SDValue &CPol) const {
2204 bool ScaleOffset;
2205 SDValue DummyOffset;
2206 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2207 false))
2208 return false;
2209
2210 // We are assuming CPol is always the last operand of the intrinsic.
2211 auto PassedCPol =
2212 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2213 CPol = CurDAG->getTargetConstant(
2214 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2215 return true;
2216}
2217
2218bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2219 SDValue &SAddr,
2220 SDValue &VOffset,
2221 SDValue &CPol) const {
2222 bool ScaleOffset;
2223 SDValue DummyOffset;
2224 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2225 false))
2226 return false;
2227
2228 // We are assuming CPol is second from last operand of the intrinsic.
2229 auto PassedCPol =
2230 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2231 CPol = CurDAG->getTargetConstant(
2232 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2233 return true;
2234}
2235
2237 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2238 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2239 } else if (SAddr.getOpcode() == ISD::ADD &&
2241 // Materialize this into a scalar move for scalar address to avoid
2242 // readfirstlane.
2243 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2244 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2245 FI->getValueType(0));
2246 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2247 MVT::i32, TFI, SAddr.getOperand(1)),
2248 0);
2249 }
2250
2251 return SAddr;
2252}
2253
2254// Match (32-bit SGPR base) + sext(imm offset)
2255bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2256 SDValue &SAddr,
2257 SDValue &Offset) const {
2259 if (Addr->isDivergent())
2260 return false;
2261
2262 SDLoc DL(Addr);
2263
2264 int64_t COffsetVal = 0;
2265
2266 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2267 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2268 SAddr = Addr.getOperand(0);
2269 } else {
2270 SAddr = Addr;
2271 }
2272
2273 SAddr = SelectSAddrFI(CurDAG, SAddr);
2274
2275 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2276
2277 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2278 FlatAddrSpace::FlatScratch)) {
2279 int64_t SplitImmOffset, RemainderOffset;
2280 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2281 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, FlatAddrSpace::FlatScratch);
2282
2283 COffsetVal = SplitImmOffset;
2284
2285 SDValue AddOffset =
2287 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2288 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2289 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2290 SAddr, AddOffset),
2291 0);
2292 }
2293
2294 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2295
2296 return true;
2297}
2298
2299// Check whether the flat scratch SVS swizzle bug affects this access.
2300bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2301 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2302 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2303 return false;
2304
2305 // The bug affects the swizzling of SVS accesses if there is any carry out
2306 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2307 // voffset to (soffset + inst_offset).
2308 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2309 KnownBits SKnown =
2310 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2311 KnownBits::makeConstant(APInt(32, ImmOffset,
2312 /*isSigned=*/true)));
2313 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2314 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2315 return (VMax & 3) + (SMax & 3) >= 4;
2316}
2317
2318bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2319 SDValue &VAddr, SDValue &SAddr,
2320 SDValue &Offset,
2321 SDValue &CPol) const {
2322 int64_t ImmOffset = 0;
2323
2324 SDValue LHS, RHS;
2325 SDValue OrigAddr = Addr;
2326 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2327 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2329
2330 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2332 Addr = LHS;
2333 ImmOffset = COffsetVal;
2334 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2335 SDLoc SL(N);
2336 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2337 // (large_offset & MaxOffset);
2338 int64_t SplitImmOffset, RemainderOffset;
2339 std::tie(SplitImmOffset, RemainderOffset) =
2340 TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2342
2343 if (isUInt<32>(RemainderOffset)) {
2344 SDNode *VMov = CurDAG->getMachineNode(
2345 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2346 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2347 VAddr = SDValue(VMov, 0);
2348 SAddr = LHS;
2349 if (!isFlatScratchBaseLegal(Addr))
2350 return false;
2351 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2352 return false;
2353 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2354 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2355 return true;
2356 }
2357 }
2358 }
2359
2360 if (Addr.getOpcode() != ISD::ADD)
2361 return false;
2362
2363 LHS = Addr.getOperand(0);
2364 RHS = Addr.getOperand(1);
2365
2366 if (!LHS->isDivergent() && RHS->isDivergent()) {
2367 SAddr = LHS;
2368 VAddr = RHS;
2369 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2370 SAddr = RHS;
2371 VAddr = LHS;
2372 } else {
2373 return false;
2374 }
2375
2376 if (OrigAddr != Addr) {
2377 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2378 return false;
2379 } else {
2380 if (!isFlatScratchBaseLegalSV(OrigAddr))
2381 return false;
2382 }
2383
2384 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2385 return false;
2386 SAddr = SelectSAddrFI(CurDAG, SAddr);
2387 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2388
2389 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2390 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2391 SDLoc(), MVT::i32);
2392 return true;
2393}
2394
2395// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2396// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2397// Handle the case where the Immediate Offset + SOffset is negative.
2398bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2399 bool Imm32Only,
2400 bool IsBuffer,
2401 int64_t ImmOffset) const {
2402 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2403 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2404 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2405 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2406 return false;
2407 }
2408
2409 return true;
2410}
2411
2412// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2413// the load byte size. If it is update \p Offset to a pre-scaled value and
2414// return true.
2415bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2416 bool IsSigned) const {
2417 bool ScaleOffset = false;
2418 if (!Subtarget->hasScaleOffset() || !Offset)
2419 return false;
2420
2421 unsigned Size =
2422 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2423
2424 SDValue Off = Offset;
2425 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2426 Off = Ext;
2427
2428 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2429 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2430 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2431 } else if (Offset.getOpcode() == ISD::MUL ||
2432 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2433 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2434 (Offset.isMachineOpcode() &&
2435 Offset.getMachineOpcode() ==
2436 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2437 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2438 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2439 ScaleOffset = C->getZExtValue() == Size;
2440 }
2441
2442 if (ScaleOffset)
2443 Offset = Off.getOperand(0);
2444
2445 return ScaleOffset;
2446}
2447
2448// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2449// not null) offset. If Imm32Only is true, match only 32-bit immediate
2450// offsets available on CI.
2451bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2452 SDValue *SOffset, SDValue *Offset,
2453 bool Imm32Only, bool IsBuffer,
2454 bool HasSOffset, int64_t ImmOffset,
2455 bool *ScaleOffset) const {
2456 assert((!SOffset || !Offset) &&
2457 "Cannot match both soffset and offset at the same time!");
2458
2459 if (ScaleOffset) {
2460 assert(N && SOffset);
2461
2462 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2463 }
2464
2465 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2466 if (!C) {
2467 if (!SOffset)
2468 return false;
2469
2470 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2471 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2472 *SOffset = ByteOffsetNode;
2473 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2474 ImmOffset);
2475 }
2476 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2477 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2478 *SOffset = ByteOffsetNode.getOperand(0);
2479 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2480 ImmOffset);
2481 }
2482 }
2483 return false;
2484 }
2485
2486 SDLoc SL(ByteOffsetNode);
2487
2488 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2489 // offset for S_BUFFER instructions is unsigned.
2490 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2491 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2492 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2493 if (EncodedOffset && Offset && !Imm32Only) {
2494 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2495 return true;
2496 }
2497
2498 // SGPR and literal offsets are unsigned.
2499 if (ByteOffset < 0)
2500 return false;
2501
2502 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2503 if (EncodedOffset && Offset && Imm32Only) {
2504 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2505 return true;
2506 }
2507
2508 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2509 return false;
2510
2511 if (SOffset) {
2512 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2513 *SOffset = SDValue(
2514 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2515 return true;
2516 }
2517
2518 return false;
2519}
2520
2521SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2522 if (Addr.getValueType() != MVT::i32)
2523 return Addr;
2524
2525 // Zero-extend a 32-bit address.
2526 SDLoc SL(Addr);
2527
2528 const MachineFunction &MF = CurDAG->getMachineFunction();
2529 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2530 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2531 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2532
2533 const SDValue Ops[] = {
2534 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2535 Addr,
2536 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2537 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2538 0),
2539 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2540 };
2541
2542 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2543 Ops), 0);
2544}
2545
2546// Match a base and an immediate (if Offset is not null) or an SGPR (if
2547// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2548// true, match only 32-bit immediate offsets available on CI.
2549bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2550 SDValue &SBase, SDValue *SOffset,
2551 SDValue *Offset, bool Imm32Only,
2552 bool IsBuffer, bool HasSOffset,
2553 int64_t ImmOffset,
2554 bool *ScaleOffset) const {
2555 if (SOffset && Offset) {
2556 assert(!Imm32Only && !IsBuffer);
2557 SDValue B;
2558
2559 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2560 return false;
2561
2562 int64_t ImmOff = 0;
2563 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2564 ImmOff = C->getSExtValue();
2565
2566 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2567 true, ImmOff, ScaleOffset);
2568 }
2569
2570 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2571 // wraparound, because s_load instructions perform the addition in 64 bits.
2572 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2573 !Addr->getFlags().hasNoUnsignedWrap())
2574 return false;
2575
2576 SDValue N0, N1;
2577 // Extract the base and offset if possible.
2578 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2579 N0 = Addr.getOperand(0);
2580 N1 = Addr.getOperand(1);
2581 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2582 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2583 }
2584 if (!N0 || !N1)
2585 return false;
2586
2587 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2588 ImmOffset, ScaleOffset)) {
2589 SBase = N0;
2590 return true;
2591 }
2592 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2593 ImmOffset, ScaleOffset)) {
2594 SBase = N1;
2595 return true;
2596 }
2597 return false;
2598}
2599
2600bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2601 SDValue *SOffset, SDValue *Offset,
2602 bool Imm32Only, bool *ScaleOffset) const {
2603 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2604 /* IsBuffer */ false, /* HasSOffset */ false,
2605 /* ImmOffset */ 0, ScaleOffset)) {
2606 SBase = Expand32BitAddress(SBase);
2607 return true;
2608 }
2609
2610 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2611 SBase = Expand32BitAddress(Addr);
2612 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2613 return true;
2614 }
2615
2616 return false;
2617}
2618
2619bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2620 SDValue &Offset) const {
2621 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2622 &Offset);
2623}
2624
2625bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2626 SDValue &Offset) const {
2627 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2628 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2629 &Offset, /* Imm32Only */ true);
2630}
2631
2632bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2633 SDValue &SOffset, SDValue &CPol) const {
2634 bool ScaleOffset;
2635 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2636 /* Imm32Only */ false, &ScaleOffset))
2637 return false;
2638
2639 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2640 SDLoc(N), MVT::i32);
2641 return true;
2642}
2643
2644bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2645 SDValue &SBase, SDValue &SOffset,
2646 SDValue &Offset,
2647 SDValue &CPol) const {
2648 bool ScaleOffset;
2649 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2650 return false;
2651
2652 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2653 SDLoc(N), MVT::i32);
2654 return true;
2655}
2656
2657bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2658 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2659 /* Imm32Only */ false, /* IsBuffer */ true);
2660}
2661
2662bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2663 SDValue &Offset) const {
2664 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2665 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2666 /* Imm32Only */ true, /* IsBuffer */ true);
2667}
2668
2669bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2670 SDValue &Offset) const {
2671 // Match the (soffset + offset) pair as a 32-bit register base and
2672 // an immediate offset.
2673 return N.getValueType() == MVT::i32 &&
2674 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2675 /* SOffset*/ nullptr, &Offset,
2676 /* Imm32Only */ false, /* IsBuffer */ true);
2677}
2678
2679bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2680 SDValue &Base,
2681 SDValue &Offset) const {
2682 SDLoc DL(Index);
2683
2684 if (CurDAG->isBaseWithConstantOffset(Index)) {
2685 SDValue N0 = Index.getOperand(0);
2686 SDValue N1 = Index.getOperand(1);
2687 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2688
2689 // (add n0, c0)
2690 // Don't peel off the offset (c0) if doing so could possibly lead
2691 // the base (n0) to be negative.
2692 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2693 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2694 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2695 Base = N0;
2696 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2697 return true;
2698 }
2699 }
2700
2701 if (isa<ConstantSDNode>(Index))
2702 return false;
2703
2704 Base = Index;
2705 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2706 return true;
2707}
2708
2709SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2710 SDValue Val, uint32_t Offset,
2711 uint32_t Width) {
2712 if (Val->isDivergent()) {
2713 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2714 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2715 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2716
2717 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2718 }
2719 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2720 // Transformation function, pack the offset and width of a BFE into
2721 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2722 // source, bits [5:0] contain the offset and bits [22:16] the width.
2723 uint32_t PackedVal = Offset | (Width << 16);
2724 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2725
2726 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2727}
2728
2729void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2730 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2731 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2732 // Predicate: 0 < b <= c < 32
2733
2734 const SDValue &Shl = N->getOperand(0);
2735 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2736 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2737
2738 if (B && C) {
2739 uint32_t BVal = B->getZExtValue();
2740 uint32_t CVal = C->getZExtValue();
2741
2742 if (0 < BVal && BVal <= CVal && CVal < 32) {
2743 bool Signed = N->getOpcode() == ISD::SRA;
2744 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2745 32 - CVal));
2746 return;
2747 }
2748 }
2749 SelectCode(N);
2750}
2751
2752void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2753 switch (N->getOpcode()) {
2754 case ISD::AND:
2755 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2756 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2757 // Predicate: isMask(mask)
2758 const SDValue &Srl = N->getOperand(0);
2759 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2760 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2761
2762 if (Shift && Mask) {
2763 uint32_t ShiftVal = Shift->getZExtValue();
2764 uint32_t MaskVal = Mask->getZExtValue();
2765
2766 if (isMask_32(MaskVal)) {
2767 uint32_t WidthVal = llvm::popcount(MaskVal);
2768 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2769 WidthVal));
2770 return;
2771 }
2772 }
2773 }
2774 break;
2775 case ISD::SRL:
2776 if (N->getOperand(0).getOpcode() == ISD::AND) {
2777 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2778 // Predicate: isMask(mask >> b)
2779 const SDValue &And = N->getOperand(0);
2780 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2781 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2782
2783 if (Shift && Mask) {
2784 uint32_t ShiftVal = Shift->getZExtValue();
2785 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2786
2787 if (isMask_32(MaskVal)) {
2788 uint32_t WidthVal = llvm::popcount(MaskVal);
2789 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2790 WidthVal));
2791 return;
2792 }
2793 }
2794 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2795 SelectS_BFEFromShifts(N);
2796 return;
2797 }
2798 break;
2799 case ISD::SRA:
2800 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2801 SelectS_BFEFromShifts(N);
2802 return;
2803 }
2804 break;
2805
2807 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2808 SDValue Src = N->getOperand(0);
2809 if (Src.getOpcode() != ISD::SRL)
2810 break;
2811
2812 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2813 if (!Amt)
2814 break;
2815
2816 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2817 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2818 Amt->getZExtValue(), Width));
2819 return;
2820 }
2821 }
2822
2823 SelectCode(N);
2824}
2825
2826bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2827 assert(N->getOpcode() == ISD::BRCOND);
2828 if (!N->hasOneUse())
2829 return false;
2830
2831 SDValue Cond = N->getOperand(1);
2832 if (Cond.getOpcode() == ISD::CopyToReg)
2833 Cond = Cond.getOperand(2);
2834
2835 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2836 return false;
2837
2838 MVT VT = Cond.getOperand(0).getSimpleValueType();
2839 if (VT == MVT::i32)
2840 return true;
2841
2842 if (VT == MVT::i64) {
2843 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2844 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2845 Subtarget->hasScalarCompareEq64();
2846 }
2847
2848 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2849 return true;
2850
2851 return false;
2852}
2853
2854static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2855 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2856 // Special case for amdgcn.ballot:
2857 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2858 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2859 // =>
2860 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2861 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2862 // Cond becomes a i(WaveSize) full mask value.
2863 // Note that ballot doesn't use SETEQ condition but its easy to support it
2864 // here for completeness, so in this case Negate is set true on return.
2865 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2866 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2867 isNullConstant(VCMP.getOperand(1))) {
2868
2869 auto Cond = VCMP.getOperand(0);
2870 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2871 Cond = Cond.getOperand(0);
2872
2873 if (isBoolSGPR(Cond)) {
2874 Negate = VCMP_CC == ISD::SETEQ;
2875 return Cond;
2876 }
2877 }
2878 return SDValue();
2879}
2880
2881void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2882 SDValue Cond = N->getOperand(1);
2883
2884 if (Cond.isUndef()) {
2885 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2886 N->getOperand(2), N->getOperand(0));
2887 return;
2888 }
2889
2890 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2891
2892 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2893 bool AndExec = !UseSCCBr;
2894 bool Negate = false;
2895
2896 if (Cond.getOpcode() == ISD::SETCC &&
2897 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2898 SDValue VCMP = Cond->getOperand(0);
2899 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2900 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2901 isNullConstant(Cond->getOperand(1)) &&
2902 // We may encounter ballot.i64 in wave32 mode on -O0.
2903 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2904 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2905 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2906 // BRCOND i1 %C, %BB
2907 // =>
2908 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2909 // VCC = COPY i(WaveSize) %VCMP
2910 // S_CBRANCH_VCCNZ/VCCZ %BB
2911 Negate = CC == ISD::SETEQ;
2912 bool NegatedBallot = false;
2913 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2914 Cond = BallotCond;
2915 UseSCCBr = !BallotCond->isDivergent();
2916 Negate = Negate ^ NegatedBallot;
2917 } else {
2918 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2919 // selected as V_CMP, but this may change for uniform condition.
2920 Cond = VCMP;
2921 UseSCCBr = false;
2922 }
2923 }
2924 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2925 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2926 // used.
2927 AndExec = false;
2928 }
2929
2930 unsigned BrOp =
2931 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2932 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2933 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2934 SDLoc SL(N);
2935
2936 if (AndExec) {
2937 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2938 // analyzed what generates the vcc value, so we do not know whether vcc
2939 // bits for disabled lanes are 0. Thus we need to mask out bits for
2940 // disabled lanes.
2941 //
2942 // For the case that we select S_CBRANCH_SCC1 and it gets
2943 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2944 // SIInstrInfo::moveToVALU which inserts the S_AND).
2945 //
2946 // We could add an analysis of what generates the vcc value here and omit
2947 // the S_AND when is unnecessary. But it would be better to add a separate
2948 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2949 // catches both cases.
2950 Cond = SDValue(
2951 CurDAG->getMachineNode(
2952 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2953 MVT::i1,
2954 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2955 : AMDGPU::EXEC,
2956 MVT::i1),
2957 Cond),
2958 0);
2959 }
2960
2961 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2962 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2963 N->getOperand(2), // Basic Block
2964 VCC.getValue(0));
2965}
2966
2967void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2968 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2969 !N->isDivergent()) {
2970 SDValue Src = N->getOperand(0);
2971 if (Src.getValueType() == MVT::f16) {
2972 if (isExtractHiElt(Src, Src)) {
2973 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2974 {Src});
2975 return;
2976 }
2977 }
2978 }
2979
2980 SelectCode(N);
2981}
2982
2983void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2984 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2985 // be copied to an SGPR with readfirstlane.
2986 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2987 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2988
2989 SDValue Chain = N->getOperand(0);
2990 SDValue Ptr = N->getOperand(2);
2991 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2992 MachineMemOperand *MMO = M->getMemOperand();
2993 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2994
2996 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2997 SDValue PtrBase = Ptr.getOperand(0);
2998 SDValue PtrOffset = Ptr.getOperand(1);
2999
3000 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
3001 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
3002 N = glueCopyToM0(N, PtrBase);
3003 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
3004 }
3005 }
3006
3007 if (!Offset) {
3008 N = glueCopyToM0(N, Ptr);
3009 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
3010 }
3011
3012 SDValue Ops[] = {
3013 Offset,
3014 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
3015 Chain,
3016 N->getOperand(N->getNumOperands() - 1) // New glue
3017 };
3018
3019 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3020 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3021}
3022
3023// We need to handle this here because tablegen doesn't support matching
3024// instructions with multiple outputs.
3025void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3026 unsigned Opc;
3027 switch (IntrID) {
3028 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3029 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3030 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3031 break;
3032 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3033 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3034 break;
3035 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3036 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3037 break;
3038 }
3039 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3040 N->getOperand(5), N->getOperand(0)};
3041
3042 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3043 MachineMemOperand *MMO = M->getMemOperand();
3044 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3045 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3046}
3047
3048void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3049 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3050 unsigned Opc =
3051 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3052
3053 SmallVector<SDValue, 7> TensorOps;
3054 // First two groups
3055 TensorOps.push_back(N->getOperand(2)); // D# group 0
3056 TensorOps.push_back(N->getOperand(3)); // D# group 1
3057
3058 // Use _D2 version if both group 2 and 3 are zero-initialized.
3059 SDValue Group2 = N->getOperand(4);
3060 SDValue Group3 = N->getOperand(5);
3061 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3063 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3064 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3065 } else { // Has at least 4 groups
3066 TensorOps.push_back(Group2); // D# group 2
3067 TensorOps.push_back(Group3); // D# group 3
3068 }
3069
3070 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3071 // for now because all existing targets only support up to 4 groups.
3072 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3073 TensorOps.push_back(N->getOperand(7)); // cache policy
3074 TensorOps.push_back(N->getOperand(0)); // chain
3075
3076 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3077}
3078
3079static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3080 switch (IntrID) {
3081 case Intrinsic::amdgcn_ds_gws_init:
3082 return AMDGPU::DS_GWS_INIT;
3083 case Intrinsic::amdgcn_ds_gws_barrier:
3084 return AMDGPU::DS_GWS_BARRIER;
3085 case Intrinsic::amdgcn_ds_gws_sema_v:
3086 return AMDGPU::DS_GWS_SEMA_V;
3087 case Intrinsic::amdgcn_ds_gws_sema_br:
3088 return AMDGPU::DS_GWS_SEMA_BR;
3089 case Intrinsic::amdgcn_ds_gws_sema_p:
3090 return AMDGPU::DS_GWS_SEMA_P;
3091 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3092 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3093 default:
3094 llvm_unreachable("not a gws intrinsic");
3095 }
3096}
3097
3098void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3099 if (!Subtarget->hasGWS() ||
3100 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3101 !Subtarget->hasGWSSemaReleaseAll())) {
3102 // Let this error.
3103 SelectCode(N);
3104 return;
3105 }
3106
3107 // Chain, intrinsic ID, vsrc, offset
3108 const bool HasVSrc = N->getNumOperands() == 4;
3109 assert(HasVSrc || N->getNumOperands() == 3);
3110
3111 SDLoc SL(N);
3112 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3113 int ImmOffset = 0;
3114 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3115 MachineMemOperand *MMO = M->getMemOperand();
3116
3117 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3118 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3119
3120 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3121 // offset field) % 64. Some versions of the programming guide omit the m0
3122 // part, or claim it's from offset 0.
3123 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3124 // If we have a constant offset, try to use the 0 in m0 as the base.
3125 // TODO: Look into changing the default m0 initialization value. If the
3126 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3127 // the immediate offset.
3128 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3129 ImmOffset = ConstOffset->getZExtValue();
3130 } else {
3131 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3132 ImmOffset = BaseOffset.getConstantOperandVal(1);
3133 BaseOffset = BaseOffset.getOperand(0);
3134 }
3135
3136 // Prefer to do the shift in an SGPR since it should be possible to use m0
3137 // as the result directly. If it's already an SGPR, it will be eliminated
3138 // later.
3139 SDNode *SGPROffset
3140 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3141 BaseOffset);
3142 // Shift to offset in m0
3143 SDNode *M0Base
3144 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3145 SDValue(SGPROffset, 0),
3146 CurDAG->getTargetConstant(16, SL, MVT::i32));
3147 glueCopyToM0(N, SDValue(M0Base, 0));
3148 }
3149
3150 SDValue Chain = N->getOperand(0);
3151 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3152
3153 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3154
3155 const MCInstrDesc &InstrDesc = TII->get(Opc);
3156 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3157
3158 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3159
3161 if (HasVSrc) {
3162 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3163
3164 SDValue Data = N->getOperand(2);
3165 MVT DataVT = Data.getValueType().getSimpleVT();
3166 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3167 // Normal 32-bit case.
3168 Ops.push_back(N->getOperand(2));
3169 } else {
3170 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3171 // even aligned 64-bit register class.
3172 const SDValue RegSeqOps[] = {
3173 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3174 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3175 SDValue(
3176 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3177 0),
3178 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3179
3180 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3181 SL, MVT::v2i32, RegSeqOps),
3182 0));
3183 }
3184 }
3185
3186 Ops.push_back(OffsetField);
3187 Ops.push_back(Chain);
3188
3189 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3190 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3191}
3192
3193void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3194 if (Subtarget->getLDSBankCount() != 16) {
3195 // This is a single instruction with a pattern.
3196 SelectCode(N);
3197 return;
3198 }
3199
3200 SDLoc DL(N);
3201
3202 // This requires 2 instructions. It is possible to write a pattern to support
3203 // this, but the generated isel emitter doesn't correctly deal with multiple
3204 // output instructions using the same physical register input. The copy to m0
3205 // is incorrectly placed before the second instruction.
3206 //
3207 // TODO: Match source modifiers.
3208 //
3209 // def : Pat <
3210 // (int_amdgcn_interp_p1_f16
3211 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3212 // (i32 timm:$attrchan), (i32 timm:$attr),
3213 // (i1 timm:$high), M0),
3214 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3215 // timm:$attrchan, 0,
3216 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3217 // let Predicates = [has16BankLDS];
3218 // }
3219
3220 // 16 bank LDS
3221 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3222 N->getOperand(5), SDValue());
3223
3224 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3225
3226 SDNode *InterpMov =
3227 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3228 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3229 N->getOperand(3), // Attr
3230 N->getOperand(2), // Attrchan
3231 ToM0.getValue(1) // In glue
3232 });
3233
3234 SDNode *InterpP1LV =
3235 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3236 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3237 N->getOperand(1), // Src0
3238 N->getOperand(3), // Attr
3239 N->getOperand(2), // Attrchan
3240 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3241 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3242 N->getOperand(4), // high
3243 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3244 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3245 SDValue(InterpMov, 1)
3246 });
3247
3248 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3249}
3250
3251void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3252 unsigned IntrID = N->getConstantOperandVal(1);
3253 switch (IntrID) {
3254 case Intrinsic::amdgcn_ds_append:
3255 case Intrinsic::amdgcn_ds_consume: {
3256 if (N->getValueType(0) != MVT::i32)
3257 break;
3258 SelectDSAppendConsume(N, IntrID);
3259 return;
3260 }
3261 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3262 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3263 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3264 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3265 SelectDSBvhStackIntrinsic(N, IntrID);
3266 return;
3267 case Intrinsic::amdgcn_init_whole_wave:
3268 CurDAG->getMachineFunction()
3269 .getInfo<SIMachineFunctionInfo>()
3270 ->setInitWholeWave();
3271 break;
3272 }
3273
3274 SelectCode(N);
3275}
3276
3277void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3278 unsigned IntrID = N->getConstantOperandVal(0);
3279 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3280 SDNode *ConvGlueNode = N->getGluedNode();
3281 if (ConvGlueNode) {
3282 // FIXME: Possibly iterate over multiple glue nodes?
3283 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3284 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3285 ConvGlueNode =
3286 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3287 MVT::Glue, SDValue(ConvGlueNode, 0));
3288 } else {
3289 ConvGlueNode = nullptr;
3290 }
3291 switch (IntrID) {
3292 case Intrinsic::amdgcn_wqm:
3293 Opcode = AMDGPU::WQM;
3294 break;
3295 case Intrinsic::amdgcn_softwqm:
3296 Opcode = AMDGPU::SOFT_WQM;
3297 break;
3298 case Intrinsic::amdgcn_wwm:
3299 case Intrinsic::amdgcn_strict_wwm:
3300 Opcode = AMDGPU::STRICT_WWM;
3301 break;
3302 case Intrinsic::amdgcn_strict_wqm:
3303 Opcode = AMDGPU::STRICT_WQM;
3304 break;
3305 case Intrinsic::amdgcn_interp_p1_f16:
3306 SelectInterpP1F16(N);
3307 return;
3308 case Intrinsic::amdgcn_permlane16_swap:
3309 case Intrinsic::amdgcn_permlane32_swap: {
3310 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3311 !Subtarget->hasPermlane16Swap()) ||
3312 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3313 !Subtarget->hasPermlane32Swap())) {
3314 SelectCode(N); // Hit the default error
3315 return;
3316 }
3317
3318 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3319 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3320 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3321
3322 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3323 if (ConvGlueNode)
3324 NewOps.push_back(SDValue(ConvGlueNode, 0));
3325
3326 bool FI = N->getConstantOperandVal(3);
3327 NewOps[2] = CurDAG->getTargetConstant(
3328 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3329
3330 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3331 return;
3332 }
3333 default:
3334 SelectCode(N);
3335 break;
3336 }
3337
3338 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3339 SDValue Src = N->getOperand(1);
3340 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3341 }
3342
3343 if (ConvGlueNode) {
3344 SmallVector<SDValue, 4> NewOps(N->ops());
3345 NewOps.push_back(SDValue(ConvGlueNode, 0));
3346 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3347 }
3348}
3349
3350void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3351 unsigned IntrID = N->getConstantOperandVal(1);
3352 switch (IntrID) {
3353 case Intrinsic::amdgcn_ds_gws_init:
3354 case Intrinsic::amdgcn_ds_gws_barrier:
3355 case Intrinsic::amdgcn_ds_gws_sema_v:
3356 case Intrinsic::amdgcn_ds_gws_sema_br:
3357 case Intrinsic::amdgcn_ds_gws_sema_p:
3358 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3359 SelectDS_GWS(N, IntrID);
3360 return;
3361 case Intrinsic::amdgcn_tensor_load_to_lds:
3362 case Intrinsic::amdgcn_tensor_store_from_lds:
3363 SelectTensorLoadStore(N, IntrID);
3364 return;
3365 default:
3366 break;
3367 }
3368
3369 SelectCode(N);
3370}
3371
3372void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3373 SDValue Log2WaveSize =
3374 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3375 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3376 {N->getOperand(0), Log2WaveSize});
3377}
3378
3379void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3380 SDValue SrcVal = N->getOperand(1);
3381 if (SrcVal.getValueType() != MVT::i32) {
3382 SelectCode(N); // Emit default error
3383 return;
3384 }
3385
3386 SDValue CopyVal;
3387 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3388 SDLoc SL(N);
3389
3390 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3391 CopyVal = SrcVal.getOperand(0);
3392 } else {
3393 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3394 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3395
3396 if (N->isDivergent()) {
3397 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3398 MVT::i32, SrcVal),
3399 0);
3400 }
3401
3402 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3403 {SrcVal, Log2WaveSize}),
3404 0);
3405 }
3406
3407 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3408 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3409}
3410
3411bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3412 unsigned &Mods,
3413 bool IsCanonicalizing,
3414 bool AllowAbs) const {
3415 Mods = SISrcMods::NONE;
3416 Src = In;
3417
3418 if (Src.getOpcode() == ISD::FNEG) {
3419 Mods |= SISrcMods::NEG;
3420 Src = Src.getOperand(0);
3421 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3422 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3423 // denormal mode, but we're implicitly canonicalizing in a source operand.
3424 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3425 if (LHS && LHS->isZero()) {
3426 Mods |= SISrcMods::NEG;
3427 Src = Src.getOperand(1);
3428 }
3429 }
3430
3431 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3432 Mods |= SISrcMods::ABS;
3433 Src = Src.getOperand(0);
3434 }
3435
3436 if (Mods != SISrcMods::NONE)
3437 return true;
3438
3439 // Convert various sign-bit masks on integers to src mods. Currently disabled
3440 // for 16-bit types as the codegen replaces the operand without adding a
3441 // srcmod. This is intentionally finding the cases where we are performing
3442 // float neg and abs on int types, the goal is not to obtain two's complement
3443 // neg or abs. Limit converison to select operands via the nonCanonalizing
3444 // pattern.
3445 // TODO: Add 16-bit support.
3446 if (IsCanonicalizing)
3447 return true;
3448
3449 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3450 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3451 // through the extract to the bitwise op.
3452 SDValue PeekSrc =
3453 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3454 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3455 // types as the codegen replaces the operand without adding a srcmod.
3456 // This is intentionally finding the cases where we are performing float neg
3457 // and abs on int types, the goal is not to obtain two's complement neg or
3458 // abs.
3459 // TODO: Add 16-bit support.
3460 unsigned Opc = PeekSrc.getOpcode();
3461 EVT VT = Src.getValueType();
3462 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3463 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3464 return true;
3465
3466 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3467 if (!CRHS)
3468 return true;
3469
3470 auto ReplaceSrc = [&]() -> SDValue {
3471 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3472 return Src.getOperand(0);
3473
3474 SDValue LHS = PeekSrc->getOperand(0);
3475 SDValue Index = Src->getOperand(1);
3476 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3477 Src.getValueType(), LHS, Index);
3478 };
3479
3480 // Recognise Srcmods:
3481 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3482 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3483 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3484 // SrcModifiers.
3485 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3486 Mods |= SISrcMods::NEG;
3487 Src = ReplaceSrc();
3488 } else if (Opc == ISD::AND && AllowAbs &&
3489 CRHS->getAPIntValue().isMaxSignedValue()) {
3490 Mods |= SISrcMods::ABS;
3491 Src = ReplaceSrc();
3492 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3494 Src = ReplaceSrc();
3495 }
3496
3497 return true;
3498}
3499
3500bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3501 SDValue &SrcMods) const {
3502 unsigned Mods;
3503 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3504 /*AllowAbs=*/true)) {
3505 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3506 return true;
3507 }
3508
3509 return false;
3510}
3511
3512bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3513 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3514 unsigned Mods;
3515 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3516 /*AllowAbs=*/true)) {
3517 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3518 return true;
3519 }
3520
3521 return false;
3522}
3523
3524bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3525 SDValue &SrcMods) const {
3526 unsigned Mods;
3527 if (SelectVOP3ModsImpl(In, Src, Mods,
3528 /*IsCanonicalizing=*/true,
3529 /*AllowAbs=*/false)) {
3530 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3531 return true;
3532 }
3533
3534 return false;
3535}
3536
3537bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3538 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3539 return false;
3540
3541 Src = In;
3542 return true;
3543}
3544
3545bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3546 SDValue &SrcMods,
3547 bool OpSel) const {
3548 unsigned Mods;
3549 if (SelectVOP3ModsImpl(In, Src, Mods,
3550 /*IsCanonicalizing=*/true,
3551 /*AllowAbs=*/false)) {
3552 if (OpSel)
3553 Mods |= SISrcMods::OP_SEL_0;
3554 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3555 return true;
3556 }
3557
3558 return false;
3559}
3560
3561bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3562 SDValue &SrcMods) const {
3563 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3564}
3565
3566bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3567 SDValue &SrcMods) const {
3568 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3569}
3570
3571bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3572 SDValue &SrcMods, SDValue &Clamp,
3573 SDValue &Omod) const {
3574 SDLoc DL(In);
3575 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3576 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3577
3578 return SelectVOP3Mods(In, Src, SrcMods);
3579}
3580
3581bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3582 SDValue &SrcMods, SDValue &Clamp,
3583 SDValue &Omod) const {
3584 SDLoc DL(In);
3585 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3586 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3587
3588 return SelectVOP3BMods(In, Src, SrcMods);
3589}
3590
3591bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3592 SDValue &Clamp, SDValue &Omod) const {
3593 Src = In;
3594
3595 SDLoc DL(In);
3596 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3597 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3598
3599 return true;
3600}
3601
3602bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3603 SDValue &SrcMods, bool IsDOT) const {
3604 unsigned Mods = SISrcMods::NONE;
3605 Src = In;
3606
3607 // TODO: Handle G_FSUB 0 as fneg
3608 if (Src.getOpcode() == ISD::FNEG) {
3610 Src = Src.getOperand(0);
3611 }
3612
3613 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3614 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3615 unsigned VecMods = Mods;
3616
3617 SDValue Lo = stripBitcast(Src.getOperand(0));
3618 SDValue Hi = stripBitcast(Src.getOperand(1));
3619
3620 if (Lo.getOpcode() == ISD::FNEG) {
3621 Lo = stripBitcast(Lo.getOperand(0));
3622 Mods ^= SISrcMods::NEG;
3623 }
3624
3625 if (Hi.getOpcode() == ISD::FNEG) {
3626 Hi = stripBitcast(Hi.getOperand(0));
3627 Mods ^= SISrcMods::NEG_HI;
3628 }
3629
3630 if (isExtractHiElt(Lo, Lo))
3631 Mods |= SISrcMods::OP_SEL_0;
3632
3633 if (isExtractHiElt(Hi, Hi))
3634 Mods |= SISrcMods::OP_SEL_1;
3635
3636 unsigned VecSize = Src.getValueSizeInBits();
3637 Lo = stripExtractLoElt(Lo);
3638 Hi = stripExtractLoElt(Hi);
3639
3640 if (Lo.getValueSizeInBits() > VecSize) {
3641 Lo = CurDAG->getTargetExtractSubreg(
3642 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3643 MVT::getIntegerVT(VecSize), Lo);
3644 }
3645
3646 if (Hi.getValueSizeInBits() > VecSize) {
3647 Hi = CurDAG->getTargetExtractSubreg(
3648 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3649 MVT::getIntegerVT(VecSize), Hi);
3650 }
3651
3652 assert(Lo.getValueSizeInBits() <= VecSize &&
3653 Hi.getValueSizeInBits() <= VecSize);
3654
3655 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3656 // Really a scalar input. Just select from the low half of the register to
3657 // avoid packing.
3658
3659 if (VecSize == Lo.getValueSizeInBits()) {
3660 Src = Lo;
3661 } else if (VecSize == 32) {
3662 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3663 } else {
3664 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3665
3666 SDLoc SL(In);
3668 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3669 Lo.getValueType()), 0);
3670 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3671 : AMDGPU::SReg_64RegClassID;
3672 const SDValue Ops[] = {
3673 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3674 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3675 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3676
3677 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3678 Src.getValueType(), Ops), 0);
3679 }
3680 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3681 return true;
3682 }
3683
3684 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3685 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3686 .bitcastToAPInt().getZExtValue();
3687 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3688 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3689 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3690 return true;
3691 }
3692 }
3693
3694 Mods = VecMods;
3695 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3696 Src.getNumOperands() == 2) {
3697
3698 // TODO: We should repeat the build_vector source check above for the
3699 // vector_shuffle for negates and casts of individual elements.
3700
3701 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3702 ArrayRef<int> Mask = SVN->getMask();
3703
3704 if (Mask[0] < 2 && Mask[1] < 2) {
3705 // src1 should be undef.
3706 SDValue ShuffleSrc = SVN->getOperand(0);
3707
3708 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3709 ShuffleSrc = ShuffleSrc.getOperand(0);
3711 }
3712
3713 if (Mask[0] == 1)
3714 Mods |= SISrcMods::OP_SEL_0;
3715 if (Mask[1] == 1)
3716 Mods |= SISrcMods::OP_SEL_1;
3717
3718 Src = ShuffleSrc;
3719 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3720 return true;
3721 }
3722 }
3723
3724 // Packed instructions do not have abs modifiers.
3725 Mods |= SISrcMods::OP_SEL_1;
3726
3727 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3728 return true;
3729}
3730
3731bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3732 SDValue &SrcMods) const {
3733 return SelectVOP3PMods(In, Src, SrcMods, true);
3734}
3735
3736bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3737 SDValue SrcTmp, SrcModsTmp;
3738 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3739 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3740 Src = SrcTmp;
3741 return true;
3742 }
3743
3744 return false;
3745}
3746
3747bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3748 SDValue &SrcMods) const {
3749 SelectVOP3Mods(In, Src, SrcMods);
3750 unsigned Mods = SISrcMods::OP_SEL_1;
3751 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3752 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3753 return true;
3754}
3755
3756bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3757 SDValue SrcTmp, SrcModsTmp;
3758 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3759 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3760 Src = SrcTmp;
3761 return true;
3762 }
3763
3764 return false;
3765}
3766
3767bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3768 SDValue &Src) const {
3769 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3770 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3771
3772 unsigned Mods = SISrcMods::OP_SEL_1;
3773 unsigned SrcVal = C->getZExtValue();
3774 if (SrcVal == 1)
3775 Mods |= SISrcMods::OP_SEL_0;
3776
3777 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3778 return true;
3779}
3780
3782AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3783 const SDLoc &DL) const {
3784 unsigned DstRegClass;
3785 EVT DstTy;
3786 switch (Elts.size()) {
3787 case 8:
3788 DstRegClass = AMDGPU::VReg_256RegClassID;
3789 DstTy = MVT::v8i32;
3790 break;
3791 case 4:
3792 DstRegClass = AMDGPU::VReg_128RegClassID;
3793 DstTy = MVT::v4i32;
3794 break;
3795 case 2:
3796 DstRegClass = AMDGPU::VReg_64RegClassID;
3797 DstTy = MVT::v2i32;
3798 break;
3799 default:
3800 llvm_unreachable("unhandled Reg sequence size");
3801 }
3802
3804 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3805 for (unsigned i = 0; i < Elts.size(); ++i) {
3806 Ops.push_back(Elts[i]);
3807 Ops.push_back(CurDAG->getTargetConstant(
3809 }
3810 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3811}
3812
3814AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3815 const SDLoc &DL) const {
3816 SmallVector<SDValue, 8> PackedElts;
3817 assert("unhandled Reg sequence size" &&
3818 (Elts.size() == 8 || Elts.size() == 16));
3819
3820 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3821 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3822 for (unsigned i = 0; i < Elts.size(); i += 2) {
3823 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3824 SDValue HiSrc;
3825 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3826 PackedElts.push_back(HiSrc);
3827 } else {
3828 if (Subtarget->useRealTrue16Insts()) {
3829 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3830 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3831 // by reg_sequence.
3833 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3834 0);
3835 Elts[i] =
3836 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3837 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3838 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3839 MVT::i32, {Elts[i + 1], Undef},
3840 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3841 }
3842 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3843 MachineSDNode *Packed =
3844 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3845 {Elts[i + 1], Elts[i], PackLoLo});
3846 PackedElts.push_back(SDValue(Packed, 0));
3847 }
3848 }
3849 return buildRegSequence32(PackedElts, DL);
3850}
3851
3853AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3854 const SDLoc &DL,
3855 unsigned ElementSize) const {
3856 if (ElementSize == 16)
3857 return buildRegSequence16(Elts, DL);
3858 if (ElementSize == 32)
3859 return buildRegSequence32(Elts, DL);
3860 llvm_unreachable("Unhandled element size");
3861}
3862
3863void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3864 unsigned &Mods,
3866 SDValue &Src, const SDLoc &DL,
3867 unsigned ElementSize) const {
3868 if (ModOpcode == ISD::FNEG) {
3869 Mods |= SISrcMods::NEG;
3870 // Check if all elements also have abs modifier
3871 SmallVector<SDValue, 8> NegAbsElts;
3872 for (auto El : Elts) {
3873 if (El.getOpcode() != ISD::FABS)
3874 break;
3875 NegAbsElts.push_back(El->getOperand(0));
3876 }
3877 if (Elts.size() != NegAbsElts.size()) {
3878 // Neg
3879 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3880 } else {
3881 // Neg and Abs
3882 Mods |= SISrcMods::NEG_HI;
3883 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3884 }
3885 } else {
3886 assert(ModOpcode == ISD::FABS);
3887 // Abs
3888 Mods |= SISrcMods::NEG_HI;
3889 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3890 }
3891}
3892
3893// Check all f16 elements for modifiers while looking through b32 and v2b16
3894// build vector, stop if element does not satisfy ModifierCheck.
3895static void
3897 std::function<bool(SDValue)> ModifierCheck) {
3898 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3899 if (auto *F16Pair =
3900 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3901 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3902 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3903 if (!ModifierCheck(ElF16))
3904 break;
3905 }
3906 }
3907 }
3908}
3909
3910bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3911 SDValue &SrcMods) const {
3912 Src = In;
3913 unsigned Mods = SISrcMods::OP_SEL_1;
3914
3915 // mods are on f16 elements
3916 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3918
3919 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3920 if (Element.getOpcode() != ISD::FNEG)
3921 return false;
3922 EltsF16.push_back(Element.getOperand(0));
3923 return true;
3924 });
3925
3926 // All elements have neg modifier
3927 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3928 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
3929 Mods |= SISrcMods::NEG;
3930 Mods |= SISrcMods::NEG_HI;
3931 }
3932 }
3933
3934 // mods are on v2f16 elements
3935 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3936 SmallVector<SDValue, 8> EltsV2F16;
3937 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3938 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3939 // Based on first element decide which mod we match, neg or abs
3940 if (ElV2f16.getOpcode() != ISD::FNEG)
3941 break;
3942 EltsV2F16.push_back(ElV2f16.getOperand(0));
3943 }
3944
3945 // All pairs of elements have neg modifier
3946 if (BV->getNumOperands() == EltsV2F16.size()) {
3947 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
3948 Mods |= SISrcMods::NEG;
3949 Mods |= SISrcMods::NEG_HI;
3950 }
3951 }
3952
3953 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3954 return true;
3955}
3956
3957bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3958 SDValue &SrcMods) const {
3959 Src = In;
3960 unsigned Mods = SISrcMods::OP_SEL_1;
3961 unsigned ModOpcode;
3962
3963 // mods are on f16 elements
3964 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3966 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3967 // Based on first element decide which mod we match, neg or abs
3968 if (EltsF16.empty())
3969 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3970 if (ElF16.getOpcode() != ModOpcode)
3971 return false;
3972 EltsF16.push_back(ElF16.getOperand(0));
3973 return true;
3974 });
3975
3976 // All elements have ModOpcode modifier
3977 if (BV->getNumOperands() * 2 == EltsF16.size())
3978 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
3979 }
3980
3981 // mods are on v2f16 elements
3982 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3983 SmallVector<SDValue, 8> EltsV2F16;
3984
3985 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3986 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3987 // Based on first element decide which mod we match, neg or abs
3988 if (EltsV2F16.empty())
3989 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3990 if (ElV2f16->getOpcode() != ModOpcode)
3991 break;
3992 EltsV2F16.push_back(ElV2f16->getOperand(0));
3993 }
3994
3995 // All elements have ModOpcode modifier
3996 if (BV->getNumOperands() == EltsV2F16.size())
3997 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
3998 }
3999
4000 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4001 return true;
4002}
4003
4004bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
4005 SDValue &SrcMods) const {
4006 Src = In;
4007 unsigned Mods = SISrcMods::OP_SEL_1;
4009
4010 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4011 assert(BV->getNumOperands() > 0);
4012 // Based on first element decide which mod we match, neg or abs
4013 SDValue ElF32 = stripBitcast(BV->getOperand(0));
4014 unsigned ModOpcode =
4015 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4016 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4017 SDValue ElF32 = stripBitcast(BV->getOperand(i));
4018 if (ElF32.getOpcode() != ModOpcode)
4019 break;
4020 EltsF32.push_back(ElF32.getOperand(0));
4021 }
4022
4023 // All elements had ModOpcode modifier
4024 if (BV->getNumOperands() == EltsF32.size())
4025 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4026 }
4027
4028 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4029 return true;
4030}
4031
4032bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4033 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4034 BitVector UndefElements;
4035 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4036 if (isInlineImmediate(Splat.getNode())) {
4037 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4038 unsigned Imm = C->getAPIntValue().getSExtValue();
4039 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4040 return true;
4041 }
4042 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4043 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4044 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4045 return true;
4046 }
4047 llvm_unreachable("unhandled Constant node");
4048 }
4049 }
4050
4051 // 16 bit splat
4052 SDValue SplatSrc32 = stripBitcast(In);
4053 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4054 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4055 SDValue SplatSrc16 = stripBitcast(Splat32);
4056 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4057 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4058 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4059 std::optional<APInt> RawValue;
4060 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4061 RawValue = C->getValueAPF().bitcastToAPInt();
4062 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4063 RawValue = C->getAPIntValue();
4064
4065 if (RawValue.has_value()) {
4066 EVT VT = In.getValueType().getScalarType();
4067 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4068 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4071 RawValue.value());
4072 if (TII->isInlineConstant(FloatVal)) {
4073 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4074 MVT::i16);
4075 return true;
4076 }
4077 } else if (VT.getSimpleVT() == MVT::i16) {
4078 if (TII->isInlineConstant(RawValue.value())) {
4079 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4080 MVT::i16);
4081 return true;
4082 }
4083 } else
4084 llvm_unreachable("unknown 16-bit type");
4085 }
4086 }
4087 }
4088
4089 return false;
4090}
4091
4092bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4093 SDValue &IndexKey) const {
4094 unsigned Key = 0;
4095 Src = In;
4096
4097 if (In.getOpcode() == ISD::SRL) {
4098 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4099 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4100 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4101 ShiftAmt->getZExtValue() % 8 == 0) {
4102 Key = ShiftAmt->getZExtValue() / 8;
4103 Src = ShiftSrc;
4104 }
4105 }
4106
4107 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4108 return true;
4109}
4110
4111bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4112 SDValue &IndexKey) const {
4113 unsigned Key = 0;
4114 Src = In;
4115
4116 if (In.getOpcode() == ISD::SRL) {
4117 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4118 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4119 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4120 ShiftAmt->getZExtValue() == 16) {
4121 Key = 1;
4122 Src = ShiftSrc;
4123 }
4124 }
4125
4126 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4127 return true;
4128}
4129
4130bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4131 SDValue &IndexKey) const {
4132 unsigned Key = 0;
4133 Src = In;
4134
4135 SDValue InI32;
4136
4137 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4138 const SDValue &ExtendSrc = In.getOperand(0);
4139 if (ExtendSrc.getValueSizeInBits() == 32)
4140 InI32 = ExtendSrc;
4141 } else if (In->getOpcode() == ISD::BITCAST) {
4142 const SDValue &CastSrc = In.getOperand(0);
4143 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4144 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4145 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4146 if (Zero && Zero->getZExtValue() == 0)
4147 InI32 = CastSrc.getOperand(0);
4148 }
4149 }
4150
4151 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4152 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4153 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4154 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4155 EltIdx->getZExtValue() == 1) {
4156 Key = 1;
4157 Src = ExtractVecEltSrc;
4158 }
4159 }
4160
4161 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4162 return true;
4163}
4164
4165bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4166 SDValue &SrcMods) const {
4167 Src = In;
4168 // FIXME: Handle op_sel
4169 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4170 return true;
4171}
4172
4173bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4174 SDValue &SrcMods) const {
4175 // FIXME: Handle op_sel
4176 return SelectVOP3Mods(In, Src, SrcMods);
4177}
4178
4179// Match lowered fpext from bf16 to f32. This is a bit operation extending
4180// a 16-bit value with 16-bit of zeroes at LSB:
4181//
4182// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4183// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4184// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4185static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4186 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4187 return SDValue();
4188 Op = Op.getOperand(0);
4189
4190 IsExtractHigh = false;
4191 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4192 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4193 if (!Low16 || !Low16->isZero())
4194 return SDValue();
4195 Op = stripBitcast(Op.getOperand(1));
4196 if (Op.getValueType() != MVT::bf16)
4197 return SDValue();
4198 return Op;
4199 }
4200
4201 if (Op.getValueType() != MVT::i32)
4202 return SDValue();
4203
4204 if (Op.getOpcode() == ISD::AND) {
4205 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4206 if (Mask->getZExtValue() == 0xffff0000) {
4207 IsExtractHigh = true;
4208 return Op.getOperand(0);
4209 }
4210 }
4211 return SDValue();
4212 }
4213
4214 if (Op.getOpcode() == ISD::SHL) {
4215 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4216 if (Amt->getZExtValue() == 16)
4217 return Op.getOperand(0);
4218 }
4219 }
4220
4221 return SDValue();
4222}
4223
4224// The return value is not whether the match is possible (which it always is),
4225// but whether or not it a conversion is really used.
4226bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4227 unsigned &Mods,
4228 MVT VT) const {
4229 Mods = 0;
4230 SelectVOP3ModsImpl(In, Src, Mods);
4231
4232 bool IsExtractHigh = false;
4233 if (Src.getOpcode() == ISD::FP_EXTEND) {
4234 Src = Src.getOperand(0);
4235 } else if (VT == MVT::bf16) {
4236 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4237 if (!B16)
4238 return false;
4239 Src = B16;
4240 } else
4241 return false;
4242
4243 if (Src.getValueType() != VT &&
4244 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4245 return false;
4246
4247 Src = stripBitcast(Src);
4248
4249 // Be careful about folding modifiers if we already have an abs. fneg is
4250 // applied last, so we don't want to apply an earlier fneg.
4251 if ((Mods & SISrcMods::ABS) == 0) {
4252 unsigned ModsTmp;
4253 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4254
4255 if ((ModsTmp & SISrcMods::NEG) != 0)
4256 Mods ^= SISrcMods::NEG;
4257
4258 if ((ModsTmp & SISrcMods::ABS) != 0)
4259 Mods |= SISrcMods::ABS;
4260 }
4261
4262 // op_sel/op_sel_hi decide the source type and source.
4263 // If the source's op_sel_hi is set, it indicates to do a conversion from
4264 // fp16. If the sources's op_sel is set, it picks the high half of the source
4265 // register.
4266
4267 Mods |= SISrcMods::OP_SEL_1;
4268 if (Src.getValueSizeInBits() == 16) {
4269 if (isExtractHiElt(Src, Src)) {
4270 Mods |= SISrcMods::OP_SEL_0;
4271
4272 // TODO: Should we try to look for neg/abs here?
4273 return true;
4274 }
4275
4276 if (Src.getOpcode() == ISD::TRUNCATE &&
4277 Src.getOperand(0).getValueType() == MVT::i32) {
4278 Src = Src.getOperand(0);
4279 return true;
4280 }
4281
4282 if (Subtarget->useRealTrue16Insts())
4283 // In true16 mode, pack src to a 32bit
4284 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4285 } else if (IsExtractHigh)
4286 Mods |= SISrcMods::OP_SEL_0;
4287
4288 return true;
4289}
4290
4291bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4292 SDValue &SrcMods) const {
4293 unsigned Mods = 0;
4294 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4295 return false;
4296 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4297 return true;
4298}
4299
4300bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4301 SDValue &SrcMods) const {
4302 unsigned Mods = 0;
4303 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4304 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4305 return true;
4306}
4307
4308bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4309 SDValue &SrcMods) const {
4310 unsigned Mods = 0;
4311 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4312 return false;
4313 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4314 return true;
4315}
4316
4317bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4318 SDValue &SrcMods) const {
4319 unsigned Mods = 0;
4320 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4321 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4322 return true;
4323}
4324
4325// Match BITOP3 operation and return a number of matched instructions plus
4326// truth table.
4327static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4329 unsigned NumOpcodes = 0;
4330 uint8_t LHSBits, RHSBits;
4331
4332 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4333 // Define truth table given Src0, Src1, Src2 bits permutations:
4334 // 0 0 0
4335 // 0 0 1
4336 // 0 1 0
4337 // 0 1 1
4338 // 1 0 0
4339 // 1 0 1
4340 // 1 1 0
4341 // 1 1 1
4342 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4343
4344 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4345 if (C->isAllOnes()) {
4346 Bits = 0xff;
4347 return true;
4348 }
4349 if (C->isZero()) {
4350 Bits = 0;
4351 return true;
4352 }
4353 }
4354
4355 for (unsigned I = 0; I < Src.size(); ++I) {
4356 // Try to find existing reused operand
4357 if (Src[I] == Op) {
4358 Bits = SrcBits[I];
4359 return true;
4360 }
4361 // Try to replace parent operator
4362 if (Src[I] == In) {
4363 Bits = SrcBits[I];
4364 Src[I] = Op;
4365 return true;
4366 }
4367 }
4368
4369 if (Src.size() == 3) {
4370 // No room left for operands. Try one last time, there can be a 'not' of
4371 // one of our source operands. In this case we can compute the bits
4372 // without growing Src vector.
4373 if (Op.getOpcode() == ISD::XOR) {
4374 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4375 if (C->isAllOnes()) {
4376 SDValue LHS = Op.getOperand(0);
4377 for (unsigned I = 0; I < Src.size(); ++I) {
4378 if (Src[I] == LHS) {
4379 Bits = ~SrcBits[I];
4380 return true;
4381 }
4382 }
4383 }
4384 }
4385 }
4386
4387 return false;
4388 }
4389
4390 Bits = SrcBits[Src.size()];
4391 Src.push_back(Op);
4392 return true;
4393 };
4394
4395 switch (In.getOpcode()) {
4396 case ISD::AND:
4397 case ISD::OR:
4398 case ISD::XOR: {
4399 SDValue LHS = In.getOperand(0);
4400 SDValue RHS = In.getOperand(1);
4401
4402 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4403 if (!getOperandBits(LHS, LHSBits) ||
4404 !getOperandBits(RHS, RHSBits)) {
4405 Src = std::move(Backup);
4406 return std::make_pair(0, 0);
4407 }
4408
4409 // Recursion is naturally limited by the size of the operand vector.
4410 auto Op = BitOp3_Op(LHS, Src);
4411 if (Op.first) {
4412 NumOpcodes += Op.first;
4413 LHSBits = Op.second;
4414 }
4415
4416 Op = BitOp3_Op(RHS, Src);
4417 if (Op.first) {
4418 NumOpcodes += Op.first;
4419 RHSBits = Op.second;
4420 }
4421 break;
4422 }
4423 default:
4424 return std::make_pair(0, 0);
4425 }
4426
4427 uint8_t TTbl;
4428 switch (In.getOpcode()) {
4429 case ISD::AND:
4430 TTbl = LHSBits & RHSBits;
4431 break;
4432 case ISD::OR:
4433 TTbl = LHSBits | RHSBits;
4434 break;
4435 case ISD::XOR:
4436 TTbl = LHSBits ^ RHSBits;
4437 break;
4438 default:
4439 break;
4440 }
4441
4442 return std::make_pair(NumOpcodes + 1, TTbl);
4443}
4444
4445bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4446 SDValue &Src2, SDValue &Tbl) const {
4448 uint8_t TTbl;
4449 unsigned NumOpcodes;
4450
4451 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4452
4453 // Src.empty() case can happen if all operands are all zero or all ones.
4454 // Normally it shall be optimized out before reaching this.
4455 if (NumOpcodes < 2 || Src.empty())
4456 return false;
4457
4458 // For a uniform case threshold should be higher to account for moves between
4459 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4460 // and a readtfirstlane after.
4461 if (NumOpcodes < 4 && !In->isDivergent())
4462 return false;
4463
4464 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4465 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4466 // asm more readable. This cannot be modeled with AddedComplexity because
4467 // selector does not know how many operations did we match.
4468 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4469 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4470 In.getOperand(1).getOpcode() == In.getOpcode()))
4471 return false;
4472
4473 if (In.getOpcode() == ISD::OR &&
4474 (In.getOperand(0).getOpcode() == ISD::AND ||
4475 In.getOperand(1).getOpcode() == ISD::AND))
4476 return false;
4477 }
4478
4479 // Last operand can be ignored, turning a ternary operation into a binary.
4480 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4481 // 'c' with 'a' here without changing the answer. In some pathological
4482 // cases it should be possible to get an operation with a single operand
4483 // too if optimizer would not catch it.
4484 while (Src.size() < 3)
4485 Src.push_back(Src[0]);
4486
4487 Src0 = Src[0];
4488 Src1 = Src[1];
4489 Src2 = Src[2];
4490
4491 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4492 return true;
4493}
4494
4495SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4496 if (In.isUndef())
4497 return CurDAG->getUNDEF(MVT::i32);
4498
4499 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4500 SDLoc SL(In);
4501 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4502 }
4503
4504 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4505 SDLoc SL(In);
4506 return CurDAG->getConstant(
4507 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4508 }
4509
4510 SDValue Src;
4511 if (isExtractHiElt(In, Src))
4512 return Src;
4513
4514 return SDValue();
4515}
4516
4517bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4518 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4519
4520 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4521 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4522
4523 unsigned Limit = 0;
4524 bool AllUsesAcceptSReg = true;
4525 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4526 Limit < 10 && U != E; ++U, ++Limit) {
4527 const TargetRegisterClass *RC =
4528 getOperandRegClass(U->getUser(), U->getOperandNo());
4529
4530 // If the register class is unknown, it could be an unknown
4531 // register class that needs to be an SGPR, e.g. an inline asm
4532 // constraint
4533 if (!RC || SIRI->isSGPRClass(RC))
4534 return false;
4535
4536 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4537 RC != &AMDGPU::VS_64_Align2RegClass) {
4538 AllUsesAcceptSReg = false;
4539 SDNode *User = U->getUser();
4540 if (User->isMachineOpcode()) {
4541 unsigned Opc = User->getMachineOpcode();
4542 const MCInstrDesc &Desc = SII->get(Opc);
4543 if (Desc.isCommutable()) {
4544 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4545 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4546 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4547 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4548 const TargetRegisterClass *CommutedRC =
4549 getOperandRegClass(U->getUser(), CommutedOpNo);
4550 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4551 CommutedRC == &AMDGPU::VS_64RegClass ||
4552 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4553 AllUsesAcceptSReg = true;
4554 }
4555 }
4556 }
4557 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4558 // commuting current user. This means have at least one use
4559 // that strictly require VGPR. Thus, we will not attempt to commute
4560 // other user instructions.
4561 if (!AllUsesAcceptSReg)
4562 break;
4563 }
4564 }
4565 return !AllUsesAcceptSReg && (Limit < 10);
4566}
4567
4568bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4569 const auto *Ld = cast<LoadSDNode>(N);
4570 const MachineMemOperand *MMO = Ld->getMemOperand();
4571
4572 // FIXME: We ought to able able to take the direct isDivergent result. We
4573 // cannot rely on the MMO for a uniformity check, and should stop using
4574 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4575 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4576 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4577 // version, and then this can be dropped.
4578 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4579 return false;
4580
4581 return MMO->getSize().hasValue() &&
4582 Ld->getAlign() >=
4583 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4584 uint64_t(4))) &&
4585 (MMO->isInvariant() ||
4586 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4587 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4588 (Subtarget->getScalarizeGlobalBehavior() &&
4589 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4590 Ld->isSimple() &&
4591 static_cast<const SITargetLowering *>(getTargetLowering())
4592 ->isMemOpHasNoClobberedMemOperand(N)));
4593}
4594
4597 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4598 bool IsModified = false;
4599 do {
4600 IsModified = false;
4601
4602 // Go over all selected nodes and try to fold them a bit more
4603 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4604 while (Position != CurDAG->allnodes_end()) {
4605 SDNode *Node = &*Position++;
4607 if (!MachineNode)
4608 continue;
4609
4610 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4611 if (ResNode != Node) {
4612 if (ResNode)
4613 ReplaceUses(Node, ResNode);
4614 IsModified = true;
4615 }
4616 }
4617 CurDAG->RemoveDeadNodes();
4618 } while (IsModified);
4619}
4620
4625
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
bool isSDWAOperand(const SDNode *N) const
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:295
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:274
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:310
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
LLVM_ABI PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:861
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.