LLVM 23.0.0git
AMDGPUCallLowering.cpp
Go to the documentation of this file.
1//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the lowering of LLVM calls to machine code calls for
11/// GlobalISel.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUCallLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPULegalizerInfo.h"
19#include "SIRegisterInfo.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-call-lowering"
28
29using namespace llvm;
30
31namespace {
32
33/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35 Register ValVReg, const CCValAssign &VA) {
36 if (VA.getLocVT().getSizeInBits() < 32) {
37 // 16-bit types are reported as legal for 32-bit registers. We need to
38 // extend and do a 32-bit copy to avoid the verifier complaining about it.
39 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
40 }
41
42 return Handler.extendRegister(ValVReg, VA);
43}
44
45struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
48 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
49
51
52 Register getStackAddress(uint64_t Size, int64_t Offset,
54 ISD::ArgFlagsTy Flags) override {
55 llvm_unreachable("not implemented");
56 }
57
58 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59 const MachinePointerInfo &MPO,
60 const CCValAssign &VA) override {
61 llvm_unreachable("not implemented");
62 }
63
64 void assignValueToReg(Register ValVReg, Register PhysReg,
65 const CCValAssign &VA,
66 ISD::ArgFlagsTy Flags = {}) override {
67 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
68
69 // If this is a scalar return, insert a readfirstlane just in case the value
70 // ends up in a VGPR.
71 // FIXME: Assert this is a shader return.
72 const SIRegisterInfo *TRI
73 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
74 if (TRI->isSGPRReg(MRI, PhysReg)) {
75 LLT Ty = MRI.getType(ExtReg);
76 LLT S32 = LLT::scalar(32);
77 if (Ty != S32) {
78 // FIXME: We should probably support readfirstlane intrinsics with all
79 // legal 32-bit types.
80 assert(Ty.getSizeInBits() == 32);
81 if (Ty.isPointer())
82 ExtReg = MIRBuilder.buildPtrToInt(S32, ExtReg).getReg(0);
83 else
84 ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0);
85 }
86
87 auto ToSGPR = MIRBuilder
88 .buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
89 {MRI.getType(ExtReg)})
90 .addReg(ExtReg);
91 ExtReg = ToSGPR.getReg(0);
92 }
93
94 MIRBuilder.buildCopy(PhysReg, ExtReg);
95 MIB.addUse(PhysReg, RegState::Implicit);
96 }
97};
98
99struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
100 uint64_t StackUsed = 0;
101
102 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
103 : IncomingValueHandler(B, MRI) {}
104
105 Register getStackAddress(uint64_t Size, int64_t Offset,
107 ISD::ArgFlagsTy Flags) override {
108 auto &MFI = MIRBuilder.getMF().getFrameInfo();
109
110 // Byval is assumed to be writable memory, but other stack passed arguments
111 // are not.
112 const bool IsImmutable = !Flags.isByVal();
113 int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
114 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
115 auto AddrReg = MIRBuilder.buildFrameIndex(
117 StackUsed = std::max(StackUsed, Size + Offset);
118 return AddrReg.getReg(0);
119 }
120
121 void copyToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) {
122 if (VA.getLocVT().getSizeInBits() < 32) {
123 // 16-bit types are reported as legal for 32-bit registers. We need to
124 // do a 32-bit copy, and truncate to avoid the verifier complaining
125 // about it.
126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
127
128 // If we have signext/zeroext, it applies to the whole 32-bit register
129 // before truncation.
130 auto Extended =
131 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
132 MIRBuilder.buildTrunc(ValVReg, Extended);
133 return;
134 }
135
136 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
137 }
138
139 void readLaneToSGPR(Register ValVReg, Register PhysReg,
140 const CCValAssign &VA) {
141 // Handle inreg parameters passed through VGPRs due to SGPR exhaustion.
142 // When SGPRs are exhausted, the calling convention may allocate inreg
143 // parameters to VGPRs. We insert readfirstlane to move the value from
144 // VGPR to SGPR, as required by the inreg ABI.
145 //
146 // FIXME: This may increase instruction count in some cases. If the
147 // readfirstlane result is subsequently copied back to a VGPR, we cannot
148 // optimize away the unnecessary VGPR->SGPR->VGPR sequence in later passes
149 // because the inreg attribute information is not preserved in MIR. We could
150 // use WWM_COPY (or similar instructions) and mark it as foldable to enable
151 // later optimization passes to eliminate the redundant readfirstlane.
152 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
153 if (VA.getLocVT().getSizeInBits() < 32) {
154 auto ToSGPR = MIRBuilder
155 .buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
156 {MRI.getType(Copy.getReg(0))})
157 .addReg(Copy.getReg(0));
158 auto Extended =
159 buildExtensionHint(VA, ToSGPR.getReg(0), LLT(VA.getLocVT()));
160 MIRBuilder.buildTrunc(ValVReg, Extended);
161 return;
162 }
163
164 MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, ValVReg)
165 .addReg(Copy.getReg(0));
166 }
167
168 void assignValueToReg(Register ValVReg, Register PhysReg,
169 const CCValAssign &VA,
170 ISD::ArgFlagsTy Flags = {}) override {
171 markPhysRegUsed(PhysReg);
172
173 const SIRegisterInfo *TRI =
174 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
175
176 // Inreg flag should be the same across SplitArg[i]
177 if (Flags.isInReg() && TRI->isVGPR(MRI, PhysReg))
178 readLaneToSGPR(ValVReg, PhysReg, VA);
179 else
180 copyToReg(ValVReg, PhysReg, VA);
181 }
182
183 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
184 const MachinePointerInfo &MPO,
185 const CCValAssign &VA) override {
186 MachineFunction &MF = MIRBuilder.getMF();
187
188 auto *MMO = MF.getMachineMemOperand(
190 inferAlignFromPtrInfo(MF, MPO));
191 MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
192 }
193
194 /// How the physical register gets marked varies between formal
195 /// parameters (it's a basic-block live-in), and a call instruction
196 /// (it's an implicit-def of the BL).
197 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
198};
199
200struct FormalArgHandler : public AMDGPUIncomingArgHandler {
201 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
202 : AMDGPUIncomingArgHandler(B, MRI) {}
203
204 void markPhysRegUsed(unsigned PhysReg) override {
205 MIRBuilder.getMBB().addLiveIn(PhysReg);
206 }
207};
208
209struct CallReturnHandler : public AMDGPUIncomingArgHandler {
210 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
212 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
213
214 void markPhysRegUsed(unsigned PhysReg) override {
215 MIB.addDef(PhysReg, RegState::Implicit);
216 }
217
219};
220
221struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
222 /// For tail calls, the byte offset of the call's argument area from the
223 /// callee's. Unused elsewhere.
224 int FPDiff;
225
226 // Cache the SP register vreg if we need it more than once in this call site.
228
229 bool IsTailCall;
230
231 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
233 bool IsTailCall = false, int FPDiff = 0)
234 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
235 IsTailCall(IsTailCall) {}
236
237 Register getStackAddress(uint64_t Size, int64_t Offset,
239 ISD::ArgFlagsTy Flags) override {
240 MachineFunction &MF = MIRBuilder.getMF();
241 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
242 const LLT S32 = LLT::scalar(32);
243
244 if (IsTailCall) {
245 Offset += FPDiff;
246 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
247 auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
249 return FIReg.getReg(0);
250 }
251
253
254 if (!SPReg) {
255 const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
256 if (ST.hasFlatScratchEnabled()) {
257 // The stack is accessed unswizzled, so we can use a regular copy.
258 SPReg = MIRBuilder.buildCopy(PtrTy,
259 MFI->getStackPtrOffsetReg()).getReg(0);
260 } else {
261 // The address we produce here, without knowing the use context, is going
262 // to be interpreted as a vector address, so we need to convert to a
263 // swizzled address.
264 SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
265 {MFI->getStackPtrOffsetReg()}).getReg(0);
266 }
267 }
268
269 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
270
271 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
273 return AddrReg.getReg(0);
274 }
275
276 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
277 const MachinePointerInfo &MPO,
278 const CCValAssign &VA) override {
279 MachineFunction &MF = MIRBuilder.getMF();
280 uint64_t LocMemOffset = VA.getLocMemOffset();
281 const auto &ST = MF.getSubtarget<GCNSubtarget>();
282
283 auto *MMO = MF.getMachineMemOperand(
284 MPO, MachineMemOperand::MOStore, MemTy,
285 commonAlignment(ST.getStackAlignment(), LocMemOffset));
286 MIRBuilder.buildStore(ValVReg, Addr, *MMO);
287 }
288
289 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
290 unsigned ValRegIndex, Register Addr, LLT MemTy,
291 const MachinePointerInfo &MPO,
292 const CCValAssign &VA) override {
294 ? extendRegister(Arg.Regs[ValRegIndex], VA)
295 : Arg.Regs[ValRegIndex];
296 assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
297 }
298};
299} // anonymous namespace
300
304
305// FIXME: Compatibility shim
307 switch (MIOpc) {
308 case TargetOpcode::G_SEXT:
309 return ISD::SIGN_EXTEND;
310 case TargetOpcode::G_ZEXT:
311 return ISD::ZERO_EXTEND;
312 case TargetOpcode::G_ANYEXT:
313 return ISD::ANY_EXTEND;
314 default:
315 llvm_unreachable("not an extend opcode");
316 }
317}
318
319bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
320 CallingConv::ID CallConv,
322 bool IsVarArg) const {
323 // For shaders. Vector types should be explicitly handled by CC.
324 if (AMDGPU::isEntryFunctionCC(CallConv))
325 return true;
326
328 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
329 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
330 MF.getFunction().getContext());
331
332 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
333}
334
335/// Lower the return value for the already existing \p Ret. This assumes that
336/// \p B's insertion point is correct.
337bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
338 const Value *Val, ArrayRef<Register> VRegs,
339 MachineInstrBuilder &Ret) const {
340 if (!Val)
341 return true;
342
343 auto &MF = B.getMF();
344 const auto &F = MF.getFunction();
345 const DataLayout &DL = MF.getDataLayout();
346 MachineRegisterInfo *MRI = B.getMRI();
347 LLVMContext &Ctx = F.getContext();
348
349 CallingConv::ID CC = F.getCallingConv();
350 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
351
352 SmallVector<EVT, 8> SplitEVTs;
353 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
354 assert(VRegs.size() == SplitEVTs.size() &&
355 "For each split Type there should be exactly one VReg.");
356
357 SmallVector<ArgInfo, 8> SplitRetInfos;
358
359 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
360 EVT VT = SplitEVTs[i];
361 Register Reg = VRegs[i];
362 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0);
363 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
364
365 if (VT.isScalarInteger()) {
366 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
367 if (RetInfo.Flags[0].isSExt()) {
368 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
369 ExtendOp = TargetOpcode::G_SEXT;
370 } else if (RetInfo.Flags[0].isZExt()) {
371 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
372 ExtendOp = TargetOpcode::G_ZEXT;
373 }
374
375 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
376 extOpcodeToISDExtOpcode(ExtendOp));
377 if (ExtVT != VT) {
378 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
379 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
380 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
381 }
382 }
383
384 if (Reg != RetInfo.Regs[0]) {
385 RetInfo.Regs[0] = Reg;
386 // Reset the arg flags after modifying Reg.
387 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
388 }
389
390 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
391 }
392
393 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
394
395 OutgoingValueAssigner Assigner(AssignFn);
396 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
397 return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
398 CC, F.isVarArg());
399}
400
402 ArrayRef<Register> VRegs,
403 FunctionLoweringInfo &FLI) const {
404
405 MachineFunction &MF = B.getMF();
407 MFI->setIfReturnsVoid(!Val);
408
409 assert(!Val == VRegs.empty() && "Return value without a vreg");
410
411 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
412 const bool IsShader = AMDGPU::isShader(CC);
413 const bool IsWaveEnd =
414 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
415 if (IsWaveEnd) {
416 B.buildInstr(AMDGPU::S_ENDPGM)
417 .addImm(0);
418 return true;
419 }
420
421 const bool IsWholeWave = MFI->isWholeWaveFunction();
422 unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
423 : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
424 : AMDGPU::SI_RETURN;
425 auto Ret = B.buildInstrNoInsert(ReturnOpc);
426
427 if (!FLI.CanLowerReturn)
428 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
429 else if (!lowerReturnVal(B, Val, VRegs, Ret))
430 return false;
431
432 if (IsWholeWave)
433 addOriginalExecToReturn(B.getMF(), Ret);
434
435 // TODO: Handle CalleeSavedRegsViaCopy.
436
437 B.insertInstr(Ret);
438 return true;
439}
440
441void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
442 uint64_t Offset) const {
443 MachineFunction &MF = B.getMF();
446 Register KernArgSegmentPtr =
448 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
449
450 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
451
452 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
453}
454
455void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
457 Align Alignment) const {
458 MachineFunction &MF = B.getMF();
459 const Function &F = MF.getFunction();
460 const DataLayout &DL = F.getDataLayout();
463
465
466 SmallVector<ArgInfo, 32> SplitArgs;
467 SmallVector<TypeSize> FieldOffsets;
468 splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
469
470 unsigned Idx = 0;
471 for (ArgInfo &SplitArg : SplitArgs) {
472 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
473 lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
474
475 LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
476 if (SplitArg.Flags[0].isPointer()) {
477 // Compensate for losing pointeriness in splitValueTypes.
478 LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
479 ArgTy.getScalarSizeInBits());
480 ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy)
481 : PtrTy;
482 }
483
484 MachineMemOperand *MMO = MF.getMachineMemOperand(
485 PtrInfo,
488 ArgTy, commonAlignment(Alignment, FieldOffsets[Idx]));
489
490 assert(SplitArg.Regs.size() == 1);
491
492 B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO);
493 ++Idx;
494 }
495}
496
497// Allocate special inputs passed in user SGPRs.
498static void allocateHSAUserSGPRs(CCState &CCInfo,
500 MachineFunction &MF,
501 const SIRegisterInfo &TRI,
502 SIMachineFunctionInfo &Info) {
503 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
504 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
505 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
506 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
507 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
508 CCInfo.AllocateReg(PrivateSegmentBufferReg);
509 }
510
511 if (UserSGPRInfo.hasDispatchPtr()) {
512 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
513 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
514 CCInfo.AllocateReg(DispatchPtrReg);
515 }
516
517 if (UserSGPRInfo.hasQueuePtr()) {
518 Register QueuePtrReg = Info.addQueuePtr(TRI);
519 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
520 CCInfo.AllocateReg(QueuePtrReg);
521 }
522
523 if (UserSGPRInfo.hasKernargSegmentPtr()) {
525 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
528 MRI.addLiveIn(InputPtrReg, VReg);
529 B.getMBB().addLiveIn(InputPtrReg);
530 B.buildCopy(VReg, InputPtrReg);
531 CCInfo.AllocateReg(InputPtrReg);
532 }
533
534 if (UserSGPRInfo.hasDispatchID()) {
535 Register DispatchIDReg = Info.addDispatchID(TRI);
536 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
537 CCInfo.AllocateReg(DispatchIDReg);
538 }
539
540 if (UserSGPRInfo.hasFlatScratchInit()) {
541 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
542 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
543 CCInfo.AllocateReg(FlatScratchInitReg);
544 }
545
546 if (UserSGPRInfo.hasPrivateSegmentSize()) {
547 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
548 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
549 CCInfo.AllocateReg(PrivateSegmentSizeReg);
550 }
551
552 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
553 // these from the dispatch pointer.
554}
555
557 MachineIRBuilder &B, const Function &F,
558 ArrayRef<ArrayRef<Register>> VRegs) const {
559 MachineFunction &MF = B.getMF();
560 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
563 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
565 const DataLayout &DL = F.getDataLayout();
566
568 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
569
570 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
571
572 unsigned i = 0;
573 const Align KernArgBaseAlign(16);
574 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
575 uint64_t ExplicitArgOffset = 0;
576
577 // TODO: Align down to dword alignment and extract bits for extending loads.
578 for (auto &Arg : F.args()) {
579 // TODO: Add support for kernarg preload.
580 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
581 LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
582 return false;
583 }
584
585 const bool IsByRef = Arg.hasByRefAttr();
586 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
587 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
588 if (AllocSize == 0)
589 continue;
590
591 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
592 Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
593
594 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
595 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
596
597 if (Arg.use_empty()) {
598 ++i;
599 continue;
600 }
601
602 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
603
604 if (IsByRef) {
605 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
606
607 assert(VRegs[i].size() == 1 &&
608 "expected only one register for byval pointers");
609 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
610 lowerParameterPtr(VRegs[i][0], B, ArgOffset);
611 } else {
612 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
613 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
614 lowerParameterPtr(PtrReg, B, ArgOffset);
615
616 B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
617 }
618 } else {
619 ArgInfo OrigArg(VRegs[i], Arg, i);
620 const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
621 setArgFlags(OrigArg, OrigArgIdx, DL, F);
622 lowerParameter(B, OrigArg, ArgOffset, Alignment);
623 }
624
625 ++i;
626 }
627
628 if (Info->getNumKernargPreloadedSGPRs())
629 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
630
631 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
632 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
633 return true;
634}
635
638 FunctionLoweringInfo &FLI) const {
639 CallingConv::ID CC = F.getCallingConv();
640
641 // The infrastructure for normal calling convention lowering is essentially
642 // useless for kernels. We want to avoid any kind of legalization or argument
643 // splitting.
645 return lowerFormalArgumentsKernel(B, F, VRegs);
646
647 const bool IsGraphics = AMDGPU::isGraphics(CC);
648 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
649
650 MachineFunction &MF = B.getMF();
651 MachineBasicBlock &MBB = B.getMBB();
654 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
655 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
656 const DataLayout &DL = F.getDataLayout();
657
659 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
660 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
661
662 if (UserSGPRInfo.hasImplicitBufferPtr()) {
663 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
664 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
665 CCInfo.AllocateReg(ImplicitBufferPtrReg);
666 }
667
668 // FIXME: This probably isn't defined for mesa
669 if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
670 Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
671 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
672 CCInfo.AllocateReg(FlatScratchInitReg);
673 }
674
675 SmallVector<ArgInfo, 32> SplitArgs;
676 unsigned Idx = 0;
677 unsigned PSInputNum = 0;
678
679 // Insert the hidden sret parameter if the return value won't fit in the
680 // return registers.
681 if (!FLI.CanLowerReturn)
682 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
683
684 for (auto &Arg : F.args()) {
685 if (DL.getTypeStoreSize(Arg.getType()) == 0)
686 continue;
687
688 if (Info->isWholeWaveFunction() && Idx == 0) {
689 assert(VRegs[Idx].size() == 1 && "Expected only one register");
690
691 // The first argument for whole wave functions is the original EXEC value.
692 B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
693 .addDef(VRegs[Idx][0]);
694
695 ++Idx;
696 continue;
697 }
698
699 const bool InReg = Arg.hasAttribute(Attribute::InReg);
700
701 if (Arg.hasAttribute(Attribute::SwiftSelf) ||
702 Arg.hasAttribute(Attribute::SwiftError) ||
703 Arg.hasAttribute(Attribute::Nest))
704 return false;
705
706 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
707 const bool ArgUsed = !Arg.use_empty();
708 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
709
710 if (!SkipArg) {
711 Info->markPSInputAllocated(PSInputNum);
712 if (ArgUsed)
713 Info->markPSInputEnabled(PSInputNum);
714 }
715
716 ++PSInputNum;
717
718 if (SkipArg) {
719 for (Register R : VRegs[Idx])
720 B.buildUndef(R);
721
722 ++Idx;
723 continue;
724 }
725 }
726
727 ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
728 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
729 setArgFlags(OrigArg, OrigArgIdx, DL, F);
730
731 splitToValueTypes(OrigArg, SplitArgs, DL, CC);
732 ++Idx;
733 }
734
735 // At least one interpolation mode must be enabled or else the GPU will
736 // hang.
737 //
738 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
739 // set PSInputAddr, the user wants to enable some bits after the compilation
740 // based on run-time states. Since we can't know what the final PSInputEna
741 // will look like, so we shouldn't do anything here and the user should take
742 // responsibility for the correct programming.
743 //
744 // Otherwise, the following restrictions apply:
745 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
746 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
747 // enabled too.
748 if (CC == CallingConv::AMDGPU_PS) {
749 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
750 ((Info->getPSInputAddr() & 0xF) == 0 &&
751 Info->isPSInputAllocated(11))) {
752 CCInfo.AllocateReg(AMDGPU::VGPR0);
753 CCInfo.AllocateReg(AMDGPU::VGPR1);
754 Info->markPSInputAllocated(0);
755 Info->markPSInputEnabled(0);
756 }
757
758 if (Subtarget.isAmdPalOS()) {
759 // For isAmdPalOS, the user does not enable some bits after compilation
760 // based on run-time states; the register values being generated here are
761 // the final ones set in hardware. Therefore we need to apply the
762 // workaround to PSInputAddr and PSInputEnable together. (The case where
763 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
764 // set up an input arg for a particular interpolation mode, but nothing
765 // uses that input arg. Really we should have an earlier pass that removes
766 // such an arg.)
767 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
768 if ((PsInputBits & 0x7F) == 0 ||
769 ((PsInputBits & 0xF) == 0 &&
770 (PsInputBits >> 11 & 1)))
771 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
772 }
773 }
774
776 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
777
778 if (!MBB.empty())
779 B.setInstr(*MBB.begin());
780
781 if (!IsEntryFunc && !IsGraphics) {
782 // For the fixed ABI, pass workitem IDs in the last argument register.
783 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
784
785 if (!Subtarget.hasFlatScratchEnabled())
786 CCInfo.AllocateReg(Info->getScratchRSrcReg());
787 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
788 }
789
790 IncomingValueAssigner Assigner(AssignFn);
791 if (!determineAssignments(Assigner, SplitArgs, CCInfo))
792 return false;
793
794 if (IsEntryFunc) {
795 // This assumes the registers are allocated by CCInfo in ascending order
796 // with no gaps.
797 Info->setNumWaveDispatchSGPRs(
798 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
799 Info->setNumWaveDispatchVGPRs(
800 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
801 }
802
803 FormalArgHandler Handler(B, MRI);
804 if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
805 return false;
806
807 uint64_t StackSize = Assigner.StackSize;
808
809 // Start adding system SGPRs.
810 if (IsEntryFunc)
811 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
812
813 // When we tail call, we need to check if the callee's arguments will fit on
814 // the caller's stack. So, whenever we lower formal arguments, we should keep
815 // track of this information, since we might lower a tail call in this
816 // function later.
817 Info->setBytesInStackArgArea(StackSize);
818
819 // Move back to the end of the basic block.
820 B.setMBB(MBB);
821
822 return true;
823}
824
826 CCState &CCInfo,
827 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
828 CallLoweringInfo &Info) const {
829 MachineFunction &MF = MIRBuilder.getMF();
830
831 // If there's no call site, this doesn't correspond to a call from the IR and
832 // doesn't need implicit inputs.
833 if (!Info.CB)
834 return true;
835
836 const AMDGPUFunctionArgInfo &CalleeArgInfo =
838
840 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
841
842
843 // TODO: Unify with private memory register handling. This is complicated by
844 // the fact that at least in kernels, the input argument is not necessarily
845 // in the same location as the input.
855 };
856
857 static constexpr StringLiteral ImplicitAttrNames[][2] = {
858 {"amdgpu-no-dispatch-ptr", ""},
859 {"amdgpu-no-queue-ptr", ""},
860 {"amdgpu-no-implicitarg-ptr", ""},
861 {"amdgpu-no-dispatch-id", ""},
862 {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"},
863 {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"},
864 {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"},
865 {"amdgpu-no-lds-kernel-id", ""},
866 };
867
869
870 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
871 const AMDGPULegalizerInfo *LI
872 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
873
874 unsigned I = 0;
875 for (auto InputID : InputRegs) {
876 const ArgDescriptor *OutgoingArg;
877 const TargetRegisterClass *ArgRC;
878 LLT ArgTy;
879
880 // If the callee does not use the attribute value, skip copying the value.
881 if (all_of(ImplicitAttrNames[I++], [&](StringRef AttrName) {
882 return AttrName.empty() || Info.CB->hasFnAttr(AttrName);
883 }))
884 continue;
885
886 std::tie(OutgoingArg, ArgRC, ArgTy) =
887 CalleeArgInfo.getPreloadedValue(InputID);
888 if (!OutgoingArg)
889 continue;
890
891 const ArgDescriptor *IncomingArg;
892 const TargetRegisterClass *IncomingArgRC;
893 std::tie(IncomingArg, IncomingArgRC, ArgTy) =
894 CallerArgInfo.getPreloadedValue(InputID);
895 assert(IncomingArgRC == ArgRC);
896
897 Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
898
899 if (IncomingArg) {
900 LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
901 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
902 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
903 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
904 std::optional<uint32_t> Id =
906 if (Id) {
907 MIRBuilder.buildConstant(InputReg, *Id);
908 } else {
909 MIRBuilder.buildUndef(InputReg);
910 }
911 } else {
912 // We may have proven the input wasn't needed, although the ABI is
913 // requiring it. We just need to allocate the register appropriately.
914 MIRBuilder.buildUndef(InputReg);
915 }
916
917 if (OutgoingArg->isRegister()) {
918 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
919 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
920 report_fatal_error("failed to allocate implicit input argument");
921 } else {
922 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
923 return false;
924 }
925 }
926
927 // Pack workitem IDs into a single register or pass it as is if already
928 // packed.
929 const ArgDescriptor *OutgoingArg;
930 const TargetRegisterClass *ArgRC;
931 LLT ArgTy;
932
933 std::tie(OutgoingArg, ArgRC, ArgTy) =
935 if (!OutgoingArg)
936 std::tie(OutgoingArg, ArgRC, ArgTy) =
938 if (!OutgoingArg)
939 std::tie(OutgoingArg, ArgRC, ArgTy) =
941 if (!OutgoingArg)
942 return false;
943
944 auto WorkitemIDX =
945 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
946 auto WorkitemIDY =
947 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
948 auto WorkitemIDZ =
949 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
950
951 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
952 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
953 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
954 const LLT S32 = LLT::scalar(32);
955
956 const bool NeedWorkItemIDX = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-x");
957 const bool NeedWorkItemIDY = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-y");
958 const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-z");
959
960 // If incoming ids are not packed we need to pack them.
961 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
962 Register InputReg;
963 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
964 NeedWorkItemIDX) {
965 if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
966 InputReg = MRI.createGenericVirtualRegister(S32);
967 LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArgX,
968 std::get<1>(WorkitemIDX),
969 std::get<2>(WorkitemIDX));
970 } else {
971 InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
972 }
973 }
974
975 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
976 NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
978 LI->buildLoadInputValue(Y, MIRBuilder, IncomingArgY,
979 std::get<1>(WorkitemIDY), std::get<2>(WorkitemIDY));
980
981 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
982 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
983 }
984
985 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
986 NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
988 LI->buildLoadInputValue(Z, MIRBuilder, IncomingArgZ,
989 std::get<1>(WorkitemIDZ), std::get<2>(WorkitemIDZ));
990
991 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
992 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
993 }
994
995 if (!InputReg &&
996 (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
997 InputReg = MRI.createGenericVirtualRegister(S32);
998 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
999 // We're in a situation where the outgoing function requires the workitem
1000 // ID, but the calling function does not have it (e.g a graphics function
1001 // calling a C calling convention function). This is illegal, but we need
1002 // to produce something.
1003 MIRBuilder.buildUndef(InputReg);
1004 } else {
1005 // Workitem ids are already packed, any of present incoming arguments will
1006 // carry all required fields.
1008 IncomingArgX ? *IncomingArgX :
1009 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
1010 LI->buildLoadInputValue(InputReg, MIRBuilder, &IncomingArg,
1011 &AMDGPU::VGPR_32RegClass, S32);
1012 }
1013 }
1014
1015 if (OutgoingArg->isRegister()) {
1016 if (InputReg)
1017 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1018
1019 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1020 report_fatal_error("failed to allocate implicit input argument");
1021 } else {
1022 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1023 return false;
1024 }
1025
1026 return true;
1027}
1028
1029/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1030/// CC.
1031static std::pair<CCAssignFn *, CCAssignFn *>
1033 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
1034}
1035
1036static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1037 bool IsTailCall, bool IsWave32,
1038 CallingConv::ID CC,
1039 bool IsDynamicVGPRChainCall = false) {
1040 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
1041 assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
1042 "Indirect calls can't be tail calls, "
1043 "because the address can be divergent");
1044 if (!IsTailCall)
1045 return AMDGPU::G_SI_CALL;
1046
1047 if (AMDGPU::isChainCC(CC)) {
1048 if (IsDynamicVGPRChainCall)
1049 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
1050 : AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
1051 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
1052 }
1053
1054 if (CallerF.getFunction().getCallingConv() ==
1056 return AMDGPU::SI_TCRETURN_GFX_WholeWave;
1057
1059 return AMDGPU::SI_TCRETURN_GFX;
1060
1061 return AMDGPU::SI_TCRETURN;
1062}
1063
1064// Add operands to call instruction to track the callee.
1066 MachineIRBuilder &MIRBuilder,
1068 bool IsDynamicVGPRChainCall = false) {
1069 if (Info.Callee.isReg()) {
1070 CallInst.addReg(Info.Callee.getReg());
1071 CallInst.addImm(0);
1072 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1073 // The call lowering lightly assumed we can directly encode a call target in
1074 // the instruction, which is not the case. Materialize the address here.
1075 const GlobalValue *GV = Info.Callee.getGlobal();
1076 auto Ptr = MIRBuilder.buildGlobalValue(
1077 LLT::pointer(GV->getAddressSpace(), 64), GV);
1078 CallInst.addReg(Ptr.getReg(0));
1079
1080 if (IsDynamicVGPRChainCall) {
1081 // DynamicVGPR chain calls are always indirect.
1082 CallInst.addImm(0);
1083 } else
1084 CallInst.add(Info.Callee);
1085 } else
1086 return false;
1087
1088 return true;
1089}
1090
1093 SmallVectorImpl<ArgInfo> &InArgs) const {
1094 const Function &CallerF = MF.getFunction();
1095 CallingConv::ID CalleeCC = Info.CallConv;
1096 CallingConv::ID CallerCC = CallerF.getCallingConv();
1097
1098 // If the calling conventions match, then everything must be the same.
1099 if (CalleeCC == CallerCC)
1100 return true;
1101
1102 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1103
1104 // Make sure that the caller and callee preserve all of the same registers.
1105 const auto *TRI = ST.getRegisterInfo();
1106
1107 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1108 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1109 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
1110 return false;
1111
1112 // Check if the caller and callee will handle arguments in the same way.
1114 CCAssignFn *CalleeAssignFnFixed;
1115 CCAssignFn *CalleeAssignFnVarArg;
1116 std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
1117 getAssignFnsForCC(CalleeCC, TLI);
1118
1119 CCAssignFn *CallerAssignFnFixed;
1120 CCAssignFn *CallerAssignFnVarArg;
1121 std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
1122 getAssignFnsForCC(CallerCC, TLI);
1123
1124 // FIXME: We are not accounting for potential differences in implicitly passed
1125 // inputs, but only the fixed ABI is supported now anyway.
1126 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1127 CalleeAssignFnVarArg);
1128 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1129 CallerAssignFnVarArg);
1130 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1131}
1132
1135 SmallVectorImpl<ArgInfo> &OutArgs) const {
1136 // If there are no outgoing arguments, then we are done.
1137 if (OutArgs.empty())
1138 return true;
1139
1140 const Function &CallerF = MF.getFunction();
1141 CallingConv::ID CalleeCC = Info.CallConv;
1142 CallingConv::ID CallerCC = CallerF.getCallingConv();
1144
1145 CCAssignFn *AssignFnFixed;
1146 CCAssignFn *AssignFnVarArg;
1147 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1148
1149 // We have outgoing arguments. Make sure that we can tail call with them.
1151 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1152 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1153
1154 if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
1155 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1156 return false;
1157 }
1158
1159 // Make sure that they can fit on the caller's stack.
1160 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1161 if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1162 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1163 return false;
1164 }
1165
1166 // Verify that the parameters in callee-saved registers match.
1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1169 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1170 MachineRegisterInfo &MRI = MF.getRegInfo();
1171 return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
1172}
1173
1176 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1177 // Must pass all target-independent checks in order to tail call optimize.
1178 if (!Info.IsTailCall)
1179 return false;
1180
1181 // Indirect calls can't be tail calls, because the address can be divergent.
1182 // TODO Check divergence info if the call really is divergent.
1183 if (Info.Callee.isReg())
1184 return false;
1185
1186 MachineFunction &MF = B.getMF();
1187 const Function &CallerF = MF.getFunction();
1188 CallingConv::ID CalleeCC = Info.CallConv;
1189 CallingConv::ID CallerCC = CallerF.getCallingConv();
1190
1191 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1192 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1193 // Kernels aren't callable, and don't have a live in return address so it
1194 // doesn't make sense to do a tail call with entry functions.
1195 if (!CallerPreserved)
1196 return false;
1197
1198 if (!AMDGPU::mayTailCallThisCC(CalleeCC)) {
1199 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1200 return false;
1201 }
1202
1203 if (any_of(CallerF.args(), [](const Argument &A) {
1204 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1205 })) {
1206 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1207 "or swifterror arguments\n");
1208 return false;
1209 }
1210
1211 // If we have -tailcallopt, then we're done.
1213 return AMDGPU::canGuaranteeTCO(CalleeCC) &&
1214 CalleeCC == CallerF.getCallingConv();
1215 }
1216
1217 // Verify that the incoming and outgoing arguments from the callee are
1218 // safe to tail call.
1219 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1220 LLVM_DEBUG(
1221 dbgs()
1222 << "... Caller and callee have incompatible calling conventions.\n");
1223 return false;
1224 }
1225
1226 // FIXME: We need to check if any arguments passed in SGPR are uniform. If
1227 // they are not, this cannot be a tail call. If they are uniform, but may be
1228 // VGPR, we need to insert readfirstlanes.
1229 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1230 return false;
1231
1232 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1233 return true;
1234}
1235
1236// Insert outgoing implicit arguments for a call, by inserting copies to the
1237// implicit argument registers and adding the necessary implicit uses to the
1238// call instruction.
1241 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1242 CallingConv::ID CalleeCC,
1243 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1244 if (!ST.hasFlatScratchEnabled()) {
1245 // Insert copies for the SRD. In the HSA case, this should be an identity
1246 // copy.
1247 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
1248 FuncInfo.getScratchRSrcReg());
1249
1250 auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
1251 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1252 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1253
1254 MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
1255 CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
1256 }
1257
1258 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1259 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1260 CallInst.addReg(ArgReg.first, RegState::Implicit);
1261 }
1262}
1263
1264namespace {
1265// Chain calls have special arguments that we need to handle. These have the
1266// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
1267enum ChainCallArgIdx {
1268 Exec = 1,
1269 Flags = 4,
1270 NumVGPRs = 5,
1271 FallbackExec = 6,
1272 FallbackCallee = 7,
1273};
1274} // anonymous namespace
1275
1277 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1278 SmallVectorImpl<ArgInfo> &OutArgs) const {
1279 MachineFunction &MF = MIRBuilder.getMF();
1280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1282 const Function &F = MF.getFunction();
1283 MachineRegisterInfo &MRI = MF.getRegInfo();
1284 const SIInstrInfo *TII = ST.getInstrInfo();
1285 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1287
1288 // True when we're tail calling, but without -tailcallopt.
1289 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1290
1291 // Find out which ABI gets to decide where things go.
1292 CallingConv::ID CalleeCC = Info.CallConv;
1293 CCAssignFn *AssignFnFixed;
1294 CCAssignFn *AssignFnVarArg;
1295 std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
1296
1297 MachineInstrBuilder CallSeqStart;
1298 if (!IsSibCall)
1299 CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
1300
1301 bool IsChainCall = AMDGPU::isChainCC(Info.CallConv);
1302 bool IsDynamicVGPRChainCall = false;
1303
1304 if (IsChainCall) {
1305 ArgInfo FlagsArg = Info.OrigArgs[ChainCallArgIdx::Flags];
1306 const APInt &FlagsValue = cast<ConstantInt>(FlagsArg.OrigValue)->getValue();
1307 if (FlagsValue.isZero()) {
1308 if (Info.OrigArgs.size() != 5) {
1309 LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0\n");
1310 return false;
1311 }
1312 } else if (FlagsValue.isOneBitSet(0)) {
1313 IsDynamicVGPRChainCall = true;
1314
1315 if (Info.OrigArgs.size() != 8) {
1316 LLVM_DEBUG(dbgs() << "Expected 3 additional args\n");
1317 return false;
1318 }
1319
1320 // On GFX12, we can only change the VGPR allocation for wave32.
1321 if (!ST.isWave32()) {
1322 F.getContext().diagnose(DiagnosticInfoUnsupported(
1323 F, "dynamic VGPR mode is only supported for wave32"));
1324 return false;
1325 }
1326
1327 ArgInfo FallbackExecArg = Info.OrigArgs[ChainCallArgIdx::FallbackExec];
1328 assert(FallbackExecArg.Regs.size() == 1 &&
1329 "Expected single register for fallback EXEC");
1330 if (!FallbackExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) {
1331 LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
1332 return false;
1333 }
1334 }
1335 }
1336
1337 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
1338 ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
1339 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1340
1341 if (FuncInfo->isWholeWaveFunction())
1342 addOriginalExecToReturn(MF, MIB);
1343
1344 // Keep track of the index of the next operand to be added to the call
1345 unsigned CalleeIdx = MIB->getNumOperands();
1346
1347 if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
1348 return false;
1349
1350 // Byte offset for the tail call. When we are sibcalling, this will always
1351 // be 0.
1352 MIB.addImm(0);
1353
1354 // If this is a chain call, we need to pass in the EXEC mask as well as any
1355 // other special args.
1356 if (IsChainCall) {
1357 auto AddRegOrImm = [&](const ArgInfo &Arg) {
1358 if (auto CI = dyn_cast<ConstantInt>(Arg.OrigValue)) {
1359 MIB.addImm(CI->getSExtValue());
1360 } else {
1361 MIB.addReg(Arg.Regs[0]);
1362 unsigned Idx = MIB->getNumOperands() - 1;
1363 MIB->getOperand(Idx).setReg(constrainOperandRegClass(
1364 MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
1365 MIB->getOperand(Idx), Idx));
1366 }
1367 };
1368
1369 ArgInfo ExecArg = Info.OrigArgs[ChainCallArgIdx::Exec];
1370 assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1371
1372 if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) {
1373 LLVM_DEBUG(dbgs() << "Bad type for EXEC");
1374 return false;
1375 }
1376
1377 AddRegOrImm(ExecArg);
1378 if (IsDynamicVGPRChainCall)
1379 std::for_each(Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
1380 Info.OrigArgs.end(), AddRegOrImm);
1381 }
1382
1383 // Tell the call which registers are clobbered.
1384 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1385 MIB.addRegMask(Mask);
1386
1387 // FPDiff is the byte offset of the call's argument area from the callee's.
1388 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1389 // by this amount for a tail call. In a sibling call it must be 0 because the
1390 // caller will deallocate the entire stack and the callee still expects its
1391 // arguments to begin at SP+0.
1392 int FPDiff = 0;
1393
1394 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1395 // by -tailcallopt. For sibcalls, the memory operands for the call are
1396 // already available in the caller's incoming argument space.
1397 unsigned NumBytes = 0;
1398 if (!IsSibCall) {
1399 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1400 // before handling assignments, because FPDiff must be known for memory
1401 // arguments.
1402 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1404 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1405
1406 // FIXME: Not accounting for callee implicit inputs
1407 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1408 if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
1409 return false;
1410
1411 // The callee will pop the argument stack as a tail call. Thus, we must
1412 // keep it 16-byte aligned.
1413 NumBytes = alignTo(OutInfo.getStackSize(), ST.getStackAlignment());
1414
1415 // FPDiff will be negative if this tail call requires more space than we
1416 // would automatically have in our incoming argument space. Positive if we
1417 // actually shrink the stack.
1418 FPDiff = NumReusableBytes - NumBytes;
1419
1420 // The stack pointer must be 16-byte aligned at all times it's used for a
1421 // memory operation, which in practice means at *all* times and in
1422 // particular across call boundaries. Therefore our own arguments started at
1423 // a 16-byte aligned SP and the delta applied for the tail call should
1424 // satisfy the same constraint.
1425 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1426 "unaligned stack on tail call");
1427 }
1428
1430 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1431
1432 // We could pass MIB and directly add the implicit uses to the call
1433 // now. However, as an aesthetic choice, place implicit argument operands
1434 // after the ordinary user argument registers.
1436
1437 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1438 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
1439 !AMDGPU::isChainCC(Info.CallConv)) {
1440 // With a fixed ABI, allocate fixed registers before user arguments.
1441 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1442 return false;
1443 }
1444
1445 // Mark the scratch resource descriptor as allocated so the CC analysis
1446 // does not assign user arguments to these registers, matching the callee.
1447 if (!ST.hasFlatScratchEnabled())
1448 CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
1449
1450 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1451
1452 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1453 return false;
1454
1455 // Do the actual argument marshalling.
1456 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1457 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1458 return false;
1459
1460 if (Info.ConvergenceCtrlToken) {
1461 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1462 }
1463 handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
1464 ImplicitArgRegs);
1465
1466 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1467 // sequence start and end here.
1468 if (!IsSibCall) {
1469 MIB->getOperand(CalleeIdx + 1).setImm(FPDiff);
1470 CallSeqStart.addImm(NumBytes).addImm(0);
1471 // End the call sequence *before* emitting the call. Normally, we would
1472 // tidy the frame up after the call. However, here, we've laid out the
1473 // parameters so that when SP is reset, they will be in the correct
1474 // location.
1475 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
1476 }
1477
1478 // Now we can add the actual call instruction to the correct basic block.
1479 MIRBuilder.insertInstr(MIB);
1480
1481 // If this is a whole wave tail call, we need to constrain the register for
1482 // the original EXEC.
1483 if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
1484 MIB->getOperand(0).setReg(
1485 constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
1486 *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
1487 }
1488
1489 // If Callee is a reg, since it is used by a target specific
1490 // instruction, it must have a register class matching the
1491 // constraint of that instruction.
1492
1493 // FIXME: We should define regbankselectable call instructions to handle
1494 // divergent call targets.
1495 if (MIB->getOperand(CalleeIdx).isReg()) {
1496 MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
1497 MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
1498 MIB->getOperand(CalleeIdx), CalleeIdx));
1499 }
1500
1502 Info.LoweredTailCall = true;
1503 return true;
1504}
1505
1506/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1508 CallLoweringInfo &Info) const {
1509 ArgInfo Callee = Info.OrigArgs[0];
1510 ArgInfo SGPRArgs = Info.OrigArgs[2];
1511 ArgInfo VGPRArgs = Info.OrigArgs[3];
1512
1513 MachineFunction &MF = MIRBuilder.getMF();
1514 const Function &F = MF.getFunction();
1515 const DataLayout &DL = F.getDataLayout();
1516
1517 // The function to jump to is actually the first argument, so we'll change the
1518 // Callee and other info to match that before using our existing helper.
1519 const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1520 if (const Function *F = dyn_cast<Function>(CalleeV)) {
1521 Info.Callee = MachineOperand::CreateGA(F, 0);
1522 Info.CallConv = F->getCallingConv();
1523 } else {
1524 assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1525 Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
1526 Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1527 // behaves the same here.
1528 }
1529
1530 // The function that we're calling cannot be vararg (only the intrinsic is).
1531 Info.IsVarArg = false;
1532
1533 assert(
1534 all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1535 "SGPR arguments should be marked inreg");
1536 assert(
1537 none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1538 "VGPR arguments should not be marked inreg");
1539
1541 splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
1542 splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
1543
1544 Info.IsMustTailCall = true;
1545 return lowerTailCall(MIRBuilder, Info, OutArgs);
1546}
1547
1549 CallLoweringInfo &Info) const {
1550 if (Function *F = Info.CB->getCalledFunction())
1551 if (F->isIntrinsic()) {
1552 switch (F->getIntrinsicID()) {
1553 case Intrinsic::amdgcn_cs_chain:
1554 return lowerChainCall(MIRBuilder, Info);
1555 case Intrinsic::amdgcn_call_whole_wave:
1556 Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1557
1558 // Get the callee from the original instruction, so it doesn't look like
1559 // this is an indirect call.
1560 Info.Callee = MachineOperand::CreateGA(
1561 cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
1562 Info.OrigArgs.erase(Info.OrigArgs.begin());
1563 Info.IsVarArg = false;
1564 break;
1565 default:
1566 llvm_unreachable("Unexpected intrinsic call");
1567 }
1568 }
1569
1570 if (Info.IsVarArg) {
1571 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1572 return false;
1573 }
1574
1575 MachineFunction &MF = MIRBuilder.getMF();
1576 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1577 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1578
1579 const Function &F = MF.getFunction();
1580 MachineRegisterInfo &MRI = MF.getRegInfo();
1582 const DataLayout &DL = F.getDataLayout();
1583
1585 for (auto &OrigArg : Info.OrigArgs)
1586 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
1587
1589 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1590 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
1591
1592 // If we can lower as a tail call, do that instead.
1593 bool CanTailCallOpt =
1594 isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
1595
1596 // We must emit a tail call if we have musttail.
1597 if (Info.IsMustTailCall && !CanTailCallOpt) {
1598 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1599 return false;
1600 }
1601
1602 Info.IsTailCall = CanTailCallOpt;
1603 if (CanTailCallOpt)
1604 return lowerTailCall(MIRBuilder, Info, OutArgs);
1605
1606 // Find out which ABI gets to decide where things go.
1607 CCAssignFn *AssignFnFixed;
1608 CCAssignFn *AssignFnVarArg;
1609 std::tie(AssignFnFixed, AssignFnVarArg) =
1610 getAssignFnsForCC(Info.CallConv, TLI);
1611
1612 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1613 .addImm(0)
1614 .addImm(0);
1615
1616 // Create a temporarily-floating call instruction so we can add the implicit
1617 // uses of arg registers.
1618 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
1619 Info.CallConv);
1620
1621 auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1622 MIB.addDef(TRI->getReturnAddressReg(MF));
1623
1624 if (!Info.IsConvergent)
1626
1627 if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1628 return false;
1629
1630 // Tell the call which registers are clobbered.
1631 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1632 MIB.addRegMask(Mask);
1633
1635 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1636
1637 // We could pass MIB and directly add the implicit uses to the call
1638 // now. However, as an aesthetic choice, place implicit argument operands
1639 // after the ordinary user argument registers.
1641
1642 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1643 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
1644 // With a fixed ABI, allocate fixed registers before user arguments.
1645 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1646 return false;
1647 }
1648
1649 // Mark the scratch resource descriptor as allocated so the CC analysis
1650 // does not assign user arguments to these registers, matching the callee.
1651 if (!ST.hasFlatScratchEnabled()) {
1652 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1653 CCInfo.AllocateReg(FuncInfo->getScratchRSrcReg());
1654 }
1655
1656 // Do the actual argument marshalling.
1657 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1658 if (!determineAssignments(Assigner, OutArgs, CCInfo))
1659 return false;
1660
1661 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1662 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1663 return false;
1664
1666
1667 if (Info.ConvergenceCtrlToken) {
1668 MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
1669 }
1670 handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
1671 ImplicitArgRegs);
1672
1673 // Get a count of how many bytes are to be pushed on the stack.
1674 unsigned NumBytes = CCInfo.getStackSize();
1675
1676 // If Callee is a reg, since it is used by a target specific
1677 // instruction, it must have a register class matching the
1678 // constraint of that instruction.
1679
1680 // FIXME: We should define regbankselectable call instructions to handle
1681 // divergent call targets.
1682 if (MIB->getOperand(1).isReg()) {
1683 MIB->getOperand(1).setReg(constrainOperandRegClass(
1684 MF, *TRI, MRI, *ST.getInstrInfo(),
1685 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1686 1));
1687 }
1688
1689 // Now we can add the actual call instruction to the correct position.
1690 MIRBuilder.insertInstr(MIB);
1691
1692 // Finally we can copy the returned value back into its virtual-register. In
1693 // symmetry with the arguments, the physical register must be an
1694 // implicit-define of the call instruction.
1695 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1696 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1697 Info.IsVarArg);
1698 IncomingValueAssigner Assigner(RetAssignFn);
1699 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1700 if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
1701 Info.CallConv, Info.IsVarArg))
1702 return false;
1703 }
1704
1705 uint64_t CalleePopBytes = NumBytes;
1706
1707 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1708 .addImm(0)
1709 .addImm(CalleePopBytes);
1710
1711 if (!Info.CanLowerReturn) {
1712 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
1713 Info.DemoteRegister, Info.DemoteStackIndex);
1714 }
1715
1716 return true;
1717}
1718
1719void AMDGPUCallLowering::addOriginalExecToReturn(
1720 MachineFunction &MF, MachineInstrBuilder &Ret) const {
1721 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1722 const SIInstrInfo *TII = ST.getInstrInfo();
1723 const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
1724 Ret.addReg(Setup->getOperand(0).getReg());
1725}
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall, std::optional< CallLowering::PtrAuthInfo > &PAI, MachineRegisterInfo &MRI)
static std::pair< CCAssignFn *, CCAssignFn * > getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI)
Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for CC.
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc)
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static bool addCallTargetOperands(MachineInstrBuilder &CallInst, MachineIRBuilder &MIRBuilder, AMDGPUCallLowering::CallLoweringInfo &Info, bool IsDynamicVGPRChainCall=false)
This file describes how to lower LLVM calls to machine code calls.
constexpr LLT S32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static constexpr MCPhysReg SPReg
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &OutArgs) const
bool isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl< ArgInfo > &InArgs, SmallVectorImpl< ArgInfo > &OutArgs) const
Returns true if the call can be lowered as a tail call.
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs) const
bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef< Register > VRegs, FunctionLoweringInfo &FLI) const override
This hook behaves as the extended lowerReturn function, but for targets that do not support swifterro...
void handleImplicitCallArguments(MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, CallingConv::ID CalleeCC, ArrayRef< std::pair< MCRegister, Register > > ImplicitArgRegs) const
bool areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &OutArgs) const
bool lowerChainCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const
Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
bool passSpecialInputs(MachineIRBuilder &MIRBuilder, CCState &CCInfo, SmallVectorImpl< std::pair< MCRegister, Register > > &ArgRegs, CallLoweringInfo &Info) const
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef< ArrayRef< Register > > VRegs, FunctionLoweringInfo &FLI) const override
This hook must be implemented to lower the incoming (formal) arguments, described by VRegs,...
bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override
This hook must be implemented to lower the given call instruction, including argument and return valu...
bool doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
Class for arbitrary precision integers.
Definition APInt.h:78
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
LocInfo getLocInfo() const
static CCValAssign getReg(unsigned ValNo, MVT ValVT, MCRegister Reg, MVT LocVT, LocInfo HTP, bool IsCustom=false)
int64_t getLocMemOffset() const
This class represents a function call, abstracting a target machine's calling convention.
void insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg, int FI) const
Load the returned value from the stack into virtual registers in VRegs.
bool handleAssignments(ValueHandler &Handler, SmallVectorImpl< ArgInfo > &Args, CCState &CCState, SmallVectorImpl< CCValAssign > &ArgLocs, MachineIRBuilder &MIRBuilder, ArrayRef< Register > ThisReturnRegs={}) const
Use Handler to insert code to handle the argument/return values represented by Args.
bool resultsCompatible(CallLoweringInfo &Info, MachineFunction &MF, SmallVectorImpl< ArgInfo > &InArgs, ValueAssigner &CalleeAssigner, ValueAssigner &CallerAssigner) const
void insertSRetIncomingArgument(const Function &F, SmallVectorImpl< ArgInfo > &SplitArgs, Register &DemoteReg, MachineRegisterInfo &MRI, const DataLayout &DL) const
Insert the hidden sret ArgInfo to the beginning of SplitArgs.
void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl< ArgInfo > &SplitArgs, const DataLayout &DL, CallingConv::ID CallConv, SmallVectorImpl< TypeSize > *Offsets=nullptr) const
Break OrigArgInfo into one or more pieces the calling convention can process, returned in SplitArgs.
bool determineAndHandleAssignments(ValueHandler &Handler, ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, bool IsVarArg, ArrayRef< Register > ThisReturnRegs={}) const
Invoke ValueAssigner::assignArg on each of the given Args and then use Handler to move them to the as...
void insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, ArrayRef< Register > VRegs, Register DemoteReg) const
Store the return value given by VRegs into stack starting at the offset specified in DemoteReg.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ArgInfo > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
bool determineAssignments(ValueAssigner &Assigner, SmallVectorImpl< ArgInfo > &Args, CCState &CCInfo) const
Analyze the argument list in Args, using Assigner to populate CCInfo.
bool checkReturn(CCState &CCInfo, SmallVectorImpl< BaseArgInfo > &Outs, CCAssignFn *Fn) const
CallLowering(const TargetLowering *TLI)
const TargetLowering * getTLI() const
Getter for generic TargetLowering class.
void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
bool CanLowerReturn
CanLowerReturn - true iff the function's return value can be lowered to registers.
iterator_range< arg_iterator > args()
Definition Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
const SIRegisterInfo * getRegisterInfo() const override
bool hasPrivateSegmentBuffer() const
unsigned getAddressSpace() const
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr ElementCount getElementCount() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void setHasTailCall(bool V=true)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineInstrBuilder insertInstr(MachineInstrBuilder MIB)
Insert an existing instruction at the insertion point.
MachineInstrBuilder buildGlobalValue(const DstOp &Res, const GlobalValue *GV)
Build and insert Res = G_GLOBAL_VALUE GV.
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
MachineInstrBuilder buildPtrAdd(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_PTR_ADD Op0, Op1.
MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert G_STORE Val, Addr, MMO.
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
MachineInstrBuilder buildFrameIndex(const DstOp &Res, int Idx)
Build and insert Res = G_FRAME_INDEX Idx.
MachineFunction & getMF()
Getter for the function we currently build.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
MachineInstrBuilder buildOr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_OR Op0, Op1.
MachineInstrBuilder buildInstrNoInsert(unsigned Opcode)
Build but don't insert <empty> = Opcode <empty>.
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
static MachineOperand CreateGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterInfo * getTargetRegisterInfo() const
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
AMDGPUFunctionArgInfo & getArgInfo()
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
@ Implicit
Not emitted register (e.g. carry, or temporary result).
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI LLT getLLTForType(Type &Ty, const DataLayout &DL)
Construct a low-level type based on an LLVM type.
LLVM_ABI Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO)
Definition Utils.cpp:900
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
const Value * OrigValue
Optionally track the original IR value for the argument.
SmallVector< Register, 4 > Regs
SmallVector< ISD::ArgFlagsTy, 4 > Flags
Base class for ValueHandlers used for arguments coming into the current function, or for return value...
Base class for ValueHandlers used for arguments passed to a function call, or for return values.
uint64_t StackSize
The size of the currently allocated portion of the stack.
Register extendRegister(Register ValReg, const CCValAssign &VA, unsigned MaxSizeBits=0)
Extend a register to the location type given in VA, capped at extending to at most MaxSize bits.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106