LLVM 23.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "NVVMProperties.h"
11#include "llvm/ADT/STLExtras.h"
17#include "llvm/IR/Constants.h"
19#include "llvm/IR/Intrinsics.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
21#include "llvm/IR/Value.h"
26#include <optional>
27using namespace llvm;
28
29#define DEBUG_TYPE "NVPTXtti"
30
31// Whether the given intrinsic reads threadIdx.x/y/z.
32static bool readsThreadIndex(const IntrinsicInst *II) {
33 switch (II->getIntrinsicID()) {
34 default: return false;
35 case Intrinsic::nvvm_read_ptx_sreg_tid_x:
36 case Intrinsic::nvvm_read_ptx_sreg_tid_y:
37 case Intrinsic::nvvm_read_ptx_sreg_tid_z:
38 return true;
39 }
40}
41
42static bool readsLaneId(const IntrinsicInst *II) {
43 return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
44}
45
46bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) const {
47 // Without inter-procedural analysis, we conservatively assume that arguments
48 // to __device__ functions are divergent.
49 if (const Argument *Arg = dyn_cast<Argument>(V))
50 return !isKernelFunction(*Arg->getParent());
51
52 if (const Instruction *I = dyn_cast<Instruction>(V)) {
53 // Without pointer analysis, we conservatively assume values loaded from
54 // generic or local address space are divergent.
55 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
56 unsigned AS = LI->getPointerAddressSpace();
57 return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
58 }
59 // Atomic instructions may cause divergence. Atomic instructions are
60 // executed sequentially across all threads in a warp. Therefore, an earlier
61 // executed thread may see different memory inputs than a later executed
62 // thread. For example, suppose *a = 0 initially.
63 //
64 // atom.global.add.s32 d, [a], 1
65 //
66 // returns 0 for the first thread that enters the critical region, and 1 for
67 // the second thread.
68 if (I->isAtomic())
69 return true;
70 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
71 // Instructions that read threadIdx are obviously divergent.
73 return true;
74 }
75 // Conservatively consider the return value of function calls as divergent.
76 // We could analyze callees with bodies more precisely using
77 // inter-procedural analysis.
78 if (isa<CallInst>(I))
79 return true;
80 }
81
82 return false;
83}
84
85// Convert NVVM intrinsics to target-generic LLVM code where possible.
88 // Each NVVM intrinsic we can simplify can be replaced with one of:
89 //
90 // * an LLVM intrinsic,
91 // * an LLVM cast operation,
92 // * an LLVM binary operation, or
93 // * ad-hoc LLVM IR for the particular operation.
94
95 // Some transformations are only valid when the module's
96 // flush-denormals-to-zero (ftz) setting is true/false, whereas other
97 // transformations are valid regardless of the module's ftz setting.
98 enum FtzRequirementTy {
99 FTZ_Any, // Any ftz setting is ok.
100 FTZ_MustBeOn, // Transformation is valid only if ftz is on.
101 FTZ_MustBeOff, // Transformation is valid only if ftz is off.
102 };
103 // Classes of NVVM intrinsics that can't be replaced one-to-one with a
104 // target-generic intrinsic, cast op, or binary op but that we can nonetheless
105 // simplify.
106 enum SpecialCase {
107 SPC_Reciprocal,
108 SCP_FunnelShiftClamp,
109 };
110
111 // SimplifyAction is a poor-man's variant (plus an additional flag) that
112 // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
113 struct SimplifyAction {
114 // Invariant: At most one of these Optionals has a value.
115 std::optional<Intrinsic::ID> IID;
116 std::optional<Instruction::CastOps> CastOp;
117 std::optional<Instruction::BinaryOps> BinaryOp;
118 std::optional<SpecialCase> Special;
119
120 FtzRequirementTy FtzRequirement = FTZ_Any;
121 // Denormal handling is guarded by different attributes depending on the
122 // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
123 bool IsHalfTy = false;
124
125 SimplifyAction() = default;
126
127 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
128 bool IsHalfTy = false)
129 : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
130
131 // Cast operations don't have anything to do with FTZ, so we skip that
132 // argument.
133 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
134
135 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
136 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
137
138 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
139 : Special(Special), FtzRequirement(FtzReq) {}
140 };
141
142 // Try to generate a SimplifyAction describing how to replace our
143 // IntrinsicInstr with target-generic LLVM IR.
144 const SimplifyAction Action = [II]() -> SimplifyAction {
145 switch (II->getIntrinsicID()) {
146 // NVVM intrinsics that map directly to LLVM intrinsics.
147 case Intrinsic::nvvm_ceil_d:
148 return {Intrinsic::ceil, FTZ_Any};
149 case Intrinsic::nvvm_ceil_f:
150 return {Intrinsic::ceil, FTZ_MustBeOff};
151 case Intrinsic::nvvm_ceil_ftz_f:
152 return {Intrinsic::ceil, FTZ_MustBeOn};
153 case Intrinsic::nvvm_floor_d:
154 return {Intrinsic::floor, FTZ_Any};
155 case Intrinsic::nvvm_floor_f:
156 return {Intrinsic::floor, FTZ_MustBeOff};
157 case Intrinsic::nvvm_floor_ftz_f:
158 return {Intrinsic::floor, FTZ_MustBeOn};
159 case Intrinsic::nvvm_fma_rn_d:
160 return {Intrinsic::fma, FTZ_Any};
161 case Intrinsic::nvvm_fma_rn_f:
162 return {Intrinsic::fma, FTZ_MustBeOff};
163 case Intrinsic::nvvm_fma_rn_ftz_f:
164 return {Intrinsic::fma, FTZ_MustBeOn};
165 case Intrinsic::nvvm_fma_rn_f16:
166 return {Intrinsic::fma, FTZ_MustBeOff, true};
167 case Intrinsic::nvvm_fma_rn_ftz_f16:
168 return {Intrinsic::fma, FTZ_MustBeOn, true};
169 case Intrinsic::nvvm_fma_rn_f16x2:
170 return {Intrinsic::fma, FTZ_MustBeOff, true};
171 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
172 return {Intrinsic::fma, FTZ_MustBeOn, true};
173 case Intrinsic::nvvm_fma_rn_bf16:
174 return {Intrinsic::fma, FTZ_MustBeOff, true};
175 case Intrinsic::nvvm_fma_rn_bf16x2:
176 return {Intrinsic::fma, FTZ_MustBeOff, true};
177 case Intrinsic::nvvm_fmax_d:
178 return {Intrinsic::maximumnum, FTZ_Any};
179 case Intrinsic::nvvm_fmax_f:
180 return {Intrinsic::maximumnum, FTZ_MustBeOff};
181 case Intrinsic::nvvm_fmax_ftz_f:
182 return {Intrinsic::maximumnum, FTZ_MustBeOn};
183 case Intrinsic::nvvm_fmax_nan_f:
184 return {Intrinsic::maximum, FTZ_MustBeOff};
185 case Intrinsic::nvvm_fmax_ftz_nan_f:
186 return {Intrinsic::maximum, FTZ_MustBeOn};
187 case Intrinsic::nvvm_fmax_f16:
188 return {Intrinsic::maximumnum, FTZ_MustBeOff, true};
189 case Intrinsic::nvvm_fmax_ftz_f16:
190 return {Intrinsic::maximumnum, FTZ_MustBeOn, true};
191 case Intrinsic::nvvm_fmax_f16x2:
192 return {Intrinsic::maximumnum, FTZ_MustBeOff, true};
193 case Intrinsic::nvvm_fmax_ftz_f16x2:
194 return {Intrinsic::maximumnum, FTZ_MustBeOn, true};
195 case Intrinsic::nvvm_fmax_nan_f16:
196 return {Intrinsic::maximum, FTZ_MustBeOff, true};
197 case Intrinsic::nvvm_fmax_ftz_nan_f16:
198 return {Intrinsic::maximum, FTZ_MustBeOn, true};
199 case Intrinsic::nvvm_fmax_nan_f16x2:
200 return {Intrinsic::maximum, FTZ_MustBeOff, true};
201 case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
202 return {Intrinsic::maximum, FTZ_MustBeOn, true};
203 case Intrinsic::nvvm_fmin_d:
204 return {Intrinsic::minimumnum, FTZ_Any};
205 case Intrinsic::nvvm_fmin_f:
206 return {Intrinsic::minimumnum, FTZ_MustBeOff};
207 case Intrinsic::nvvm_fmin_ftz_f:
208 return {Intrinsic::minimumnum, FTZ_MustBeOn};
209 case Intrinsic::nvvm_fmin_nan_f:
210 return {Intrinsic::minimum, FTZ_MustBeOff};
211 case Intrinsic::nvvm_fmin_ftz_nan_f:
212 return {Intrinsic::minimum, FTZ_MustBeOn};
213 case Intrinsic::nvvm_fmin_f16:
214 return {Intrinsic::minimumnum, FTZ_MustBeOff, true};
215 case Intrinsic::nvvm_fmin_ftz_f16:
216 return {Intrinsic::minimumnum, FTZ_MustBeOn, true};
217 case Intrinsic::nvvm_fmin_f16x2:
218 return {Intrinsic::minimumnum, FTZ_MustBeOff, true};
219 case Intrinsic::nvvm_fmin_ftz_f16x2:
220 return {Intrinsic::minimumnum, FTZ_MustBeOn, true};
221 case Intrinsic::nvvm_fmin_nan_f16:
222 return {Intrinsic::minimum, FTZ_MustBeOff, true};
223 case Intrinsic::nvvm_fmin_ftz_nan_f16:
224 return {Intrinsic::minimum, FTZ_MustBeOn, true};
225 case Intrinsic::nvvm_fmin_nan_f16x2:
226 return {Intrinsic::minimum, FTZ_MustBeOff, true};
227 case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
228 return {Intrinsic::minimum, FTZ_MustBeOn, true};
229 case Intrinsic::nvvm_sqrt_rn_d:
230 return {Intrinsic::sqrt, FTZ_Any};
231 case Intrinsic::nvvm_sqrt_f:
232 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
233 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
234 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
235 // the versions with explicit ftz-ness.
236 return {Intrinsic::sqrt, FTZ_Any};
237 case Intrinsic::nvvm_trunc_d:
238 return {Intrinsic::trunc, FTZ_Any};
239 case Intrinsic::nvvm_trunc_f:
240 return {Intrinsic::trunc, FTZ_MustBeOff};
241 case Intrinsic::nvvm_trunc_ftz_f:
242 return {Intrinsic::trunc, FTZ_MustBeOn};
243
244 // NVVM intrinsics that map to LLVM cast operations.
245 // Note - we cannot map intrinsics like nvvm_d2ll_rz to LLVM's
246 // FPToSI, as NaN to int conversion with FPToSI is considered UB and is
247 // eliminated. NVVM conversion intrinsics are translated to PTX cvt
248 // instructions which define the outcome for NaN rather than leaving as UB.
249 // Therefore, translate NVVM intrinsics to sitofp/uitofp, but not to
250 // fptosi/fptoui.
251 case Intrinsic::nvvm_i2d_rn:
252 case Intrinsic::nvvm_i2f_rn:
253 case Intrinsic::nvvm_ll2d_rn:
254 case Intrinsic::nvvm_ll2f_rn:
255 return {Instruction::SIToFP};
256 case Intrinsic::nvvm_ui2d_rn:
257 case Intrinsic::nvvm_ui2f_rn:
258 case Intrinsic::nvvm_ull2d_rn:
259 case Intrinsic::nvvm_ull2f_rn:
260 return {Instruction::UIToFP};
261
262 // NVVM intrinsics that map to LLVM binary ops.
263 case Intrinsic::nvvm_div_rn_d:
264 return {Instruction::FDiv, FTZ_Any};
265
266 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
267 // need special handling.
268 //
269 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
270 // as well.
271 case Intrinsic::nvvm_rcp_rn_d:
272 return {SPC_Reciprocal, FTZ_Any};
273
274 case Intrinsic::nvvm_fshl_clamp:
275 case Intrinsic::nvvm_fshr_clamp:
276 return {SCP_FunnelShiftClamp, FTZ_Any};
277
278 // We do not currently simplify intrinsics that give an approximate
279 // answer. These include:
280 //
281 // - nvvm_cos_approx_{f,ftz_f}
282 // - nvvm_ex2_approx(_ftz)
283 // - nvvm_lg2_approx_{d,f,ftz_f}
284 // - nvvm_sin_approx_{f,ftz_f}
285 // - nvvm_sqrt_approx_{f,ftz_f}
286 // - nvvm_rsqrt_approx_{d,f,ftz_f}
287 // - nvvm_div_approx_{ftz_d,ftz_f,f}
288 // - nvvm_rcp_approx_ftz_d
289 //
290 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
291 // means that fastmath is enabled in the intrinsic. Unfortunately only
292 // binary operators (currently) have a fastmath bit in SelectionDAG, so
293 // this information gets lost and we can't select on it.
294 //
295 // TODO: div and rcp are lowered to a binary op, so these we could in
296 // theory lower them to "fast fdiv".
297
298 default:
299 return {};
300 }
301 }();
302
303 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
304 // can bail out now. (Notice that in the case that IID is not an NVVM
305 // intrinsic, we don't have to look up any module metadata, as
306 // FtzRequirementTy will be FTZ_Any.)
307 if (Action.FtzRequirement != FTZ_Any) {
308 // FIXME: Broken for f64
309 DenormalMode Mode = II->getFunction()->getDenormalMode(
310 Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());
311 bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;
312
313 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
314 return nullptr;
315 }
316
317 // Simplify to target-generic intrinsic.
318 if (Action.IID) {
319 SmallVector<Value *, 4> Args(II->args());
320 // All the target-generic intrinsics currently of interest to us have one
321 // type argument, equal to that of the nvvm intrinsic's argument.
322 Type *Tys[] = {II->getArgOperand(0)->getType()};
323 return CallInst::Create(
324 Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),
325 Args);
326 }
327
328 // Simplify to target-generic binary op.
329 if (Action.BinaryOp)
330 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
331 II->getArgOperand(1), II->getName());
332
333 // Simplify to target-generic cast op.
334 if (Action.CastOp)
335 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
336 II->getName());
337
338 // All that's left are the special cases.
339 if (!Action.Special)
340 return nullptr;
341
342 switch (*Action.Special) {
343 case SPC_Reciprocal:
344 // Simplify reciprocal.
346 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
347 II->getArgOperand(0), II->getName());
348
349 case SCP_FunnelShiftClamp: {
350 // Canonicalize a clamping funnel shift to the generic llvm funnel shift
351 // when possible, as this is easier for llvm to optimize further.
352 if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
353 const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
354 if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())
355 return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));
356
357 const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;
359 II->getModule(), FshIID, II->getType()),
360 SmallVector<Value *, 3>(II->args()));
361 }
362 return nullptr;
363 }
364 }
365 llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
366}
367
368// Returns true/false when we know the answer, nullopt otherwise.
369static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {
372 return std::nullopt; // Got to check at run-time.
373 switch (IID) {
374 case Intrinsic::nvvm_isspacep_global:
376 case Intrinsic::nvvm_isspacep_local:
377 return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;
378 case Intrinsic::nvvm_isspacep_shared:
379 // If shared cluster this can't be evaluated at compile time.
381 return std::nullopt;
383 case Intrinsic::nvvm_isspacep_shared_cluster:
386 case Intrinsic::nvvm_isspacep_const:
387 return AS == NVPTXAS::ADDRESS_SPACE_CONST;
388 default:
389 llvm_unreachable("Unexpected intrinsic");
390 }
391}
392
393// Returns an instruction pointer (may be nullptr if we do not know the answer).
394// Returns nullopt if `II` is not one of the `isspacep` intrinsics.
395//
396// TODO: If InferAddressSpaces were run early enough in the pipeline this could
397// be removed in favor of the constant folding that occurs there through
398// rewriteIntrinsicWithAddressSpace
399static std::optional<Instruction *>
401
402 switch (auto IID = II.getIntrinsicID()) {
403 case Intrinsic::nvvm_isspacep_global:
404 case Intrinsic::nvvm_isspacep_local:
405 case Intrinsic::nvvm_isspacep_shared:
406 case Intrinsic::nvvm_isspacep_shared_cluster:
407 case Intrinsic::nvvm_isspacep_const: {
408 Value *Op0 = II.getArgOperand(0);
409 unsigned AS = Op0->getType()->getPointerAddressSpace();
410 // Peek through ASC to generic AS.
411 // TODO: we could dig deeper through both ASCs and GEPs.
413 if (auto *ASCO = dyn_cast<AddrSpaceCastOperator>(Op0))
414 AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace();
415
416 if (std::optional<bool> Answer = evaluateIsSpace(IID, AS))
417 return IC.replaceInstUsesWith(II,
418 ConstantInt::get(II.getType(), *Answer));
419 return nullptr; // Don't know the answer, got to check at run time.
420 }
421 default:
422 return std::nullopt;
423 }
424}
425
426std::optional<Instruction *>
428 if (std::optional<Instruction *> I = handleSpaceCheckIntrinsics(IC, II))
429 return *I;
431 return I;
432
433 return std::nullopt;
434}
435
440 if (const auto *CI = dyn_cast<CallInst>(U))
441 if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
442 // Without this implementation getCallCost() would return the number
443 // of arguments+1 as the cost. Because the cost-model assumes it is a call
444 // since it is classified as a call in the IR. A better cost model would
445 // be to return the number of asm instructions embedded in the asm
446 // string.
447 StringRef AsmStr = IA->getAsmString();
448 const unsigned InstCount =
449 count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
450 // Trim off scopes denoted by '{' and '}' as these can be ignored
451 AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
452 // This is pretty coarse but does a reasonably good job of
453 // identifying things that look like instructions, possibly with a
454 // predicate ("@").
455 return !AsmInst.empty() &&
456 (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
457 AsmInst.contains(".pragma"));
458 });
459 return InstCount * TargetTransformInfo::TCC_Basic;
460 }
461
462 return BaseT::getInstructionCost(U, Operands, CostKind);
463}
464
466 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
468 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
469 // Legalize the type.
470 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
471
472 int ISD = TLI->InstructionOpcodeToISD(Opcode);
473
474 switch (ISD) {
475 default:
476 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
477 Op2Info);
478 case ISD::ADD:
479 case ISD::MUL:
480 case ISD::XOR:
481 case ISD::OR:
482 case ISD::AND:
483 // The machine code (SASS) simulates an i64 with two i32. Therefore, we
484 // estimate that arithmetic operations on i64 are twice as expensive as
485 // those on types that can fit into one machine register.
486 if (LT.second.SimpleTy == MVT::i64)
487 return 2 * LT.first;
488 // Delegate other cases to the basic TTI.
489 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
490 Op2Info);
491 }
492}
493
496 OptimizationRemarkEmitter *ORE) const {
497 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
498
499 // Enable partial unrolling and runtime unrolling, but reduce the
500 // threshold. This partially unrolls small loops which are often
501 // unrolled by the PTX to SASS compiler and unrolling earlier can be
502 // beneficial.
503 UP.Partial = UP.Runtime = true;
504 UP.PartialThreshold = UP.Threshold / 4;
505}
506
511
513 Intrinsic::ID IID) const {
514 switch (IID) {
515 case Intrinsic::nvvm_isspacep_const:
516 case Intrinsic::nvvm_isspacep_global:
517 case Intrinsic::nvvm_isspacep_local:
518 case Intrinsic::nvvm_isspacep_shared:
519 case Intrinsic::nvvm_isspacep_shared_cluster:
520 case Intrinsic::nvvm_prefetch_tensormap: {
521 OpIndexes.push_back(0);
522 return true;
523 }
524 }
525 return false;
526}
527
529 Value *OldV,
530 Value *NewV) const {
531 const Intrinsic::ID IID = II->getIntrinsicID();
532 switch (IID) {
533 case Intrinsic::nvvm_isspacep_const:
534 case Intrinsic::nvvm_isspacep_global:
535 case Intrinsic::nvvm_isspacep_local:
536 case Intrinsic::nvvm_isspacep_shared:
537 case Intrinsic::nvvm_isspacep_shared_cluster: {
538 const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
539 if (const auto R = evaluateIsSpace(IID, NewAS))
540 return ConstantInt::get(II->getType(), *R);
541 return nullptr;
542 }
543 case Intrinsic::nvvm_prefetch_tensormap: {
544 IRBuilder<> Builder(II);
545 const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
546 if (NewAS == NVPTXAS::ADDRESS_SPACE_CONST ||
548 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
549 NewV);
550 return nullptr;
551 }
552 }
553 return nullptr;
554}
555
557 unsigned AddrSpace,
558 TTI::MaskKind MaskKind) const {
559 if (MaskKind != TTI::MaskKind::ConstantMask)
560 return false;
561
562 // We currently only support this feature for 256-bit vectors, so the
563 // alignment must be at least 32
564 if (Alignment < 32)
565 return false;
566
567 if (!ST->has256BitVectorLoadStore(AddrSpace))
568 return false;
569
570 auto *VTy = dyn_cast<FixedVectorType>(DataTy);
571 if (!VTy)
572 return false;
573
574 auto *ElemTy = VTy->getScalarType();
575 return (ElemTy->getScalarSizeInBits() == 32 && VTy->getNumElements() == 8) ||
576 (ElemTy->getScalarSizeInBits() == 64 && VTy->getNumElements() == 4);
577}
578
580 unsigned /*AddrSpace*/,
581 TTI::MaskKind MaskKind) const {
582 if (MaskKind != TTI::MaskKind::ConstantMask)
583 return false;
584
585 if (Alignment < DL.getTypeStoreSize(DataTy))
586 return false;
587
588 // We do not support sub-byte element type masked loads.
589 auto *VTy = dyn_cast<FixedVectorType>(DataTy);
590 if (!VTy)
591 return false;
592 return VTy->getElementType()->getScalarSizeInBits() >= 8;
593}
594
595unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
596 // 256 bit loads/stores are currently only supported for global address space
597 if (ST->has256BitVectorLoadStore(AddrSpace))
598 return 256;
599 return 128;
600}
601
602unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
603 if (isa<AllocaInst>(V))
604 return ADDRESS_SPACE_LOCAL;
605
606 if (const Argument *Arg = dyn_cast<Argument>(V)) {
607 if (isKernelFunction(*Arg->getParent())) {
608 const NVPTXTargetMachine &TM =
609 static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());
610 if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())
612 } else {
613 // We assume that all device parameters that are passed byval will be
614 // placed in the local AS. Very simple cases will be updated after ISel to
615 // use the device param space where possible.
616 if (Arg->hasByValAttr())
617 return ADDRESS_SPACE_LOCAL;
618 }
619 }
620
621 return -1;
622}
623
625 const Function &F,
626 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
627 if (const auto Val = getMaxClusterRank(F))
628 LB.push_back({"maxclusterrank", *Val});
629
630 const auto MaxNTID = getMaxNTID(F);
631 if (MaxNTID.size() > 0)
632 LB.push_back({"maxntidx", MaxNTID[0]});
633 if (MaxNTID.size() > 1)
634 LB.push_back({"maxntidy", MaxNTID[1]});
635 if (MaxNTID.size() > 2)
636 LB.push_back({"maxntidz", MaxNTID[2]});
637}
638
640 if (isSourceOfDivergence(V))
642
644}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file provides the interface for the instcombine pass implementation.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
NVPTX address space definition.
static std::optional< Instruction * > handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II)
static Instruction * convertNvvmIntrinsicToLlvm(InstCombiner &IC, IntrinsicInst *II)
static bool readsLaneId(const IntrinsicInst *II)
static std::optional< bool > evaluateIsSpace(Intrinsic::ID IID, unsigned AS)
static bool readsThreadIndex(const IntrinsicInst *II)
This file a TargetTransformInfoImplBase conforming object specific to the NVPTX target machine.
uint64_t IntrinsicInst * II
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
The core instruction combiner logic.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
A wrapper class for inspecting calls to intrinsic functions.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
ValueUniformity getValueUniformity(const Value *V) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getAssumedAddrSpace(const Value *V) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
NVPTX::DrvInterface getDrvInterface() const
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition StringRef.h:820
bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition StringRef.h:446
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition StringRef.h:844
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
iterator_range< SplittingIterator > split(StringRef Str, StringRef Separator)
Split the specified string over a separator and return a range-compatible iterable over its partition...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
std::optional< unsigned > getMaxClusterRank(const Function &F)
SmallVector< unsigned, 3 > getMaxNTID(const Function &F)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
bool isKernelFunction(const Function &F)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...