LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "AMDGPUGenSearchableTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(Src1, Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(Src0, Src2);
59
60 return maxnum(Src0, Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
80 &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
94 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
106 Type *VTy = V.getType();
108 return cast<Instruction>(&V)->getOperand(0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Ext, Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
282 Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
288
289 II.mutateType(HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
299 HalfExtract->takeName(Tr);
300
301 Tr->replaceAllUsesWith(HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(*Tr);
306 IC.eraseInstFromFunction(*Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
362 : Type::getInt16Ty(II.getContext());
363
364 return modifyIntrinsicCall(
365 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
387 }
388 });
389}
390
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
406 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(Arg, m_ConstantFP(CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
424 if (!LosesInfo)
425 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(UseV, i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
459 auto *VTy = cast<FixedVectorType>(V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(VWidth);
462 Value *FirstComponent = findScalarElement(V, 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
466 SVI->getShuffleMask(ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(I);
480 }
481
482 return DemandedElts;
483}
484
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(V))
505 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
547 Function &NewCallee, ArrayRef<Value *> Ops) {
549 Old.getOperandBundlesAsDefs(OpBundles);
550
551 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
552 NewCall->takeName(&Old);
553 return NewCall;
554}
555
556// Return true for sequences of instructions that effectively assign
557// each lane to its thread ID
558static bool isThreadID(const GCNSubtarget &ST, Value *V) {
559 // Case 1:
560 // wave32: mbcnt_lo(-1, 0)
561 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
567 if (ST.isWave32() && match(V, W32Pred))
568 return true;
569 if (ST.isWave64() && match(V, W64Pred))
570 return true;
571
572 return false;
573}
574
575// Attempt to capture situations where the index argument matches
576// a DPP pattern, and convert to a DPP-based mov
577static std::optional<Instruction *>
579 Value *Val = II.getArgOperand(0);
580 Value *Idx = II.getArgOperand(1);
581 auto &B = IC.Builder;
582
583 // DPP16 Row Share requires known wave size, architecture support
584 if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
585 return std::nullopt;
586
587 Value *Tid;
588 uint64_t Mask;
589 uint64_t RowIdx;
590 bool CanDPP16RowShare = false;
591
592 // wave32 requires Mask & 0x1F == 0x10
593 // wave64 requires Mask & 0x3F == 0x30
594 uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
595 uint64_t MaskTarget = MaskCheck & 0xF0;
596
597 // DPP16 Row Share 0: Idx = Tid & Mask
598 auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
599
600 // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
601 auto RowSharePred =
602 m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
603
604 // DPP16 Row Share 15: Idx = Tid | 0xF
605 auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>());
606
607 if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
608 if ((Mask & MaskCheck) != MaskTarget)
609 return std::nullopt;
610
611 RowIdx = 0;
612 CanDPP16RowShare = true;
613 } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
614 RowIdx > 0) {
615 if ((Mask & MaskCheck) != MaskTarget)
616 return std::nullopt;
617
618 CanDPP16RowShare = true;
619 } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) {
620 RowIdx = 15;
621 CanDPP16RowShare = true;
622 }
623
624 if (CanDPP16RowShare) {
625 CallInst *UpdateDPP =
626 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(),
627 {PoisonValue::get(Val->getType()), Val,
628 B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx),
629 B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
630 UpdateDPP->takeName(&II);
631 UpdateDPP->copyMetadata(II);
632 return IC.replaceInstUsesWith(II, UpdateDPP);
633 }
634
635 // No valid DPP detected
636 return std::nullopt;
637}
638
641 IntrinsicInst &II) const {
642 const auto IID = II.getIntrinsicID();
643 assert(IID == Intrinsic::amdgcn_readlane ||
644 IID == Intrinsic::amdgcn_readfirstlane ||
645 IID == Intrinsic::amdgcn_permlane64);
646
647 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
648
649 // Only do this if both instructions are in the same block
650 // (so the exec mask won't change) and the readlane is the only user of its
651 // operand.
652 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
653 return nullptr;
654
655 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
656
657 // If this is a readlane, check that the second operand is a constant, or is
658 // defined before OpInst so we know it's safe to move this intrinsic higher.
659 Value *LaneID = nullptr;
660 if (IsReadLane) {
661 LaneID = II.getOperand(1);
662
663 // readlane take an extra operand for the lane ID, so we must check if that
664 // LaneID value can be used at the point where we want to move the
665 // intrinsic.
666 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
667 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
668 return nullptr;
669 }
670 }
671
672 // Hoist the intrinsic (II) through OpInst.
673 //
674 // (II (OpInst x)) -> (OpInst (II x))
675 const auto DoIt = [&](unsigned OpIdx,
676 Function *NewIntrinsic) -> Instruction * {
678 if (IsReadLane)
679 Ops.push_back(LaneID);
680
681 // Rewrite the intrinsic call.
682 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
683
684 // Rewrite OpInst so it takes the result of the intrinsic now.
685 Instruction &NewOp = *OpInst->clone();
686 NewOp.setOperand(OpIdx, NewII);
687 return &NewOp;
688 };
689
690 // TODO(?): Should we do more with permlane64?
691 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
692 return nullptr;
693
694 if (isa<UnaryOperator>(OpInst))
695 return DoIt(0, II.getCalledFunction());
696
697 if (isa<CastInst>(OpInst)) {
698 Value *Src = OpInst->getOperand(0);
699 Type *SrcTy = Src->getType();
700 if (!isTypeLegal(SrcTy))
701 return nullptr;
702
703 Function *Remangled =
704 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
705 return DoIt(0, Remangled);
706 }
707
708 // We can also hoist through binary operators if the other operand is uniform.
709 if (isa<BinaryOperator>(OpInst)) {
710 // FIXME: If we had access to UniformityInfo here we could just check
711 // if the operand is uniform.
712 if (isTriviallyUniform(OpInst->getOperandUse(0)))
713 return DoIt(1, II.getCalledFunction());
714 if (isTriviallyUniform(OpInst->getOperandUse(1)))
715 return DoIt(0, II.getCalledFunction());
716 }
717
718 return nullptr;
719}
720
721std::optional<Instruction *>
723 Intrinsic::ID IID = II.getIntrinsicID();
724 switch (IID) {
725 case Intrinsic::amdgcn_implicitarg_ptr: {
726 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
727
728 uint64_t CurrentOrNullBytes =
729 II.getAttributes().getRetDereferenceableOrNullBytes();
730 if (CurrentOrNullBytes != 0) {
731 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
732 // into dereferenceable(max(A, B))
733 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
734 II.addRetAttr(
735 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
736 II.removeRetAttr(Attribute::DereferenceableOrNull);
737 return &II;
738 }
739
740 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
741 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
742 if (NewBytes != CurrentBytes) {
743 II.addRetAttr(
744 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
745 return &II;
746 }
747
748 return std::nullopt;
749 }
750 case Intrinsic::amdgcn_rcp: {
751 Value *Src = II.getArgOperand(0);
752 if (isa<PoisonValue>(Src))
753 return IC.replaceInstUsesWith(II, Src);
754
755 // TODO: Move to ConstantFolding/InstSimplify?
756 if (isa<UndefValue>(Src)) {
757 Type *Ty = II.getType();
758 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
759 return IC.replaceInstUsesWith(II, QNaN);
760 }
761
762 if (II.isStrictFP())
763 break;
764
765 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
766 const APFloat &ArgVal = C->getValueAPF();
767 APFloat Val(ArgVal.getSemantics(), 1);
769
770 // This is more precise than the instruction may give.
771 //
772 // TODO: The instruction always flushes denormal results (except for f16),
773 // should this also?
774 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
775 }
776
777 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
778 if (!FMF.allowContract())
779 break;
780 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
781 if (!SrcCI)
782 break;
783
784 auto IID = SrcCI->getIntrinsicID();
785 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
786 //
787 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
788 // relaxed.
789 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
790 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
791 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
792 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
793 break;
794
795 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
796 break;
797
799 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
800
801 InnerFMF |= FMF;
802 II.setFastMathFlags(InnerFMF);
803
804 II.setCalledFunction(NewDecl);
805 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
806 }
807
808 break;
809 }
810 case Intrinsic::amdgcn_sqrt:
811 case Intrinsic::amdgcn_rsq:
812 case Intrinsic::amdgcn_tanh: {
813 Value *Src = II.getArgOperand(0);
814 if (isa<PoisonValue>(Src))
815 return IC.replaceInstUsesWith(II, Src);
816
817 // TODO: Move to ConstantFolding/InstSimplify?
818 if (isa<UndefValue>(Src)) {
819 Type *Ty = II.getType();
820 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
821 return IC.replaceInstUsesWith(II, QNaN);
822 }
823
824 // f16 amdgcn.sqrt is identical to regular sqrt.
825 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
827 II.getModule(), Intrinsic::sqrt, {II.getType()});
828 II.setCalledFunction(NewDecl);
829 return &II;
830 }
831
832 break;
833 }
834 case Intrinsic::amdgcn_log:
835 case Intrinsic::amdgcn_exp2: {
836 const bool IsLog = IID == Intrinsic::amdgcn_log;
837 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
838 Value *Src = II.getArgOperand(0);
839 Type *Ty = II.getType();
840
841 if (isa<PoisonValue>(Src))
842 return IC.replaceInstUsesWith(II, Src);
843
844 if (IC.getSimplifyQuery().isUndefValue(Src))
846
847 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
848 if (C->isInfinity()) {
849 // exp2(+inf) -> +inf
850 // log2(+inf) -> +inf
851 if (!C->isNegative())
852 return IC.replaceInstUsesWith(II, C);
853
854 // exp2(-inf) -> 0
855 if (IsExp && C->isNegative())
857 }
858
859 if (II.isStrictFP())
860 break;
861
862 if (C->isNaN()) {
863 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
864 return IC.replaceInstUsesWith(II, Quieted);
865 }
866
867 // f32 instruction doesn't handle denormals, f16 does.
868 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
869 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
870 : ConstantFP::get(Ty, 1.0);
871 return IC.replaceInstUsesWith(II, FoldedValue);
872 }
873
874 if (IsLog && C->isNegative())
876
877 // TODO: Full constant folding matching hardware behavior.
878 }
879
880 break;
881 }
882 case Intrinsic::amdgcn_frexp_mant:
883 case Intrinsic::amdgcn_frexp_exp: {
884 Value *Src = II.getArgOperand(0);
885 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
886 int Exp;
887 APFloat Significand =
888 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
889
890 if (IID == Intrinsic::amdgcn_frexp_mant) {
891 return IC.replaceInstUsesWith(
892 II, ConstantFP::get(II.getContext(), Significand));
893 }
894
895 // Match instruction special case behavior.
896 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
897 Exp = 0;
898
899 return IC.replaceInstUsesWith(II,
900 ConstantInt::getSigned(II.getType(), Exp));
901 }
902
903 if (isa<PoisonValue>(Src))
904 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
905
906 if (isa<UndefValue>(Src)) {
907 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
908 }
909
910 break;
911 }
912 case Intrinsic::amdgcn_class: {
913 Value *Src0 = II.getArgOperand(0);
914 Value *Src1 = II.getArgOperand(1);
915 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
916 if (CMask) {
917 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
918 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
919
920 // Clamp any excess bits, as they're illegal for the generic intrinsic.
921 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
922 CMask->getZExtValue() & fcAllFlags));
923 return &II;
924 }
925
926 // Propagate poison.
927 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
928 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
929
930 // llvm.amdgcn.class(_, undef) -> false
931 if (IC.getSimplifyQuery().isUndefValue(Src1))
932 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
933
934 // llvm.amdgcn.class(undef, mask) -> mask != 0
935 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
936 Value *CmpMask = IC.Builder.CreateICmpNE(
937 Src1, ConstantInt::getNullValue(Src1->getType()));
938 return IC.replaceInstUsesWith(II, CmpMask);
939 }
940 break;
941 }
942 case Intrinsic::amdgcn_cvt_pkrtz: {
943 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
944 Type *HalfTy = Type::getHalfTy(Arg->getContext());
945
946 if (isa<PoisonValue>(Arg))
947 return PoisonValue::get(HalfTy);
948 if (isa<UndefValue>(Arg))
949 return UndefValue::get(HalfTy);
950
951 ConstantFP *CFP = nullptr;
952 if (match(Arg, m_ConstantFP(CFP))) {
953 bool LosesInfo;
954 APFloat Val(CFP->getValueAPF());
956 return ConstantFP::get(HalfTy, Val);
957 }
958
959 Value *Src = nullptr;
960 if (match(Arg, m_FPExt(m_Value(Src)))) {
961 if (Src->getType()->isHalfTy())
962 return Src;
963 }
964
965 return nullptr;
966 };
967
968 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
969 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
970 Value *V = PoisonValue::get(II.getType());
971 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
972 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
973 return IC.replaceInstUsesWith(II, V);
974 }
975 }
976
977 break;
978 }
979 case Intrinsic::amdgcn_cvt_pknorm_i16:
980 case Intrinsic::amdgcn_cvt_pknorm_u16:
981 case Intrinsic::amdgcn_cvt_pk_i16:
982 case Intrinsic::amdgcn_cvt_pk_u16: {
983 Value *Src0 = II.getArgOperand(0);
984 Value *Src1 = II.getArgOperand(1);
985
986 // TODO: Replace call with scalar operation if only one element is poison.
987 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
988 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
989
990 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
991 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
992 }
993
994 break;
995 }
996 case Intrinsic::amdgcn_cvt_off_f32_i4: {
997 Value* Arg = II.getArgOperand(0);
998 Type *Ty = II.getType();
999
1000 if (isa<PoisonValue>(Arg))
1001 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1002
1003 if(IC.getSimplifyQuery().isUndefValue(Arg))
1005
1006 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1007 if (!CArg)
1008 break;
1009
1010 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1011 constexpr size_t ResValsSize = 16;
1012 static constexpr float ResVals[ResValsSize] = {
1013 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1014 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1015 Constant *Res =
1016 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1017 return IC.replaceInstUsesWith(II, Res);
1018 }
1019 case Intrinsic::amdgcn_ubfe:
1020 case Intrinsic::amdgcn_sbfe: {
1021 // Decompose simple cases into standard shifts.
1022 Value *Src = II.getArgOperand(0);
1023 if (isa<UndefValue>(Src)) {
1024 return IC.replaceInstUsesWith(II, Src);
1025 }
1026
1027 unsigned Width;
1028 Type *Ty = II.getType();
1029 unsigned IntSize = Ty->getIntegerBitWidth();
1030
1031 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1032 if (CWidth) {
1033 Width = CWidth->getZExtValue();
1034 if ((Width & (IntSize - 1)) == 0) {
1036 }
1037
1038 // Hardware ignores high bits, so remove those.
1039 if (Width >= IntSize) {
1040 return IC.replaceOperand(
1041 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1042 }
1043 }
1044
1045 unsigned Offset;
1046 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1047 if (COffset) {
1048 Offset = COffset->getZExtValue();
1049 if (Offset >= IntSize) {
1050 return IC.replaceOperand(
1051 II, 1,
1052 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1053 }
1054 }
1055
1056 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1057
1058 if (!CWidth || !COffset)
1059 break;
1060
1061 // The case of Width == 0 is handled above, which makes this transformation
1062 // safe. If Width == 0, then the ashr and lshr instructions become poison
1063 // value since the shift amount would be equal to the bit size.
1064 assert(Width != 0);
1065
1066 // TODO: This allows folding to undef when the hardware has specific
1067 // behavior?
1068 if (Offset + Width < IntSize) {
1069 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1070 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1071 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1072 RightShift->takeName(&II);
1073 return IC.replaceInstUsesWith(II, RightShift);
1074 }
1075
1076 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1077 : IC.Builder.CreateLShr(Src, Offset);
1078
1079 RightShift->takeName(&II);
1080 return IC.replaceInstUsesWith(II, RightShift);
1081 }
1082 case Intrinsic::amdgcn_exp:
1083 case Intrinsic::amdgcn_exp_row:
1084 case Intrinsic::amdgcn_exp_compr: {
1085 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1086 unsigned EnBits = En->getZExtValue();
1087 if (EnBits == 0xf)
1088 break; // All inputs enabled.
1089
1090 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1091 bool Changed = false;
1092 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1093 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1094 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1095 Value *Src = II.getArgOperand(I + 2);
1096 if (!isa<PoisonValue>(Src)) {
1097 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1098 Changed = true;
1099 }
1100 }
1101 }
1102
1103 if (Changed) {
1104 return &II;
1105 }
1106
1107 break;
1108 }
1109 case Intrinsic::amdgcn_fmed3: {
1110 Value *Src0 = II.getArgOperand(0);
1111 Value *Src1 = II.getArgOperand(1);
1112 Value *Src2 = II.getArgOperand(2);
1113
1114 for (Value *Src : {Src0, Src1, Src2}) {
1115 if (isa<PoisonValue>(Src))
1116 return IC.replaceInstUsesWith(II, Src);
1117 }
1118
1119 if (II.isStrictFP())
1120 break;
1121
1122 // med3 with a nan input acts like
1123 // v_min_f32(v_min_f32(s0, s1), s2)
1124 //
1125 // Signalingness is ignored with ieee=0, so we fold to
1126 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1127 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1128 // returned signaling nan will not be quieted.
1129
1130 // ieee=1
1131 // s0 snan: s2
1132 // s1 snan: s2
1133 // s2 snan: qnan
1134
1135 // s0 qnan: min(s1, s2)
1136 // s1 qnan: min(s0, s2)
1137 // s2 qnan: min(s0, s1)
1138
1139 // ieee=0
1140 // s0 _nan: min(s1, s2)
1141 // s1 _nan: min(s0, s2)
1142 // s2 _nan: min(s0, s1)
1143
1144 // med3 behavior with infinity
1145 // s0 +inf: max(s1, s2)
1146 // s1 +inf: max(s0, s2)
1147 // s2 +inf: max(s0, s1)
1148 // s0 -inf: min(s1, s2)
1149 // s1 -inf: min(s0, s2)
1150 // s2 -inf: min(s0, s1)
1151
1152 // Checking for NaN before canonicalization provides better fidelity when
1153 // mapping other operations onto fmed3 since the order of operands is
1154 // unchanged.
1155 Value *V = nullptr;
1156 const APFloat *ConstSrc0 = nullptr;
1157 const APFloat *ConstSrc1 = nullptr;
1158 const APFloat *ConstSrc2 = nullptr;
1159
1160 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1161 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1162 isa<UndefValue>(Src0)) {
1163 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1164 switch (fpenvIEEEMode(II)) {
1165 case KnownIEEEMode::On:
1166 // TODO: If Src2 is snan, does it need quieting?
1167 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1168 return IC.replaceInstUsesWith(II, Src2);
1169
1170 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1171 : IC.Builder.CreateMinNum(Src1, Src2);
1172 break;
1173 case KnownIEEEMode::Off:
1174 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1175 : IC.Builder.CreateMinimumNum(Src1, Src2);
1176 break;
1178 break;
1179 }
1180 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1181 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1182 isa<UndefValue>(Src1)) {
1183 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1184 switch (fpenvIEEEMode(II)) {
1185 case KnownIEEEMode::On:
1186 // TODO: If Src2 is snan, does it need quieting?
1187 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1188 return IC.replaceInstUsesWith(II, Src2);
1189
1190 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1191 : IC.Builder.CreateMinNum(Src0, Src2);
1192 break;
1193 case KnownIEEEMode::Off:
1194 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1195 : IC.Builder.CreateMinimumNum(Src0, Src2);
1196 break;
1198 break;
1199 }
1200 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1201 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1202 isa<UndefValue>(Src2)) {
1203 switch (fpenvIEEEMode(II)) {
1204 case KnownIEEEMode::On:
1205 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1206 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1207 return IC.replaceInstUsesWith(II, Quieted);
1208 }
1209
1210 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1211 ? IC.Builder.CreateMaxNum(Src0, Src1)
1212 : IC.Builder.CreateMinNum(Src0, Src1);
1213 break;
1214 case KnownIEEEMode::Off:
1215 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1216 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1217 : IC.Builder.CreateMaximumNum(Src0, Src1);
1218 break;
1220 break;
1221 }
1222 }
1223
1224 if (V) {
1225 if (auto *CI = dyn_cast<CallInst>(V)) {
1226 CI->copyFastMathFlags(&II);
1227 CI->takeName(&II);
1228 }
1229 return IC.replaceInstUsesWith(II, V);
1230 }
1231
1232 bool Swap = false;
1233 // Canonicalize constants to RHS operands.
1234 //
1235 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1236 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1237 std::swap(Src0, Src1);
1238 Swap = true;
1239 }
1240
1241 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1242 std::swap(Src1, Src2);
1243 Swap = true;
1244 }
1245
1246 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1247 std::swap(Src0, Src1);
1248 Swap = true;
1249 }
1250
1251 if (Swap) {
1252 II.setArgOperand(0, Src0);
1253 II.setArgOperand(1, Src1);
1254 II.setArgOperand(2, Src2);
1255 return &II;
1256 }
1257
1258 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1259 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1260 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1261 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1262 C2->getValueAPF());
1263 return IC.replaceInstUsesWith(II,
1264 ConstantFP::get(II.getType(), Result));
1265 }
1266 }
1267 }
1268
1269 if (!ST->hasMed3_16())
1270 break;
1271
1272 // Repeat floating-point width reduction done for minnum/maxnum.
1273 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1274 if (Value *X = matchFPExtFromF16(Src0)) {
1275 if (Value *Y = matchFPExtFromF16(Src1)) {
1276 if (Value *Z = matchFPExtFromF16(Src2)) {
1277 Value *NewCall = IC.Builder.CreateIntrinsic(
1278 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1279 return new FPExtInst(NewCall, II.getType());
1280 }
1281 }
1282 }
1283
1284 break;
1285 }
1286 case Intrinsic::amdgcn_icmp:
1287 case Intrinsic::amdgcn_fcmp: {
1288 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1289 // Guard against invalid arguments.
1290 int64_t CCVal = CC->getZExtValue();
1291 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1292 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1293 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1294 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1296 break;
1297
1298 Value *Src0 = II.getArgOperand(0);
1299 Value *Src1 = II.getArgOperand(1);
1300
1301 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1302 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1304 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1305 if (CCmp && CCmp->isNullValue()) {
1306 return IC.replaceInstUsesWith(
1307 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1308 }
1309
1310 // The result of V_ICMP/V_FCMP assembly instructions (which this
1311 // intrinsic exposes) is one bit per thread, masked with the EXEC
1312 // register (which contains the bitmask of live threads). So a
1313 // comparison that always returns true is the same as a read of the
1314 // EXEC register.
1315 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1316 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1317 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1318 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1319 II.getType(), Args);
1320 NewCall->addFnAttr(Attribute::Convergent);
1321 NewCall->takeName(&II);
1322 return IC.replaceInstUsesWith(II, NewCall);
1323 }
1324
1325 // Canonicalize constants to RHS.
1326 CmpInst::Predicate SwapPred =
1328 II.setArgOperand(0, Src1);
1329 II.setArgOperand(1, Src0);
1330 II.setArgOperand(
1331 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1332 return &II;
1333 }
1334
1335 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1336 break;
1337
1338 // Canonicalize compare eq with true value to compare != 0
1339 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1340 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1341 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1342 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1343 Value *ExtSrc;
1344 if (CCVal == CmpInst::ICMP_EQ &&
1345 ((match(Src1, PatternMatch::m_One()) &&
1346 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1347 (match(Src1, PatternMatch::m_AllOnes()) &&
1348 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1349 ExtSrc->getType()->isIntegerTy(1)) {
1351 IC.replaceOperand(II, 2,
1352 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1353 return &II;
1354 }
1355
1356 CmpPredicate SrcPred;
1357 Value *SrcLHS;
1358 Value *SrcRHS;
1359
1360 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1361 // intrinsic. The typical use is a wave vote function in the library, which
1362 // will be fed from a user code condition compared with 0. Fold in the
1363 // redundant compare.
1364
1365 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1366 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1367 //
1368 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1369 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1370 if (match(Src1, PatternMatch::m_Zero()) &&
1372 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1373 PatternMatch::m_Value(SrcRHS))))) {
1374 if (CCVal == CmpInst::ICMP_EQ)
1375 SrcPred = CmpInst::getInversePredicate(SrcPred);
1376
1377 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1378 ? Intrinsic::amdgcn_fcmp
1379 : Intrinsic::amdgcn_icmp;
1380
1381 Type *Ty = SrcLHS->getType();
1382 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1383 // Promote to next legal integer type.
1384 unsigned Width = CmpType->getBitWidth();
1385 unsigned NewWidth = Width;
1386
1387 // Don't do anything for i1 comparisons.
1388 if (Width == 1)
1389 break;
1390
1391 if (Width <= 16)
1392 NewWidth = 16;
1393 else if (Width <= 32)
1394 NewWidth = 32;
1395 else if (Width <= 64)
1396 NewWidth = 64;
1397 else
1398 break; // Can't handle this.
1399
1400 if (Width != NewWidth) {
1401 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1402 if (CmpInst::isSigned(SrcPred)) {
1403 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1404 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1405 } else {
1406 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1407 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1408 }
1409 }
1410 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1411 break;
1412
1413 Value *Args[] = {SrcLHS, SrcRHS,
1414 ConstantInt::get(CC->getType(), SrcPred)};
1415 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1416 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1417 NewCall->takeName(&II);
1418 return IC.replaceInstUsesWith(II, NewCall);
1419 }
1420
1421 break;
1422 }
1423 case Intrinsic::amdgcn_mbcnt_hi: {
1424 // exec_hi is all 0, so this is just a copy.
1425 if (ST->isWave32())
1426 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1427 break;
1428 }
1429 case Intrinsic::amdgcn_ballot: {
1430 Value *Arg = II.getArgOperand(0);
1431 if (isa<PoisonValue>(Arg))
1432 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1433
1434 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1435 if (Src->isZero()) {
1436 // amdgcn.ballot(i1 0) is zero.
1437 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1438 }
1439 }
1440 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1441 // %b64 = call i64 ballot.i64(...)
1442 // =>
1443 // %b32 = call i32 ballot.i32(...)
1444 // %b64 = zext i32 %b32 to i64
1446 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1447 {IC.Builder.getInt32Ty()},
1448 {II.getArgOperand(0)}),
1449 II.getType());
1450 Call->takeName(&II);
1451 return IC.replaceInstUsesWith(II, Call);
1452 }
1453 break;
1454 }
1455 case Intrinsic::amdgcn_wavefrontsize: {
1456 if (ST->isWaveSizeKnown())
1457 return IC.replaceInstUsesWith(
1458 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1459 break;
1460 }
1461 case Intrinsic::amdgcn_wqm_vote: {
1462 // wqm_vote is identity when the argument is constant.
1463 if (!isa<Constant>(II.getArgOperand(0)))
1464 break;
1465
1466 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1467 }
1468 case Intrinsic::amdgcn_kill: {
1469 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1470 if (!C || !C->getZExtValue())
1471 break;
1472
1473 // amdgcn.kill(i1 1) is a no-op
1474 return IC.eraseInstFromFunction(II);
1475 }
1476 case Intrinsic::amdgcn_update_dpp: {
1477 Value *Old = II.getArgOperand(0);
1478
1479 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1480 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1481 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1482 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1483 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1484 break;
1485
1486 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1487 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1488 }
1489 case Intrinsic::amdgcn_permlane16:
1490 case Intrinsic::amdgcn_permlane16_var:
1491 case Intrinsic::amdgcn_permlanex16:
1492 case Intrinsic::amdgcn_permlanex16_var: {
1493 // Discard vdst_in if it's not going to be read.
1494 Value *VDstIn = II.getArgOperand(0);
1495 if (isa<PoisonValue>(VDstIn))
1496 break;
1497
1498 // FetchInvalid operand idx.
1499 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1500 IID == Intrinsic::amdgcn_permlanex16)
1501 ? 4 /* for permlane16 and permlanex16 */
1502 : 3; /* for permlane16_var and permlanex16_var */
1503
1504 // BoundCtrl operand idx.
1505 // For permlane16 and permlanex16 it should be 5
1506 // For Permlane16_var and permlanex16_var it should be 4
1507 unsigned int BcIdx = FiIdx + 1;
1508
1509 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1510 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1511 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1512 break;
1513
1514 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1515 }
1516 case Intrinsic::amdgcn_permlane64:
1517 case Intrinsic::amdgcn_readfirstlane:
1518 case Intrinsic::amdgcn_readlane:
1519 case Intrinsic::amdgcn_ds_bpermute: {
1520 // If the data argument is uniform these intrinsics return it unchanged.
1521 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1522 const Use &Src = II.getArgOperandUse(SrcIdx);
1523 if (isTriviallyUniform(Src))
1524 return IC.replaceInstUsesWith(II, Src.get());
1525
1526 if (IID == Intrinsic::amdgcn_readlane &&
1528 return &II;
1529
1530 // If the lane argument of bpermute is uniform, change it to readlane. This
1531 // generates better code and can enable further optimizations because
1532 // readlane is AlwaysUniform.
1533 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1534 const Use &Lane = II.getArgOperandUse(0);
1535 if (isTriviallyUniform(Lane)) {
1536 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1538 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1539 II.setCalledFunction(NewDecl);
1540 II.setOperand(0, Src);
1541 II.setOperand(1, NewLane);
1542 return &II;
1543 }
1544 }
1545
1546 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1548 return Res;
1549 }
1550
1551 return std::nullopt;
1552 }
1553 case Intrinsic::amdgcn_writelane: {
1554 // TODO: Fold bitcast like readlane.
1555 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1556 return &II;
1557 return std::nullopt;
1558 }
1559 case Intrinsic::amdgcn_trig_preop: {
1560 // The intrinsic is declared with name mangling, but currently the
1561 // instruction only exists for f64
1562 if (!II.getType()->isDoubleTy())
1563 break;
1564
1565 Value *Src = II.getArgOperand(0);
1566 Value *Segment = II.getArgOperand(1);
1567 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1568 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1569
1570 if (isa<UndefValue>(Segment))
1571 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1572
1573 // Sign bit is not used.
1574 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1575 if (StrippedSign != Src)
1576 return IC.replaceOperand(II, 0, StrippedSign);
1577
1578 if (II.isStrictFP())
1579 break;
1580
1581 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1582 if (!CSrc && !isa<UndefValue>(Src))
1583 break;
1584
1585 // The instruction ignores special cases, and literally just extracts the
1586 // exponents. Fold undef to nan, and index the table as normal.
1587 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1588 : APFloat::getQNaN(II.getType()->getFltSemantics())
1589 .bitcastToAPInt();
1590
1591 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1592 if (!Cseg) {
1593 if (isa<UndefValue>(Src))
1594 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1595 break;
1596 }
1597
1598 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
1599 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1600 unsigned Shift = SegmentVal * 53;
1601 if (Exponent > 1077)
1602 Shift += Exponent - 1077;
1603
1604 // 2.0/PI table.
1605 static const uint32_t TwoByPi[] = {
1606 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1607 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1608 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1609 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1610 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1611 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1612 0x56033046};
1613
1614 // Return 0 for outbound segment (hardware behavior).
1615 unsigned Idx = Shift >> 5;
1616 if (Idx + 2 >= std::size(TwoByPi)) {
1617 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1618 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1619 }
1620
1621 unsigned BShift = Shift & 0x1f;
1622 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1623 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1624 if (BShift)
1625 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1626 Thi = Thi >> 11;
1627 APFloat Result = APFloat((double)Thi);
1628
1629 int Scale = -53 - Shift;
1630 if (Exponent >= 1968)
1631 Scale += 128;
1632
1633 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1634 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1635 }
1636 case Intrinsic::amdgcn_fmul_legacy: {
1637 Value *Op0 = II.getArgOperand(0);
1638 Value *Op1 = II.getArgOperand(1);
1639
1640 for (Value *Src : {Op0, Op1}) {
1641 if (isa<PoisonValue>(Src))
1642 return IC.replaceInstUsesWith(II, Src);
1643 }
1644
1645 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1646 // infinity, gives +0.0.
1647 // TODO: Move to InstSimplify?
1648 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1650 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1651
1652 // If we can prove we don't have one of the special cases then we can use a
1653 // normal fmul instruction instead.
1654 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1655 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1656 FMul->takeName(&II);
1657 return IC.replaceInstUsesWith(II, FMul);
1658 }
1659 break;
1660 }
1661 case Intrinsic::amdgcn_fma_legacy: {
1662 Value *Op0 = II.getArgOperand(0);
1663 Value *Op1 = II.getArgOperand(1);
1664 Value *Op2 = II.getArgOperand(2);
1665
1666 for (Value *Src : {Op0, Op1, Op2}) {
1667 if (isa<PoisonValue>(Src))
1668 return IC.replaceInstUsesWith(II, Src);
1669 }
1670
1671 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1672 // infinity, gives +0.0.
1673 // TODO: Move to InstSimplify?
1674 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1676 // It's tempting to just return Op2 here, but that would give the wrong
1677 // result if Op2 was -0.0.
1678 auto *Zero = ConstantFP::getZero(II.getType());
1679 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1680 FAdd->takeName(&II);
1681 return IC.replaceInstUsesWith(II, FAdd);
1682 }
1683
1684 // If we can prove we don't have one of the special cases then we can use a
1685 // normal fma instead.
1686 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1687 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1688 II.getModule(), Intrinsic::fma, II.getType()));
1689 return &II;
1690 }
1691 break;
1692 }
1693 case Intrinsic::amdgcn_is_shared:
1694 case Intrinsic::amdgcn_is_private: {
1695 Value *Src = II.getArgOperand(0);
1696 if (isa<PoisonValue>(Src))
1697 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1698 if (isa<UndefValue>(Src))
1699 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1700
1701 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1702 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1703 break;
1704 }
1705 case Intrinsic::amdgcn_make_buffer_rsrc: {
1706 Value *Src = II.getArgOperand(0);
1707 if (isa<PoisonValue>(Src))
1708 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1709 return std::nullopt;
1710 }
1711 case Intrinsic::amdgcn_raw_buffer_store_format:
1712 case Intrinsic::amdgcn_struct_buffer_store_format:
1713 case Intrinsic::amdgcn_raw_tbuffer_store:
1714 case Intrinsic::amdgcn_struct_tbuffer_store:
1715 case Intrinsic::amdgcn_image_store_1d:
1716 case Intrinsic::amdgcn_image_store_1darray:
1717 case Intrinsic::amdgcn_image_store_2d:
1718 case Intrinsic::amdgcn_image_store_2darray:
1719 case Intrinsic::amdgcn_image_store_2darraymsaa:
1720 case Intrinsic::amdgcn_image_store_2dmsaa:
1721 case Intrinsic::amdgcn_image_store_3d:
1722 case Intrinsic::amdgcn_image_store_cube:
1723 case Intrinsic::amdgcn_image_store_mip_1d:
1724 case Intrinsic::amdgcn_image_store_mip_1darray:
1725 case Intrinsic::amdgcn_image_store_mip_2d:
1726 case Intrinsic::amdgcn_image_store_mip_2darray:
1727 case Intrinsic::amdgcn_image_store_mip_3d:
1728 case Intrinsic::amdgcn_image_store_mip_cube: {
1729 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1730 break;
1731
1732 APInt DemandedElts;
1733 if (ST->hasDefaultComponentBroadcast())
1734 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1735 else if (ST->hasDefaultComponentZero())
1736 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1737 else
1738 break;
1739
1740 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1741 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1742 false)) {
1743 return IC.eraseInstFromFunction(II);
1744 }
1745
1746 break;
1747 }
1748 case Intrinsic::amdgcn_prng_b32: {
1749 auto *Src = II.getArgOperand(0);
1750 if (isa<UndefValue>(Src)) {
1751 return IC.replaceInstUsesWith(II, Src);
1752 }
1753 return std::nullopt;
1754 }
1755 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1756 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1757 Value *Src0 = II.getArgOperand(0);
1758 Value *Src1 = II.getArgOperand(1);
1759 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
1760 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
1761 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1762 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1763
1764 auto getFormatNumRegs = [](unsigned FormatVal) {
1765 switch (FormatVal) {
1768 return 6u;
1770 return 4u;
1773 return 8u;
1774 default:
1775 llvm_unreachable("invalid format value");
1776 }
1777 };
1778
1779 bool MadeChange = false;
1780 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1781 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1782
1783 // Depending on the used format, fewer registers are required so shrink the
1784 // vector type.
1785 if (Src0Ty->getNumElements() > Src0NumElts) {
1786 Src0 = IC.Builder.CreateExtractVector(
1787 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1788 uint64_t(0));
1789 MadeChange = true;
1790 }
1791
1792 if (Src1Ty->getNumElements() > Src1NumElts) {
1793 Src1 = IC.Builder.CreateExtractVector(
1794 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1795 uint64_t(0));
1796 MadeChange = true;
1797 }
1798
1799 if (!MadeChange)
1800 return std::nullopt;
1801
1802 SmallVector<Value *, 10> Args(II.args());
1803 Args[0] = Src0;
1804 Args[1] = Src1;
1805
1806 CallInst *NewII = IC.Builder.CreateIntrinsic(
1807 IID, {Src0->getType(), Src1->getType()}, Args, &II);
1808 NewII->takeName(&II);
1809 return IC.replaceInstUsesWith(II, NewII);
1810 }
1811 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1812 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1813 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1814 Value *Src0 = II.getArgOperand(1);
1815 Value *Src1 = II.getArgOperand(3);
1816 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1817 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
1818 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1819 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1820
1821 bool MadeChange = false;
1822 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
1823 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
1824
1825 // Depending on the used format, fewer registers are required so shrink the
1826 // vector type.
1827 if (Src0Ty->getNumElements() > Src0NumElts) {
1828 Src0 = IC.Builder.CreateExtractVector(
1829 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1830 IC.Builder.getInt64(0));
1831 MadeChange = true;
1832 }
1833
1834 if (Src1Ty->getNumElements() > Src1NumElts) {
1835 Src1 = IC.Builder.CreateExtractVector(
1836 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1837 IC.Builder.getInt64(0));
1838 MadeChange = true;
1839 }
1840
1841 if (!MadeChange)
1842 return std::nullopt;
1843
1844 SmallVector<Value *, 13> Args(II.args());
1845 Args[1] = Src0;
1846 Args[3] = Src1;
1847
1848 CallInst *NewII = IC.Builder.CreateIntrinsic(
1849 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
1850 Args, &II);
1851 NewII->takeName(&II);
1852 return IC.replaceInstUsesWith(II, NewII);
1853 }
1854 case Intrinsic::amdgcn_wave_shuffle: {
1855 if (!ST->hasDPP())
1856 return std::nullopt;
1857
1858 return tryWaveShuffleDPP(*ST, IC, II);
1859 }
1860 }
1861 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1862 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1863 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1864 }
1865 return std::nullopt;
1866}
1867
1868/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1869///
1870/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1871/// definitions of the intrinsics vector argument, not Uses of the result like
1872/// image and buffer loads.
1873/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1874/// struct returns.
1877 APInt DemandedElts,
1878 int DMaskIdx, bool IsLoad) {
1879
1880 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1881 : II.getOperand(0)->getType());
1882 unsigned VWidth = IIVTy->getNumElements();
1883 if (VWidth == 1)
1884 return nullptr;
1885 Type *EltTy = IIVTy->getElementType();
1886
1889
1890 // Assume the arguments are unchanged and later override them, if needed.
1891 SmallVector<Value *, 16> Args(II.args());
1892
1893 if (DMaskIdx < 0) {
1894 // Buffer case.
1895
1896 const unsigned ActiveBits = DemandedElts.getActiveBits();
1897 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1898
1899 // Start assuming the prefix of elements is demanded, but possibly clear
1900 // some other bits if there are trailing zeros (unused components at front)
1901 // and update offset.
1902 DemandedElts = (1 << ActiveBits) - 1;
1903
1904 if (UnusedComponentsAtFront > 0) {
1905 static const unsigned InvalidOffsetIdx = 0xf;
1906
1907 unsigned OffsetIdx;
1908 switch (II.getIntrinsicID()) {
1909 case Intrinsic::amdgcn_raw_buffer_load:
1910 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1911 OffsetIdx = 1;
1912 break;
1913 case Intrinsic::amdgcn_s_buffer_load:
1914 // If resulting type is vec3, there is no point in trimming the
1915 // load with updated offset, as the vec3 would most likely be widened to
1916 // vec4 anyway during lowering.
1917 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1918 OffsetIdx = InvalidOffsetIdx;
1919 else
1920 OffsetIdx = 1;
1921 break;
1922 case Intrinsic::amdgcn_struct_buffer_load:
1923 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1924 OffsetIdx = 2;
1925 break;
1926 default:
1927 // TODO: handle tbuffer* intrinsics.
1928 OffsetIdx = InvalidOffsetIdx;
1929 break;
1930 }
1931
1932 if (OffsetIdx != InvalidOffsetIdx) {
1933 // Clear demanded bits and update the offset.
1934 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1935 auto *Offset = Args[OffsetIdx];
1936 unsigned SingleComponentSizeInBits =
1937 IC.getDataLayout().getTypeSizeInBits(EltTy);
1938 unsigned OffsetAdd =
1939 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1940 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1941 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1942 }
1943 }
1944 } else {
1945 // Image case.
1946
1947 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1948 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1949
1950 // dmask 0 has special semantics, do not simplify.
1951 if (DMaskVal == 0)
1952 return nullptr;
1953
1954 // Mask off values that are undefined because the dmask doesn't cover them
1955 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1956
1957 unsigned NewDMaskVal = 0;
1958 unsigned OrigLdStIdx = 0;
1959 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1960 const unsigned Bit = 1 << SrcIdx;
1961 if (!!(DMaskVal & Bit)) {
1962 if (!!DemandedElts[OrigLdStIdx])
1963 NewDMaskVal |= Bit;
1964 OrigLdStIdx++;
1965 }
1966 }
1967
1968 if (DMaskVal != NewDMaskVal)
1969 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1970 }
1971
1972 unsigned NewNumElts = DemandedElts.popcount();
1973 if (!NewNumElts)
1974 return PoisonValue::get(IIVTy);
1975
1976 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1977 if (DMaskIdx >= 0)
1978 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1979 return nullptr;
1980 }
1981
1982 // Validate function argument and return types, extracting overloaded types
1983 // along the way.
1984 SmallVector<Type *, 6> OverloadTys;
1985 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1986 return nullptr;
1987
1988 Type *NewTy =
1989 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1990 OverloadTys[0] = NewTy;
1991
1992 if (!IsLoad) {
1993 SmallVector<int, 8> EltMask;
1994 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1995 if (DemandedElts[OrigStoreIdx])
1996 EltMask.push_back(OrigStoreIdx);
1997
1998 if (NewNumElts == 1)
1999 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2000 else
2001 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2002 }
2003
2004 CallInst *NewCall =
2005 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2006 NewCall->takeName(&II);
2007 NewCall->copyMetadata(II);
2008
2009 if (IsLoad) {
2010 if (NewNumElts == 1) {
2011 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2012 DemandedElts.countr_zero());
2013 }
2014
2015 SmallVector<int, 8> EltMask;
2016 unsigned NewLoadIdx = 0;
2017 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2018 if (!!DemandedElts[OrigLoadIdx])
2019 EltMask.push_back(NewLoadIdx++);
2020 else
2021 EltMask.push_back(NewNumElts);
2022 }
2023
2024 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2025
2026 return Shuffle;
2027 }
2028
2029 return NewCall;
2030}
2031
2033 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2034 APInt &UndefElts) const {
2035 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2036 if (!VT)
2037 return nullptr;
2038
2039 const unsigned FirstElt = DemandedElts.countr_zero();
2040 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2041 const unsigned MaskLen = LastElt - FirstElt + 1;
2042
2043 unsigned OldNumElts = VT->getNumElements();
2044 if (MaskLen == OldNumElts && MaskLen != 1)
2045 return nullptr;
2046
2047 Type *EltTy = VT->getElementType();
2048 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2049
2050 // Theoretically we should support these intrinsics for any legal type. Avoid
2051 // introducing cases that aren't direct register types like v3i16.
2052 if (!isTypeLegal(NewVT))
2053 return nullptr;
2054
2055 Value *Src = II.getArgOperand(0);
2056
2057 // Make sure convergence tokens are preserved.
2058 // TODO: CreateIntrinsic should allow directly copying bundles
2060 II.getOperandBundlesAsDefs(OpBundles);
2061
2063 Function *Remangled =
2064 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2065
2066 if (MaskLen == 1) {
2067 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2068
2069 // TODO: Preserve callsite attributes?
2070 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2071
2072 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2073 NewCall, FirstElt);
2074 }
2075
2076 SmallVector<int> ExtractMask(MaskLen, -1);
2077 for (unsigned I = 0; I != MaskLen; ++I) {
2078 if (DemandedElts[FirstElt + I])
2079 ExtractMask[I] = FirstElt + I;
2080 }
2081
2082 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2083
2084 // TODO: Preserve callsite attributes?
2085 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2086
2087 SmallVector<int> InsertMask(OldNumElts, -1);
2088 for (unsigned I = 0; I != MaskLen; ++I) {
2089 if (DemandedElts[FirstElt + I])
2090 InsertMask[FirstElt + I] = I;
2091 }
2092
2093 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2094 // call behind.
2095 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2096}
2097
2099 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2100 APInt &UndefElts2, APInt &UndefElts3,
2101 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2102 SimplifyAndSetOp) const {
2103 switch (II.getIntrinsicID()) {
2104 case Intrinsic::amdgcn_readfirstlane:
2105 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2106 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2107 case Intrinsic::amdgcn_raw_buffer_load:
2108 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2109 case Intrinsic::amdgcn_raw_buffer_load_format:
2110 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2111 case Intrinsic::amdgcn_raw_tbuffer_load:
2112 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2113 case Intrinsic::amdgcn_s_buffer_load:
2114 case Intrinsic::amdgcn_struct_buffer_load:
2115 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2116 case Intrinsic::amdgcn_struct_buffer_load_format:
2117 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2118 case Intrinsic::amdgcn_struct_tbuffer_load:
2119 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2120 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2121 default: {
2122 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2123 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2124 }
2125 break;
2126 }
2127 }
2128 return std::nullopt;
2129}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static APInt defaultComponentBroadcast(Value *V)
static std::optional< Instruction * > tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1175
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1263
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5976
bool isPosInfinity() const
Definition APFloat.h:1529
const fltSemantics & getSemantics() const
Definition APFloat.h:1524
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1371
bool isNaN() const
Definition APFloat.h:1514
bool isSignaling() const
Definition APFloat.h:1518
APInt bitcastToAPInt() const
Definition APFloat.h:1408
bool isNegInfinity() const
Definition APFloat.h:1530
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1134
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1459
bool isInfinity() const
Definition APFloat.h:1513
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isMask(unsigned numBits) const
Definition APInt.h:489
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
bool isFPPredicate() const
Definition InstrTypes.h:782
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:282
const APFloat & getValueAPF() const
Definition Constants.h:325
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:74
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:328
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1096
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:574
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2066
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2312
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1027
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1495
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2054
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1055
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1015
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1406
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2487
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1621
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1049
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1535
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1659
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:110
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
class_match< ConstantFP > m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1622
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1665
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1610
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.