LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
56
57 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
58 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
59 if (Cmp0 == APFloat::cmpEqual)
60 return maxnum(Src1, Src2);
61
62 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
63 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
64 if (Cmp1 == APFloat::cmpEqual)
65 return maxnum(Src0, Src2);
66
67 return maxnum(Src0, Src1);
68}
69
70// Check if a value can be converted to a 16-bit value without losing
71// precision.
72// The value is expected to be either a float (IsFloat = true) or an unsigned
73// integer (IsFloat = false).
74static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
75 Type *VTy = V.getType();
76 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
77 // The value is already 16-bit, so we don't want to convert to 16-bit again!
78 return false;
79 }
80 if (IsFloat) {
81 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
82 // We need to check that if we cast the index down to a half, we do not
83 // lose precision.
84 APFloat FloatValue(ConstFloat->getValueAPF());
85 bool LosesInfo = true;
87 &LosesInfo);
88 return !LosesInfo;
89 }
90 } else {
91 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
92 // We need to check that if we cast the index down to an i16, we do not
93 // lose precision.
94 APInt IntValue(ConstInt->getValue());
95 return IntValue.getActiveBits() <= 16;
96 }
97 }
98
99 Value *CastSrc;
100 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
101 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
102 if (IsExt) {
103 Type *CastSrcTy = CastSrc->getType();
104 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
105 return true;
106 }
107
108 return false;
109}
110
111// Convert a value to 16-bit.
113 Type *VTy = V.getType();
115 return cast<Instruction>(&V)->getOperand(0);
116 if (VTy->isIntegerTy())
117 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
118 if (VTy->isFloatingPointTy())
119 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
120
121 llvm_unreachable("Should never be called!");
122}
123
124/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
125/// modified arguments (based on OldIntr) and replaces InstToReplace with
126/// this newly created intrinsic call.
127static std::optional<Instruction *> modifyIntrinsicCall(
128 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
129 InstCombiner &IC,
130 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
131 Func) {
132 SmallVector<Type *, 4> OverloadTys;
133 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
134 return std::nullopt;
135
136 SmallVector<Value *, 8> Args(OldIntr.args());
137
138 // Modify arguments and types
139 Func(Args, OverloadTys);
140
141 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, OverloadTys, Args);
142 NewCall->takeName(&OldIntr);
143 NewCall->copyMetadata(OldIntr);
144 if (isa<FPMathOperator>(NewCall))
145 NewCall->copyFastMathFlags(&OldIntr);
146 // Copy attributes
147 AttributeList OldAttrList = OldIntr.getAttributes();
148 NewCall->setAttributes(OldAttrList);
149
150 // Erase and replace uses
151 if (!InstToReplace.getType()->isVoidTy())
152 IC.replaceInstUsesWith(InstToReplace, NewCall);
153
154 bool RemoveOldIntr = &OldIntr != &InstToReplace;
155
156 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
157 if (RemoveOldIntr)
158 IC.eraseInstFromFunction(OldIntr);
159
160 return RetValue;
161}
162
163static std::optional<Instruction *>
165 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
167 // Optimize _L to _LZ when _L is zero
168 if (const auto *LZMappingInfo =
170 if (auto *ConstantLod =
171 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
172 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
173 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
175 ImageDimIntr->Dim);
176 return modifyIntrinsicCall(
177 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
178 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
179 });
180 }
181 }
182 }
183
184 // Optimize _mip away, when 'lod' is zero
185 if (const auto *MIPMappingInfo =
187 if (auto *ConstantMip =
188 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
189 if (ConstantMip->isZero()) {
190 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
191 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
192 ImageDimIntr->Dim);
193 return modifyIntrinsicCall(
194 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
195 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
196 });
197 }
198 }
199 }
200
201 // Optimize _bias away when 'bias' is zero
202 if (const auto *BiasMappingInfo =
204 if (auto *ConstantBias =
205 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
206 if (ConstantBias->isZero()) {
207 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
208 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
209 ImageDimIntr->Dim);
210 return modifyIntrinsicCall(
211 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
212 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
213 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
214 });
215 }
216 }
217 }
218
219 // Optimize _offset away when 'offset' is zero
220 if (const auto *OffsetMappingInfo =
222 if (auto *ConstantOffset =
223 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
224 if (ConstantOffset->isZero()) {
225 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
227 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
228 return modifyIntrinsicCall(
229 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
230 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
231 });
232 }
233 }
234 }
235
236 // Try to use D16
237 if (ST->hasD16Images()) {
238
239 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
241
242 if (BaseOpcode->HasD16) {
243
244 // If the only use of image intrinsic is a fptrunc (with conversion to
245 // half) then both fptrunc and image intrinsic will be replaced with image
246 // intrinsic with D16 flag.
247 if (II.hasOneUse()) {
248 Instruction *User = II.user_back();
249
250 if (User->getOpcode() == Instruction::FPTrunc &&
252
253 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
254 [&](auto &Args, auto &ArgTys) {
255 // Change return type of image intrinsic.
256 // Set it to return type of fptrunc.
257 ArgTys[0] = User->getType();
258 });
259 }
260 }
261
262 // Only perform D16 folding if every user of the image sample is
263 // an ExtractElementInst immediately followed by an FPTrunc to half.
265 ExtractTruncPairs;
266 bool AllHalfExtracts = true;
267
268 for (User *U : II.users()) {
269 auto *Ext = dyn_cast<ExtractElementInst>(U);
270 if (!Ext || !Ext->hasOneUse()) {
271 AllHalfExtracts = false;
272 break;
273 }
274
275 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
276 if (!Tr || !Tr->getType()->isHalfTy()) {
277 AllHalfExtracts = false;
278 break;
279 }
280
281 ExtractTruncPairs.emplace_back(Ext, Tr);
282 }
283
284 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
285 auto *VecTy = cast<VectorType>(II.getType());
286 Type *HalfVecTy =
287 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
288
289 // Obtain the original image sample intrinsic's signature
290 // and replace its return type with the half-vector for D16 folding
291 SmallVector<Type *, 8> OverloadTys;
292 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
293 return std::nullopt;
294
295 OverloadTys[0] = HalfVecTy;
296 Module *M = II.getModule();
298 M, ImageDimIntr->Intr, OverloadTys);
299
300 II.mutateType(HalfVecTy);
301 II.setCalledFunction(HalfDecl);
302
303 IRBuilder<> Builder(II.getContext());
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 Value *Idx = Ext->getIndexOperand();
306
307 Builder.SetInsertPoint(Tr);
308
309 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
310 HalfExtract->takeName(Tr);
311
312 Tr->replaceAllUsesWith(HalfExtract);
313 }
314
315 for (auto &[Ext, Tr] : ExtractTruncPairs) {
316 IC.eraseInstFromFunction(*Tr);
317 IC.eraseInstFromFunction(*Ext);
318 }
319
320 return &II;
321 }
322 }
323 }
324
325 // Try to use A16 or G16
326 if (!ST->hasA16() && !ST->hasG16())
327 return std::nullopt;
328
329 // Address is interpreted as float if the instruction has a sampler or as
330 // unsigned int if there is no sampler.
331 bool HasSampler =
333 bool FloatCoord = false;
334 // true means derivatives can be converted to 16 bit, coordinates not
335 bool OnlyDerivatives = false;
336
337 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
338 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
339 Value *Coord = II.getOperand(OperandIndex);
340 // If the values are not derived from 16-bit values, we cannot optimize.
341 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
342 if (OperandIndex < ImageDimIntr->CoordStart ||
343 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
344 return std::nullopt;
345 }
346 // All gradients can be converted, so convert only them
347 OnlyDerivatives = true;
348 break;
349 }
350
351 assert(OperandIndex == ImageDimIntr->GradientStart ||
352 FloatCoord == Coord->getType()->isFloatingPointTy());
353 FloatCoord = Coord->getType()->isFloatingPointTy();
354 }
355
356 if (!OnlyDerivatives && !ST->hasA16())
357 OnlyDerivatives = true; // Only supports G16
358
359 // Check if there is a bias parameter and if it can be converted to f16
360 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
361 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
362 assert(HasSampler &&
363 "Only image instructions with a sampler can have a bias");
364 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
365 OnlyDerivatives = true;
366 }
367
368 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
369 ImageDimIntr->CoordStart))
370 return std::nullopt;
371
372 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
373 : Type::getInt16Ty(II.getContext());
374
375 return modifyIntrinsicCall(
376 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
377 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
378 if (!OnlyDerivatives) {
379 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
380
381 // Change the bias type
382 if (ImageDimIntr->NumBiasArgs != 0)
383 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
384 }
385
386 unsigned EndIndex =
387 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
388 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
389 OperandIndex < EndIndex; OperandIndex++) {
390 Args[OperandIndex] =
391 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
392 }
393
394 // Convert the bias
395 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
396 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
397 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
398 }
399 });
400}
401
403 const Value *Op0, const Value *Op1,
404 InstCombiner &IC) const {
405 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
406 // infinity, gives +0.0. If we can prove we don't have one of the special
407 // cases then we can use a normal multiply instead.
408 // TODO: Create and use isKnownFiniteNonZero instead of just matching
409 // constants here.
412 // One operand is not zero or infinity or NaN.
413 return true;
414 }
415
417 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
418 // Neither operand is infinity or NaN.
419 return true;
420 }
421 return false;
422}
423
424/// Match an fpext from half to float, or a constant we can convert.
426 Value *Src = nullptr;
427 ConstantFP *CFP = nullptr;
428 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
429 if (Src->getType()->isHalfTy())
430 return Src;
431 } else if (match(Arg, m_ConstantFP(CFP))) {
432 bool LosesInfo;
433 APFloat Val(CFP->getValueAPF());
435 if (!LosesInfo)
436 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
437 }
438 return nullptr;
439}
440
441// Trim all zero components from the end of the vector \p UseV and return
442// an appropriate bitset with known elements.
444 Instruction *I) {
445 auto *VTy = cast<FixedVectorType>(UseV->getType());
446 unsigned VWidth = VTy->getNumElements();
447 APInt DemandedElts = APInt::getAllOnes(VWidth);
448
449 for (int i = VWidth - 1; i > 0; --i) {
450 auto *Elt = findScalarElement(UseV, i);
451 if (!Elt)
452 break;
453
454 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
455 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
456 break;
457 } else {
458 break;
459 }
460
461 DemandedElts.clearBit(i);
462 }
463
464 return DemandedElts;
465}
466
467// Trim elements of the end of the vector \p V, if they are
468// equal to the first element of the vector.
470 auto *VTy = cast<FixedVectorType>(V->getType());
471 unsigned VWidth = VTy->getNumElements();
472 APInt DemandedElts = APInt::getAllOnes(VWidth);
473 Value *FirstComponent = findScalarElement(V, 0);
474
475 SmallVector<int> ShuffleMask;
476 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
477 SVI->getShuffleMask(ShuffleMask);
478
479 for (int I = VWidth - 1; I > 0; --I) {
480 if (ShuffleMask.empty()) {
481 auto *Elt = findScalarElement(V, I);
482 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
483 break;
484 } else {
485 // Detect identical elements in the shufflevector result, even though
486 // findScalarElement cannot tell us what that element is.
487 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
488 break;
489 }
490 DemandedElts.clearBit(I);
491 }
492
493 return DemandedElts;
494}
495
498 APInt DemandedElts,
499 int DMaskIdx = -1,
500 bool IsLoad = true);
501
502/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
503static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
504 return (SqrtOp->getType()->isFloatTy() &&
505 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
506 SqrtOp->getType()->isHalfTy();
507}
508
509/// Return true if we can easily prove that use U is uniform.
510static bool isTriviallyUniform(const Use &U) {
511 Value *V = U.get();
512 if (isa<Constant>(V))
513 return true;
514 if (const auto *A = dyn_cast<Argument>(V))
516 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
517 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
518 return false;
519 // If II and U are in different blocks then there is a possibility of
520 // temporal divergence.
521 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
522 }
523 return false;
524}
525
526/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
527///
528/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
531 unsigned LaneArgIdx) const {
532 unsigned MaskBits = ST->getWavefrontSizeLog2();
533 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
534
535 KnownBits Known(32);
536 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
537 return true;
538
539 if (!Known.isConstant())
540 return false;
541
542 // Out of bounds indexes may appear in wave64 code compiled for wave32.
543 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
544 // manually fix it up.
545
546 Value *LaneArg = II.getArgOperand(LaneArgIdx);
547 Constant *MaskedConst =
548 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
549 if (MaskedConst != LaneArg) {
550 II.getOperandUse(LaneArgIdx).set(MaskedConst);
551 return true;
552 }
553
554 return false;
555}
556
558 Function &NewCallee, ArrayRef<Value *> Ops) {
560 Old.getOperandBundlesAsDefs(OpBundles);
561
562 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
563 NewCall->takeName(&Old);
564 return NewCall;
565}
566
567// Return true for sequences of instructions that effectively assign
568// each lane to its thread ID
569static bool isThreadID(const GCNSubtarget &ST, Value *V) {
570 // Case 1:
571 // wave32: mbcnt_lo(-1, 0)
572 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
578 if (ST.isWave32() && match(V, W32Pred))
579 return true;
580 if (ST.isWave64() && match(V, W64Pred))
581 return true;
582
583 return false;
584}
585
588 IntrinsicInst &II) const {
589 const auto IID = II.getIntrinsicID();
590 assert(IID == Intrinsic::amdgcn_readlane ||
591 IID == Intrinsic::amdgcn_readfirstlane ||
592 IID == Intrinsic::amdgcn_permlane64);
593
594 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
595
596 // Only do this if both instructions are in the same block
597 // (so the exec mask won't change) and the readlane is the only user of its
598 // operand.
599 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
600 return nullptr;
601
602 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
603
604 // If this is a readlane, check that the second operand is a constant, or is
605 // defined before OpInst so we know it's safe to move this intrinsic higher.
606 Value *LaneID = nullptr;
607 if (IsReadLane) {
608 LaneID = II.getOperand(1);
609
610 // readlane take an extra operand for the lane ID, so we must check if that
611 // LaneID value can be used at the point where we want to move the
612 // intrinsic.
613 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
614 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
615 return nullptr;
616 }
617 }
618
619 // Hoist the intrinsic (II) through OpInst.
620 //
621 // (II (OpInst x)) -> (OpInst (II x))
622 const auto DoIt = [&](unsigned OpIdx,
623 Function *NewIntrinsic) -> Instruction * {
625 if (IsReadLane)
626 Ops.push_back(LaneID);
627
628 // Rewrite the intrinsic call.
629 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
630
631 // Rewrite OpInst so it takes the result of the intrinsic now.
632 Instruction &NewOp = *OpInst->clone();
633 NewOp.setOperand(OpIdx, NewII);
634 return &NewOp;
635 };
636
637 // TODO(?): Should we do more with permlane64?
638 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
639 return nullptr;
640
641 if (isa<UnaryOperator>(OpInst))
642 return DoIt(0, II.getCalledFunction());
643
644 if (isa<CastInst>(OpInst)) {
645 Value *Src = OpInst->getOperand(0);
646 Type *SrcTy = Src->getType();
647 if (!isTypeLegal(SrcTy))
648 return nullptr;
649
650 Function *Remangled =
651 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
652 return DoIt(0, Remangled);
653 }
654
655 // We can also hoist through binary operators if the other operand is uniform.
656 if (isa<BinaryOperator>(OpInst)) {
657 // FIXME: If we had access to UniformityInfo here we could just check
658 // if the operand is uniform.
659 if (isTriviallyUniform(OpInst->getOperandUse(0)))
660 return DoIt(1, II.getCalledFunction());
661 if (isTriviallyUniform(OpInst->getOperandUse(1)))
662 return DoIt(0, II.getCalledFunction());
663 }
664
665 return nullptr;
666}
667
668/// Evaluate V as a function of the lane ID and return its value on Lane, or
669/// std::nullopt if V is not a closed-form expression of the lane ID.
670static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
671 const GCNSubtarget &ST,
672 const DataLayout &DL,
673 unsigned Depth = 0) {
675 return std::nullopt;
676
677 // Poison/undef in the index expression: bail and let InstCombine fold the
678 // intrinsic the usual way.
679 if (isa<UndefValue>(V))
680 return std::nullopt;
681
682 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
683 return CI->getZExtValue();
684
685 if (isThreadID(ST, V))
686 return Lane;
687
689 if (!BO)
690 return std::nullopt;
691
692 std::optional<unsigned> LHS =
693 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
694 if (!LHS)
695 return std::nullopt;
696 std::optional<unsigned> RHS =
697 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
698 if (!RHS)
699 return std::nullopt;
700
701 Type *Ty = BO->getType();
702 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
703 auto *CI =
705 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
706}
707
708/// Build the per-lane shuffle map by evaluating Index for every lane in the
709/// wave. Returns false if any lane index is non-constant or out of range.
710static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
712 const DataLayout &DL) {
713 unsigned WaveSize = ST.getWavefrontSize();
714 Ids.resize(WaveSize);
715 for (unsigned Lane : seq(WaveSize)) {
716 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
717 if (!Val || *Val >= WaveSize)
718 return false;
719 Ids[Lane] = *Val;
720 }
721 return true;
722}
723
724/// Lanes are partitioned into groups of Period; each group is a translated
725/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
726template <unsigned Period>
728 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
729 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
730 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
731 return false;
732 return true;
733}
734
735/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
736/// in the same N-lane row, and the pattern repeats periodically across rows.
737template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
738 for (unsigned I = 0; I < N; ++I)
739 if (Ids[I] >= N)
740 return false;
741 return hasPeriodicLayout<N>(Ids);
742}
743
744static constexpr auto isQuadPattern = isRowPattern<4>;
745static constexpr auto isHalfRowPattern = isRowPattern<8>;
746static constexpr auto isFullRowPattern = isRowPattern<16>;
747
748/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
749/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
750/// [7:6]=Ids[3].
751static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
752 if (!isQuadPattern(Ids))
753 return std::nullopt;
754 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
755}
756
757/// Match an N-lane reversal (mirror) pattern.
758template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
759 if (!isRowPattern<N>(Ids))
760 return false;
761 for (unsigned J = 0; J < N; ++J)
762 if (Ids[J] != (N - 1) - J)
763 return false;
764 return true;
765}
766
769
770/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
771static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
772 if (Ids[0] == 0 || !isFullRowPattern(Ids))
773 return std::nullopt;
774 for (unsigned J = 1; J < 16; ++J)
775 if (Ids[J] != (Ids[0] + J) % 16)
776 return std::nullopt;
777 return 16u - Ids[0];
778}
779
780/// Match a row-share pattern: all 16 lanes of each row read the same source
781/// lane. Returns the shared source lane index in [0, 16).
782static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
783 if (!isFullRowPattern(Ids))
784 return std::nullopt;
785 if (!all_equal(Ids.take_front(16)))
786 return std::nullopt;
787 return Ids[0];
788}
789
790/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
791/// with Mask in [1, 15].
792static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
793 unsigned Mask = Ids[0];
794 if (Mask == 0 || !isFullRowPattern(Ids))
795 return std::nullopt;
796 for (unsigned J = 0; J < 16; ++J)
797 if (Ids[J] != (Mask ^ J))
798 return std::nullopt;
799 return Mask;
800}
801
802/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
803/// 24-bit selector (three bits per output lane).
804static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
805 if (!isHalfRowPattern(Ids))
806 return std::nullopt;
807 unsigned Selector = 0;
808 for (unsigned J = 0; J < 8; ++J)
809 Selector |= Ids[J] << (J * 3);
810 return Selector;
811}
812
813/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
814/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
815/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
817 uint64_t Sel = 0;
818 for (unsigned J = 0; J < 16; ++J)
819 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
820 return Sel;
821}
822
823/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
824/// wave64 targets.
826 if (Ids.size() != 64)
827 return false;
828 for (unsigned J = 0; J < 64; ++J)
829 if (Ids[J] != (J ^ 32))
830 return false;
831 return true;
832}
833
834/// Match a cross-row permutation suitable for v_permlanex16: every lane in
835/// the low 16-lane half reads from the high half of its own row, and vice
836/// versa.
838 if (!hasPeriodicLayout<32>(Ids))
839 return false;
840 for (unsigned J = 0; J < 16; ++J) {
841 if (Ids[J] < 16 || Ids[J] >= 32)
842 return false;
843 if (Ids[J + 16] != Ids[J] - 16)
844 return false;
845 }
846 return true;
847}
848
849/// Match a DS_SWIZZLE bitmask-mode permutation:
850/// dst_lane = ((src_lane & AND) | OR) ^ XOR
851/// with each mask being five bits. Returns the encoded swizzle immediate.
852/// The hardware applies the formula independently within each 32-lane group,
853/// so on wave64 the high group must replicate the low one (translated by 32).
854static std::optional<unsigned>
856 if (!hasPeriodicLayout<32>(Ids))
857 return std::nullopt;
858
859 // The formula is per-bit: output bit B depends only on input bit B. Probe
860 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
861 // and XOR[B] carries the constant offset; otherwise it is a constant bit
862 // encoded in OR (with AND[B]=0, XOR[B]=0).
863 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
864 for (unsigned B = 0; B < 5; ++B) {
865 unsigned Bit0 = (Ids[0] >> B) & 1;
866 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
867 if (Bit0 != Bit1) {
868 AndMask |= 1u << B;
869 XorMask |= Bit0 << B;
870 } else {
871 OrMask |= Bit0 << B;
872 }
873 }
874
875 // The per-bit derivation assumes bit independence; verify the masks
876 // actually reproduce every lane in the 32-lane group.
877 for (unsigned I : seq(32u)) {
878 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
879 if (Ids[I] != Expected)
880 return std::nullopt;
881 }
882
887}
888
889/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
890/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
891/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
892/// ensures both 32-lane groups rotate by the same amount.
893static std::optional<unsigned>
895 if (!hasPeriodicLayout<32>(Ids))
896 return std::nullopt;
897
898 // Determine the rotation amount from lane 0: every lane must read from
899 // lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
900 unsigned N = Ids[0];
901 if (N >= 32)
902 return std::nullopt;
903
904 for (unsigned I = 0; I < 32; ++I)
905 if (Ids[I] != (I + N) % 32)
906 return std::nullopt;
907
910}
911
912/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
913/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
914/// be folded into a consuming VALU op by GCNDPPCombine.
915static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
916 Type *Ty = Val->getType();
917 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
918 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
919 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
920}
921
922/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
923static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
924 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
925 {Val, B.getInt32(Selector)});
926}
927
928/// Emit v_permlane16 with the precomputed lane-select halves.
930 uint32_t Hi) {
931 Type *Ty = Val->getType();
932 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
933 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
934 B.getInt32(Hi), B.getFalse(), B.getFalse()});
935}
936
937/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
938/// lane reads from the other 16-lane half of the same row.
940 uint32_t Hi) {
941 Type *Ty = Val->getType();
942 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
943 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
944 B.getInt32(Hi), B.getFalse(), B.getFalse()});
945}
946
947/// Emit ds_swizzle with the given immediate, bitcasting/converting between
948/// pointer/float types and i32 as required by the intrinsic signature.
950 const DataLayout &DL) {
951 Type *OrigTy = Val->getType();
952 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
953 "ds_swizzle only supports 32-bit operands");
954 IntegerType *I32Ty = B.getInt32Ty();
955 Value *Src = Val;
956 if (OrigTy->isPointerTy())
957 Src = B.CreatePtrToInt(Src, I32Ty);
958 else if (OrigTy != I32Ty)
959 Src = B.CreateBitCast(Src, I32Ty);
960 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
961 {Src, B.getInt32(Offset)});
962 if (OrigTy->isPointerTy())
963 return B.CreateIntToPtr(Result, OrigTy);
964 if (OrigTy != I32Ty)
965 return B.CreateBitCast(Result, OrigTy);
966 return Result;
967}
968
969/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
971 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
972 {Val});
973}
974
975/// Given a shuffle map, try to emit the best hardware intrinsic.
978 const GCNSubtarget &ST,
979 const DataLayout &DL) {
980 // Uniform shuffle (all lanes read the same value) is handled by cheaper
981 // broadcast/readlane intrinsics.
982 if (all_equal(Ids))
983 return nullptr;
984
985 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
986 if (ST.hasDPP())
987 return createUpdateDpp(B, Src, *QP);
989 }
990
991 if (ST.hasDPP()) {
996 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
997 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
998 }
999
1000 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
1001 if (ST.hasDPPRowShare()) {
1002 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1003 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1004 }
1005
1006 if (ST.hasDPP() && ST.hasGFX10Insts()) {
1007 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1008 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1009 }
1010
1011 if (ST.hasDPP8()) {
1012 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1013 return createMovDpp8(B, Src, *Sel);
1014 }
1015
1016 if (ST.hasPermlane16Insts()) {
1017 if (isFullRowPattern(Ids)) {
1019 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
1020 }
1021 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1022 if (isCrossRowPattern(Ids)) {
1024 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
1025 }
1026 }
1027
1028 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1029 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1030 // is available on every target that has ds_swizzle.
1031 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1032 return createDsSwizzle(B, Src, *Imm, DL);
1033
1034 // DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1035 // bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1036 if (ST.hasDsSwizzleRotateMode()) {
1037 if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1038 return createDsSwizzle(B, Src, *Imm, DL);
1039 }
1040
1041 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1042 return createPermlane64(B, Src);
1043
1044 return nullptr;
1045}
1046
1047/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1048/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1049static std::optional<Instruction *>
1051 const GCNSubtarget &ST) {
1052 const DataLayout &DL = IC.getDataLayout();
1053 if (DL.getTypeSizeInBits(II.getType()) != 32)
1054 return std::nullopt;
1055
1056 if (!ST.isWaveSizeKnown())
1057 return std::nullopt;
1058
1059 unsigned WaveSize = ST.getWavefrontSize();
1060 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1061 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1062 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1063
1065 if (IsBpermute) {
1066 Ids.resize(WaveSize);
1067 for (unsigned Lane : seq(WaveSize)) {
1068 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1069 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1070 return std::nullopt;
1071 Ids[Lane] = *Val >> 2;
1072 }
1073 } else {
1074 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1075 return std::nullopt;
1076 }
1077
1078 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1079 if (!Result)
1080 return std::nullopt;
1081
1082 return IC.replaceInstUsesWith(II, Result);
1083}
1084std::optional<Instruction *>
1086 Intrinsic::ID IID = II.getIntrinsicID();
1087 switch (IID) {
1088 case Intrinsic::amdgcn_implicitarg_ptr: {
1089 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1090 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1091 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1092
1093 uint64_t CurrentOrNullBytes =
1094 II.getAttributes().getRetDereferenceableOrNullBytes();
1095 if (CurrentOrNullBytes != 0) {
1096 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1097 // into dereferenceable(max(A, B))
1098 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1099 II.addRetAttr(
1100 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1101 II.removeRetAttr(Attribute::DereferenceableOrNull);
1102 return &II;
1103 }
1104
1105 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1106 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1107 if (NewBytes != CurrentBytes) {
1108 II.addRetAttr(
1109 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1110 return &II;
1111 }
1112
1113 return std::nullopt;
1114 }
1115 case Intrinsic::amdgcn_rcp: {
1116 Value *Src = II.getArgOperand(0);
1117 if (isa<PoisonValue>(Src))
1118 return IC.replaceInstUsesWith(II, Src);
1119
1120 // TODO: Move to ConstantFolding/InstSimplify?
1121 if (isa<UndefValue>(Src)) {
1122 Type *Ty = II.getType();
1123 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1124 return IC.replaceInstUsesWith(II, QNaN);
1125 }
1126
1127 if (II.isStrictFP())
1128 break;
1129
1130 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1131 const APFloat &ArgVal = C->getValueAPF();
1132 APFloat Val(ArgVal.getSemantics(), 1);
1134
1135 // This is more precise than the instruction may give.
1136 //
1137 // TODO: The instruction always flushes denormal results (except for f16),
1138 // should this also?
1139 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1140 }
1141
1142 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1143 if (!FMF.allowContract())
1144 break;
1145 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1146 if (!SrcCI)
1147 break;
1148
1149 auto IID = SrcCI->getIntrinsicID();
1150 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1151 //
1152 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1153 // relaxed.
1154 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1155 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1156 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1157 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1158 break;
1159
1160 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1161 break;
1162
1164 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1165
1166 InnerFMF |= FMF;
1167 II.setFastMathFlags(InnerFMF);
1168
1169 II.setCalledFunction(NewDecl);
1170 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1171 }
1172
1173 break;
1174 }
1175 case Intrinsic::amdgcn_sqrt:
1176 case Intrinsic::amdgcn_rsq:
1177 case Intrinsic::amdgcn_tanh: {
1178 Value *Src = II.getArgOperand(0);
1179 if (isa<PoisonValue>(Src))
1180 return IC.replaceInstUsesWith(II, Src);
1181
1182 // TODO: Move to ConstantFolding/InstSimplify?
1183 if (isa<UndefValue>(Src)) {
1184 Type *Ty = II.getType();
1185 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1186 return IC.replaceInstUsesWith(II, QNaN);
1187 }
1188
1189 // f16 amdgcn.sqrt is identical to regular sqrt.
1190 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1192 II.getModule(), Intrinsic::sqrt, {II.getType()});
1193 II.setCalledFunction(NewDecl);
1194 return &II;
1195 }
1196
1197 break;
1198 }
1199 case Intrinsic::amdgcn_log:
1200 case Intrinsic::amdgcn_exp2: {
1201 const bool IsLog = IID == Intrinsic::amdgcn_log;
1202 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1203 Value *Src = II.getArgOperand(0);
1204 Type *Ty = II.getType();
1205
1206 if (isa<PoisonValue>(Src))
1207 return IC.replaceInstUsesWith(II, Src);
1208
1209 if (IC.getSimplifyQuery().isUndefValue(Src))
1211
1212 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1213 if (C->isInfinity()) {
1214 // exp2(+inf) -> +inf
1215 // log2(+inf) -> +inf
1216 if (!C->isNegative())
1217 return IC.replaceInstUsesWith(II, C);
1218
1219 // exp2(-inf) -> 0
1220 if (IsExp && C->isNegative())
1222 }
1223
1224 if (II.isStrictFP())
1225 break;
1226
1227 if (C->isNaN()) {
1228 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1229 return IC.replaceInstUsesWith(II, Quieted);
1230 }
1231
1232 // f32 instruction doesn't handle denormals, f16 does.
1233 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1234 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1235 : ConstantFP::get(Ty, 1.0);
1236 return IC.replaceInstUsesWith(II, FoldedValue);
1237 }
1238
1239 if (IsLog && C->isNegative())
1241
1242 // TODO: Full constant folding matching hardware behavior.
1243 }
1244
1245 break;
1246 }
1247 case Intrinsic::amdgcn_frexp_mant:
1248 case Intrinsic::amdgcn_frexp_exp: {
1249 Value *Src = II.getArgOperand(0);
1250 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1251 int Exp;
1252 APFloat Significand =
1253 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1254
1255 if (IID == Intrinsic::amdgcn_frexp_mant) {
1256 return IC.replaceInstUsesWith(
1257 II, ConstantFP::get(II.getContext(), Significand));
1258 }
1259
1260 // Match instruction special case behavior.
1261 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1262 Exp = 0;
1263
1264 return IC.replaceInstUsesWith(II,
1265 ConstantInt::getSigned(II.getType(), Exp));
1266 }
1267
1268 if (isa<PoisonValue>(Src))
1269 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1270
1271 if (isa<UndefValue>(Src)) {
1272 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1273 }
1274
1275 break;
1276 }
1277 case Intrinsic::amdgcn_class: {
1278 Value *Src0 = II.getArgOperand(0);
1279 Value *Src1 = II.getArgOperand(1);
1280 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1281 if (CMask) {
1282 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1283 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1284
1285 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1286 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1287 CMask->getZExtValue() & fcAllFlags));
1288 return &II;
1289 }
1290
1291 // Propagate poison.
1292 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1293 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1294
1295 // llvm.amdgcn.class(_, undef) -> false
1296 if (IC.getSimplifyQuery().isUndefValue(Src1))
1297 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1298
1299 // llvm.amdgcn.class(undef, mask) -> mask != 0
1300 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1301 Value *CmpMask = IC.Builder.CreateICmpNE(
1302 Src1, ConstantInt::getNullValue(Src1->getType()));
1303 return IC.replaceInstUsesWith(II, CmpMask);
1304 }
1305 break;
1306 }
1307 case Intrinsic::amdgcn_cvt_pkrtz: {
1308 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1309 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1310
1311 if (isa<PoisonValue>(Arg))
1312 return PoisonValue::get(HalfTy);
1313 if (isa<UndefValue>(Arg))
1314 return UndefValue::get(HalfTy);
1315
1316 ConstantFP *CFP = nullptr;
1317 if (match(Arg, m_ConstantFP(CFP))) {
1318 bool LosesInfo;
1319 APFloat Val(CFP->getValueAPF());
1321 return ConstantFP::get(HalfTy, Val);
1322 }
1323
1324 Value *Src = nullptr;
1325 if (match(Arg, m_FPExt(m_Value(Src)))) {
1326 if (Src->getType()->isHalfTy())
1327 return Src;
1328 }
1329
1330 return nullptr;
1331 };
1332
1333 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1334 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1335 Value *V = PoisonValue::get(II.getType());
1336 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1337 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1338 return IC.replaceInstUsesWith(II, V);
1339 }
1340 }
1341
1342 break;
1343 }
1344 case Intrinsic::amdgcn_cvt_pknorm_i16:
1345 case Intrinsic::amdgcn_cvt_pknorm_u16:
1346 case Intrinsic::amdgcn_cvt_pk_i16:
1347 case Intrinsic::amdgcn_cvt_pk_u16: {
1348 Value *Src0 = II.getArgOperand(0);
1349 Value *Src1 = II.getArgOperand(1);
1350
1351 // TODO: Replace call with scalar operation if only one element is poison.
1352 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1353 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1354
1355 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1356 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1357 }
1358
1359 break;
1360 }
1361 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1362 Value* Arg = II.getArgOperand(0);
1363 Type *Ty = II.getType();
1364
1365 if (isa<PoisonValue>(Arg))
1366 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1367
1368 if(IC.getSimplifyQuery().isUndefValue(Arg))
1370
1371 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1372 if (!CArg)
1373 break;
1374
1375 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1376 constexpr size_t ResValsSize = 16;
1377 static constexpr float ResVals[ResValsSize] = {
1378 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1379 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1380 Constant *Res =
1381 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1382 return IC.replaceInstUsesWith(II, Res);
1383 }
1384 case Intrinsic::amdgcn_ubfe:
1385 case Intrinsic::amdgcn_sbfe: {
1386 // Decompose simple cases into standard shifts.
1387 Value *Src = II.getArgOperand(0);
1388 if (isa<UndefValue>(Src)) {
1389 return IC.replaceInstUsesWith(II, Src);
1390 }
1391
1392 unsigned Width;
1393 Type *Ty = II.getType();
1394 unsigned IntSize = Ty->getIntegerBitWidth();
1395
1396 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1397 if (CWidth) {
1398 Width = CWidth->getZExtValue();
1399 if ((Width & (IntSize - 1)) == 0) {
1401 }
1402
1403 // Hardware ignores high bits, so remove those.
1404 if (Width >= IntSize) {
1405 return IC.replaceOperand(
1406 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1407 }
1408 }
1409
1410 unsigned Offset;
1411 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1412 if (COffset) {
1413 Offset = COffset->getZExtValue();
1414 if (Offset >= IntSize) {
1415 return IC.replaceOperand(
1416 II, 1,
1417 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1418 }
1419 }
1420
1421 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1422
1423 if (!CWidth || !COffset)
1424 break;
1425
1426 // The case of Width == 0 is handled above, which makes this transformation
1427 // safe. If Width == 0, then the ashr and lshr instructions become poison
1428 // value since the shift amount would be equal to the bit size.
1429 assert(Width != 0);
1430
1431 // TODO: This allows folding to undef when the hardware has specific
1432 // behavior?
1433 if (Offset + Width < IntSize) {
1434 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1435 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1436 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1437 RightShift->takeName(&II);
1438 return IC.replaceInstUsesWith(II, RightShift);
1439 }
1440
1441 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1442 : IC.Builder.CreateLShr(Src, Offset);
1443
1444 RightShift->takeName(&II);
1445 return IC.replaceInstUsesWith(II, RightShift);
1446 }
1447 case Intrinsic::amdgcn_exp:
1448 case Intrinsic::amdgcn_exp_row:
1449 case Intrinsic::amdgcn_exp_compr: {
1450 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1451 unsigned EnBits = En->getZExtValue();
1452 if (EnBits == 0xf)
1453 break; // All inputs enabled.
1454
1455 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1456 bool Changed = false;
1457 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1458 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1459 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1460 Value *Src = II.getArgOperand(I + 2);
1461 if (!isa<PoisonValue>(Src)) {
1462 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1463 Changed = true;
1464 }
1465 }
1466 }
1467
1468 if (Changed) {
1469 return &II;
1470 }
1471
1472 break;
1473 }
1474 case Intrinsic::amdgcn_fmed3: {
1475 Value *Src0 = II.getArgOperand(0);
1476 Value *Src1 = II.getArgOperand(1);
1477 Value *Src2 = II.getArgOperand(2);
1478
1479 for (Value *Src : {Src0, Src1, Src2}) {
1480 if (isa<PoisonValue>(Src))
1481 return IC.replaceInstUsesWith(II, Src);
1482 }
1483
1484 if (II.isStrictFP())
1485 break;
1486
1487 // med3 with a nan input acts like
1488 // v_min_f32(v_min_f32(s0, s1), s2)
1489 //
1490 // Signalingness is ignored with ieee=0, so we fold to
1491 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1492 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1493 // returned signaling nan will not be quieted.
1494
1495 // ieee=1
1496 // s0 snan: s2
1497 // s1 snan: s2
1498 // s2 snan: qnan
1499
1500 // s0 qnan: min(s1, s2)
1501 // s1 qnan: min(s0, s2)
1502 // s2 qnan: min(s0, s1)
1503
1504 // ieee=0
1505 // s0 _nan: min(s1, s2)
1506 // s1 _nan: min(s0, s2)
1507 // s2 _nan: min(s0, s1)
1508
1509 // med3 behavior with infinity
1510 // s0 +inf: max(s1, s2)
1511 // s1 +inf: max(s0, s2)
1512 // s2 +inf: max(s0, s1)
1513 // s0 -inf: min(s1, s2)
1514 // s1 -inf: min(s0, s2)
1515 // s2 -inf: min(s0, s1)
1516
1517 // Checking for NaN before canonicalization provides better fidelity when
1518 // mapping other operations onto fmed3 since the order of operands is
1519 // unchanged.
1520 Value *V = nullptr;
1521 const APFloat *ConstSrc0 = nullptr;
1522 const APFloat *ConstSrc1 = nullptr;
1523 const APFloat *ConstSrc2 = nullptr;
1524
1525 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1526 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1527 isa<UndefValue>(Src0)) {
1528 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1529 switch (fpenvIEEEMode(II)) {
1530 case KnownIEEEMode::On:
1531 // TODO: If Src2 is snan, does it need quieting?
1532 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1533 return IC.replaceInstUsesWith(II, Src2);
1534
1535 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1536 : IC.Builder.CreateMinNum(Src1, Src2);
1537 break;
1538 case KnownIEEEMode::Off:
1539 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1540 : IC.Builder.CreateMinimumNum(Src1, Src2);
1541 break;
1543 break;
1544 }
1545 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1546 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1547 isa<UndefValue>(Src1)) {
1548 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1549 switch (fpenvIEEEMode(II)) {
1550 case KnownIEEEMode::On:
1551 // TODO: If Src2 is snan, does it need quieting?
1552 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1553 return IC.replaceInstUsesWith(II, Src2);
1554
1555 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1556 : IC.Builder.CreateMinNum(Src0, Src2);
1557 break;
1558 case KnownIEEEMode::Off:
1559 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1560 : IC.Builder.CreateMinimumNum(Src0, Src2);
1561 break;
1563 break;
1564 }
1565 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1566 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1567 isa<UndefValue>(Src2)) {
1568 switch (fpenvIEEEMode(II)) {
1569 case KnownIEEEMode::On:
1570 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1571 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1572 return IC.replaceInstUsesWith(II, Quieted);
1573 }
1574
1575 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1576 ? IC.Builder.CreateMaxNum(Src0, Src1)
1577 : IC.Builder.CreateMinNum(Src0, Src1);
1578 break;
1579 case KnownIEEEMode::Off:
1580 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1581 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1582 : IC.Builder.CreateMaximumNum(Src0, Src1);
1583 break;
1585 break;
1586 }
1587 }
1588
1589 if (V) {
1590 if (auto *CI = dyn_cast<CallInst>(V)) {
1591 CI->copyFastMathFlags(&II);
1592 CI->takeName(&II);
1593 }
1594 return IC.replaceInstUsesWith(II, V);
1595 }
1596
1597 bool Swap = false;
1598 // Canonicalize constants to RHS operands.
1599 //
1600 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1601 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1602 std::swap(Src0, Src1);
1603 Swap = true;
1604 }
1605
1606 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1607 std::swap(Src1, Src2);
1608 Swap = true;
1609 }
1610
1611 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1612 std::swap(Src0, Src1);
1613 Swap = true;
1614 }
1615
1616 if (Swap) {
1617 II.setArgOperand(0, Src0);
1618 II.setArgOperand(1, Src1);
1619 II.setArgOperand(2, Src2);
1620 return &II;
1621 }
1622
1623 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1624 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1625 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1626 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1627 C2->getValueAPF());
1628 return IC.replaceInstUsesWith(II,
1629 ConstantFP::get(II.getType(), Result));
1630 }
1631 }
1632 }
1633
1634 if (!ST->hasMed3_16())
1635 break;
1636
1637 // Repeat floating-point width reduction done for minnum/maxnum.
1638 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1639 if (Value *X = matchFPExtFromF16(Src0)) {
1640 if (Value *Y = matchFPExtFromF16(Src1)) {
1641 if (Value *Z = matchFPExtFromF16(Src2)) {
1642 Value *NewCall = IC.Builder.CreateIntrinsic(
1643 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1644 return new FPExtInst(NewCall, II.getType());
1645 }
1646 }
1647 }
1648
1649 break;
1650 }
1651 case Intrinsic::amdgcn_icmp:
1652 case Intrinsic::amdgcn_fcmp: {
1653 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1654 // Guard against invalid arguments.
1655 int64_t CCVal = CC->getZExtValue();
1656 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1657 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1658 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1659 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1661 break;
1662
1663 Value *Src0 = II.getArgOperand(0);
1664 Value *Src1 = II.getArgOperand(1);
1665
1666 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1667 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1669 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1670 if (CCmp && CCmp->isNullValue()) {
1671 return IC.replaceInstUsesWith(
1672 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1673 }
1674
1675 // The result of V_ICMP/V_FCMP assembly instructions (which this
1676 // intrinsic exposes) is one bit per thread, masked with the EXEC
1677 // register (which contains the bitmask of live threads). So a
1678 // comparison that always returns true is the same as a read of the
1679 // EXEC register.
1680 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1681 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1682 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1683 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1684 II.getType(), Args);
1685 NewCall->addFnAttr(Attribute::Convergent);
1686 NewCall->takeName(&II);
1687 return IC.replaceInstUsesWith(II, NewCall);
1688 }
1689
1690 // Canonicalize constants to RHS.
1691 CmpInst::Predicate SwapPred =
1693 II.setArgOperand(0, Src1);
1694 II.setArgOperand(1, Src0);
1695 II.setArgOperand(
1696 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1697 return &II;
1698 }
1699
1700 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1701 break;
1702
1703 // Canonicalize compare eq with true value to compare != 0
1704 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1705 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1706 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1707 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1708 Value *ExtSrc;
1709 if (CCVal == CmpInst::ICMP_EQ &&
1710 ((match(Src1, PatternMatch::m_One()) &&
1711 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1712 (match(Src1, PatternMatch::m_AllOnes()) &&
1713 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1714 ExtSrc->getType()->isIntegerTy(1)) {
1716 IC.replaceOperand(II, 2,
1717 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1718 return &II;
1719 }
1720
1721 CmpPredicate SrcPred;
1722 Value *SrcLHS;
1723 Value *SrcRHS;
1724
1725 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1726 // intrinsic. The typical use is a wave vote function in the library, which
1727 // will be fed from a user code condition compared with 0. Fold in the
1728 // redundant compare.
1729
1730 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1731 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1732 //
1733 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1734 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1735 if (match(Src1, PatternMatch::m_Zero()) &&
1737 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1738 PatternMatch::m_Value(SrcRHS))))) {
1739 if (CCVal == CmpInst::ICMP_EQ)
1740 SrcPred = CmpInst::getInversePredicate(SrcPred);
1741
1742 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1743 ? Intrinsic::amdgcn_fcmp
1744 : Intrinsic::amdgcn_icmp;
1745
1746 Type *Ty = SrcLHS->getType();
1747 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1748 // Promote to next legal integer type.
1749 unsigned Width = CmpType->getBitWidth();
1750 unsigned NewWidth = Width;
1751
1752 // Don't do anything for i1 comparisons.
1753 if (Width == 1)
1754 break;
1755
1756 if (Width <= 16)
1757 NewWidth = 16;
1758 else if (Width <= 32)
1759 NewWidth = 32;
1760 else if (Width <= 64)
1761 NewWidth = 64;
1762 else
1763 break; // Can't handle this.
1764
1765 if (Width != NewWidth) {
1766 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1767 if (CmpInst::isSigned(SrcPred)) {
1768 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1769 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1770 } else {
1771 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1772 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1773 }
1774 }
1775 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1776 break;
1777
1778 Value *Args[] = {SrcLHS, SrcRHS,
1779 ConstantInt::get(CC->getType(), SrcPred)};
1780 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1781 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1782 NewCall->takeName(&II);
1783 return IC.replaceInstUsesWith(II, NewCall);
1784 }
1785
1786 break;
1787 }
1788 case Intrinsic::amdgcn_mbcnt_hi:
1789 // exec_hi is all 0, so this is just a copy.
1790 if (ST->isWave32())
1791 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1792 [[fallthrough]];
1793 case Intrinsic::amdgcn_mbcnt_lo: {
1794 ConstantRange AccRange =
1795 computeConstantRange(II.getArgOperand(1),
1796 /*ForSigned=*/false, IC.getSimplifyQuery());
1797 if (AccRange.isFullSet())
1798 return nullptr;
1799
1800 // TODO: Can raise lower bound by inspecting first argument.
1801 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1802 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1803 if (ComputedRange.isFullSet())
1804 return nullptr;
1805
1806 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1807 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1808 if (ComputedRange == *ExistingRange)
1809 return nullptr;
1810 }
1811
1812 II.addRangeRetAttr(ComputedRange);
1813 return nullptr;
1814 }
1815 case Intrinsic::amdgcn_ballot: {
1816 Value *Arg = II.getArgOperand(0);
1817 if (isa<PoisonValue>(Arg))
1818 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1819
1820 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1821 if (Src->isZero()) {
1822 // amdgcn.ballot(i1 0) is zero.
1823 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1824 }
1825 }
1826 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1827 // %b64 = call i64 ballot.i64(...)
1828 // =>
1829 // %b32 = call i32 ballot.i32(...)
1830 // %b64 = zext i32 %b32 to i64
1832 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1833 {IC.Builder.getInt32Ty()},
1834 {II.getArgOperand(0)}),
1835 II.getType());
1836 Call->takeName(&II);
1837 return IC.replaceInstUsesWith(II, Call);
1838 }
1839 break;
1840 }
1841 case Intrinsic::amdgcn_wavefrontsize: {
1842 if (ST->isWaveSizeKnown())
1843 return IC.replaceInstUsesWith(
1844 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1845 break;
1846 }
1847 case Intrinsic::amdgcn_wqm_vote: {
1848 // wqm_vote is identity when the argument is constant.
1849 if (!isa<Constant>(II.getArgOperand(0)))
1850 break;
1851
1852 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1853 }
1854 case Intrinsic::amdgcn_kill: {
1855 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1856 if (!C || !C->getZExtValue())
1857 break;
1858
1859 // amdgcn.kill(i1 1) is a no-op
1860 return IC.eraseInstFromFunction(II);
1861 }
1862 case Intrinsic::amdgcn_s_sendmsg:
1863 case Intrinsic::amdgcn_s_sendmsghalt: {
1864 // The second operand is copied to m0, but is only actually used for
1865 // certain message types. For message types that are known to not use m0,
1866 // fold it to poison.
1867 using namespace AMDGPU::SendMsg;
1868
1869 Value *M0Val = II.getArgOperand(1);
1870 if (isa<PoisonValue>(M0Val))
1871 break;
1872
1873 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1874 uint16_t MsgId, OpId, StreamId;
1875 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1876
1877 if (!msgDoesNotUseM0(MsgId, *ST))
1878 break;
1879
1880 // Drop UB-implying attributes since we're replacing with poison.
1881 II.dropUBImplyingAttrsAndMetadata();
1882 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1883 return nullptr;
1884 }
1885 case Intrinsic::amdgcn_update_dpp: {
1886 Value *Old = II.getArgOperand(0);
1887
1888 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1889 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1890 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1891 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1892 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1893 break;
1894
1895 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1896 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1897 }
1898 case Intrinsic::amdgcn_permlane16:
1899 case Intrinsic::amdgcn_permlane16_var:
1900 case Intrinsic::amdgcn_permlanex16:
1901 case Intrinsic::amdgcn_permlanex16_var: {
1902 // Discard vdst_in if it's not going to be read.
1903 Value *VDstIn = II.getArgOperand(0);
1904 if (isa<PoisonValue>(VDstIn))
1905 break;
1906
1907 // FetchInvalid operand idx.
1908 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1909 IID == Intrinsic::amdgcn_permlanex16)
1910 ? 4 /* for permlane16 and permlanex16 */
1911 : 3; /* for permlane16_var and permlanex16_var */
1912
1913 // BoundCtrl operand idx.
1914 // For permlane16 and permlanex16 it should be 5
1915 // For Permlane16_var and permlanex16_var it should be 4
1916 unsigned int BcIdx = FiIdx + 1;
1917
1918 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1919 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1920 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1921 break;
1922
1923 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1924 }
1925 case Intrinsic::amdgcn_wave_shuffle:
1926 return tryOptimizeShufflePattern(IC, II, *ST);
1927 case Intrinsic::amdgcn_permlane64:
1928 case Intrinsic::amdgcn_readfirstlane:
1929 case Intrinsic::amdgcn_readlane:
1930 case Intrinsic::amdgcn_ds_bpermute: {
1931 // If the data argument is uniform these intrinsics return it unchanged.
1932 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1933 const Use &Src = II.getArgOperandUse(SrcIdx);
1934 if (isTriviallyUniform(Src))
1935 return IC.replaceInstUsesWith(II, Src.get());
1936
1937 if (IID == Intrinsic::amdgcn_readlane &&
1939 return &II;
1940
1941 // If the lane argument of bpermute is uniform, change it to readlane. This
1942 // generates better code and can enable further optimizations because
1943 // readlane is AlwaysUniform.
1944 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1945 const Use &Lane = II.getArgOperandUse(0);
1946 if (isTriviallyUniform(Lane)) {
1947 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1949 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1950 II.setCalledFunction(NewDecl);
1951 II.setOperand(0, Src);
1952 II.setOperand(1, NewLane);
1953 return &II;
1954 }
1955 }
1956
1957 if (IID == Intrinsic::amdgcn_ds_bpermute)
1958 return tryOptimizeShufflePattern(IC, II, *ST);
1959
1961 return Res;
1962
1963 return std::nullopt;
1964 }
1965 case Intrinsic::amdgcn_writelane: {
1966 // TODO: Fold bitcast like readlane.
1967 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1968 return &II;
1969 return std::nullopt;
1970 }
1971 case Intrinsic::amdgcn_trig_preop: {
1972 // The intrinsic is declared with name mangling, but currently the
1973 // instruction only exists for f64
1974 if (!II.getType()->isDoubleTy())
1975 break;
1976
1977 Value *Src = II.getArgOperand(0);
1978 Value *Segment = II.getArgOperand(1);
1979 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1980 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1981
1982 if (isa<UndefValue>(Segment))
1983 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1984
1985 // Sign bit is not used.
1986 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1987 if (StrippedSign != Src)
1988 return IC.replaceOperand(II, 0, StrippedSign);
1989
1990 if (II.isStrictFP())
1991 break;
1992
1993 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1994 if (!CSrc && !isa<UndefValue>(Src))
1995 break;
1996
1997 // The instruction ignores special cases, and literally just extracts the
1998 // exponents. Fold undef to nan, and index the table as normal.
1999 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
2000 : APFloat::getQNaN(II.getType()->getFltSemantics())
2001 .bitcastToAPInt();
2002
2003 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
2004 if (!Cseg) {
2005 if (isa<UndefValue>(Src))
2006 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2007 break;
2008 }
2009
2010 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
2011 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
2012 unsigned Shift = SegmentVal * 53;
2013 if (Exponent > 1077)
2014 Shift += Exponent - 1077;
2015
2016 // 2.0/PI table.
2017 static const uint32_t TwoByPi[] = {
2018 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
2019 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
2020 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
2021 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
2022 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
2023 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
2024 0x56033046};
2025
2026 // Return 0 for outbound segment (hardware behavior).
2027 unsigned Idx = Shift >> 5;
2028 if (Idx + 2 >= std::size(TwoByPi)) {
2029 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
2030 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
2031 }
2032
2033 unsigned BShift = Shift & 0x1f;
2034 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2035 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2036 if (BShift)
2037 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2038 Thi = Thi >> 11;
2039 APFloat Result = APFloat((double)Thi);
2040
2041 int Scale = -53 - Shift;
2042 if (Exponent >= 1968)
2043 Scale += 128;
2044
2045 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2046 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2047 }
2048 case Intrinsic::amdgcn_fmul_legacy: {
2049 Value *Op0 = II.getArgOperand(0);
2050 Value *Op1 = II.getArgOperand(1);
2051
2052 for (Value *Src : {Op0, Op1}) {
2053 if (isa<PoisonValue>(Src))
2054 return IC.replaceInstUsesWith(II, Src);
2055 }
2056
2057 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2058 // infinity, gives +0.0.
2059 // TODO: Move to InstSimplify?
2060 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2062 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2063
2064 // If we can prove we don't have one of the special cases then we can use a
2065 // normal fmul instruction instead.
2066 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2067 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2068 FMul->takeName(&II);
2069 return IC.replaceInstUsesWith(II, FMul);
2070 }
2071 break;
2072 }
2073 case Intrinsic::amdgcn_fma_legacy: {
2074 Value *Op0 = II.getArgOperand(0);
2075 Value *Op1 = II.getArgOperand(1);
2076 Value *Op2 = II.getArgOperand(2);
2077
2078 for (Value *Src : {Op0, Op1, Op2}) {
2079 if (isa<PoisonValue>(Src))
2080 return IC.replaceInstUsesWith(II, Src);
2081 }
2082
2083 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2084 // infinity, gives +0.0.
2085 // TODO: Move to InstSimplify?
2086 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2088 // It's tempting to just return Op2 here, but that would give the wrong
2089 // result if Op2 was -0.0.
2090 auto *Zero = ConstantFP::getZero(II.getType());
2091 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2092 FAdd->takeName(&II);
2093 return IC.replaceInstUsesWith(II, FAdd);
2094 }
2095
2096 // If we can prove we don't have one of the special cases then we can use a
2097 // normal fma instead.
2098 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2099 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2100 II.getModule(), Intrinsic::fma, II.getType()));
2101 return &II;
2102 }
2103 break;
2104 }
2105 case Intrinsic::amdgcn_is_shared:
2106 case Intrinsic::amdgcn_is_private: {
2107 Value *Src = II.getArgOperand(0);
2108 if (isa<PoisonValue>(Src))
2109 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2110 if (isa<UndefValue>(Src))
2111 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2112
2113 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2114 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2115 break;
2116 }
2117 case Intrinsic::amdgcn_make_buffer_rsrc: {
2118 Value *Src = II.getArgOperand(0);
2119 if (isa<PoisonValue>(Src))
2120 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2121 return std::nullopt;
2122 }
2123 case Intrinsic::amdgcn_raw_buffer_store_format:
2124 case Intrinsic::amdgcn_struct_buffer_store_format:
2125 case Intrinsic::amdgcn_raw_tbuffer_store:
2126 case Intrinsic::amdgcn_struct_tbuffer_store:
2127 case Intrinsic::amdgcn_image_store_1d:
2128 case Intrinsic::amdgcn_image_store_1darray:
2129 case Intrinsic::amdgcn_image_store_2d:
2130 case Intrinsic::amdgcn_image_store_2darray:
2131 case Intrinsic::amdgcn_image_store_2darraymsaa:
2132 case Intrinsic::amdgcn_image_store_2dmsaa:
2133 case Intrinsic::amdgcn_image_store_3d:
2134 case Intrinsic::amdgcn_image_store_cube:
2135 case Intrinsic::amdgcn_image_store_mip_1d:
2136 case Intrinsic::amdgcn_image_store_mip_1darray:
2137 case Intrinsic::amdgcn_image_store_mip_2d:
2138 case Intrinsic::amdgcn_image_store_mip_2darray:
2139 case Intrinsic::amdgcn_image_store_mip_3d:
2140 case Intrinsic::amdgcn_image_store_mip_cube: {
2141 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2142 break;
2143
2144 APInt DemandedElts;
2145 if (ST->hasDefaultComponentBroadcast())
2146 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2147 else if (ST->hasDefaultComponentZero())
2148 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2149 else
2150 break;
2151
2152 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2153 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2154 false)) {
2155 return IC.eraseInstFromFunction(II);
2156 }
2157
2158 break;
2159 }
2160 case Intrinsic::amdgcn_prng_b32: {
2161 auto *Src = II.getArgOperand(0);
2162 if (isa<UndefValue>(Src)) {
2163 return IC.replaceInstUsesWith(II, Src);
2164 }
2165 return std::nullopt;
2166 }
2167 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2168 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2169 Value *Src0 = II.getArgOperand(0);
2170 Value *Src1 = II.getArgOperand(1);
2171 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2172 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2173 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2174 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2175
2176 auto getFormatNumRegs = [](unsigned FormatVal) {
2177 switch (FormatVal) {
2180 return 6u;
2182 return 4u;
2185 return 8u;
2186 default:
2187 llvm_unreachable("invalid format value");
2188 }
2189 };
2190
2191 bool MadeChange = false;
2192 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2193 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2194
2195 // Depending on the used format, fewer registers are required so shrink the
2196 // vector type.
2197 if (Src0Ty->getNumElements() > Src0NumElts) {
2198 Src0 = IC.Builder.CreateExtractVector(
2199 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2200 uint64_t(0));
2201 MadeChange = true;
2202 }
2203
2204 if (Src1Ty->getNumElements() > Src1NumElts) {
2205 Src1 = IC.Builder.CreateExtractVector(
2206 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2207 uint64_t(0));
2208 MadeChange = true;
2209 }
2210
2211 if (!MadeChange)
2212 return std::nullopt;
2213
2214 SmallVector<Value *, 10> Args(II.args());
2215 Args[0] = Src0;
2216 Args[1] = Src1;
2217
2218 CallInst *NewII = IC.Builder.CreateIntrinsic(
2219 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2220 NewII->takeName(&II);
2221 return IC.replaceInstUsesWith(II, NewII);
2222 }
2223 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2224 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2225 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2226 Value *Src0 = II.getArgOperand(1);
2227 Value *Src1 = II.getArgOperand(3);
2228 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2229 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2230 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2231 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2232
2233 bool MadeChange = false;
2234 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2235 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2236
2237 // Depending on the used format, fewer registers are required so shrink the
2238 // vector type.
2239 if (Src0Ty->getNumElements() > Src0NumElts) {
2240 Src0 = IC.Builder.CreateExtractVector(
2241 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2242 IC.Builder.getInt64(0));
2243 MadeChange = true;
2244 }
2245
2246 if (Src1Ty->getNumElements() > Src1NumElts) {
2247 Src1 = IC.Builder.CreateExtractVector(
2248 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2249 IC.Builder.getInt64(0));
2250 MadeChange = true;
2251 }
2252
2253 if (!MadeChange)
2254 return std::nullopt;
2255
2256 SmallVector<Value *, 13> Args(II.args());
2257 Args[1] = Src0;
2258 Args[3] = Src1;
2259
2260 CallInst *NewII = IC.Builder.CreateIntrinsic(
2261 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2262 Args, &II);
2263 NewII->takeName(&II);
2264 return IC.replaceInstUsesWith(II, NewII);
2265 }
2266 }
2267 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2268 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2269 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2270 }
2271 return std::nullopt;
2272}
2273
2274/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2275///
2276/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2277/// definitions of the intrinsics vector argument, not Uses of the result like
2278/// image and buffer loads.
2279/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2280/// struct returns.
2283 APInt DemandedElts,
2284 int DMaskIdx, bool IsLoad) {
2285
2286 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2287 : II.getOperand(0)->getType());
2288 unsigned VWidth = IIVTy->getNumElements();
2289 if (VWidth == 1)
2290 return nullptr;
2291 Type *EltTy = IIVTy->getElementType();
2292
2295
2296 // Assume the arguments are unchanged and later override them, if needed.
2297 SmallVector<Value *, 16> Args(II.args());
2298
2299 if (DMaskIdx < 0) {
2300 // Buffer case.
2301
2302 const unsigned ActiveBits = DemandedElts.getActiveBits();
2303 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2304
2305 // Start assuming the prefix of elements is demanded, but possibly clear
2306 // some other bits if there are trailing zeros (unused components at front)
2307 // and update offset.
2308 DemandedElts = (1 << ActiveBits) - 1;
2309
2310 if (UnusedComponentsAtFront > 0) {
2311 static const unsigned InvalidOffsetIdx = 0xf;
2312
2313 unsigned OffsetIdx;
2314 switch (II.getIntrinsicID()) {
2315 case Intrinsic::amdgcn_raw_buffer_load:
2316 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2317 OffsetIdx = 1;
2318 break;
2319 case Intrinsic::amdgcn_s_buffer_load:
2320 // If resulting type is vec3, there is no point in trimming the
2321 // load with updated offset, as the vec3 would most likely be widened to
2322 // vec4 anyway during lowering.
2323 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2324 OffsetIdx = InvalidOffsetIdx;
2325 else
2326 OffsetIdx = 1;
2327 break;
2328 case Intrinsic::amdgcn_struct_buffer_load:
2329 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2330 OffsetIdx = 2;
2331 break;
2332 default:
2333 // TODO: handle tbuffer* intrinsics.
2334 OffsetIdx = InvalidOffsetIdx;
2335 break;
2336 }
2337
2338 if (OffsetIdx != InvalidOffsetIdx) {
2339 // Clear demanded bits and update the offset.
2340 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2341 auto *Offset = Args[OffsetIdx];
2342 unsigned SingleComponentSizeInBits =
2343 IC.getDataLayout().getTypeSizeInBits(EltTy);
2344 unsigned OffsetAdd =
2345 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2346 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2347 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2348 }
2349 }
2350 } else {
2351 // Image case.
2352
2353 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2354 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2355
2356 // dmask 0 has special semantics, do not simplify.
2357 if (DMaskVal == 0)
2358 return nullptr;
2359
2360 // Mask off values that are undefined because the dmask doesn't cover them
2361 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2362
2363 unsigned NewDMaskVal = 0;
2364 unsigned OrigLdStIdx = 0;
2365 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2366 const unsigned Bit = 1 << SrcIdx;
2367 if (!!(DMaskVal & Bit)) {
2368 if (!!DemandedElts[OrigLdStIdx])
2369 NewDMaskVal |= Bit;
2370 OrigLdStIdx++;
2371 }
2372 }
2373
2374 if (DMaskVal != NewDMaskVal)
2375 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2376 }
2377
2378 unsigned NewNumElts = DemandedElts.popcount();
2379 if (!NewNumElts)
2380 return PoisonValue::get(IIVTy);
2381
2382 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2383 if (DMaskIdx >= 0)
2384 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2385 return nullptr;
2386 }
2387
2388 // Validate function argument and return types, extracting overloaded types
2389 // along the way.
2390 SmallVector<Type *, 6> OverloadTys;
2391 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2392 return nullptr;
2393
2394 Type *NewTy =
2395 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2396 OverloadTys[0] = NewTy;
2397
2398 if (!IsLoad) {
2399 SmallVector<int, 8> EltMask;
2400 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2401 if (DemandedElts[OrigStoreIdx])
2402 EltMask.push_back(OrigStoreIdx);
2403
2404 if (NewNumElts == 1)
2405 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2406 else
2407 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2408 }
2409
2410 CallInst *NewCall =
2411 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2412 NewCall->takeName(&II);
2413 NewCall->copyMetadata(II);
2414 AttributeList OldAttrList = II.getAttributes();
2415 NewCall->setAttributes(OldAttrList);
2416
2417 if (IsLoad) {
2418 if (NewNumElts == 1) {
2419 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2420 DemandedElts.countr_zero());
2421 }
2422
2423 SmallVector<int, 8> EltMask;
2424 unsigned NewLoadIdx = 0;
2425 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2426 if (!!DemandedElts[OrigLoadIdx])
2427 EltMask.push_back(NewLoadIdx++);
2428 else
2429 EltMask.push_back(NewNumElts);
2430 }
2431
2432 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2433
2434 return Shuffle;
2435 }
2436
2437 return NewCall;
2438}
2439
2441 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2442 APInt &UndefElts) const {
2443 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2444 if (!VT)
2445 return nullptr;
2446
2447 const unsigned FirstElt = DemandedElts.countr_zero();
2448 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2449 const unsigned MaskLen = LastElt - FirstElt + 1;
2450
2451 unsigned OldNumElts = VT->getNumElements();
2452 if (MaskLen == OldNumElts && MaskLen != 1)
2453 return nullptr;
2454
2455 Type *EltTy = VT->getElementType();
2456 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2457
2458 // Theoretically we should support these intrinsics for any legal type. Avoid
2459 // introducing cases that aren't direct register types like v3i16.
2460 if (!isTypeLegal(NewVT))
2461 return nullptr;
2462
2463 Value *Src = II.getArgOperand(0);
2464
2465 // Make sure convergence tokens are preserved.
2466 // TODO: CreateIntrinsic should allow directly copying bundles
2468 II.getOperandBundlesAsDefs(OpBundles);
2469
2471 Function *Remangled =
2472 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2473
2474 if (MaskLen == 1) {
2475 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2476
2477 // TODO: Preserve callsite attributes?
2478 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2479
2480 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2481 NewCall, FirstElt);
2482 }
2483
2484 SmallVector<int> ExtractMask(MaskLen, -1);
2485 for (unsigned I = 0; I != MaskLen; ++I) {
2486 if (DemandedElts[FirstElt + I])
2487 ExtractMask[I] = FirstElt + I;
2488 }
2489
2490 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2491
2492 // TODO: Preserve callsite attributes?
2493 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2494
2495 SmallVector<int> InsertMask(OldNumElts, -1);
2496 for (unsigned I = 0; I != MaskLen; ++I) {
2497 if (DemandedElts[FirstElt + I])
2498 InsertMask[FirstElt + I] = I;
2499 }
2500
2501 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2502 // call behind.
2503 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2504}
2505
2507 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2508 APInt &UndefElts2, APInt &UndefElts3,
2509 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2510 SimplifyAndSetOp) const {
2511 switch (II.getIntrinsicID()) {
2512 case Intrinsic::amdgcn_readfirstlane:
2513 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2514 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2515 case Intrinsic::amdgcn_raw_buffer_load:
2516 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2517 case Intrinsic::amdgcn_raw_buffer_load_format:
2518 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2519 case Intrinsic::amdgcn_raw_tbuffer_load:
2520 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2521 case Intrinsic::amdgcn_s_buffer_load:
2522 case Intrinsic::amdgcn_struct_buffer_load:
2523 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2524 case Intrinsic::amdgcn_struct_buffer_load_format:
2525 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2526 case Intrinsic::amdgcn_struct_tbuffer_load:
2527 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2528 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2529 default: {
2530 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2531 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2532 }
2533 break;
2534 }
2535 }
2536 return std::nullopt;
2537}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchDsSwizzleRotatePattern(ArrayRef< uint8_t > Ids)
Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation of all 32 lanes within each ...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1267
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5912
bool isPosInfinity() const
Definition APFloat.h:1551
const fltSemantics & getSemantics() const
Definition APFloat.h:1546
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1375
bool isNaN() const
Definition APFloat.h:1536
bool isSignaling() const
Definition APFloat.h:1540
APInt bitcastToAPInt() const
Definition APFloat.h:1430
bool isNegInfinity() const
Definition APFloat.h:1552
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1481
bool isInfinity() const
Definition APFloat.h:1535
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
void setAttributes(AttributeList A)
Set the attributes for this call.
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
AttributeList getAttributes() const
Return the attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:890
bool isFPPredicate() const
Definition InstrTypes.h:845
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantFP * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constant.h:64
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:288
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1135
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2388
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1066
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2659
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1094
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1054
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1663
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1088
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1075
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1567
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:110
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1652
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1695
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1640
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.