LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUMemoryUtils.h"
17#include "AMDGPUTargetMachine.h"
19#include "llvm/ADT/SetVector.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/InstVisitor.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/IR/ValueHandle.h"
34#include "llvm/Pass.h"
40
41#define DEBUG_TYPE "amdgpu-codegenprepare"
42
43using namespace llvm;
44using namespace llvm::PatternMatch;
45
46namespace {
47
49 "amdgpu-codegenprepare-widen-constant-loads",
50 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
52 cl::init(false));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
74 cl::init(true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
81 cl::init(false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
89 cl::init(false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
96 cl::init(false));
97
98class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100public:
101 Function &F;
102 const GCNSubtarget &ST;
103 const AMDGPUTargetMachine &TM;
104 const TargetLibraryInfo *TLI;
105 const UniformityInfo &UA;
106 const DataLayout &DL;
107 SimplifyQuery SQ;
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged = false;
110 mutable Function *SqrtF32 = nullptr;
111 mutable Function *LdexpF32 = nullptr;
112 mutable SmallVector<WeakVH> DeadVals;
113
114 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
115
116 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
117 const TargetLibraryInfo *TLI, AssumptionCache *AC,
118 const DominatorTree *DT, const UniformityInfo &UA)
119 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
120 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
121 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
123
124 Function *getSqrtF32() const {
125 if (SqrtF32)
126 return SqrtF32;
127
128 LLVMContext &Ctx = F.getContext();
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
131 return SqrtF32;
132 }
133
134 Function *getLdexpF32() const {
135 if (LdexpF32)
136 return LdexpF32;
137
138 LLVMContext &Ctx = F.getContext();
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
142 return LdexpF32;
143 }
144
145 bool canBreakPHINode(const PHINode &I);
146
147 /// Return true if \p T is a legal scalar floating point type.
148 bool isLegalFloatingTy(const Type *T) const;
149
150 /// Wrapper to pass all the arguments to computeKnownFPClass
152 const Instruction *CtxI) const {
153 return llvm::computeKnownFPClass(V, Interested,
154 SQ.getWithInstruction(CtxI));
155 }
156
157 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
158 return HasFP32DenormalFlush ||
160 }
161
162 /// \returns The minimum number of bits needed to store the value of \Op as an
163 /// unsigned integer. Truncating to this size and then zero-extending to
164 /// the original will not change the value.
165 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
166
167 /// \returns The minimum number of bits needed to store the value of \Op as a
168 /// signed integer. Truncating to this size and then sign-extending to
169 /// the original size will not change the value.
170 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
171
172 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
173 /// SelectionDAG has an issue where an and asserting the bits are known
174 bool replaceMulWithMul24(BinaryOperator &I) const;
175
176 /// Perform same function as equivalently named function in DAGCombiner. Since
177 /// we expand some divisions here, we need to perform this before obscuring.
178 bool foldBinOpIntoSelect(BinaryOperator &I) const;
179
180 bool divHasSpecialOptimization(BinaryOperator &I,
181 Value *Num, Value *Den) const;
182 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
183 unsigned MaxDivBits, bool Signed) const;
184
185 /// Expands 24 bit div or rem.
186 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
187 Value *Num, Value *Den,
188 bool IsDiv, bool IsSigned) const;
189
190 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
191 Value *Num, Value *Den, unsigned NumBits,
192 bool IsDiv, bool IsSigned) const;
193
194 /// Expands 32 bit div or rem.
195 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
196 Value *Num, Value *Den) const;
197
198 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
199 Value *Num, Value *Den) const;
200 void expandDivRem64(BinaryOperator &I) const;
201
202 /// Widen a scalar load.
203 ///
204 /// \details \p Widen scalar load for uniform, small type loads from constant
205 // memory / to a full 32-bits and then truncate the input to allow a scalar
206 // load instead of a vector load.
207 //
208 /// \returns True.
209
210 bool canWidenScalarExtLoad(LoadInst &I) const;
211
212 Value *matchFractPatImpl(Value &V, const APFloat &C) const;
213 Value *matchFractPatNanAvoidant(Value &V);
214 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
215
216 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
217
218 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
219 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
220 const Instruction *CtxI) const;
221
222 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
223 FastMathFlags FMF, const Instruction *CtxI) const;
224 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
225 float ReqdAccuracy) const;
226
227 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
228 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
229 Value *RsqOp, const Instruction *FDiv,
230 float ReqdAccuracy) const;
231
232 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
233 Value *Src) const;
234
235 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
236 bool IsNegative) const;
237 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
238 FastMathFlags FMF) const;
239 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
240 FastMathFlags FMF) const;
241 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
242 FastMathFlags DivFMF, const Instruction *CtxI,
243 bool IsNegative) const;
244
245 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
246 void replaceWithWorkitemIdX(Instruction &I) const;
247 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
248 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
249
250 bool tryNarrowMathIfNoOverflow(Instruction *I);
251
252public:
253 bool visitFDiv(BinaryOperator &I);
254
255 bool visitInstruction(Instruction &I) { return false; }
256 bool visitBinaryOperator(BinaryOperator &I);
257 bool visitLoadInst(LoadInst &I);
258 bool visitSelectInst(SelectInst &I);
259 bool visitPHINode(PHINode &I);
260 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
261
262 bool visitIntrinsicInst(IntrinsicInst &I);
263 bool visitFMinLike(IntrinsicInst &I);
264 bool visitSqrt(IntrinsicInst &I);
265 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
266 bool visitMbcntLo(IntrinsicInst &I) const;
267 bool visitMbcntHi(IntrinsicInst &I) const;
268 bool visitVectorReduceAdd(IntrinsicInst &I);
269 bool visitSaturatingAdd(IntrinsicInst &I);
270 bool run();
271};
272
273class AMDGPUCodeGenPrepare : public FunctionPass {
274public:
275 static char ID;
276 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
277 void getAnalysisUsage(AnalysisUsage &AU) const override {
281
282 // FIXME: Division expansion needs to preserve the dominator tree.
283 if (!ExpandDiv64InIR)
284 AU.setPreservesAll();
285 }
286 bool runOnFunction(Function &F) override;
287 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
288};
289
290} // end anonymous namespace
291
292bool AMDGPUCodeGenPrepareImpl::run() {
293 BreakPhiNodesCache.clear();
294 bool MadeChange = false;
295
296 // Need to use make_early_inc_range because integer division expansion is
297 // handled by Transform/Utils, and it can delete instructions such as the
298 // terminator of the BB.
299 for (BasicBlock &BB : reverse(F)) {
300 for (Instruction &I : make_early_inc_range(reverse(BB))) {
301 if (!isInstructionTriviallyDead(&I, TLI))
302 MadeChange |= visit(I);
303 }
304 }
305
306 while (!DeadVals.empty()) {
307 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
309 }
310
311 return MadeChange;
312}
313
314bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
315 return Ty->isFloatTy() || Ty->isDoubleTy() ||
316 (Ty->isHalfTy() && ST.has16BitInsts());
317}
318
319bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
320 Type *Ty = I.getType();
321 int TySize = DL.getTypeSizeInBits(Ty);
322 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
323
324 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(&I);
325}
326
327unsigned
328AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
329 const Instruction *CtxI) const {
330 return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
331}
332
333unsigned
334AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
335 const Instruction *CtxI) const {
336 return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
337}
338
339static void extractValues(IRBuilder<> &Builder,
340 SmallVectorImpl<Value *> &Values, Value *V) {
341 auto *VT = dyn_cast<FixedVectorType>(V->getType());
342 if (!VT) {
343 Values.push_back(V);
344 return;
345 }
346
347 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
348 Values.push_back(Builder.CreateExtractElement(V, I));
349}
350
352 Type *Ty,
353 SmallVectorImpl<Value *> &Values) {
354 if (!Ty->isVectorTy()) {
355 assert(Values.size() == 1);
356 return Values[0];
357 }
358
359 Value *NewVal = PoisonValue::get(Ty);
360 for (int I = 0, E = Values.size(); I != E; ++I)
361 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
362
363 return NewVal;
364}
365
366bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
367 if (I.getOpcode() != Instruction::Mul)
368 return false;
369
370 Type *Ty = I.getType();
371 unsigned Size = Ty->getScalarSizeInBits();
372 if (Size <= 16 && ST.has16BitInsts())
373 return false;
374
375 // Prefer scalar if this could be s_mul_i32
376 if (UA.isUniformAtDef(&I))
377 return false;
378
379 Value *LHS = I.getOperand(0);
380 Value *RHS = I.getOperand(1);
381 IRBuilder<> Builder(&I);
382 Builder.SetCurrentDebugLocation(I.getDebugLoc());
383
384 unsigned LHSBits = 0, RHSBits = 0;
385 bool IsSigned = false;
386
387 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
388 (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
389 IsSigned = false;
390
391 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
392 (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
393 IsSigned = true;
394
395 } else
396 return false;
397
398 SmallVector<Value *, 4> LHSVals;
399 SmallVector<Value *, 4> RHSVals;
400 SmallVector<Value *, 4> ResultVals;
401 extractValues(Builder, LHSVals, LHS);
402 extractValues(Builder, RHSVals, RHS);
403
404 IntegerType *I32Ty = Builder.getInt32Ty();
405 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
406 Type *DstTy = LHSVals[0]->getType();
407
408 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
409 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
410 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
411 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
412 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
414 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
415 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
416 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
417 : Builder.CreateZExtOrTrunc(Result, DstTy);
418 ResultVals.push_back(Result);
419 }
420
421 Value *NewVal = insertValues(Builder, Ty, ResultVals);
422 NewVal->takeName(&I);
423 I.replaceAllUsesWith(NewVal);
424 DeadVals.push_back(&I);
425
426 return true;
427}
428
429// Find a select instruction, which may have been casted. This is mostly to deal
430// with cases where i16 selects were promoted here to i32.
432 Cast = nullptr;
433 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
434 return Sel;
435
436 if ((Cast = dyn_cast<CastInst>(V))) {
437 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
438 return Sel;
439 }
440
441 return nullptr;
442}
443
444bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
445 // Don't do this unless the old select is going away. We want to eliminate the
446 // binary operator, not replace a binop with a select.
447 int SelOpNo = 0;
448
449 CastInst *CastOp;
450
451 // TODO: Should probably try to handle some cases with multiple
452 // users. Duplicating the select may be profitable for division.
453 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
454 if (!Sel || !Sel->hasOneUse()) {
455 SelOpNo = 1;
456 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
457 }
458
459 if (!Sel || !Sel->hasOneUse())
460 return false;
461
464 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
465 if (!CBO || !CT || !CF)
466 return false;
467
468 if (CastOp) {
469 if (!CastOp->hasOneUse())
470 return false;
471 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
472 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
473 }
474
475 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
476 // need to handle divisions here.
477 Constant *FoldedT =
478 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
479 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
480 if (!FoldedT || isa<ConstantExpr>(FoldedT))
481 return false;
482
483 Constant *FoldedF =
484 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
485 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
486 if (!FoldedF || isa<ConstantExpr>(FoldedF))
487 return false;
488
489 IRBuilder<> Builder(&BO);
490 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
491 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
492 Builder.setFastMathFlags(FPOp->getFastMathFlags());
493
494 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
495 FoldedT, FoldedF);
496 NewSelect->takeName(&BO);
497 BO.replaceAllUsesWith(NewSelect);
498 DeadVals.push_back(&BO);
499 if (CastOp)
500 DeadVals.push_back(CastOp);
501 DeadVals.push_back(Sel);
502 return true;
503}
504
505std::pair<Value *, Value *>
506AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
507 Value *Src) const {
508 Type *Ty = Src->getType();
509 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
510 {Ty, Builder.getInt32Ty()}, Src);
511 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
512
513 // Bypass the bug workaround for the exponent result since it doesn't matter.
514 // TODO: Does the bug workaround even really need to consider the exponent
515 // result? It's unspecified by the spec.
516
517 Value *FrexpExp =
518 ST.hasFractBug()
519 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
520 {Builder.getInt32Ty(), Ty}, Src)
521 : Builder.CreateExtractValue(Frexp, {1});
522 return {FrexpMant, FrexpExp};
523}
524
525/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
526Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
527 Value *Src,
528 bool IsNegative) const {
529 // Same as for 1.0, but expand the sign out of the constant.
530 // -1.0 / x -> rcp (fneg x)
531 if (IsNegative)
532 Src = Builder.CreateFNeg(Src);
533
534 // The rcp instruction doesn't support denormals, so scale the input
535 // out of the denormal range and convert at the end.
536 //
537 // Expand as 2^-n * (1.0 / (x * 2^n))
538
539 // TODO: Skip scaling if input is known never denormal and the input
540 // range won't underflow to denormal. The hard part is knowing the
541 // result. We need a range check, the result could be denormal for
542 // 0x1p+126 < den <= 0x1p+127.
543 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
544 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
545 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
546 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
547}
548
549/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
550Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
551 Value *RHS,
552 FastMathFlags FMF) const {
553 // If we have have to work around the fract/frexp bug, we're worse off than
554 // using the fdiv.fast expansion. The full safe expansion is faster if we have
555 // fast FMA.
556 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
557 (!FMF.noNaNs() || !FMF.noInfs()))
558 return nullptr;
559
560 // We're scaling the LHS to avoid a denormal input, and scale the denominator
561 // to avoid large values underflowing the result.
562 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
563
564 Value *Rcp =
565 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
566
567 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
568 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
569
570 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
571 // result.
572 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
573 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
574}
575
576/// Emit a sqrt that handles denormals and is accurate to 2ulp.
577Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
578 Value *Src,
579 FastMathFlags FMF) const {
580 Type *Ty = Src->getType();
581 APFloat SmallestNormal =
583 Value *NeedScale =
584 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
585
586 ConstantInt *Zero = Builder.getInt32(0);
587 Value *InputScaleFactor =
588 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
589
590 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
591
592 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
593
594 Value *OutputScaleFactor =
595 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
596 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
597}
598
599/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
600static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
601 bool IsNegative) {
602 // bool need_scale = x < 0x1p-126f;
603 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
604 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
605 // rsq(x * input_scale) * output_scale;
606
607 Type *Ty = Src->getType();
608 APFloat SmallestNormal =
609 APFloat::getSmallestNormalized(Ty->getFltSemantics());
610 Value *NeedScale =
611 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
612 Constant *One = ConstantFP::get(Ty, 1.0);
613 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
614 Constant *OutputScale =
615 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
616
617 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
618
619 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
620 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
621 Value *OutputScaleFactor = Builder.CreateSelect(
622 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
623
624 return Builder.CreateFMul(Rsq, OutputScaleFactor);
625}
626
627/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
628/// v_rsq_f64. This should give a 1ulp result.
629Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
630 FastMathFlags SqrtFMF,
631 FastMathFlags DivFMF,
632 const Instruction *CtxI,
633 bool IsNegative) const {
634 // rsq(x):
635 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
636 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
637 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
638 //
639 // -rsq(x):
640 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
641 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
642 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
643 //
644 // The rsq instruction handles the special cases correctly. We need to check
645 // for the edge case conditions to ensure the special case propagates through
646 // the later instructions.
647
648 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
649
650 // Try to elide the edge case check.
651 //
652 // Fast math flags imply:
653 // sqrt ninf => !isinf(x)
654 // fdiv ninf => x != 0, !isinf(x)
655 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
656 bool MaybeZero = !DivFMF.noInfs();
657
658 DenormalMode DenormMode;
659 FPClassTest Interested = fcNone;
660 if (MaybePosInf)
661 Interested = fcPosInf;
662 if (MaybeZero)
663 Interested |= fcZero;
664
665 if (Interested != fcNone) {
666 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
667 if (KnownSrc.isKnownNeverPosInfinity())
668 MaybePosInf = false;
669
670 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
671 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
672 MaybeZero = false;
673 }
674
675 Value *SpecialOrRsq = X;
676 if (MaybeZero || MaybePosInf) {
677 Value *Cond;
678 if (MaybePosInf && MaybeZero) {
679 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
680 FPClassTest TestMask = fcPosInf | fcZero;
681 if (DenormMode.inputsAreZero())
682 TestMask |= fcSubnormal;
683
684 Cond = Builder.createIsFPClass(X, TestMask);
685 } else {
686 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
687 // doesn't respect the floating-point environment.
688 Value *IsZero =
689 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
690 Value *IsInf =
691 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
692 Cond = Builder.CreateOr(IsZero, IsInf);
693 }
694 } else if (MaybeZero) {
695 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
696 } else {
697 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
698 }
699
700 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
701 }
702
703 Value *NegY0 = Builder.CreateFNeg(Y0);
704 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
705
706 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
707 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
708
709 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
710
711 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
712 ConstantFP::get(X->getType(), 0.5));
713
714 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
715}
716
717bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
718 FastMathFlags SqrtFMF) const {
719 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
720 // f64.
721 return DivFMF.allowContract() && SqrtFMF.allowContract();
722}
723
724Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
725 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
726 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
727 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
728 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
729
730 // rsq_f16 is accurate to 0.51 ulp.
731 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
732 // rsq_f64 is never accurate.
733 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
734 if (!CLHS)
735 return nullptr;
736
737 bool IsNegative = false;
738
739 // TODO: Handle other numerator values with arcp.
740 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
741 // Add in the sqrt flags.
742 IRBuilder<>::FastMathFlagGuard Guard(Builder);
743 Builder.setFastMathFlags(DivFMF | SqrtFMF);
744
745 if (Den->getType()->isFloatTy()) {
746 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
747 canIgnoreDenormalInput(Den, CtxI)) {
748 Value *Result =
749 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
750 // -1.0 / sqrt(x) -> fneg(rsq(x))
751 return IsNegative ? Builder.CreateFNeg(Result) : Result;
752 }
753
754 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
755 }
756
757 if (Den->getType()->isDoubleTy())
758 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
759 }
760
761 return nullptr;
762}
763
764// Optimize fdiv with rcp:
765//
766// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
767// allowed with afn.
768//
769// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
770Value *
771AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
772 Value *Den, FastMathFlags FMF,
773 const Instruction *CtxI) const {
774 // rcp_f16 is accurate to 0.51 ulp.
775 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
776 // rcp_f64 is never accurate.
777 assert(Den->getType()->isFloatTy());
778
779 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
780 bool IsNegative = false;
781 if (CLHS->isExactlyValue(1.0) ||
782 (IsNegative = CLHS->isExactlyValue(-1.0))) {
783 Value *Src = Den;
784
785 if (HasFP32DenormalFlush || FMF.approxFunc()) {
786 // -1.0 / x -> 1.0 / fneg(x)
787 if (IsNegative)
788 Src = Builder.CreateFNeg(Src);
789
790 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
791 // the CI documentation has a worst case error of 1 ulp.
792 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
793 // to use it as long as we aren't trying to use denormals.
794 //
795 // v_rcp_f16 and v_rsq_f16 DO support denormals.
796
797 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
798 // insert rsq intrinsic here.
799
800 // 1.0 / x -> rcp(x)
801 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
802 }
803
804 // TODO: If the input isn't denormal, and we know the input exponent isn't
805 // big enough to introduce a denormal we can avoid the scaling.
806 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
807 }
808 }
809
810 if (FMF.allowReciprocal()) {
811 // x / y -> x * (1.0 / y)
812
813 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
814 // will never underflow.
815 if (HasFP32DenormalFlush || FMF.approxFunc()) {
816 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
817 return Builder.CreateFMul(Num, Recip);
818 }
819
820 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
821 return Builder.CreateFMul(Num, Recip);
822 }
823
824 return nullptr;
825}
826
827// optimize with fdiv.fast:
828//
829// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
830//
831// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
832//
833// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
834Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
835 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
836 // fdiv.fast can achieve 2.5 ULP accuracy.
837 if (ReqdAccuracy < 2.5f)
838 return nullptr;
839
840 // Only have fdiv.fast for f32.
841 assert(Den->getType()->isFloatTy());
842
843 bool NumIsOne = false;
844 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
845 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
846 NumIsOne = true;
847 }
848
849 // fdiv does not support denormals. But 1.0/x is always fine to use it.
850 //
851 // TODO: This works for any value with a specific known exponent range, don't
852 // just limit to constant 1.
853 if (!HasFP32DenormalFlush && !NumIsOne)
854 return nullptr;
855
856 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
857}
858
859Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
860 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
861 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
862 float ReqdDivAccuracy) const {
863 if (RsqOp) {
864 Value *Rsq =
865 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
866 if (Rsq)
867 return Rsq;
868 }
869
870 if (!Num->getType()->isFloatTy())
871 return nullptr;
872
873 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
874 if (Rcp)
875 return Rcp;
876
877 // In the basic case fdiv_fast has the same instruction count as the frexp div
878 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
879 // potentially be fused into a user. Also, materialization of the constants
880 // can be reused for multiple instances.
881 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
882 if (FDivFast)
883 return FDivFast;
884
885 return emitFrexpDiv(Builder, Num, Den, DivFMF);
886}
887
888// Optimizations is performed based on fpmath, fast math flags as well as
889// denormals to optimize fdiv with either rcp or fdiv.fast.
890//
891// With rcp:
892// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
893// allowed with afn.
894//
895// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
896//
897// With fdiv.fast:
898// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
899//
900// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
901//
902// NOTE: rcp is the preference in cases that both are legal.
903bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
904 if (DisableFDivExpand)
905 return false;
906
907 Type *Ty = FDiv.getType()->getScalarType();
908 const bool IsFloat = Ty->isFloatTy();
909 if (!IsFloat && !Ty->isDoubleTy())
910 return false;
911
912 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
913 // expansion around them in codegen. f16 is good enough to always use.
914
915 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
916 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
917 const float ReqdAccuracy = FPOp->getFPAccuracy();
918
919 FastMathFlags SqrtFMF;
920
921 Value *Num = FDiv.getOperand(0);
922 Value *Den = FDiv.getOperand(1);
923
924 Value *RsqOp = nullptr;
925 auto *DenII = dyn_cast<IntrinsicInst>(Den);
926 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
927 DenII->hasOneUse()) {
928 const auto *SqrtOp = cast<FPMathOperator>(DenII);
929 SqrtFMF = SqrtOp->getFastMathFlags();
930 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
931 RsqOp = SqrtOp->getOperand(0);
932 }
933
934 // rcp path not yet implemented for f64.
935 if (!IsFloat && !RsqOp)
936 return false;
937
938 // Inaccurate rcp is allowed with afn.
939 //
940 // Defer to codegen to handle this.
941 //
942 // TODO: Decide on an interpretation for interactions between afn + arcp +
943 // !fpmath, and make it consistent between here and codegen. For now, defer
944 // expansion of afn to codegen. The current interpretation is so aggressive we
945 // don't need any pre-consideration here when we have better information. A
946 // more conservative interpretation could use handling here.
947 const bool AllowInaccurateRcp = DivFMF.approxFunc();
948 if (!RsqOp && AllowInaccurateRcp)
949 return false;
950
951 // Defer the correct implementations to codegen.
952 if (IsFloat && ReqdAccuracy < 1.0f)
953 return false;
954
955 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
956 Builder.setFastMathFlags(DivFMF);
957 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
958
959 SmallVector<Value *, 4> NumVals;
960 SmallVector<Value *, 4> DenVals;
961 SmallVector<Value *, 4> RsqDenVals;
962 extractValues(Builder, NumVals, Num);
963 extractValues(Builder, DenVals, Den);
964
965 if (RsqOp)
966 extractValues(Builder, RsqDenVals, RsqOp);
967
968 SmallVector<Value *, 4> ResultVals(NumVals.size());
969 for (int I = 0, E = NumVals.size(); I != E; ++I) {
970 Value *NumElt = NumVals[I];
971 Value *DenElt = DenVals[I];
972 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
973
974 Value *NewElt =
975 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
976 cast<Instruction>(FPOp), ReqdAccuracy);
977 if (!NewElt) {
978 // Keep the original, but scalarized.
979
980 // This has the unfortunate side effect of sometimes scalarizing when
981 // we're not going to do anything.
982 NewElt = Builder.CreateFDiv(NumElt, DenElt);
983 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
984 NewEltInst->copyMetadata(FDiv);
985 }
986
987 ResultVals[I] = NewElt;
988 }
989
990 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
991
992 if (NewVal) {
993 FDiv.replaceAllUsesWith(NewVal);
994 NewVal->takeName(&FDiv);
995 DeadVals.push_back(&FDiv);
996 }
997
998 return true;
999}
1000
1001static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1002 Value *LHS, Value *RHS) {
1003 Type *I32Ty = Builder.getInt32Ty();
1004 Type *I64Ty = Builder.getInt64Ty();
1005
1006 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1007 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1008 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1009 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1010 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1011 Hi = Builder.CreateTrunc(Hi, I32Ty);
1012 return std::pair(Lo, Hi);
1013}
1014
1015static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1016 return getMul64(Builder, LHS, RHS).second;
1017}
1018
1019/// Figure out how many bits are really needed for this division.
1020/// \p MaxDivBits is an optimization hint to bypass the second
1021/// ComputeNumSignBits/computeKnownBits call if the first one is
1022/// insufficient.
1023unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1024 Value *Den,
1025 unsigned MaxDivBits,
1026 bool IsSigned) const {
1028 Den->getType()->getScalarSizeInBits());
1029 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1030 if (IsSigned) {
1031 unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
1032 // A sign bit needs to be reserved for shrinking.
1033 unsigned DivBits = SSBits - RHSSignBits + 1;
1034 if (DivBits > MaxDivBits)
1035 return SSBits;
1036
1037 unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
1038
1039 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1040 DivBits = SSBits - SignBits + 1;
1041 return DivBits;
1042 }
1043
1044 // All bits are used for unsigned division for Num or Den in range
1045 // (SignedMax, UnsignedMax].
1046 KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
1047 unsigned RHSBits = Known.countMaxActiveBits();
1048 if (RHSBits > MaxDivBits)
1049 return SSBits;
1050
1051 Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
1052 unsigned LHSBits = Known.countMaxActiveBits();
1053
1054 unsigned DivBits = std::max(LHSBits, RHSBits);
1055 return DivBits;
1056}
1057
1058// The fractional part of a float is enough to accurately represent up to
1059// a 24-bit signed integer.
1060Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1061 BinaryOperator &I, Value *Num,
1062 Value *Den, bool IsDiv,
1063 bool IsSigned) const {
1064 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1065
1066 // v_rcp_f32(float(X)) can have an error of 1 ulp.
1067 // This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly
1068 // when abs(Y)>0x800000.
1069 // For example,
1070 // (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.
1071 // (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.
1072 //
1073 // Note that for DivBits==24 && IsSigned, Y is in the range
1074 // [-0x800000:0x7FFFFF]. abs(Y) is at most
1075 // 0x800000 so it cannot hit this issue.
1076 if (DivBits > (IsSigned ? 24 : 23))
1077 return nullptr;
1078 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1079}
1080
1081Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1082 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1083 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1084 Type *I32Ty = Builder.getInt32Ty();
1085 Num = Builder.CreateTrunc(Num, I32Ty);
1086 Den = Builder.CreateTrunc(Den, I32Ty);
1087
1088 Type *F32Ty = Builder.getFloatTy();
1089 ConstantInt *One = Builder.getInt32(1);
1090 Value *JQ = One;
1091
1092 if (IsSigned) {
1093 // char|short jq = ia ^ ib;
1094 JQ = Builder.CreateXor(Num, Den);
1095
1096 // jq = jq >> (bitsize - 2)
1097 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1098
1099 // jq = jq | 0x1
1100 JQ = Builder.CreateOr(JQ, One);
1101 }
1102
1103 // int ia = (int)LHS;
1104 Value *IA = Num;
1105
1106 // int ib, (int)RHS;
1107 Value *IB = Den;
1108
1109 // float fa = (float)ia;
1110 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1111 : Builder.CreateUIToFP(IA, F32Ty);
1112
1113 // float fb = (float)ib;
1114 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1115 : Builder.CreateUIToFP(IB,F32Ty);
1116
1117 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1118 Builder.getFloatTy(), {FB});
1119 Value *FQM = Builder.CreateFMul(FA, RCP);
1120
1121 // fq = trunc(fqm);
1122 Value *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1123 auto *FQI = dyn_cast<Instruction>(FQ);
1124 if (FQI)
1125 FQI->copyFastMathFlags(Builder.getFastMathFlags());
1126
1127 // float fqneg = -fq;
1128 Value *FQNeg = Builder.CreateFNeg(FQ);
1129
1130 // float fr = mad(fqneg, fb, fa);
1131 auto FMAD = !ST.hasMadMacF32Insts()
1132 ? Intrinsic::fma
1133 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1134 Value *FR =
1135 Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQI);
1136
1137 // int iq = (int)fq;
1138 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1139 : Builder.CreateFPToUI(FQ, I32Ty);
1140
1141 // fr = fabs(fr);
1142 FR = Builder.CreateFAbs(FR, FQI);
1143
1144 // fb = fabs(fb);
1145 FB = Builder.CreateFAbs(FB, FQI);
1146
1147 // int cv = fr >= fb;
1148 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1149
1150 // jq = (cv ? jq : 0);
1151 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1152
1153 // dst = iq + jq;
1154 Value *Div = Builder.CreateAdd(IQ, JQ);
1155
1156 Value *Res = Div;
1157 if (!IsDiv) {
1158 // Rem needs compensation, it's easier to recompute it
1159 Value *Rem = Builder.CreateMul(Div, Den);
1160 Res = Builder.CreateSub(Num, Rem);
1161 }
1162
1163 if (DivBits != 0 && DivBits < 32) {
1164 // Extend in register from the number of bits this divide really is.
1165 if (IsSigned) {
1166 int InRegBits = 32 - DivBits;
1167
1168 Res = Builder.CreateShl(Res, InRegBits);
1169 Res = Builder.CreateAShr(Res, InRegBits);
1170 } else {
1171 ConstantInt *TruncMask
1172 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1173 Res = Builder.CreateAnd(Res, TruncMask);
1174 }
1175 }
1176
1177 return Res;
1178}
1179
1180// Try to recognize special cases the DAG will emit special, better expansions
1181// than the general expansion we do here.
1182
1183// TODO: It would be better to just directly handle those optimizations here.
1184bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1185 Value *Num,
1186 Value *Den) const {
1187 if (Constant *C = dyn_cast<Constant>(Den)) {
1188 // Arbitrary constants get a better expansion as long as a wider mulhi is
1189 // legal.
1190 if (C->getType()->getScalarSizeInBits() <= 32)
1191 return true;
1192
1193 // TODO: Sdiv check for not exact for some reason.
1194
1195 // If there's no wider mulhi, there's only a better expansion for powers of
1196 // two.
1197 // TODO: Should really know for each vector element.
1199 return true;
1200
1201 return false;
1202 }
1203
1204 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1205 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1206 if (BinOpDen->getOpcode() == Instruction::Shl &&
1207 isa<Constant>(BinOpDen->getOperand(0)) &&
1208 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
1209 SQ.getWithInstruction(&I))) {
1210 return true;
1211 }
1212 }
1213
1214 return false;
1215}
1216
1217static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1218 // Check whether the sign can be determined statically.
1219 KnownBits Known = computeKnownBits(V, DL);
1220 if (Known.isNegative())
1221 return Constant::getAllOnesValue(V->getType());
1222 if (Known.isNonNegative())
1223 return Constant::getNullValue(V->getType());
1224 return Builder.CreateAShr(V, Builder.getInt32(31));
1225}
1226
1227Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1228 BinaryOperator &I, Value *X,
1229 Value *Y) const {
1230 Instruction::BinaryOps Opc = I.getOpcode();
1231 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1232 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1233
1234 FastMathFlags FMF;
1235 FMF.setFast();
1236 Builder.setFastMathFlags(FMF);
1237
1238 if (divHasSpecialOptimization(I, X, Y))
1239 return nullptr; // Keep it for later optimization.
1240
1241 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1242 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1243
1244 Type *Ty = X->getType();
1245 Type *I32Ty = Builder.getInt32Ty();
1246 Type *F32Ty = Builder.getFloatTy();
1247
1248 if (Ty->getScalarSizeInBits() != 32) {
1249 if (IsSigned) {
1250 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1251 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1252 } else {
1253 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1254 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1255 }
1256 }
1257
1258 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1259 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1260 Builder.CreateZExtOrTrunc(Res, Ty);
1261 }
1262
1263 ConstantInt *Zero = Builder.getInt32(0);
1264 ConstantInt *One = Builder.getInt32(1);
1265
1266 Value *Sign = nullptr;
1267 if (IsSigned) {
1268 Value *SignX = getSign32(X, Builder, DL);
1269 Value *SignY = getSign32(Y, Builder, DL);
1270 // Remainder sign is the same as LHS
1271 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1272
1273 X = Builder.CreateAdd(X, SignX);
1274 Y = Builder.CreateAdd(Y, SignY);
1275
1276 X = Builder.CreateXor(X, SignX);
1277 Y = Builder.CreateXor(Y, SignY);
1278 }
1279
1280 // The algorithm here is based on ideas from "Software Integer Division", Tom
1281 // Rodeheffer, August 2008.
1282 //
1283 // unsigned udiv(unsigned x, unsigned y) {
1284 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1285 // // that this is a lower bound on inv(y), even if some of the calculations
1286 // // round up.
1287 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1288 //
1289 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1290 // // Empirically this is guaranteed to give a "two-y" lower bound on
1291 // // inv(y).
1292 // z += umulh(z, -y * z);
1293 //
1294 // // Quotient/remainder estimate.
1295 // unsigned q = umulh(x, z);
1296 // unsigned r = x - q * y;
1297 //
1298 // // Two rounds of quotient/remainder refinement.
1299 // if (r >= y) {
1300 // ++q;
1301 // r -= y;
1302 // }
1303 // if (r >= y) {
1304 // ++q;
1305 // r -= y;
1306 // }
1307 //
1308 // return q;
1309 // }
1310
1311 // Initial estimate of inv(y).
1312 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1313 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1314 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1315 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1316 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1317
1318 // One round of UNR.
1319 Value *NegY = Builder.CreateSub(Zero, Y);
1320 Value *NegYZ = Builder.CreateMul(NegY, Z);
1321 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1322
1323 // Quotient/remainder estimate.
1324 Value *Q = getMulHu(Builder, X, Z);
1325 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1326
1327 // First quotient/remainder refinement.
1328 Value *Cond = Builder.CreateICmpUGE(R, Y);
1329 if (IsDiv)
1330 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1331 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1332
1333 // Second quotient/remainder refinement.
1334 Cond = Builder.CreateICmpUGE(R, Y);
1335 Value *Res;
1336 if (IsDiv)
1337 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1338 else
1339 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1340
1341 if (IsSigned) {
1342 Res = Builder.CreateXor(Res, Sign);
1343 Res = Builder.CreateSub(Res, Sign);
1344 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1345 } else {
1346 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1347 }
1348 return Res;
1349}
1350
1351Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1352 BinaryOperator &I, Value *Num,
1353 Value *Den) const {
1354 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1355 return nullptr; // Keep it for later optimization.
1356
1357 Instruction::BinaryOps Opc = I.getOpcode();
1358
1359 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1360 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1361
1362 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1363 if (NumDivBits > 32)
1364 return nullptr;
1365
1366 Value *Narrowed = nullptr;
1367 // v_rcp_f32(float(X)) can have an error of 1 ulp.
1368 // This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly
1369 // when abs(Y)>0x800000.
1370 // For example,
1371 // (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.
1372 // (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.
1373 //
1374 // Note that for NumDivBits==24 && IsSigned, Y is in the range
1375 // [-0x800000:0x7FFFFF]. abs(Y) is at most
1376 // 0x800000 so it cannot hit this issue.
1377 if (NumDivBits <= (IsSigned ? 24 : 23)) {
1378 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1379 IsDiv, IsSigned);
1380 } else if (NumDivBits <= 32) {
1381 Narrowed = expandDivRem32(Builder, I, Num, Den);
1382 }
1383
1384 if (Narrowed) {
1385 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1386 Builder.CreateZExt(Narrowed, Num->getType());
1387 }
1388
1389 return nullptr;
1390}
1391
1392void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1393 Instruction::BinaryOps Opc = I.getOpcode();
1394 // Do the general expansion.
1395 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1397 return;
1398 }
1399
1400 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1402 return;
1403 }
1404
1405 llvm_unreachable("not a division");
1406}
1407
1408/*
1409This will cause non-byte load in consistency, for example:
1410```
1411 %load = load i1, ptr addrspace(4) %arg, align 4
1412 %zext = zext i1 %load to
1413 i64 %add = add i64 %zext
1414```
1415Instead of creating `s_and_b32 s0, s0, 1`,
1416it will create `s_and_b32 s0, s0, 0xff`.
1417We accept this change since the non-byte load assumes the upper bits
1418within the byte are all 0.
1419*/
1420bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1421 unsigned Opc = I->getOpcode();
1422 Type *OldType = I->getType();
1423
1424 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1425 return false;
1426
1427 unsigned OrigBit = OldType->getScalarSizeInBits();
1428
1429 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1430 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1431 "Instruction::Mul.");
1432
1433 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1434
1435 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1436 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1437 if (!NewType)
1438 return false;
1439 unsigned NewBit = NewType->getIntegerBitWidth();
1440 if (NewBit >= OrigBit)
1441 return false;
1442 NewType = I->getType()->getWithNewBitWidth(NewBit);
1443
1444 // Old cost
1445 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1446 InstructionCost OldCost =
1448 // New cost of new op
1449 InstructionCost NewCost =
1451 // New cost of narrowing 2 operands (use trunc)
1452 int NumOfNonConstOps = 2;
1453 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1454 // Cannot be both constant, should be propagated
1455 NumOfNonConstOps = 1;
1456 }
1457 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1458 NewType, OldType,
1461 // New cost of zext narrowed result to original type
1462 NewCost +=
1463 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1465 if (NewCost >= OldCost)
1466 return false;
1467
1468 IRBuilder<> Builder(I);
1469 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1470 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1471 Value *Arith =
1472 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1473
1474 Value *Zext = Builder.CreateZExt(Arith, OldType);
1475 I->replaceAllUsesWith(Zext);
1476 DeadVals.push_back(I);
1477 return true;
1478}
1479
1480bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1481 if (foldBinOpIntoSelect(I))
1482 return true;
1483
1484 if (UseMul24Intrin && replaceMulWithMul24(I))
1485 return true;
1486 if (tryNarrowMathIfNoOverflow(&I))
1487 return true;
1488
1489 bool Changed = false;
1490 Instruction::BinaryOps Opc = I.getOpcode();
1491 Type *Ty = I.getType();
1492 Value *NewDiv = nullptr;
1493 unsigned ScalarSize = Ty->getScalarSizeInBits();
1494
1496
1497 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1498 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1499 ScalarSize <= 64 &&
1500 !DisableIDivExpand) {
1501 Value *Num = I.getOperand(0);
1502 Value *Den = I.getOperand(1);
1503 IRBuilder<> Builder(&I);
1504 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1505
1506 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1507 NewDiv = PoisonValue::get(VT);
1508
1509 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1510 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1511 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1512
1513 Value *NewElt;
1514 if (ScalarSize <= 32) {
1515 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1516 if (!NewElt)
1517 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1518 } else {
1519 // See if this 64-bit division can be shrunk to 32/24-bits before
1520 // producing the general expansion.
1521 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1522 if (!NewElt) {
1523 // The general 64-bit expansion introduces control flow and doesn't
1524 // return the new value. Just insert a scalar copy and defer
1525 // expanding it.
1526 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1527 // CreateBinOp does constant folding. If the operands are constant,
1528 // it will return a Constant instead of a BinaryOperator.
1529 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1530 Div64ToExpand.push_back(NewEltBO);
1531 }
1532 }
1533
1534 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1535 NewEltI->copyIRFlags(&I);
1536
1537 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1538 }
1539 } else {
1540 if (ScalarSize <= 32)
1541 NewDiv = expandDivRem32(Builder, I, Num, Den);
1542 else {
1543 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1544 if (!NewDiv)
1545 Div64ToExpand.push_back(&I);
1546 }
1547 }
1548
1549 if (NewDiv) {
1550 I.replaceAllUsesWith(NewDiv);
1551 DeadVals.push_back(&I);
1552 Changed = true;
1553 }
1554 }
1555
1556 if (ExpandDiv64InIR) {
1557 // TODO: We get much worse code in specially handled constant cases.
1558 for (BinaryOperator *Div : Div64ToExpand) {
1559 expandDivRem64(*Div);
1560 FlowChanged = true;
1561 Changed = true;
1562 }
1563 }
1564
1565 return Changed;
1566}
1567
1568bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1569 if (!WidenLoads)
1570 return false;
1571
1572 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1573 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1574 canWidenScalarExtLoad(I)) {
1575 IRBuilder<> Builder(&I);
1576 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1577
1578 Type *I32Ty = Builder.getInt32Ty();
1579 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1581
1582 // The widened load reads the original bytes in the low bits, so a !range
1583 // lower bound still holds. Convert it to the new type and don't make
1584 // assumptions about the high bits.
1585 if (auto *Range = I.getMetadata(LLVMContext::MD_range)) {
1586 ConstantInt *Lower = mdconst::extract<ConstantInt>(Range->getOperand(0));
1587
1588 if (!Lower->isNullValue()) {
1589 Metadata *LowAndHigh[] = {
1590 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1591 // Don't make assumptions about the high bits.
1592 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1593 };
1594
1595 WidenLoad->setMetadata(LLVMContext::MD_range,
1596 MDNode::get(F.getContext(), LowAndHigh));
1597 }
1598 }
1599
1600 int TySize = DL.getTypeSizeInBits(I.getType());
1601 Type *IntNTy = Builder.getIntNTy(TySize);
1602 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1603 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1604 I.replaceAllUsesWith(ValOrig);
1605 DeadVals.push_back(&I);
1606 return true;
1607 }
1608
1609 return false;
1610}
1611
1612bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1613 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1614 if (!FPOp)
1615 return false;
1616
1617 Value *X;
1618 Value *Fract = nullptr;
1619
1620 // Match:
1621 // (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))
1622 //
1623 // This is the preferred way to implement fract.
1624 // TODO: Could also match with compare against 1.0
1625 const APFloat *C;
1627 Value *FractSrc = matchFractPatImpl(*X, *C);
1628 if (!FractSrc)
1629 return false;
1630 IRBuilder<> Builder(&I);
1631 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1632 Fract = applyFractPat(Builder, FractSrc);
1633 } else {
1634 // Match patterns which may appear in legacy implementations of the fract()
1635 // function, built around the nan-avoidant minnum intrinsic. These are the
1636 // core pattern plus additional clamping of inf and nan values on the
1637 // result.
1638 Value *Cond = I.getCondition();
1639 Value *TrueVal = I.getTrueValue();
1640 Value *FalseVal = I.getFalseValue();
1641 Value *CmpVal;
1642 CmpPredicate IsNanPred;
1643
1644 // Match fract pattern with nan check.
1645 if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))
1646 return false;
1647
1648 IRBuilder<> Builder(&I);
1649 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1650
1651 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1652 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1653 // isnan(x) ? x : fract(x)
1654 Fract = applyFractPat(Builder, CmpVal);
1655 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1656 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1657 // !isnan(x) ? fract(x) : x
1658 Fract = applyFractPat(Builder, CmpVal);
1659 } else {
1660 // Match an intermediate clamp infinity to 0 pattern. i.e.
1661 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1662 CmpPredicate PredInf;
1663 Value *IfNotInf;
1664
1665 if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),
1666 m_PosInf()),
1667 m_Value(IfNotInf), m_PosZeroFP())) ||
1668 PredInf != FCmpInst::FCMP_UNE ||
1669 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1670 return false;
1671
1672 SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);
1673
1674 // Insert before the fabs
1675 Value *InsertPt =
1676 cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);
1677
1678 Builder.SetInsertPoint(cast<Instruction>(InsertPt));
1679 Value *NewFract = applyFractPat(Builder, CmpVal);
1680 NewFract->takeName(TrueVal);
1681
1682 // Thread the new fract into the inf clamping sequence.
1683 DeadVals.push_back(ClampInfSelect->getOperand(1));
1684 ClampInfSelect->setOperand(1, NewFract);
1685
1686 // The outer select nan handling is also absorbed into the fract.
1687 Fract = ClampInfSelect;
1688 }
1689 } else
1690 return false;
1691 }
1692
1693 Fract->takeName(&I);
1694 I.replaceAllUsesWith(Fract);
1695 DeadVals.push_back(&I);
1696 return true;
1697}
1698
1699static bool areInSameBB(const Value *A, const Value *B) {
1700 const auto *IA = dyn_cast<Instruction>(A);
1701 const auto *IB = dyn_cast<Instruction>(B);
1702 return IA && IB && IA->getParent() == IB->getParent();
1703}
1704
1705// Helper for breaking large PHIs that returns true when an extractelement on V
1706// is likely to be folded away by the DAG combiner.
1708 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1709 if (!FVT)
1710 return false;
1711
1712 const Value *CurVal = V;
1713
1714 // Check for insertelements, keeping track of the elements covered.
1715 BitVector EltsCovered(FVT->getNumElements());
1716 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1717 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1718
1719 // Non constant index/out of bounds index -> folding is unlikely.
1720 // The latter is more of a sanity check because canonical IR should just
1721 // have replaced those with poison.
1722 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1723 return false;
1724
1725 const auto *VecSrc = IE->getOperand(0);
1726
1727 // If the vector source is another instruction, it must be in the same basic
1728 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1729 // unlikely to be able to do anything interesting here.
1730 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1731 return false;
1732
1733 CurVal = VecSrc;
1734 EltsCovered.set(Idx->getZExtValue());
1735
1736 // All elements covered.
1737 if (EltsCovered.all())
1738 return true;
1739 }
1740
1741 // We either didn't find a single insertelement, or the insertelement chain
1742 // ended before all elements were covered. Check for other interesting values.
1743
1744 // Constants are always interesting because we can just constant fold the
1745 // extractelements.
1746 if (isa<Constant>(CurVal))
1747 return true;
1748
1749 // shufflevector is likely to be profitable if either operand is a constant,
1750 // or if either source is in the same block.
1751 // This is because shufflevector is most often lowered as a series of
1752 // insert/extract elements anyway.
1753 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1754 return isa<Constant>(SV->getOperand(1)) ||
1755 areInSameBB(SV, SV->getOperand(0)) ||
1756 areInSameBB(SV, SV->getOperand(1));
1757 }
1758
1759 return false;
1760}
1761
1762static void collectPHINodes(const PHINode &I,
1764 const auto [It, Inserted] = SeenPHIs.insert(&I);
1765 if (!Inserted)
1766 return;
1767
1768 for (const Value *Inc : I.incoming_values()) {
1769 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1770 collectPHINodes(*PhiInc, SeenPHIs);
1771 }
1772
1773 for (const User *U : I.users()) {
1774 if (const auto *PhiU = dyn_cast<PHINode>(U))
1775 collectPHINodes(*PhiU, SeenPHIs);
1776 }
1777}
1778
1779bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1780 // Check in the cache first.
1781 if (const auto It = BreakPhiNodesCache.find(&I);
1782 It != BreakPhiNodesCache.end())
1783 return It->second;
1784
1785 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1786 // recursively consider all its users and incoming values that are also PHI
1787 // nodes. We then make a decision about all of those PHIs at once. Either they
1788 // all get broken up, or none of them do. That way, we avoid cases where a
1789 // single PHI is/is not broken and we end up reforming/exploding a vector
1790 // multiple times, or even worse, doing it in a loop.
1791 SmallPtrSet<const PHINode *, 8> WorkList;
1792 collectPHINodes(I, WorkList);
1793
1794#ifndef NDEBUG
1795 // Check that none of the PHI nodes in the worklist are in the map. If some of
1796 // them are, it means we're not good enough at collecting related PHIs.
1797 for (const PHINode *WLP : WorkList) {
1798 assert(BreakPhiNodesCache.count(WLP) == 0);
1799 }
1800#endif
1801
1802 // To consider a PHI profitable to break, we need to see some interesting
1803 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1804 // must have one to consider all PHIs breakable.
1805 //
1806 // This threshold has been determined through performance testing.
1807 //
1808 // Note that the computation below is equivalent to
1809 //
1810 // (unsigned)ceil((K / 3.0) * 2)
1811 //
1812 // It's simply written this way to avoid mixing integral/FP arithmetic.
1813 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1814 unsigned NumBreakablePHIs = 0;
1815 bool CanBreak = false;
1816 for (const PHINode *Cur : WorkList) {
1817 // Don't break PHIs that have no interesting incoming values. That is, where
1818 // there is no clear opportunity to fold the "extractelement" instructions
1819 // we would add.
1820 //
1821 // Note: IC does not run after this pass, so we're only interested in the
1822 // foldings that the DAG combiner can do.
1823 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1824 if (++NumBreakablePHIs >= Threshold) {
1825 CanBreak = true;
1826 break;
1827 }
1828 }
1829 }
1830
1831 for (const PHINode *Cur : WorkList)
1832 BreakPhiNodesCache[Cur] = CanBreak;
1833
1834 return CanBreak;
1835}
1836
1837/// Helper class for "break large PHIs" (visitPHINode).
1838///
1839/// This represents a slice of a PHI's incoming value, which is made up of:
1840/// - The type of the slice (Ty)
1841/// - The index in the incoming value's vector where the slice starts (Idx)
1842/// - The number of elements in the slice (NumElts).
1843/// It also keeps track of the NewPHI node inserted for this particular slice.
1844///
1845/// Slice examples:
1846/// <4 x i64> -> Split into four i64 slices.
1847/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1848/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1849/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1851public:
1852 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1853 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1854
1855 Type *Ty = nullptr;
1856 unsigned Idx = 0;
1857 unsigned NumElts = 0;
1858 PHINode *NewPHI = nullptr;
1859
1860 /// Slice \p Inc according to the information contained within this slice.
1861 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1862 /// pair, it returns the same Sliced value as well.
1863 ///
1864 /// Note this *intentionally* does not return the same value for, say,
1865 /// [%bb.0, %0] & [%bb.1, %0] as:
1866 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1867 /// the value in bb.1 may not be reachable from bb.0 if it's its
1868 /// predecessor.)
1869 /// - We also want to make our extract instructions as local as possible so
1870 /// the DAG has better chances of folding them out. Duplicating them like
1871 /// that is beneficial in that regard.
1872 ///
1873 /// This is both a minor optimization to avoid creating duplicate
1874 /// instructions, but also a requirement for correctness. It is not forbidden
1875 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1876 /// returned a new value each time, those previously identical pairs would all
1877 /// have different incoming values (from the same block) and it'd cause a "PHI
1878 /// node has multiple entries for the same basic block with different incoming
1879 /// values!" verifier error.
1880 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1881 Value *&Res = SlicedVals[{BB, Inc}];
1882 if (Res)
1883 return Res;
1884
1886 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1887 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1888
1889 if (NumElts > 1) {
1891 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1892 Mask.push_back(K);
1893 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1894 } else
1895 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1896
1897 return Res;
1898 }
1899
1900private:
1902};
1903
1904bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1905 // Break-up fixed-vector PHIs into smaller pieces.
1906 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1907 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1908 //
1909 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1910 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1911 // With large, odd-sized PHIs we may end up needing many `build_vector`
1912 // operations with most elements being "undef". This inhibits a lot of
1913 // optimization opportunities and can result in unreasonably high register
1914 // pressure and the inevitable stack spilling.
1915 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1916 return false;
1917
1918 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1919 if (!FVT || FVT->getNumElements() == 1 ||
1920 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1921 return false;
1922
1923 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1924 return false;
1925
1926 std::vector<VectorSlice> Slices;
1927
1928 Type *EltTy = FVT->getElementType();
1929 {
1930 unsigned Idx = 0;
1931 // For 8/16 bits type, don't scalarize fully but break it up into as many
1932 // 32-bit slices as we can, and scalarize the tail.
1933 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1934 const unsigned NumElts = FVT->getNumElements();
1935 if (EltSize == 8 || EltSize == 16) {
1936 const unsigned SubVecSize = (32 / EltSize);
1937 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1938 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1939 Idx += SubVecSize)
1940 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1941 }
1942
1943 // Scalarize all remaining elements.
1944 for (; Idx < NumElts; ++Idx)
1945 Slices.emplace_back(EltTy, Idx, 1);
1946 }
1947
1948 assert(Slices.size() > 1);
1949
1950 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1951 // creating the necessary instruction to extract the relevant slices of each
1952 // incoming value.
1953 IRBuilder<> B(I.getParent());
1954 B.SetCurrentDebugLocation(I.getDebugLoc());
1955
1956 unsigned IncNameSuffix = 0;
1957 for (VectorSlice &S : Slices) {
1958 // We need to reset the build on each iteration, because getSlicedVal may
1959 // have inserted something into I's BB.
1960 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1961 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1962
1963 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1964 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1965 "largephi.extractslice" +
1966 std::to_string(IncNameSuffix++)),
1967 BB);
1968 }
1969 }
1970
1971 // And replace this PHI with a vector of all the previous PHI values.
1972 Value *Vec = PoisonValue::get(FVT);
1973 unsigned NameSuffix = 0;
1974 for (VectorSlice &S : Slices) {
1975 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1976 if (S.NumElts > 1)
1977 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1978 else
1979 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1980 }
1981
1982 I.replaceAllUsesWith(Vec);
1983 DeadVals.push_back(&I);
1984 return true;
1985}
1986
1987/// \param V Value to check
1988/// \param DL DataLayout
1989/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1990/// \param AS Target Address Space
1991/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1992static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1993 const AMDGPUTargetMachine &TM, unsigned AS) {
1994 // Pointer cannot be null if it's a block address, GV or alloca.
1995 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1996 // it as the symbol could be null in such cases.
1998 return true;
1999
2000 // Check nonnull arguments.
2001 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2002 return true;
2003
2004 // Check nonnull loads.
2005 if (const auto *Load = dyn_cast<LoadInst>(V);
2006 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2007 return true;
2008
2009 // getUnderlyingObject may have looked through another addrspacecast, although
2010 // the optimizable situations most likely folded out by now.
2011 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2012 return false;
2013
2014 // TODO: Calls that return nonnull?
2015
2016 // For all other things, use KnownBits.
2017 // We either use 0 or all bits set to indicate null, so check whether the
2018 // value can be zero or all ones.
2019 //
2020 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2021 // address spaces have non-zero null values.
2022 auto SrcPtrKB = computeKnownBits(V, DL);
2023 const auto NullVal = AMDGPU::getNullPointerValue(AS);
2024
2025 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2026 assert((NullVal == 0 || NullVal == -1) &&
2027 "don't know how to check for this null value!");
2028 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2029}
2030
2031bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2032 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2033 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2034 // worth supporting.
2035 if (I.getType()->isVectorTy())
2036 return false;
2037
2038 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2039 // This is only worthwhile for casts from/to priv/local to flat.
2040 const unsigned SrcAS = I.getSrcAddressSpace();
2041 const unsigned DstAS = I.getDestAddressSpace();
2042
2043 bool CanLower = false;
2044 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2045 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2046 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2047 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2048 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2049 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2050 if (!CanLower)
2051 return false;
2052
2054 getUnderlyingObjects(I.getOperand(0), WorkList);
2055 if (!all_of(WorkList, [&](const Value *V) {
2056 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2057 }))
2058 return false;
2059
2060 IRBuilder<> B(&I);
2061 auto *Intrin = B.CreateIntrinsic(
2062 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2063 I.replaceAllUsesWith(Intrin);
2064 DeadVals.push_back(&I);
2065 return true;
2066}
2067
2068bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2069 Intrinsic::ID IID = I.getIntrinsicID();
2070 switch (IID) {
2071 case Intrinsic::minnum:
2072 case Intrinsic::minimumnum:
2073 case Intrinsic::minimum:
2074 return visitFMinLike(I);
2075 case Intrinsic::sqrt:
2076 return visitSqrt(I);
2077 case Intrinsic::log:
2078 case Intrinsic::log10:
2079 return visitLog(cast<FPMathOperator>(I), IID);
2080 case Intrinsic::log2:
2081 // No reason to handle log2.
2082 return false;
2083 case Intrinsic::amdgcn_mbcnt_lo:
2084 return visitMbcntLo(I);
2085 case Intrinsic::amdgcn_mbcnt_hi:
2086 return visitMbcntHi(I);
2087 case Intrinsic::vector_reduce_add:
2088 return visitVectorReduceAdd(I);
2089 case Intrinsic::uadd_sat:
2090 case Intrinsic::sadd_sat:
2091 return visitSaturatingAdd(I);
2092 default:
2093 return false;
2094 }
2095}
2096
2097/// Match the core sequence in the fract pattern (x - floor(x), which doesn't
2098/// need to consider edge case handling.
2099Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,
2100 const APFloat &C) const {
2101 if (ST.hasFractBug())
2102 return nullptr;
2103
2104 Type *Ty = FractSrc.getType();
2105 if (!isLegalFloatingTy(Ty->getScalarType()))
2106 return nullptr;
2107
2108 APFloat OneNextDown = APFloat::getOne(C.getSemantics());
2109 OneNextDown.next(true);
2110
2111 // Match nextafter(1.0, -1)
2112 if (OneNextDown != C)
2113 return nullptr;
2114
2115 Value *FloorSrc;
2116 if (match(&FractSrc, m_FSub(m_Value(FloorSrc), m_Intrinsic<Intrinsic::floor>(
2117 m_Deferred(FloorSrc)))))
2118 return FloorSrc;
2119 return nullptr;
2120}
2121
2122/// Match non-nan fract pattern.
2123// MIN_CONSTANT = nextafter(1.0, -1.0)
2124/// minnum(fsub(x, floor(x)), MIN_CONSTANT)
2125/// minimumnum(fsub(x, floor(x)), MIN_CONSTANT)
2126/// minimum(fsub(x, floor(x)), MIN_CONSTANT)
2127
2128// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;
2129///
2130/// If fract is a useful instruction for the subtarget. Does not account for the
2131/// nan handling; the instruction has a nan check on the input value.
2132Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {
2133 Value *Arg0;
2134 const APFloat *C;
2135
2136 // The value is only used in contexts where we know the input isn't a nan, so
2137 // any of the fmin variants are fine.
2138 if (!match(&V,
2142 return nullptr;
2143
2144 return matchFractPatImpl(*Arg0, *C);
2145}
2146
2147Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2148 Value *FractArg) {
2149 SmallVector<Value *, 4> FractVals;
2150 extractValues(Builder, FractVals, FractArg);
2151
2152 SmallVector<Value *, 4> ResultVals(FractVals.size());
2153
2154 Type *Ty = FractArg->getType()->getScalarType();
2155 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2156 ResultVals[I] =
2157 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2158 }
2159
2160 return insertValues(Builder, FractArg->getType(), ResultVals);
2161}
2162
2163bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2164 const APFloat *C;
2165 Value *FractArg;
2166
2167 // minimum(x - floor(x), MIN_CONSTANT)
2168 Value *X;
2169 if (!ST.hasFractBug() &&
2171 FractArg = matchFractPatImpl(*X, *C);
2172 if (!FractArg)
2173 return false;
2174 } else {
2175 // minnum(x - floor(x), MIN_CONSTANT)
2176 FractArg = matchFractPatNanAvoidant(I);
2177 if (!FractArg)
2178 return false;
2179
2180 // Match pattern for fract intrinsic in contexts where the nan check has
2181 // been optimized out (and hope the knowledge the source can't be nan wasn't
2182 // lost).
2183 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
2184 return false;
2185 }
2186
2187 IRBuilder<> Builder(&I);
2188 FastMathFlags FMF = I.getFastMathFlags();
2189 FMF.setNoNaNs();
2190 Builder.setFastMathFlags(FMF);
2191
2192 Value *Fract = applyFractPat(Builder, FractArg);
2193 Fract->takeName(&I);
2194 I.replaceAllUsesWith(Fract);
2195 DeadVals.push_back(&I);
2196 return true;
2197}
2198
2199// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2200bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2201 Type *Ty = Sqrt.getType()->getScalarType();
2202 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2203 return false;
2204
2205 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2206 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2207
2208 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2209 // of fast llvm.sqrt will give the raw instruction anyway.
2210 if (SqrtFMF.approxFunc())
2211 return false;
2212
2213 const float ReqdAccuracy = FPOp->getFPAccuracy();
2214
2215 // Defer correctly rounded expansion to codegen.
2216 if (ReqdAccuracy < 1.0f)
2217 return false;
2218
2219 Value *SrcVal = Sqrt.getOperand(0);
2220 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2221
2222 // The raw instruction is 1 ulp, but the correction for denormal handling
2223 // brings it to 2.
2224 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2225 return false;
2226
2227 IRBuilder<> Builder(&Sqrt);
2228 SmallVector<Value *, 4> SrcVals;
2229 extractValues(Builder, SrcVals, SrcVal);
2230
2231 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2232 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2233 if (CanTreatAsDAZ)
2234 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2235 else
2236 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2237 }
2238
2239 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2240 NewSqrt->takeName(&Sqrt);
2241 Sqrt.replaceAllUsesWith(NewSqrt);
2242 DeadVals.push_back(&Sqrt);
2243 return true;
2244}
2245
2246/// Replace log and log10 intrinsic calls based on fpmath metadata.
2247bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2248 Intrinsic::ID IID) {
2249 Type *Ty = Log.getType();
2250 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2251 return false;
2252
2253 FastMathFlags FMF = Log.getFastMathFlags();
2254
2255 // Defer fast math cases to codegen.
2256 if (FMF.approxFunc())
2257 return false;
2258
2259 // Limit experimentally determined from OpenCL conformance test (1.79)
2260 if (Log.getFPAccuracy() < 1.80f)
2261 return false;
2262
2263 IRBuilder<> Builder(&cast<CallInst>(Log));
2264
2265 // Use the generic intrinsic for convenience in the vector case. Codegen will
2266 // recognize the denormal handling is not necessary from the fpext.
2267 // TODO: Move to generic code
2268 Value *Log2 =
2269 Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);
2270
2271 double Log2BaseInverted =
2272 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2273 Value *Mul =
2274 Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);
2275
2276 Mul->takeName(&Log);
2277
2278 Log.replaceAllUsesWith(Mul);
2279 DeadVals.push_back(&Log);
2280 return true;
2281}
2282
2283bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2284 if (skipFunction(F))
2285 return false;
2286
2287 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2288 if (!TPC)
2289 return false;
2290
2291 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2292 const TargetLibraryInfo *TLI =
2293 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2294 AssumptionCache *AC =
2295 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2296 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2297 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2298 const UniformityInfo &UA =
2299 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2300 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2301}
2302
2305 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2306 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2307 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2308 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2309 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2310 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2311 if (!Impl.run())
2312 return PreservedAnalyses::all();
2314 if (!Impl.FlowChanged)
2316 return PA;
2317}
2318
2319INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2320 "AMDGPU IR optimizations", false, false)
2324INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2326
2327/// Create a workitem.id.x intrinsic call with range metadata.
2328CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2329 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2330 ST.makeLIDRangeMetadata(Tid);
2331 return Tid;
2332}
2333
2334/// Replace the instruction with a direct workitem.id.x call.
2335void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2336 IRBuilder<> B(&I);
2337 CallInst *Tid = createWorkitemIdX(B);
2339 ReplaceInstWithValue(BI, Tid);
2340}
2341
2342/// Replace the instruction with (workitem.id.x & mask).
2343void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2344 Instruction &I, unsigned WaveSize) const {
2345 IRBuilder<> B(&I);
2346 CallInst *Tid = createWorkitemIdX(B);
2347 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2348 Value *AndInst = B.CreateAnd(Tid, Mask);
2350 ReplaceInstWithValue(BI, AndInst);
2351}
2352
2353/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2354/// work group size allows direct computation of lane ID.
2355/// Returns true if optimization was applied, false otherwise.
2356bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2357 unsigned Wave) const {
2358 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2359 if (!MaybeX)
2360 return false;
2361
2362 // When work group size == wave_size, each work group contains exactly one
2363 // wave, so the instruction can be replaced with workitem.id.x directly.
2364 if (*MaybeX == Wave) {
2365 replaceWithWorkitemIdX(I);
2366 return true;
2367 }
2368
2369 // When work group evenly splits into waves, compute lane ID within wave
2370 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2371 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2372 replaceWithMaskedWorkitemIdX(I, Wave);
2373 return true;
2374 }
2375
2376 return false;
2377}
2378
2379/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2380bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2381 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2382 // the full execution mask.
2383 if (!ST.isWave32())
2384 return false;
2385
2386 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2387 // lower IDs.
2388 if (!match(&I,
2390 return false;
2391
2392 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2393}
2394
2395/// Optimize mbcnt.hi calls for lane ID computation.
2396bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2397 // Abort if wave size is not known at compile time.
2398 if (!ST.isWaveSizeKnown())
2399 return false;
2400
2401 unsigned Wave = ST.getWavefrontSize();
2402
2403 // On wave32, the upper 32 bits of execution mask are always 0, so
2404 // mbcnt.hi(mask, val) always returns val unchanged.
2405 if (ST.isWave32()) {
2406 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2407 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2408 // wave size (single wave per work group).
2409 if (*MaybeX == Wave) {
2411 ReplaceInstWithValue(BI, I.getArgOperand(1));
2412 return true;
2413 }
2414 }
2415 }
2416
2417 // Optimize the complete lane ID computation pattern:
2418 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2419 // across the full execution mask.
2420 using namespace PatternMatch;
2421
2422 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2425 m_AllOnes(), m_Zero()))))
2426 return false;
2427
2428 return tryReplaceWithWorkitemId(I, Wave);
2429}
2430
2431/// Check if type is <4 x i8>.
2432static bool isV4I8(Type *Ty) {
2434 return VTy && VTy->getNumElements() == 4 &&
2435 VTy->getElementType()->isIntegerTy(8);
2436}
2437
2438/// Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x
2439/// i8>) Returns true if pattern matches and signedness matches IsSigned.
2440/// Sets A, B to the <4 x i8> sources.
2441static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B,
2442 bool IsSigned) {
2443 Value *Src0, *Src1;
2444 if (!match(MulOp, m_Mul(m_Value(Src0), m_Value(Src1))))
2445 return false;
2446
2447 // Check that result type is <4 x i32>
2449 if (!MulTy || MulTy->getNumElements() != 4 ||
2450 !MulTy->getElementType()->isIntegerTy(32))
2451 return false;
2452
2453 // Match zext or sext based on IsSigned
2454 Value *ExtSrc0, *ExtSrc1;
2455 if (IsSigned) {
2456 if (!match(Src0, m_SExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2457 return false;
2458 if (!match(Src1, m_SExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2459 return false;
2460 } else {
2461 if (!match(Src0, m_ZExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2462 return false;
2463 if (!match(Src1, m_ZExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2464 return false;
2465 }
2466
2467 A = ExtSrc0;
2468 B = ExtSrc1;
2469 return true;
2470}
2471
2472/// Try to convert vector.reduce.add(mul(zext/sext <4 x i8>, zext/sext <4 x
2473/// i8>)) to a dot4 intrinsic call (non-saturating case only).
2474bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &I) {
2475 // Check if we have dot4 instructions available
2476 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2477 return false;
2478
2479 Value *A = nullptr, *B = nullptr;
2480
2481 // Try unsigned first, then signed
2482 bool IsSigned = false;
2483 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/false)) {
2484 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/true))
2485 return false;
2486 IsSigned = true;
2487 }
2488
2489 LLVMContext &Ctx = I.getContext();
2490 Type *I32Ty = Type::getInt32Ty(Ctx);
2491 IRBuilder<> Builder(&I);
2492
2493 // Bitcast <4 x i8> to i32
2494 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2495 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2496
2497 // Non-saturating case: accumulator is 0, clamp is false
2498 Value *Acc = ConstantInt::get(I32Ty, 0);
2499 Value *Clamp = ConstantInt::getFalse(Ctx);
2500
2501 Intrinsic::ID DotIID =
2502 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2503
2504 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Acc, Clamp});
2505 Dot->takeName(&I);
2506
2507 I.replaceAllUsesWith(Dot);
2508 DeadVals.push_back(&I);
2509
2510 return true;
2511}
2512
2513/// Try to convert uadd.sat/sadd.sat(vector.reduce.add(mul(...)), c) to a
2514/// saturating dot4 intrinsic. This combine starts at the root (saturating add)
2515/// and looks at its operands.
2516bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &I) {
2517 // Check if we have dot4 instructions available
2518 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2519 return false;
2520
2521 Intrinsic::ID IID = I.getIntrinsicID();
2522 bool IsSigned = (IID == Intrinsic::sadd_sat);
2523
2524 // Look for vector.reduce.add as one of the operands (commutative match)
2525 Value *Op0 = I.getArgOperand(0);
2526 Value *Op1 = I.getArgOperand(1);
2527 Value *MulOp = nullptr;
2528 Value *Accum = nullptr;
2529 IntrinsicInst *ReduceInst = nullptr;
2530
2532 ReduceInst = cast<IntrinsicInst>(Op0);
2533 Accum = Op1;
2534 } else if (match(Op1,
2536 ReduceInst = cast<IntrinsicInst>(Op1);
2537 Accum = Op0;
2538 } else {
2539 return false;
2540 }
2541
2542 Value *A = nullptr, *B = nullptr;
2543
2544 if (!matchDot4Pattern(MulOp, A, B, IsSigned))
2545 return false;
2546
2547 LLVMContext &Ctx = I.getContext();
2548 Type *I32Ty = Type::getInt32Ty(Ctx);
2549 IRBuilder<> Builder(&I);
2550
2551 // Bitcast <4 x i8> to i32
2552 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2553 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2554
2555 // Saturating case: use the accumulator and set clamp to true
2556 Value *Clamp = ConstantInt::getTrue(Ctx);
2557
2558 Intrinsic::ID DotIID =
2559 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2560
2561 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Accum, Clamp});
2562 Dot->takeName(&I);
2563
2564 I.replaceAllUsesWith(Dot);
2565 DeadVals.push_back(&I);
2566 // The reduce.add will be dead after this and cleaned up later
2567 if (ReduceInst->use_empty())
2568 DeadVals.push_back(ReduceInst);
2569
2570 return true;
2571}
2572
2573char AMDGPUCodeGenPrepare::ID = 0;
2574
2576 return new AMDGPUCodeGenPrepare();
2577}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1147
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
opStatus next(bool nextDown)
Definition APFloat.h:1313
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
bool all() const
Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:512
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:674
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:274
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1715
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2148
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false, MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2190
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2439
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Definition IRBuilder.h:1048
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1852
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1115
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:341
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2424
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2116
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2396
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1644
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2202
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1861
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2163
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2434
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2183
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
Base class for instruction visitors.
Definition InstVisitor.h:78
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1567
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
bool use_empty() const
Definition Value.h:346
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double ln2
constexpr double ln10
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:106
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
const DataLayout & DL
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC