doxygen/AMDGPUCodeGenPrepare_8cpp_source.html

//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// This pass does misc. AMDGPU optimizations on IR before instruction

/// selection.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "AMDGPUMemoryUtils.h"

#include "AMDGPUTargetMachine.h"

#include "SIModeRegisterDefaults.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/ConstantFolding.h"

#include "llvm/Analysis/TargetLibraryInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/UniformityAnalysis.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstVisitor.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/KnownFPClass.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/IntegerDivision.h"

#include "llvm/Transforms/Utils/Local.h"


#define DEBUG_TYPE "amdgpu-codegenprepare"


using namespace llvm;

using namespace llvm::PatternMatch;


namespace {


static cl::opt<bool> WidenLoads(

  "amdgpu-codegenprepare-widen-constant-loads",

  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),

  cl::ReallyHidden,

  cl::init(false));


static cl::opt<bool>

    BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",

                   cl::desc("Break large PHI nodes for DAGISel"),

                   cl::ReallyHidden, cl::init(true));


static cl::opt<bool>

    ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",

                        cl::desc("For testing purposes, always break large "

                                 "PHIs even if it isn't profitable."),

                        cl::ReallyHidden, cl::init(false));


static cl::opt<unsigned> BreakLargePHIsThreshold(

    "amdgpu-codegenprepare-break-large-phis-threshold",

    cl::desc("Minimum type size in bits for breaking large PHI nodes"),

    cl::ReallyHidden, cl::init(32));


static cl::opt<bool> UseMul24Intrin(

  "amdgpu-codegenprepare-mul24",

  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),

  cl::ReallyHidden,

  cl::init(true));


// Legalize 64-bit division by using the generic IR expansion.

static cl::opt<bool> ExpandDiv64InIR(

  "amdgpu-codegenprepare-expand-div64",

  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),

  cl::ReallyHidden,

  cl::init(false));


// Leave all division operations as they are. This supersedes ExpandDiv64InIR

// and is used for testing the legalizer.

static cl::opt<bool> DisableIDivExpand(

  "amdgpu-codegenprepare-disable-idiv-expansion",

  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),

  cl::ReallyHidden,

  cl::init(false));


// Disable processing of fdiv so we can better test the backend implementations.

static cl::opt<bool> DisableFDivExpand(

  "amdgpu-codegenprepare-disable-fdiv-expansion",

  cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),

  cl::ReallyHidden,

  cl::init(false));


class AMDGPUCodeGenPrepareImpl

    : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {

public:

  Function &F;

  const GCNSubtarget &ST;

  const AMDGPUTargetMachine &TM;

  const TargetLibraryInfo *TLI;

  const UniformityInfo &UA;

  const DataLayout &DL;

  SimplifyQuery SQ;

  const bool HasFP32DenormalFlush;

  bool FlowChanged = false;

  mutable Function *SqrtF32 = nullptr;

  mutable Function *LdexpF32 = nullptr;

  mutable SmallVector<WeakVH> DeadVals;


  DenseMap<const PHINode *, bool> BreakPhiNodesCache;


  AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,

                           const TargetLibraryInfo *TLI, AssumptionCache *AC,

                           const DominatorTree *DT, const UniformityInfo &UA)

      : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),

        DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),

        HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==

                             DenormalMode::getPreserveSign()) {}


  Function *getSqrtF32() const {

    if (SqrtF32)

      return SqrtF32;


    LLVMContext &Ctx = F.getContext();

    SqrtF32 = Intrinsic::getOrInsertDeclaration(

        F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});

    return SqrtF32;

  }


  Function *getLdexpF32() const {

    if (LdexpF32)

      return LdexpF32;


    LLVMContext &Ctx = F.getContext();

    LdexpF32 = Intrinsic::getOrInsertDeclaration(

        F.getParent(), Intrinsic::ldexp,

        {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});

    return LdexpF32;

  }


  bool canBreakPHINode(const PHINode &I);


  /// Return true if \p T is a legal scalar floating point type.

  bool isLegalFloatingTy(const Type *T) const;


  /// Wrapper to pass all the arguments to computeKnownFPClass

  KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,

                                   const Instruction *CtxI) const {

    return llvm::computeKnownFPClass(V, Interested,

                                     SQ.getWithInstruction(CtxI));

  }


  bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {

    return HasFP32DenormalFlush ||

           computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();

  }


  /// \returns The minimum number of bits needed to store the value of \Op as an

  /// unsigned integer. Truncating to this size and then zero-extending to

  /// the original will not change the value.

  unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;


  /// \returns The minimum number of bits needed to store the value of \Op as a

  /// signed integer. Truncating to this size and then sign-extending to

  /// the original size will not change the value.

  unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;


  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.

  /// SelectionDAG has an issue where an and asserting the bits are known

  bool replaceMulWithMul24(BinaryOperator &I) const;


  /// Perform same function as equivalently named function in DAGCombiner. Since

  /// we expand some divisions here, we need to perform this before obscuring.

  bool foldBinOpIntoSelect(BinaryOperator &I) const;


  bool divHasSpecialOptimization(BinaryOperator &I,

                                 Value *Num, Value *Den) const;

  unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,

                         unsigned MaxDivBits, bool Signed) const;


  /// Expands 24 bit div or rem.

  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,

                        Value *Num, Value *Den,

                        bool IsDiv, bool IsSigned) const;


  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,

                            Value *Num, Value *Den, unsigned NumBits,

                            bool IsDiv, bool IsSigned) const;


  /// Expands 32 bit div or rem.

  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,

                        Value *Num, Value *Den) const;


  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,

                        Value *Num, Value *Den) const;

  void expandDivRem64(BinaryOperator &I) const;


  /// Widen a scalar load.

  ///

  /// \details \p Widen scalar load for uniform, small type loads from constant

  //  memory / to a full 32-bits and then truncate the input to allow a scalar

  //  load instead of a vector load.

  //

  /// \returns True.


  bool canWidenScalarExtLoad(LoadInst &I) const;


  Value *matchFractPatImpl(Value &V, const APFloat &C) const;

  Value *matchFractPatNanAvoidant(Value &V);

  Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);


  bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;


  Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,

                         FastMathFlags DivFMF, FastMathFlags SqrtFMF,

                         const Instruction *CtxI) const;


  Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,

                         FastMathFlags FMF, const Instruction *CtxI) const;

  Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,

                              float ReqdAccuracy) const;


  Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,

                          FastMathFlags DivFMF, FastMathFlags SqrtFMF,

                          Value *RsqOp, const Instruction *FDiv,

                          float ReqdAccuracy) const;


  std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,

                                              Value *Src) const;


  Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,

                         bool IsNegative) const;

  Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,

                      FastMathFlags FMF) const;

  Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,

                          FastMathFlags FMF) const;

  Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,

                    FastMathFlags DivFMF, const Instruction *CtxI,

                    bool IsNegative) const;


  CallInst *createWorkitemIdX(IRBuilder<> &B) const;

  void replaceWithWorkitemIdX(Instruction &I) const;

  void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;

  bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;


  bool tryNarrowMathIfNoOverflow(Instruction *I);


public:

  bool visitFDiv(BinaryOperator &I);


  bool visitInstruction(Instruction &I) { return false; }

  bool visitBinaryOperator(BinaryOperator &I);

  bool visitLoadInst(LoadInst &I);

  bool visitSelectInst(SelectInst &I);

  bool visitPHINode(PHINode &I);

  bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);


  bool visitIntrinsicInst(IntrinsicInst &I);

  bool visitFMinLike(IntrinsicInst &I);

  bool visitSqrt(IntrinsicInst &I);

  bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);

  bool visitMbcntLo(IntrinsicInst &I) const;

  bool visitMbcntHi(IntrinsicInst &I) const;

  bool visitVectorReduceAdd(IntrinsicInst &I);

  bool visitSaturatingAdd(IntrinsicInst &I);

  bool run();

};


class AMDGPUCodeGenPrepare : public FunctionPass {

public:

  static char ID;

  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}

  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<AssumptionCacheTracker>();

    AU.addRequired<UniformityInfoWrapperPass>();

    AU.addRequired<TargetLibraryInfoWrapperPass>();


    // FIXME: Division expansion needs to preserve the dominator tree.

    if (!ExpandDiv64InIR)

      AU.setPreservesAll();

  }

  bool runOnFunction(Function &F) override;

  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }

};


} // end anonymous namespace


bool AMDGPUCodeGenPrepareImpl::run() {

  BreakPhiNodesCache.clear();

  bool MadeChange = false;


  // Need to use make_early_inc_range because integer division expansion is

  // handled by Transform/Utils, and it can delete instructions such as the

  // terminator of the BB.

  for (BasicBlock &BB : reverse(F)) {

    for (Instruction &I : make_early_inc_range(reverse(BB))) {

      if (!isInstructionTriviallyDead(&I, TLI))

        MadeChange |= visit(I);

    }

  }


  while (!DeadVals.empty()) {

    if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))

      RecursivelyDeleteTriviallyDeadInstructions(I, TLI);

  }


  return MadeChange;

}


bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {

  return Ty->isFloatTy() || Ty->isDoubleTy() ||

         (Ty->isHalfTy() && ST.has16BitInsts());

}


bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {

  Type *Ty = I.getType();

  int TySize = DL.getTypeSizeInBits(Ty);

  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);


  return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(&I);

}


unsigned

AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,

                                          const Instruction *CtxI) const {

  return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();

}


unsigned

AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,

                                        const Instruction *CtxI) const {

  return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);

}


static void extractValues(IRBuilder<> &Builder,

                          SmallVectorImpl<Value *> &Values, Value *V) {

  auto *VT = dyn_cast<FixedVectorType>(V->getType());

  if (!VT) {

    Values.push_back(V);

    return;

  }


  for (int I = 0, E = VT->getNumElements(); I != E; ++I)

    Values.push_back(Builder.CreateExtractElement(V, I));

}


static Value *insertValues(IRBuilder<> &Builder,

                           Type *Ty,

                           SmallVectorImpl<Value *> &Values) {

  if (!Ty->isVectorTy()) {

    assert(Values.size() == 1);

    return Values[0];

  }


  Value *NewVal = PoisonValue::get(Ty);

  for (int I = 0, E = Values.size(); I != E; ++I)

    NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);


  return NewVal;

}


bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {

  if (I.getOpcode() != Instruction::Mul)

    return false;


  Type *Ty = I.getType();

  unsigned Size = Ty->getScalarSizeInBits();

  if (Size <= 16 && ST.has16BitInsts())

    return false;


  // Prefer scalar if this could be s_mul_i32

  if (UA.isUniformAtDef(&I))

    return false;


  Value *LHS = I.getOperand(0);

  Value *RHS = I.getOperand(1);

  IRBuilder<> Builder(&I);

  Builder.SetCurrentDebugLocation(I.getDebugLoc());


  unsigned LHSBits = 0, RHSBits = 0;

  bool IsSigned = false;


  if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&

      (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {

    IsSigned = false;


  } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&

             (RHSBits = numBitsSigned(RHS, &I)) <= 24) {

    IsSigned = true;


  } else

    return false;


  SmallVector<Value *, 4> LHSVals;

  SmallVector<Value *, 4> RHSVals;

  SmallVector<Value *, 4> ResultVals;

  extractValues(Builder, LHSVals, LHS);

  extractValues(Builder, RHSVals, RHS);


  IntegerType *I32Ty = Builder.getInt32Ty();

  IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;

  Type *DstTy = LHSVals[0]->getType();


  for (int I = 0, E = LHSVals.size(); I != E; ++I) {

    Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)

                          : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);

    Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)

                          : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);

    Intrinsic::ID ID =

        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;

    Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});

    Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)

                      : Builder.CreateZExtOrTrunc(Result, DstTy);

    ResultVals.push_back(Result);

  }


  Value *NewVal = insertValues(Builder, Ty, ResultVals);

  NewVal->takeName(&I);

  I.replaceAllUsesWith(NewVal);

  DeadVals.push_back(&I);


  return true;

}


// Find a select instruction, which may have been casted. This is mostly to deal

// with cases where i16 selects were promoted here to i32.


static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {

  Cast = nullptr;

  if (SelectInst *Sel = dyn_cast<SelectInst>(V))

    return Sel;


  if ((Cast = dyn_cast<CastInst>(V))) {

    if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))

      return Sel;

  }


  return nullptr;

}


bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {

  // Don't do this unless the old select is going away. We want to eliminate the

  // binary operator, not replace a binop with a select.

  int SelOpNo = 0;


  CastInst *CastOp;


  // TODO: Should probably try to handle some cases with multiple

  // users. Duplicating the select may be profitable for division.

  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);

  if (!Sel || !Sel->hasOneUse()) {

    SelOpNo = 1;

    Sel = findSelectThroughCast(BO.getOperand(1), CastOp);

  }


  if (!Sel || !Sel->hasOneUse())

    return false;


  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());

  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());

  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));

  if (!CBO || !CT || !CF)

    return false;


  if (CastOp) {

    if (!CastOp->hasOneUse())

      return false;

    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);

    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);

  }


  // TODO: Handle special 0/-1 cases DAG combine does, although we only really

  // need to handle divisions here.

  Constant *FoldedT =

      SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)

              : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);

  if (!FoldedT || isa<ConstantExpr>(FoldedT))

    return false;


  Constant *FoldedF =

      SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)

              : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);

  if (!FoldedF || isa<ConstantExpr>(FoldedF))

    return false;


  IRBuilder<> Builder(&BO);

  Builder.SetCurrentDebugLocation(BO.getDebugLoc());

  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))

    Builder.setFastMathFlags(FPOp->getFastMathFlags());


  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),

                                          FoldedT, FoldedF);

  NewSelect->takeName(&BO);

  BO.replaceAllUsesWith(NewSelect);

  DeadVals.push_back(&BO);

  if (CastOp)

    DeadVals.push_back(CastOp);

  DeadVals.push_back(Sel);

  return true;

}


std::pair<Value *, Value *>

AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,

                                          Value *Src) const {

  Type *Ty = Src->getType();

  Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,

                                         {Ty, Builder.getInt32Ty()}, Src);

  Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});


  // Bypass the bug workaround for the exponent result since it doesn't matter.

  // TODO: Does the bug workaround even really need to consider the exponent

  // result? It's unspecified by the spec.


  Value *FrexpExp =

      ST.hasFractBug()

          ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,

                                    {Builder.getInt32Ty(), Ty}, Src)

          : Builder.CreateExtractValue(Frexp, {1});

  return {FrexpMant, FrexpExp};

}


/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.

Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,

                                                 Value *Src,

                                                 bool IsNegative) const {

  // Same as for 1.0, but expand the sign out of the constant.

  // -1.0 / x -> rcp (fneg x)

  if (IsNegative)

    Src = Builder.CreateFNeg(Src);


  // The rcp instruction doesn't support denormals, so scale the input

  // out of the denormal range and convert at the end.

  //

  // Expand as 2^-n * (1.0 / (x * 2^n))


  // TODO: Skip scaling if input is known never denormal and the input

  // range won't underflow to denormal. The hard part is knowing the

  // result. We need a range check, the result could be denormal for

  // 0x1p+126 < den <= 0x1p+127.

  auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);

  Value *ScaleFactor = Builder.CreateNeg(FrexpExp);

  Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);

  return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});

}


/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.

Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,

                                              Value *RHS,

                                              FastMathFlags FMF) const {

  // If we have have to work around the fract/frexp bug, we're worse off than

  // using the fdiv.fast expansion. The full safe expansion is faster if we have

  // fast FMA.

  if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&

      (!FMF.noNaNs() || !FMF.noInfs()))

    return nullptr;


  // We're scaling the LHS to avoid a denormal input, and scale the denominator

  // to avoid large values underflowing the result.

  auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);


  Value *Rcp =

      Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);


  auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);

  Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);


  // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the

  // result.

  Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);

  return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});

}


/// Emit a sqrt that handles denormals and is accurate to 2ulp.

Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,

                                                  Value *Src,

                                                  FastMathFlags FMF) const {

  Type *Ty = Src->getType();

  APFloat SmallestNormal =

      APFloat::getSmallestNormalized(Ty->getFltSemantics());

  Value *NeedScale =

      Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));


  ConstantInt *Zero = Builder.getInt32(0);

  Value *InputScaleFactor =

      Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);


  Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});


  Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);


  Value *OutputScaleFactor =

      Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);

  return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});

}


/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.


static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,

                              bool IsNegative) {

  // bool need_scale = x < 0x1p-126f;

  // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;

  // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;

  // rsq(x * input_scale) * output_scale;


  Type *Ty = Src->getType();

  APFloat SmallestNormal =

      APFloat::getSmallestNormalized(Ty->getFltSemantics());

  Value *NeedScale =

      Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));

  Constant *One = ConstantFP::get(Ty, 1.0);

  Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);

  Constant *OutputScale =

      ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);


  Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);


  Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);

  Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);

  Value *OutputScaleFactor = Builder.CreateSelect(

      NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);


  return Builder.CreateFMul(Rsq, OutputScaleFactor);

}


/// Emit inverse sqrt expansion for f64 with a correction sequence on top of

/// v_rsq_f64. This should give a 1ulp result.

Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,

                                            FastMathFlags SqrtFMF,

                                            FastMathFlags DivFMF,

                                            const Instruction *CtxI,

                                            bool IsNegative) const {

  // rsq(x):

  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);

  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);

  //   return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);

  //

  // -rsq(x):

  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);

  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);

  //   return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);

  //

  // The rsq instruction handles the special cases correctly. We need to check

  // for the edge case conditions to ensure the special case propagates through

  // the later instructions.


  Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);


  // Try to elide the edge case check.

  //

  // Fast math flags imply:

  //   sqrt ninf => !isinf(x)

  //   fdiv ninf => x != 0, !isinf(x)

  bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();

  bool MaybeZero = !DivFMF.noInfs();


  DenormalMode DenormMode;

  FPClassTest Interested = fcNone;

  if (MaybePosInf)

    Interested = fcPosInf;

  if (MaybeZero)

    Interested |= fcZero;


  if (Interested != fcNone) {

    KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);

    if (KnownSrc.isKnownNeverPosInfinity())

      MaybePosInf = false;


    DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());

    if (KnownSrc.isKnownNeverLogicalZero(DenormMode))

      MaybeZero = false;

  }


  Value *SpecialOrRsq = X;

  if (MaybeZero || MaybePosInf) {

    Value *Cond;

    if (MaybePosInf && MaybeZero) {

      if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {

        FPClassTest TestMask = fcPosInf | fcZero;

        if (DenormMode.inputsAreZero())

          TestMask |= fcSubnormal;


        Cond = Builder.createIsFPClass(X, TestMask);

      } else {

        // Avoid using llvm.is.fpclass for dynamic denormal mode, since it

        // doesn't respect the floating-point environment.

        Value *IsZero =

            Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));

        Value *IsInf =

            Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));

        Cond = Builder.CreateOr(IsZero, IsInf);

      }

    } else if (MaybeZero) {

      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));

    } else {

      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));

    }


    SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);

  }


  Value *NegY0 = Builder.CreateFNeg(Y0);

  Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);


  // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.

  Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));


  Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);


  Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),

                                  ConstantFP::get(X->getType(), 0.5));


  return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);

}


bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,

                                                  FastMathFlags SqrtFMF) const {

  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and

  // f64.

  return DivFMF.allowContract() && SqrtFMF.allowContract();

}


Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(

    IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,

    const FastMathFlags SqrtFMF, const Instruction *CtxI) const {

  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.

  assert(DivFMF.allowContract() && SqrtFMF.allowContract());


  // rsq_f16 is accurate to 0.51 ulp.

  // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.

  // rsq_f64 is never accurate.

  const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);

  if (!CLHS)

    return nullptr;


  bool IsNegative = false;


  // TODO: Handle other numerator values with arcp.

  if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {

    // Add in the sqrt flags.

    IRBuilder<>::FastMathFlagGuard Guard(Builder);

    Builder.setFastMathFlags(DivFMF | SqrtFMF);


    if (Den->getType()->isFloatTy()) {

      if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||

          canIgnoreDenormalInput(Den, CtxI)) {

        Value *Result =

            Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);

        // -1.0 / sqrt(x) -> fneg(rsq(x))

        return IsNegative ? Builder.CreateFNeg(Result) : Result;

      }


      return emitRsqIEEE1ULP(Builder, Den, IsNegative);

    }


    if (Den->getType()->isDoubleTy())

      return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);

  }


  return nullptr;

}


// Optimize fdiv with rcp:

//

// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is

//               allowed with afn.

//

// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0

Value *

AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,

                                          Value *Den, FastMathFlags FMF,

                                          const Instruction *CtxI) const {

  // rcp_f16 is accurate to 0.51 ulp.

  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.

  // rcp_f64 is never accurate.

  assert(Den->getType()->isFloatTy());


  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {

    bool IsNegative = false;

    if (CLHS->isExactlyValue(1.0) ||

        (IsNegative = CLHS->isExactlyValue(-1.0))) {

      Value *Src = Den;


      if (HasFP32DenormalFlush || FMF.approxFunc()) {

        // -1.0 / x -> 1.0 / fneg(x)

        if (IsNegative)

          Src = Builder.CreateFNeg(Src);


        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to

        // the CI documentation has a worst case error of 1 ulp.

        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK

        // to use it as long as we aren't trying to use denormals.

        //

        // v_rcp_f16 and v_rsq_f16 DO support denormals.


        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't

        //       insert rsq intrinsic here.


        // 1.0 / x -> rcp(x)

        return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);

      }


      // TODO: If the input isn't denormal, and we know the input exponent isn't

      // big enough to introduce a denormal we can avoid the scaling.

      return emitRcpIEEE1ULP(Builder, Src, IsNegative);

    }

  }


  if (FMF.allowReciprocal()) {

    // x / y -> x * (1.0 / y)


    // TODO: Could avoid denormal scaling and use raw rcp if we knew the output

    // will never underflow.

    if (HasFP32DenormalFlush || FMF.approxFunc()) {

      Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);

      return Builder.CreateFMul(Num, Recip);

    }


    Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);

    return Builder.CreateFMul(Num, Recip);

  }


  return nullptr;

}


// optimize with fdiv.fast:

//

// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.

//

// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.

//

// NOTE: optimizeWithRcp should be tried first because rcp is the preference.

Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(

    IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {

  // fdiv.fast can achieve 2.5 ULP accuracy.

  if (ReqdAccuracy < 2.5f)

    return nullptr;


  // Only have fdiv.fast for f32.

  assert(Den->getType()->isFloatTy());


  bool NumIsOne = false;

  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {

    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))

      NumIsOne = true;

  }


  // fdiv does not support denormals. But 1.0/x is always fine to use it.

  //

  // TODO: This works for any value with a specific known exponent range, don't

  // just limit to constant 1.

  if (!HasFP32DenormalFlush && !NumIsOne)

    return nullptr;


  return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});

}


Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(

    IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,

    FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,

    float ReqdDivAccuracy) const {

  if (RsqOp) {

    Value *Rsq =

        optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);

    if (Rsq)

      return Rsq;

  }


  if (!Num->getType()->isFloatTy())

    return nullptr;


  Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);

  if (Rcp)

    return Rcp;


  // In the basic case fdiv_fast has the same instruction count as the frexp div

  // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can

  // potentially be fused into a user. Also, materialization of the constants

  // can be reused for multiple instances.

  Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);

  if (FDivFast)

    return FDivFast;


  return emitFrexpDiv(Builder, Num, Den, DivFMF);

}


// Optimizations is performed based on fpmath, fast math flags as well as

// denormals to optimize fdiv with either rcp or fdiv.fast.

//

// With rcp:

//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is

//                 allowed with afn.

//

//   a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.

//

// With fdiv.fast:

//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.

//

//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.

//

// NOTE: rcp is the preference in cases that both are legal.

bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {

  if (DisableFDivExpand)

    return false;


  Type *Ty = FDiv.getType()->getScalarType();

  const bool IsFloat = Ty->isFloatTy();

  if (!IsFloat && !Ty->isDoubleTy())

    return false;


  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an

  // expansion around them in codegen. f16 is good enough to always use.


  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);

  const FastMathFlags DivFMF = FPOp->getFastMathFlags();

  const float ReqdAccuracy = FPOp->getFPAccuracy();


  FastMathFlags SqrtFMF;


  Value *Num = FDiv.getOperand(0);

  Value *Den = FDiv.getOperand(1);


  Value *RsqOp = nullptr;

  auto *DenII = dyn_cast<IntrinsicInst>(Den);

  if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&

      DenII->hasOneUse()) {

    const auto *SqrtOp = cast<FPMathOperator>(DenII);

    SqrtFMF = SqrtOp->getFastMathFlags();

    if (canOptimizeWithRsq(DivFMF, SqrtFMF))

      RsqOp = SqrtOp->getOperand(0);

  }


  // rcp path not yet implemented for f64.

  if (!IsFloat && !RsqOp)

    return false;


  // Inaccurate rcp is allowed with afn.

  //

  // Defer to codegen to handle this.

  //

  // TODO: Decide on an interpretation for interactions between afn + arcp +

  // !fpmath, and make it consistent between here and codegen. For now, defer

  // expansion of afn to codegen. The current interpretation is so aggressive we

  // don't need any pre-consideration here when we have better information. A

  // more conservative interpretation could use handling here.

  const bool AllowInaccurateRcp = DivFMF.approxFunc();

  if (!RsqOp && AllowInaccurateRcp)

    return false;


  // Defer the correct implementations to codegen.

  if (IsFloat && ReqdAccuracy < 1.0f)

    return false;


  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));

  Builder.setFastMathFlags(DivFMF);

  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());


  SmallVector<Value *, 4> NumVals;

  SmallVector<Value *, 4> DenVals;

  SmallVector<Value *, 4> RsqDenVals;

  extractValues(Builder, NumVals, Num);

  extractValues(Builder, DenVals, Den);


  if (RsqOp)

    extractValues(Builder, RsqDenVals, RsqOp);


  SmallVector<Value *, 4> ResultVals(NumVals.size());

  for (int I = 0, E = NumVals.size(); I != E; ++I) {

    Value *NumElt = NumVals[I];

    Value *DenElt = DenVals[I];

    Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;


    Value *NewElt =

        visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,

                         cast<Instruction>(FPOp), ReqdAccuracy);

    if (!NewElt) {

      // Keep the original, but scalarized.


      // This has the unfortunate side effect of sometimes scalarizing when

      // we're not going to do anything.

      NewElt = Builder.CreateFDiv(NumElt, DenElt);

      if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))

        NewEltInst->copyMetadata(FDiv);

    }


    ResultVals[I] = NewElt;

  }


  Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);


  if (NewVal) {

    FDiv.replaceAllUsesWith(NewVal);

    NewVal->takeName(&FDiv);

    DeadVals.push_back(&FDiv);

  }


  return true;

}


static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,

                                          Value *LHS, Value *RHS) {

  Type *I32Ty = Builder.getInt32Ty();

  Type *I64Ty = Builder.getInt64Ty();


  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);

  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);

  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);

  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);

  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));

  Hi = Builder.CreateTrunc(Hi, I32Ty);

  return std::pair(Lo, Hi);

}


static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {

  return getMul64(Builder, LHS, RHS).second;

}


/// Figure out how many bits are really needed for this division.

/// \p MaxDivBits is an optimization hint to bypass the second

/// ComputeNumSignBits/computeKnownBits call if the first one is

/// insufficient.

unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,

                                                 Value *Den,

                                                 unsigned MaxDivBits,

                                                 bool IsSigned) const {

  assert(Num->getType()->getScalarSizeInBits() ==

         Den->getType()->getScalarSizeInBits());

  unsigned SSBits = Num->getType()->getScalarSizeInBits();

  if (IsSigned) {

    unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);

    // A sign bit needs to be reserved for shrinking.

    unsigned DivBits = SSBits - RHSSignBits + 1;

    if (DivBits > MaxDivBits)

      return SSBits;


    unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);


    unsigned SignBits = std::min(LHSSignBits, RHSSignBits);

    DivBits = SSBits - SignBits + 1;

    return DivBits;

  }


  // All bits are used for unsigned division for Num or Den in range

  // (SignedMax, UnsignedMax].

  KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));

  unsigned RHSBits = Known.countMaxActiveBits();

  if (RHSBits > MaxDivBits)

    return SSBits;


  Known = computeKnownBits(Num, SQ.getWithInstruction(&I));

  unsigned LHSBits = Known.countMaxActiveBits();


  unsigned DivBits = std::max(LHSBits, RHSBits);

  return DivBits;

}


// The fractional part of a float is enough to accurately represent up to

// a 24-bit signed integer.

Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,

                                                BinaryOperator &I, Value *Num,

                                                Value *Den, bool IsDiv,

                                                bool IsSigned) const {

  unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);


  // v_rcp_f32(float(X)) can have an error of 1 ulp.

  // This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly

  // when abs(Y)>0x800000.

  // For example,

  // (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.

  // (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.

  //

  // Note that for DivBits==24 && IsSigned, Y is in the range

  // [-0x800000:0x7FFFFF]. abs(Y) is at most

  // 0x800000 so it cannot hit this issue.

  if (DivBits > (IsSigned ? 24 : 23))

    return nullptr;

  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);

}


Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(

    IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,

    unsigned DivBits, bool IsDiv, bool IsSigned) const {

  Type *I32Ty = Builder.getInt32Ty();

  Num = Builder.CreateTrunc(Num, I32Ty);

  Den = Builder.CreateTrunc(Den, I32Ty);


  Type *F32Ty = Builder.getFloatTy();

  ConstantInt *One = Builder.getInt32(1);

  Value *JQ = One;


  if (IsSigned) {

    // char|short jq = ia ^ ib;

    JQ = Builder.CreateXor(Num, Den);


    // jq = jq >> (bitsize - 2)

    JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));


    // jq = jq | 0x1

    JQ = Builder.CreateOr(JQ, One);

  }


  // int ia = (int)LHS;

  Value *IA = Num;


  // int ib, (int)RHS;

  Value *IB = Den;


  // float fa = (float)ia;

  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)

                       : Builder.CreateUIToFP(IA, F32Ty);


  // float fb = (float)ib;

  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)

                       : Builder.CreateUIToFP(IB,F32Ty);


  Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,

                                       Builder.getFloatTy(), {FB});

  Value *FQM = Builder.CreateFMul(FA, RCP);


  // fq = trunc(fqm);

  Value *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);

  auto *FQI = dyn_cast<Instruction>(FQ);

  if (FQI)

    FQI->copyFastMathFlags(Builder.getFastMathFlags());


  // float fqneg = -fq;

  Value *FQNeg = Builder.CreateFNeg(FQ);


  // float fr = mad(fqneg, fb, fa);

  auto FMAD = !ST.hasMadMacF32Insts()

                  ? Intrinsic::fma

                  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;

  Value *FR =

      Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQI);


  // int iq = (int)fq;

  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)

                       : Builder.CreateFPToUI(FQ, I32Ty);


  // fr = fabs(fr);

  FR = Builder.CreateFAbs(FR, FQI);


  // fb = fabs(fb);

  FB = Builder.CreateFAbs(FB, FQI);


  // int cv = fr >= fb;

  Value *CV = Builder.CreateFCmpOGE(FR, FB);


  // jq = (cv ? jq : 0);

  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));


  // dst = iq + jq;

  Value *Div = Builder.CreateAdd(IQ, JQ);


  Value *Res = Div;

  if (!IsDiv) {

    // Rem needs compensation, it's easier to recompute it

    Value *Rem = Builder.CreateMul(Div, Den);

    Res = Builder.CreateSub(Num, Rem);

  }


  if (DivBits != 0 && DivBits < 32) {

    // Extend in register from the number of bits this divide really is.

    if (IsSigned) {

      int InRegBits = 32 - DivBits;


      Res = Builder.CreateShl(Res, InRegBits);

      Res = Builder.CreateAShr(Res, InRegBits);

    } else {

      ConstantInt *TruncMask

        = Builder.getInt32((UINT64_C(1) << DivBits) - 1);

      Res = Builder.CreateAnd(Res, TruncMask);

    }

  }


  return Res;

}


// Try to recognize special cases the DAG will emit special, better expansions

// than the general expansion we do here.


// TODO: It would be better to just directly handle those optimizations here.

bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,

                                                         Value *Num,

                                                         Value *Den) const {

  if (Constant *C = dyn_cast<Constant>(Den)) {

    // Arbitrary constants get a better expansion as long as a wider mulhi is

    // legal.

    if (C->getType()->getScalarSizeInBits() <= 32)

      return true;


    // TODO: Sdiv check for not exact for some reason.


    // If there's no wider mulhi, there's only a better expansion for powers of

    // two.

    // TODO: Should really know for each vector element.

    if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I)))

      return true;


    return false;

  }


  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {

    // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2

    if (BinOpDen->getOpcode() == Instruction::Shl &&

        isa<Constant>(BinOpDen->getOperand(0)) &&

        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,

                               SQ.getWithInstruction(&I))) {

      return true;

    }

  }


  return false;

}


static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {

  // Check whether the sign can be determined statically.

  KnownBits Known = computeKnownBits(V, DL);

  if (Known.isNegative())

    return Constant::getAllOnesValue(V->getType());

  if (Known.isNonNegative())

    return Constant::getNullValue(V->getType());

  return Builder.CreateAShr(V, Builder.getInt32(31));

}


Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,

                                                BinaryOperator &I, Value *X,

                                                Value *Y) const {

  Instruction::BinaryOps Opc = I.getOpcode();

  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||

         Opc == Instruction::SRem || Opc == Instruction::SDiv);


  FastMathFlags FMF;

  FMF.setFast();

  Builder.setFastMathFlags(FMF);


  if (divHasSpecialOptimization(I, X, Y))

    return nullptr;  // Keep it for later optimization.


  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;

  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;


  Type *Ty = X->getType();

  Type *I32Ty = Builder.getInt32Ty();

  Type *F32Ty = Builder.getFloatTy();


  if (Ty->getScalarSizeInBits() != 32) {

    if (IsSigned) {

      X = Builder.CreateSExtOrTrunc(X, I32Ty);

      Y = Builder.CreateSExtOrTrunc(Y, I32Ty);

    } else {

      X = Builder.CreateZExtOrTrunc(X, I32Ty);

      Y = Builder.CreateZExtOrTrunc(Y, I32Ty);

    }

  }


  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {

    return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :

                      Builder.CreateZExtOrTrunc(Res, Ty);

  }


  ConstantInt *Zero = Builder.getInt32(0);

  ConstantInt *One = Builder.getInt32(1);


  Value *Sign = nullptr;

  if (IsSigned) {

    Value *SignX = getSign32(X, Builder, DL);

    Value *SignY = getSign32(Y, Builder, DL);

    // Remainder sign is the same as LHS

    Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;


    X = Builder.CreateAdd(X, SignX);

    Y = Builder.CreateAdd(Y, SignY);


    X = Builder.CreateXor(X, SignX);

    Y = Builder.CreateXor(Y, SignY);

  }


  // The algorithm here is based on ideas from "Software Integer Division", Tom

  // Rodeheffer, August 2008.

  //

  // unsigned udiv(unsigned x, unsigned y) {

  //   // Initial estimate of inv(y). The constant is less than 2^32 to ensure

  //   // that this is a lower bound on inv(y), even if some of the calculations

  //   // round up.

  //   unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));

  //

  //   // One round of UNR (Unsigned integer Newton-Raphson) to improve z.

  //   // Empirically this is guaranteed to give a "two-y" lower bound on

  //   // inv(y).

  //   z += umulh(z, -y * z);

  //

  //   // Quotient/remainder estimate.

  //   unsigned q = umulh(x, z);

  //   unsigned r = x - q * y;

  //

  //   // Two rounds of quotient/remainder refinement.

  //   if (r >= y) {

  //     ++q;

  //     r -= y;

  //   }

  //   if (r >= y) {

  //     ++q;

  //     r -= y;

  //   }

  //

  //   return q;

  // }


  // Initial estimate of inv(y).

  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);

  Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});

  Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));

  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);

  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);


  // One round of UNR.

  Value *NegY = Builder.CreateSub(Zero, Y);

  Value *NegYZ = Builder.CreateMul(NegY, Z);

  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));


  // Quotient/remainder estimate.

  Value *Q = getMulHu(Builder, X, Z);

  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));


  // First quotient/remainder refinement.

  Value *Cond = Builder.CreateICmpUGE(R, Y);

  if (IsDiv)

    Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);

  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);


  // Second quotient/remainder refinement.

  Cond = Builder.CreateICmpUGE(R, Y);

  Value *Res;

  if (IsDiv)

    Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);

  else

    Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);


  if (IsSigned) {

    Res = Builder.CreateXor(Res, Sign);

    Res = Builder.CreateSub(Res, Sign);

    Res = Builder.CreateSExtOrTrunc(Res, Ty);

  } else {

    Res = Builder.CreateZExtOrTrunc(Res, Ty);

  }

  return Res;

}


Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,

                                                BinaryOperator &I, Value *Num,

                                                Value *Den) const {

  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))

    return nullptr;  // Keep it for later optimization.


  Instruction::BinaryOps Opc = I.getOpcode();


  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;

  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;


  unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);

  if (NumDivBits > 32)

    return nullptr;


  Value *Narrowed = nullptr;

  // v_rcp_f32(float(X)) can have an error of 1 ulp.

  // This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly

  // when abs(Y)>0x800000.

  // For example,

  // (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.

  // (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.

  //

  // Note that for NumDivBits==24 && IsSigned, Y is in the range

  // [-0x800000:0x7FFFFF]. abs(Y) is at most

  // 0x800000 so it cannot hit this issue.

  if (NumDivBits <= (IsSigned ? 24 : 23)) {

    Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,

                                  IsDiv, IsSigned);

  } else if (NumDivBits <= 32) {

    Narrowed = expandDivRem32(Builder, I, Num, Den);

  }


  if (Narrowed) {

    return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :

                      Builder.CreateZExt(Narrowed, Num->getType());

  }


  return nullptr;

}


void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {

  Instruction::BinaryOps Opc = I.getOpcode();

  // Do the general expansion.

  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {

    expandDivisionUpTo64Bits(&I);

    return;

  }


  if (Opc == Instruction::URem || Opc == Instruction::SRem) {

    expandRemainderUpTo64Bits(&I);

    return;

  }


  llvm_unreachable("not a division");

}


/*

This will cause non-byte load in consistency, for example:

```

    %load = load i1, ptr addrspace(4) %arg, align 4

    %zext = zext i1 %load to

    i64 %add = add i64 %zext

```

Instead of creating `s_and_b32 s0, s0, 1`,

it will create `s_and_b32 s0, s0, 0xff`.

We accept this change since the non-byte load assumes the upper bits

within the byte are all 0.

*/

bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {

  unsigned Opc = I->getOpcode();

  Type *OldType = I->getType();


  if (Opc != Instruction::Add && Opc != Instruction::Mul)

    return false;


  unsigned OrigBit = OldType->getScalarSizeInBits();


  if (Opc != Instruction::Add && Opc != Instruction::Mul)

    llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "

                     "Instruction::Mul.");


  unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();


  MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);

  Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);

  if (!NewType)

    return false;

  unsigned NewBit = NewType->getIntegerBitWidth();

  if (NewBit >= OrigBit)

    return false;

  NewType = I->getType()->getWithNewBitWidth(NewBit);


  // Old cost

  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);

  InstructionCost OldCost =

      TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);

  // New cost of new op

  InstructionCost NewCost =

      TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);

  // New cost of narrowing 2 operands (use trunc)

  int NumOfNonConstOps = 2;

  if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {

    // Cannot be both constant, should be propagated

    NumOfNonConstOps = 1;

  }

  NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,

                                                     NewType, OldType,

                                                     TTI.getCastContextHint(I),

                                                     TTI::TCK_RecipThroughput);

  // New cost of zext narrowed result to original type

  NewCost +=

      TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,

                           TTI.getCastContextHint(I), TTI::TCK_RecipThroughput);

  if (NewCost >= OldCost)

    return false;


  IRBuilder<> Builder(I);

  Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);

  Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);

  Value *Arith =

      Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);


  Value *Zext = Builder.CreateZExt(Arith, OldType);

  I->replaceAllUsesWith(Zext);

  DeadVals.push_back(I);

  return true;

}


bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {

  if (foldBinOpIntoSelect(I))

    return true;


  if (UseMul24Intrin && replaceMulWithMul24(I))

    return true;

  if (tryNarrowMathIfNoOverflow(&I))

    return true;


  bool Changed = false;

  Instruction::BinaryOps Opc = I.getOpcode();

  Type *Ty = I.getType();

  Value *NewDiv = nullptr;

  unsigned ScalarSize = Ty->getScalarSizeInBits();


  SmallVector<BinaryOperator *, 8> Div64ToExpand;


  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||

       Opc == Instruction::SRem || Opc == Instruction::SDiv) &&

      ScalarSize <= 64 &&

      !DisableIDivExpand) {

    Value *Num = I.getOperand(0);

    Value *Den = I.getOperand(1);

    IRBuilder<> Builder(&I);

    Builder.SetCurrentDebugLocation(I.getDebugLoc());


    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {

      NewDiv = PoisonValue::get(VT);


      for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {

        Value *NumEltN = Builder.CreateExtractElement(Num, N);

        Value *DenEltN = Builder.CreateExtractElement(Den, N);


        Value *NewElt;

        if (ScalarSize <= 32) {

          NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);

          if (!NewElt)

            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);

        } else {

          // See if this 64-bit division can be shrunk to 32/24-bits before

          // producing the general expansion.

          NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);

          if (!NewElt) {

            // The general 64-bit expansion introduces control flow and doesn't

            // return the new value. Just insert a scalar copy and defer

            // expanding it.

            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);

            // CreateBinOp does constant folding. If the operands are constant,

            // it will return a Constant instead of a BinaryOperator.

            if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))

              Div64ToExpand.push_back(NewEltBO);

          }

        }


        if (auto *NewEltI = dyn_cast<Instruction>(NewElt))

          NewEltI->copyIRFlags(&I);


        NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);

      }

    } else {

      if (ScalarSize <= 32)

        NewDiv = expandDivRem32(Builder, I, Num, Den);

      else {

        NewDiv = shrinkDivRem64(Builder, I, Num, Den);

        if (!NewDiv)

          Div64ToExpand.push_back(&I);

      }

    }


    if (NewDiv) {

      I.replaceAllUsesWith(NewDiv);

      DeadVals.push_back(&I);

      Changed = true;

    }

  }


  if (ExpandDiv64InIR) {

    // TODO: We get much worse code in specially handled constant cases.

    for (BinaryOperator *Div : Div64ToExpand) {

      expandDivRem64(*Div);

      FlowChanged = true;

      Changed = true;

    }

  }


  return Changed;

}


bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {

  if (!WidenLoads)

    return false;


  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||

       I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&

      canWidenScalarExtLoad(I)) {

    IRBuilder<> Builder(&I);

    Builder.SetCurrentDebugLocation(I.getDebugLoc());


    Type *I32Ty = Builder.getInt32Ty();

    LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());

    AMDGPU::copyMetadataForWidenedLoad(*WidenLoad, I);


    // The widened load reads the original bytes in the low bits, so a !range

    // lower bound still holds. Convert it to the new type and don't make

    // assumptions about the high bits.

    if (auto *Range = I.getMetadata(LLVMContext::MD_range)) {

      ConstantInt *Lower = mdconst::extract<ConstantInt>(Range->getOperand(0));


      if (!Lower->isNullValue()) {

        Metadata *LowAndHigh[] = {

          ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),

          // Don't make assumptions about the high bits.

          ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))

        };


        WidenLoad->setMetadata(LLVMContext::MD_range,

                               MDNode::get(F.getContext(), LowAndHigh));

      }

    }


    int TySize = DL.getTypeSizeInBits(I.getType());

    Type *IntNTy = Builder.getIntNTy(TySize);

    Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);

    Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());

    I.replaceAllUsesWith(ValOrig);

    DeadVals.push_back(&I);

    return true;

  }


  return false;

}


bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {

  FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);

  if (!FPOp)

    return false;


  Value *X;

  Value *Fract = nullptr;


  // Match:

  //   (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))

  //

  // This is the preferred way to implement fract.

  // TODO: Could also match with compare against 1.0

  const APFloat *C;

  if (match(&I, m_UnordFMin(m_Value(X), m_APFloatAllowPoison(C)))) {

    Value *FractSrc = matchFractPatImpl(*X, *C);

    if (!FractSrc)

      return false;

    IRBuilder<> Builder(&I);

    Builder.setFastMathFlags(FPOp->getFastMathFlags());

    Fract = applyFractPat(Builder, FractSrc);

  } else {

    // Match patterns which may appear in legacy implementations of the fract()

    // function, built around the nan-avoidant minnum intrinsic. These are the

    // core pattern plus additional clamping of inf and nan values on the

    // result.

    Value *Cond = I.getCondition();

    Value *TrueVal = I.getTrueValue();

    Value *FalseVal = I.getFalseValue();

    Value *CmpVal;

    CmpPredicate IsNanPred;


    // Match fract pattern with nan check.

    if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))

      return false;


    IRBuilder<> Builder(&I);

    Builder.setFastMathFlags(FPOp->getFastMathFlags());


    if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&

        CmpVal == matchFractPatNanAvoidant(*FalseVal)) {

      // isnan(x) ? x : fract(x)

      Fract = applyFractPat(Builder, CmpVal);

    } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {

      if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {

        // !isnan(x) ? fract(x) : x

        Fract = applyFractPat(Builder, CmpVal);

      } else {

        // Match an intermediate clamp infinity to 0 pattern. i.e.

        // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x

        CmpPredicate PredInf;

        Value *IfNotInf;


        if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),

                                            m_PosInf()),

                                     m_Value(IfNotInf), m_PosZeroFP())) ||

            PredInf != FCmpInst::FCMP_UNE ||

            CmpVal != matchFractPatNanAvoidant(*IfNotInf))

          return false;


        SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);


        // Insert before the fabs

        Value *InsertPt =

            cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);


        Builder.SetInsertPoint(cast<Instruction>(InsertPt));

        Value *NewFract = applyFractPat(Builder, CmpVal);

        NewFract->takeName(TrueVal);


        // Thread the new fract into the inf clamping sequence.

        DeadVals.push_back(ClampInfSelect->getOperand(1));

        ClampInfSelect->setOperand(1, NewFract);


        // The outer select nan handling is also absorbed into the fract.

        Fract = ClampInfSelect;

      }

    } else

      return false;

  }


  Fract->takeName(&I);

  I.replaceAllUsesWith(Fract);

  DeadVals.push_back(&I);

  return true;

}


static bool areInSameBB(const Value *A, const Value *B) {

  const auto *IA = dyn_cast<Instruction>(A);

  const auto *IB = dyn_cast<Instruction>(B);

  return IA && IB && IA->getParent() == IB->getParent();

}


// Helper for breaking large PHIs that returns true when an extractelement on V

// is likely to be folded away by the DAG combiner.


static bool isInterestingPHIIncomingValue(const Value *V) {

  const auto *FVT = dyn_cast<FixedVectorType>(V->getType());

  if (!FVT)

    return false;


  const Value *CurVal = V;


  // Check for insertelements, keeping track of the elements covered.

  BitVector EltsCovered(FVT->getNumElements());

  while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {

    const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));


    // Non constant index/out of bounds index -> folding is unlikely.

    // The latter is more of a sanity check because canonical IR should just

    // have replaced those with poison.

    if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())

      return false;


    const auto *VecSrc = IE->getOperand(0);


    // If the vector source is another instruction, it must be in the same basic

    // block. Otherwise, the DAGCombiner won't see the whole thing and is

    // unlikely to be able to do anything interesting here.

    if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))

      return false;


    CurVal = VecSrc;

    EltsCovered.set(Idx->getZExtValue());


    // All elements covered.

    if (EltsCovered.all())

      return true;

  }


  // We either didn't find a single insertelement, or the insertelement chain

  // ended before all elements were covered. Check for other interesting values.


  // Constants are always interesting because we can just constant fold the

  // extractelements.

  if (isa<Constant>(CurVal))

    return true;


  // shufflevector is likely to be profitable if either operand is a constant,

  // or if either source is in the same block.

  // This is because shufflevector is most often lowered as a series of

  // insert/extract elements anyway.

  if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {

    return isa<Constant>(SV->getOperand(1)) ||

           areInSameBB(SV, SV->getOperand(0)) ||

           areInSameBB(SV, SV->getOperand(1));

  }


  return false;

}


static void collectPHINodes(const PHINode &I,

                            SmallPtrSet<const PHINode *, 8> &SeenPHIs) {

  const auto [It, Inserted] = SeenPHIs.insert(&I);

  if (!Inserted)

    return;


  for (const Value *Inc : I.incoming_values()) {

    if (const auto *PhiInc = dyn_cast<PHINode>(Inc))

      collectPHINodes(*PhiInc, SeenPHIs);

  }


  for (const User *U : I.users()) {

    if (const auto *PhiU = dyn_cast<PHINode>(U))

      collectPHINodes(*PhiU, SeenPHIs);

  }

}


bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {

  // Check in the cache first.

  if (const auto It = BreakPhiNodesCache.find(&I);

      It != BreakPhiNodesCache.end())

    return It->second;


  // We consider PHI nodes as part of "chains", so given a PHI node I, we

  // recursively consider all its users and incoming values that are also PHI

  // nodes. We then make a decision about all of those PHIs at once. Either they

  // all get broken up, or none of them do. That way, we avoid cases where a

  // single PHI is/is not broken and we end up reforming/exploding a vector

  // multiple times, or even worse, doing it in a loop.

  SmallPtrSet<const PHINode *, 8> WorkList;

  collectPHINodes(I, WorkList);


#ifndef NDEBUG

  // Check that none of the PHI nodes in the worklist are in the map. If some of

  // them are, it means we're not good enough at collecting related PHIs.

  for (const PHINode *WLP : WorkList) {

    assert(BreakPhiNodesCache.count(WLP) == 0);

  }

#endif


  // To consider a PHI profitable to break, we need to see some interesting

  // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist

  // must have one to consider all PHIs breakable.

  //

  // This threshold has been determined through performance testing.

  //

  // Note that the computation below is equivalent to

  //

  //    (unsigned)ceil((K / 3.0) * 2)

  //

  // It's simply written this way to avoid mixing integral/FP arithmetic.

  const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);

  unsigned NumBreakablePHIs = 0;

  bool CanBreak = false;

  for (const PHINode *Cur : WorkList) {

    // Don't break PHIs that have no interesting incoming values. That is, where

    // there is no clear opportunity to fold the "extractelement" instructions

    // we would add.

    //

    // Note: IC does not run after this pass, so we're only interested in the

    // foldings that the DAG combiner can do.

    if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {

      if (++NumBreakablePHIs >= Threshold) {

        CanBreak = true;

        break;

      }

    }

  }


  for (const PHINode *Cur : WorkList)

    BreakPhiNodesCache[Cur] = CanBreak;


  return CanBreak;

}


/// Helper class for "break large PHIs" (visitPHINode).

///

/// This represents a slice of a PHI's incoming value, which is made up of:

///   - The type of the slice (Ty)

///   - The index in the incoming value's vector where the slice starts (Idx)

///   - The number of elements in the slice (NumElts).

/// It also keeps track of the NewPHI node inserted for this particular slice.

///

/// Slice examples:

///   <4 x i64> -> Split into four i64 slices.

///     -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]

///   <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.

///     -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]


class VectorSlice {

public:


  VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)

      : Ty(Ty), Idx(Idx), NumElts(NumElts) {}


  Type *Ty = nullptr;

  unsigned Idx = 0;

  unsigned NumElts = 0;

  PHINode *NewPHI = nullptr;


  /// Slice \p Inc according to the information contained within this slice.

  /// This is cached, so if called multiple times for the same \p BB & \p Inc

  /// pair, it returns the same Sliced value as well.

  ///

  /// Note this *intentionally* does not return the same value for, say,

  /// [%bb.0, %0] & [%bb.1, %0] as:

  ///   - It could cause issues with dominance (e.g. if bb.1 is seen first, then

  ///   the value in bb.1 may not be reachable from bb.0 if it's its

  ///   predecessor.)

  ///   - We also want to make our extract instructions as local as possible so

  ///   the DAG has better chances of folding them out. Duplicating them like

  ///   that is beneficial in that regard.

  ///

  /// This is both a minor optimization to avoid creating duplicate

  /// instructions, but also a requirement for correctness. It is not forbidden

  /// for a PHI node to have the same [BB, Val] pair multiple times. If we

  /// returned a new value each time, those previously identical pairs would all

  /// have different incoming values (from the same block) and it'd cause a "PHI

  /// node has multiple entries for the same basic block with different incoming

  /// values!" verifier error.


  Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {

    Value *&Res = SlicedVals[{BB, Inc}];

    if (Res)

      return Res;


    IRBuilder<> B(BB->getTerminator());

    if (Instruction *IncInst = dyn_cast<Instruction>(Inc))

      B.SetCurrentDebugLocation(IncInst->getDebugLoc());


    if (NumElts > 1) {

      SmallVector<int, 4> Mask;

      for (unsigned K = Idx; K < (Idx + NumElts); ++K)

        Mask.push_back(K);

      Res = B.CreateShuffleVector(Inc, Mask, NewValName);

    } else

      Res = B.CreateExtractElement(Inc, Idx, NewValName);


    return Res;

  }


private:

  SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;

};


bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {

  // Break-up fixed-vector PHIs into smaller pieces.

  // Default threshold is 32, so it breaks up any vector that's >32 bits into

  // its elements, or into 32-bit pieces (for 8/16 bit elts).

  //

  // This is only helpful for DAGISel because it doesn't handle large PHIs as

  // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.

  // With large, odd-sized PHIs we may end up needing many `build_vector`

  // operations with most elements being "undef". This inhibits a lot of

  // optimization opportunities and can result in unreasonably high register

  // pressure and the inevitable stack spilling.

  if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)

    return false;


  FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());

  if (!FVT || FVT->getNumElements() == 1 ||

      DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)

    return false;


  if (!ForceBreakLargePHIs && !canBreakPHINode(I))

    return false;


  std::vector<VectorSlice> Slices;


  Type *EltTy = FVT->getElementType();

  {

    unsigned Idx = 0;

    // For 8/16 bits type, don't scalarize fully but break it up into as many

    // 32-bit slices as we can, and scalarize the tail.

    const unsigned EltSize = DL.getTypeSizeInBits(EltTy);

    const unsigned NumElts = FVT->getNumElements();

    if (EltSize == 8 || EltSize == 16) {

      const unsigned SubVecSize = (32 / EltSize);

      Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);

      for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;

           Idx += SubVecSize)

        Slices.emplace_back(SubVecTy, Idx, SubVecSize);

    }


    // Scalarize all remaining elements.

    for (; Idx < NumElts; ++Idx)

      Slices.emplace_back(EltTy, Idx, 1);

  }


  assert(Slices.size() > 1);


  // Create one PHI per vector piece. The "VectorSlice" class takes care of

  // creating the necessary instruction to extract the relevant slices of each

  // incoming value.

  IRBuilder<> B(I.getParent());

  B.SetCurrentDebugLocation(I.getDebugLoc());


  unsigned IncNameSuffix = 0;

  for (VectorSlice &S : Slices) {

    // We need to reset the build on each iteration, because getSlicedVal may

    // have inserted something into I's BB.

    B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());

    S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());


    for (const auto &[Idx, BB] : enumerate(I.blocks())) {

      S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),

                                           "largephi.extractslice" +

                                               std::to_string(IncNameSuffix++)),

                            BB);

    }

  }


  // And replace this PHI with a vector of all the previous PHI values.

  Value *Vec = PoisonValue::get(FVT);

  unsigned NameSuffix = 0;

  for (VectorSlice &S : Slices) {

    const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);

    if (S.NumElts > 1)

      Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);

    else

      Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);

  }


  I.replaceAllUsesWith(Vec);

  DeadVals.push_back(&I);

  return true;

}


/// \param V  Value to check

/// \param DL DataLayout

/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)

/// \param AS Target Address Space

/// \return true if \p V cannot be the null value of \p AS, false otherwise.


static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,

                                const AMDGPUTargetMachine &TM, unsigned AS) {

  // Pointer cannot be null if it's a block address, GV or alloca.

  // NOTE: We don't support extern_weak, but if we did, we'd need to check for

  // it as the symbol could be null in such cases.

  if (isa<BlockAddress, GlobalValue, AllocaInst>(V))

    return true;


  // Check nonnull arguments.

  if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())

    return true;


  // Check nonnull loads.

  if (const auto *Load = dyn_cast<LoadInst>(V);

      Load && Load->hasMetadata(LLVMContext::MD_nonnull))

    return true;


  // getUnderlyingObject may have looked through another addrspacecast, although

  // the optimizable situations most likely folded out by now.

  if (AS != cast<PointerType>(V->getType())->getAddressSpace())

    return false;


  // TODO: Calls that return nonnull?


  // For all other things, use KnownBits.

  // We either use 0 or all bits set to indicate null, so check whether the

  // value can be zero or all ones.

  //

  // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some

  // address spaces have non-zero null values.

  auto SrcPtrKB = computeKnownBits(V, DL);

  const auto NullVal = AMDGPU::getNullPointerValue(AS);


  assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));

  assert((NullVal == 0 || NullVal == -1) &&

         "don't know how to check for this null value!");

  return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();

}


bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {

  // Intrinsic doesn't support vectors, also it seems that it's often difficult

  // to prove that a vector cannot have any nulls in it so it's unclear if it's

  // worth supporting.

  if (I.getType()->isVectorTy())

    return false;


  // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.

  // This is only worthwhile for casts from/to priv/local to flat.

  const unsigned SrcAS = I.getSrcAddressSpace();

  const unsigned DstAS = I.getDestAddressSpace();


  bool CanLower = false;

  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)

    CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||

                DstAS == AMDGPUAS::PRIVATE_ADDRESS);

  else if (DstAS == AMDGPUAS::FLAT_ADDRESS)

    CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||

                SrcAS == AMDGPUAS::PRIVATE_ADDRESS);

  if (!CanLower)

    return false;


  SmallVector<const Value *, 4> WorkList;

  getUnderlyingObjects(I.getOperand(0), WorkList);

  if (!all_of(WorkList, [&](const Value *V) {

        return isPtrKnownNeverNull(V, DL, TM, SrcAS);

      }))

    return false;


  IRBuilder<> B(&I);

  auto *Intrin = B.CreateIntrinsic(

      I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});

  I.replaceAllUsesWith(Intrin);

  DeadVals.push_back(&I);

  return true;

}


bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {

  Intrinsic::ID IID = I.getIntrinsicID();

  switch (IID) {

  case Intrinsic::minnum:

  case Intrinsic::minimumnum:

  case Intrinsic::minimum:

    return visitFMinLike(I);

  case Intrinsic::sqrt:

    return visitSqrt(I);

  case Intrinsic::log:

  case Intrinsic::log10:

    return visitLog(cast<FPMathOperator>(I), IID);

  case Intrinsic::log2:

    // No reason to handle log2.

    return false;

  case Intrinsic::amdgcn_mbcnt_lo:

    return visitMbcntLo(I);

  case Intrinsic::amdgcn_mbcnt_hi:

    return visitMbcntHi(I);

  case Intrinsic::vector_reduce_add:

    return visitVectorReduceAdd(I);

  case Intrinsic::uadd_sat:

  case Intrinsic::sadd_sat:

    return visitSaturatingAdd(I);

  default:

    return false;

  }

}


/// Match the core sequence in the fract pattern (x - floor(x), which doesn't

/// need to consider edge case handling.

Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,

                                                   const APFloat &C) const {

  if (ST.hasFractBug())

    return nullptr;


  Type *Ty = FractSrc.getType();

  if (!isLegalFloatingTy(Ty->getScalarType()))

    return nullptr;


  APFloat OneNextDown = APFloat::getOne(C.getSemantics());

  OneNextDown.next(true);


  // Match nextafter(1.0, -1)

  if (OneNextDown != C)

    return nullptr;


  Value *FloorSrc;

  if (match(&FractSrc, m_FSub(m_Value(FloorSrc), m_Intrinsic<Intrinsic::floor>(

                                                     m_Deferred(FloorSrc)))))

    return FloorSrc;

  return nullptr;

}


/// Match non-nan fract pattern.

//    MIN_CONSTANT = nextafter(1.0, -1.0)

///   minnum(fsub(x, floor(x)), MIN_CONSTANT)

///   minimumnum(fsub(x, floor(x)), MIN_CONSTANT)

///   minimum(fsub(x, floor(x)), MIN_CONSTANT)


// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;

///

/// If fract is a useful instruction for the subtarget. Does not account for the

/// nan handling; the instruction has a nan check on the input value.

Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {

  Value *Arg0;

  const APFloat *C;


  // The value is only used in contexts where we know the input isn't a nan, so

  // any of the fmin variants are fine.

  if (!match(&V,

             m_CombineOr(m_FMinNum_or_FMinimumNum(m_Value(Arg0),

                                                  m_APFloatAllowPoison(C)),

                         m_FMinimum(m_Value(Arg0), m_APFloatAllowPoison(C)))))

    return nullptr;


  return matchFractPatImpl(*Arg0, *C);

}


Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,

                                               Value *FractArg) {

  SmallVector<Value *, 4> FractVals;

  extractValues(Builder, FractVals, FractArg);


  SmallVector<Value *, 4> ResultVals(FractVals.size());


  Type *Ty = FractArg->getType()->getScalarType();

  for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {

    ResultVals[I] =

        Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});

  }


  return insertValues(Builder, FractArg->getType(), ResultVals);

}


bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {

  const APFloat *C;

  Value *FractArg;


  //  minimum(x - floor(x), MIN_CONSTANT)

  Value *X;

  if (!ST.hasFractBug() &&

      match(&I, m_FMinimum(m_Value(X), m_APFloatAllowPoison(C)))) {

    FractArg = matchFractPatImpl(*X, *C);

    if (!FractArg)

      return false;

  } else {

    //  minnum(x - floor(x), MIN_CONSTANT)

    FractArg = matchFractPatNanAvoidant(I);

    if (!FractArg)

      return false;


    // Match pattern for fract intrinsic in contexts where the nan check has

    // been optimized out (and hope the knowledge the source can't be nan wasn't

    // lost).

    if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))

      return false;

  }


  IRBuilder<> Builder(&I);

  FastMathFlags FMF = I.getFastMathFlags();

  FMF.setNoNaNs();

  Builder.setFastMathFlags(FMF);


  Value *Fract = applyFractPat(Builder, FractArg);

  Fract->takeName(&I);

  I.replaceAllUsesWith(Fract);

  DeadVals.push_back(&I);

  return true;

}


// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.

bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {

  Type *Ty = Sqrt.getType()->getScalarType();

  if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))

    return false;


  const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);

  FastMathFlags SqrtFMF = FPOp->getFastMathFlags();


  // We're trying to handle the fast-but-not-that-fast case only. The lowering

  // of fast llvm.sqrt will give the raw instruction anyway.

  if (SqrtFMF.approxFunc())

    return false;


  const float ReqdAccuracy = FPOp->getFPAccuracy();


  // Defer correctly rounded expansion to codegen.

  if (ReqdAccuracy < 1.0f)

    return false;


  Value *SrcVal = Sqrt.getOperand(0);

  bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);


  // The raw instruction is 1 ulp, but the correction for denormal handling

  // brings it to 2.

  if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)

    return false;


  IRBuilder<> Builder(&Sqrt);

  SmallVector<Value *, 4> SrcVals;

  extractValues(Builder, SrcVals, SrcVal);


  SmallVector<Value *, 4> ResultVals(SrcVals.size());

  for (int I = 0, E = SrcVals.size(); I != E; ++I) {

    if (CanTreatAsDAZ)

      ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);

    else

      ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);

  }


  Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);

  NewSqrt->takeName(&Sqrt);

  Sqrt.replaceAllUsesWith(NewSqrt);

  DeadVals.push_back(&Sqrt);

  return true;

}


/// Replace log and log10 intrinsic calls based on fpmath metadata.

bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,

                                        Intrinsic::ID IID) {

  Type *Ty = Log.getType();

  if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())

    return false;


  FastMathFlags FMF = Log.getFastMathFlags();


  // Defer fast math cases to codegen.

  if (FMF.approxFunc())

    return false;


  // Limit experimentally determined from OpenCL conformance test (1.79)

  if (Log.getFPAccuracy() < 1.80f)

    return false;


  IRBuilder<> Builder(&cast<CallInst>(Log));


  // Use the generic intrinsic for convenience in the vector case. Codegen will

  // recognize the denormal handling is not necessary from the fpext.

  // TODO: Move to generic code

  Value *Log2 =

      Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);


  double Log2BaseInverted =

      IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;

  Value *Mul =

      Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);


  Mul->takeName(&Log);


  Log.replaceAllUsesWith(Mul);

  DeadVals.push_back(&Log);

  return true;

}


bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {

  if (skipFunction(F))

    return false;


  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();

  if (!TPC)

    return false;


  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();

  const TargetLibraryInfo *TLI =

      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);

  AssumptionCache *AC =

      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();

  const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;

  const UniformityInfo &UA =

      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

  return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();

}


PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,

                                                FunctionAnalysisManager &FAM) {

  const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);

  const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);

  AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);

  const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);

  const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);

  AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);

  if (!Impl.run())

    return PreservedAnalyses::all();

  PreservedAnalyses PA = PreservedAnalyses::none();

  if (!Impl.FlowChanged)

    PA.preserveSet<CFGAnalyses>();

  return PA;

}


INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,

                      "AMDGPU IR optimizations", false, false)

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)

INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",

                    false, false)


/// Create a workitem.id.x intrinsic call with range metadata.

CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {

  CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});

  ST.makeLIDRangeMetadata(Tid);

  return Tid;

}


/// Replace the instruction with a direct workitem.id.x call.

void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {

  IRBuilder<> B(&I);

  CallInst *Tid = createWorkitemIdX(B);

  BasicBlock::iterator BI(&I);

  ReplaceInstWithValue(BI, Tid);

}


/// Replace the instruction with (workitem.id.x & mask).

void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(

    Instruction &I, unsigned WaveSize) const {

  IRBuilder<> B(&I);

  CallInst *Tid = createWorkitemIdX(B);

  Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);

  Value *AndInst = B.CreateAnd(Tid, Mask);

  BasicBlock::iterator BI(&I);

  ReplaceInstWithValue(BI, AndInst);

}


/// Try to optimize mbcnt instruction by replacing with workitem.id.x when

/// work group size allows direct computation of lane ID.

/// Returns true if optimization was applied, false otherwise.

bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,

                                                        unsigned Wave) const {

  std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);

  if (!MaybeX)

    return false;


  // When work group size == wave_size, each work group contains exactly one

  // wave, so the instruction can be replaced with workitem.id.x directly.

  if (*MaybeX == Wave) {

    replaceWithWorkitemIdX(I);

    return true;

  }


  // When work group evenly splits into waves, compute lane ID within wave

  // using bit masking: lane_id = workitem.id.x & (wave_size - 1).

  if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {

    replaceWithMaskedWorkitemIdX(I, Wave);

    return true;

  }


  return false;

}


/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.

bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {

  // This optimization only applies to wave32 targets where mbcnt.lo operates on

  // the full execution mask.

  if (!ST.isWave32())

    return false;


  // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with

  // lower IDs.

  if (!match(&I,

             m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))

    return false;


  return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());

}


/// Optimize mbcnt.hi calls for lane ID computation.

bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {

  // Abort if wave size is not known at compile time.

  if (!ST.isWaveSizeKnown())

    return false;


  unsigned Wave = ST.getWavefrontSize();


  // On wave32, the upper 32 bits of execution mask are always 0, so

  // mbcnt.hi(mask, val) always returns val unchanged.

  if (ST.isWave32()) {

    if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {

      // Replace mbcnt.hi(mask, val) with val only when work group size matches

      // wave size (single wave per work group).

      if (*MaybeX == Wave) {

        BasicBlock::iterator BI(&I);

        ReplaceInstWithValue(BI, I.getArgOperand(1));

        return true;

      }

    }

  }


  // Optimize the complete lane ID computation pattern:

  // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs

  // across the full execution mask.

  using namespace PatternMatch;


  // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))

  if (!match(&I, m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(

                     m_AllOnes(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(

                                      m_AllOnes(), m_Zero()))))

    return false;


  return tryReplaceWithWorkitemId(I, Wave);

}


/// Check if type is <4 x i8>.


static bool isV4I8(Type *Ty) {

  FixedVectorType *VTy = dyn_cast<FixedVectorType>(Ty);

  return VTy && VTy->getNumElements() == 4 &&

         VTy->getElementType()->isIntegerTy(8);

}


/// Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x

/// i8>) Returns true if pattern matches and signedness matches IsSigned.

/// Sets A, B to the <4 x i8> sources.


static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B,

                             bool IsSigned) {

  Value *Src0, *Src1;

  if (!match(MulOp, m_Mul(m_Value(Src0), m_Value(Src1))))

    return false;


  // Check that result type is <4 x i32>

  FixedVectorType *MulTy = dyn_cast<FixedVectorType>(MulOp->getType());

  if (!MulTy || MulTy->getNumElements() != 4 ||

      !MulTy->getElementType()->isIntegerTy(32))

    return false;


  // Match zext or sext based on IsSigned

  Value *ExtSrc0, *ExtSrc1;

  if (IsSigned) {

    if (!match(Src0, m_SExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))

      return false;

    if (!match(Src1, m_SExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))

      return false;

  } else {

    if (!match(Src0, m_ZExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))

      return false;

    if (!match(Src1, m_ZExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))

      return false;

  }


  A = ExtSrc0;

  B = ExtSrc1;

  return true;

}


/// Try to convert vector.reduce.add(mul(zext/sext <4 x i8>, zext/sext <4 x

/// i8>)) to a dot4 intrinsic call (non-saturating case only).

bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &I) {

  // Check if we have dot4 instructions available

  if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))

    return false;


  Value *A = nullptr, *B = nullptr;


  // Try unsigned first, then signed

  bool IsSigned = false;

  if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/false)) {

    if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/true))

      return false;

    IsSigned = true;

  }


  LLVMContext &Ctx = I.getContext();

  Type *I32Ty = Type::getInt32Ty(Ctx);

  IRBuilder<> Builder(&I);


  // Bitcast <4 x i8> to i32

  Value *ASrc = Builder.CreateBitCast(A, I32Ty);

  Value *BSrc = Builder.CreateBitCast(B, I32Ty);


  // Non-saturating case: accumulator is 0, clamp is false

  Value *Acc = ConstantInt::get(I32Ty, 0);

  Value *Clamp = ConstantInt::getFalse(Ctx);


  Intrinsic::ID DotIID =

      IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;


  Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Acc, Clamp});

  Dot->takeName(&I);


  I.replaceAllUsesWith(Dot);

  DeadVals.push_back(&I);


  return true;

}


/// Try to convert uadd.sat/sadd.sat(vector.reduce.add(mul(...)), c) to a

/// saturating dot4 intrinsic. This combine starts at the root (saturating add)

/// and looks at its operands.

bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &I) {

  // Check if we have dot4 instructions available

  if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))

    return false;


  Intrinsic::ID IID = I.getIntrinsicID();

  bool IsSigned = (IID == Intrinsic::sadd_sat);


  // Look for vector.reduce.add as one of the operands (commutative match)

  Value *Op0 = I.getArgOperand(0);

  Value *Op1 = I.getArgOperand(1);

  Value *MulOp = nullptr;

  Value *Accum = nullptr;

  IntrinsicInst *ReduceInst = nullptr;


  if (match(Op0, m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(MulOp)))) {

    ReduceInst = cast<IntrinsicInst>(Op0);

    Accum = Op1;

  } else if (match(Op1,

                   m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(MulOp)))) {

    ReduceInst = cast<IntrinsicInst>(Op1);

    Accum = Op0;

  } else {

    return false;

  }


  Value *A = nullptr, *B = nullptr;


  if (!matchDot4Pattern(MulOp, A, B, IsSigned))

    return false;


  LLVMContext &Ctx = I.getContext();

  Type *I32Ty = Type::getInt32Ty(Ctx);

  IRBuilder<> Builder(&I);


  // Bitcast <4 x i8> to i32

  Value *ASrc = Builder.CreateBitCast(A, I32Ty);

  Value *BSrc = Builder.CreateBitCast(B, I32Ty);


  // Saturating case: use the accumulator and set clamp to true

  Value *Clamp = ConstantInt::getTrue(Ctx);


  Intrinsic::ID DotIID =

      IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;


  Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Accum, Clamp});

  Dot->takeName(&I);


  I.replaceAllUsesWith(Dot);

  DeadVals.push_back(&I);

  // The reduce.add will be dead after this and cleaned up later

  if (ReduceInst->use_empty())

    DeadVals.push_back(ReduceInst);


  return true;

}


char AMDGPUCodeGenPrepare::ID = 0;


FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {

  return new AMDGPUCodeGenPrepare();

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

insertValues
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition AMDGPUCodeGenPrepare.cpp:351

extractValues
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition AMDGPUCodeGenPrepare.cpp:339

getMulHu
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition AMDGPUCodeGenPrepare.cpp:1015

isInterestingPHIIncomingValue
static bool isInterestingPHIIncomingValue(const Value *V)
Definition AMDGPUCodeGenPrepare.cpp:1707

findSelectThroughCast
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition AMDGPUCodeGenPrepare.cpp:431

matchDot4Pattern
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
Definition AMDGPUCodeGenPrepare.cpp:2441

isV4I8
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
Definition AMDGPUCodeGenPrepare.cpp:2432

getMul64
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition AMDGPUCodeGenPrepare.cpp:1001

emitRsqIEEE1ULP
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
Definition AMDGPUCodeGenPrepare.cpp:600

getSign32
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
Definition AMDGPUCodeGenPrepare.cpp:1217

collectPHINodes
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
Definition AMDGPUCodeGenPrepare.cpp:1762

isPtrKnownNeverNull
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
Definition AMDGPUCodeGenPrepare.cpp:1992

areInSameBB
static bool areInSameBB(const Value *A, const Value *B)
Definition AMDGPUCodeGenPrepare.cpp:1699

WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))

AMDGPUMemoryUtils.h

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

Scaled
@ Scaled
Definition ARCInstrInfo.cpp:35

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AssumptionCache.h

BasicBlockUtils.h

X
#define X(NUM, ENUM, NAME)
Definition ELF.h:853

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

ConstantFolding.h

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:51

Metadata
dxil translate DXIL Translate Metadata
Definition DXILTranslateMetadata.cpp:647

Dominators.h

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:109

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

IRBuilder.h

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InstVisitor.h

getOpcode
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
Definition Instrumentor.cpp:1003

IntegerDivision.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

KnownBits.h

KnownFPClass.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

T
#define T
Definition Mips16ISelLowering.cpp:282

Signed
@ Signed
Definition NVPTXISelLowering.cpp:6398

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

FAM
FunctionAnalysisManager FAM
Definition PassBuilderBindings.cpp:61

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

Pass.h

PatternMatch.h

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

SIModeRegisterDefaults.h

visit
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
Definition SPIRVStructurizer.cpp:189

SetVector.h
This file implements a set that has insertion order iteration characteristics.

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

TargetLibraryInfo.h

EnableGlobalISelOption
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

Local.h

UniformityAnalysis.h
LLVM IR instance of the generic uniformity analysis.

ValueHandle.h

ValueTracking.h

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:75

VectorSlice::Ty
Type * Ty
Definition AMDGPUCodeGenPrepare.cpp:1855

VectorSlice::Idx
unsigned Idx
Definition AMDGPUCodeGenPrepare.cpp:1856

VectorSlice::VectorSlice
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Definition AMDGPUCodeGenPrepare.cpp:1852

VectorSlice::NewPHI
PHINode * NewPHI
Definition AMDGPUCodeGenPrepare.cpp:1858

VectorSlice::getSlicedVal
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
Definition AMDGPUCodeGenPrepare.cpp:1880

VectorSlice::NumElts
unsigned NumElts
Definition AMDGPUCodeGenPrepare.cpp:1857

llvm::AMDGPUCodeGenPreparePass::run
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Definition AMDGPUCodeGenPrepare.cpp:2303

llvm::AMDGPUSubtarget::getReqdWorkGroupSize
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Definition AMDGPUSubtarget.cpp:231

llvm::AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
Definition AMDGPUSubtarget.cpp:239

llvm::AMDGPUSubtarget::hasMulU24
bool hasMulU24() const
Definition AMDGPUSubtarget.h:207

llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition AMDGPUSubtarget.h:219

llvm::AMDGPUSubtarget::hasMulI24
bool hasMulI24() const
Definition AMDGPUSubtarget.h:203

llvm::AMDGPUTargetMachine
Definition AMDGPUTargetMachine.h:34

llvm::APFloat
Definition APFloat.h:1029

llvm::APFloat::getOne
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1147

llvm::APFloat::getSmallestNormalized
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217

llvm::APFloat::next
opStatus next(bool nextDown)
Definition APFloat.h:1313

llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition Instructions.h:5235

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition PassAnalysisSupport.h:131

llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition AssumptionCache.h:180

llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition AssumptionCache.h:210

llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition AssumptionCache.h:44

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237

llvm::BinaryOperator
Definition InstrTypes.h:206

llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition InstrTypes.h:409

llvm::BitVector
Definition BitVector.h:101

llvm::BitVector::set
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366

llvm::BitVector::all
bool all() const
Returns true if all bits are set.
Definition BitVector.h:194

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1531

llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:512

llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:674

llvm::CodeGenTargetMachineImpl::getTargetTransformInfo
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
Definition CodeGenTargetMachineImpl.cpp:112

llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537

llvm::ConstantFP::getZero
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
Definition Constants.cpp:1159

llvm::ConstantFP::isExactlyValue
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition Constants.cpp:1209

llvm::ConstantFP::getInfinity
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
Definition Constants.cpp:1136

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:893

llvm::ConstantInt::getFalse
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition Constants.cpp:900

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::Constant::getAllOnesValue
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition Constants.cpp:419

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition Constants.cpp:363

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DenseMap
Definition DenseMap.h:834

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition Dominators.h:274

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155

llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202

llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291

llvm::FPMathOperator::getFPAccuracy
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Definition Instructions.cpp:2789

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23

llvm::FastMathFlags::setFast
void setFast(bool B=true)
Definition FMF.h:96

llvm::FastMathFlags::noInfs
bool noInfs() const
Definition FMF.h:66

llvm::FastMathFlags::allowReciprocal
bool allowReciprocal() const
Definition FMF.h:68

llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition FMF.h:70

llvm::FastMathFlags::setNoNaNs
void setNoNaNs(bool B=true)
Definition FMF.h:78

llvm::FastMathFlags::noNaNs
bool noNaNs() const
Definition FMF.h:65

llvm::FastMathFlags::allowContract
bool allowContract() const
Definition FMF.h:69

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:650

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:693

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:65

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition GCNSubtarget.h:909

llvm::GCNSubtarget::isWaveSizeKnown
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Definition GCNSubtarget.h:916

llvm::GCNSubtarget::hasFractBug
bool hasFractBug() const
Definition GCNSubtarget.h:236

llvm::GenericUniformityInfo::isUniformAtDef
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
Definition GenericUniformityInfo.h:64

llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637

llvm::IRBuilderBase::CreateFDiv
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1715

llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625

llvm::IRBuilderBase::getIntNTy
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599

llvm::IRBuilderBase::CreateZExtOrTrunc
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2148

llvm::IRBuilderBase::CreateExtractValue
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Definition IRBuilder.cpp:936

llvm::IRBuilderBase::CreateSelect
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.cpp:1112

llvm::IRBuilderBase::CreateFPToUI
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176

llvm::IRBuilderBase::CreateSExt
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142

llvm::IRBuilderBase::SetCurrentDebugLocation
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247

llvm::IRBuilderBase::getInt32Ty
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586

llvm::IRBuilderBase::CreateUIToFP
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false, MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2190

llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352

llvm::IRBuilderBase::CreateFCmpOLT
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2439

llvm::IRBuilderBase::CreateFAbs
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Definition IRBuilder.h:1048

llvm::IRBuilderBase::CreateNeg
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1852

llvm::IRBuilderBase::createIsFPClass
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
Definition IRBuilder.cpp:1353

llvm::IRBuilderBase::getInt32
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529

llvm::IRBuilderBase::CreateSub
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461

llvm::IRBuilderBase::CreateFMA
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1115

llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252

llvm::IRBuilderBase::CreateLoad
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928

llvm::IRBuilderBase::CreateShl
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533

llvm::IRBuilderBase::getFastMathFlags
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:341

llvm::IRBuilderBase::CreateZExt
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130

llvm::IRBuilderBase::CreateFCmpOEQ
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2424

llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592

llvm::IRBuilderBase::CreateAdd
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444

llvm::IRBuilderBase::getFloatTy
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614

llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563

llvm::IRBuilderBase::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2116

llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753

llvm::IRBuilderBase::CreateICmpUGE
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2396

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207

llvm::IRBuilderBase::CreateAShr
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573

llvm::IRBuilderBase::CreateXor
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1644

llvm::IRBuilderBase::CreateSIToFP
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2202

llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696

llvm::IRBuilderBase::CreateFNeg
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1861

llvm::IRBuilderBase::CreateOr
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614

llvm::IRBuilderBase::CreateSExtOrTrunc
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2163

llvm::IRBuilderBase::CreateFMulFMF
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701

llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478

llvm::IRBuilderBase::CreateUnaryIntrinsic
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition IRBuilder.cpp:914

llvm::IRBuilderBase::CreateFCmpOGE
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2434

llvm::IRBuilderBase::CreateFPToSI
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2183

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868

llvm::InstVisitor
Base class for instruction visitors.
Definition InstVisitor.h:78

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition Instruction.h:546

llvm::Instruction::BinaryOps
BinaryOps
Definition Instruction.h:1056

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1567

llvm::PHINode
Definition Instructions.h:2661

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:2026

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition Instructions.h:1710

llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition Instructions.h:1748

llvm::SelectInst::getCondition
const Value * getCondition() const
Definition Instructions.h:1746

llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition Instructions.h:1747

llvm::SmallDenseMap
Definition DenseMap.h:977

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:387

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:533

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition TargetLibraryInfo.h:602

llvm::TargetLibraryInfoWrapperPass
Definition TargetLibraryInfo.h:627

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:266

llvm::TargetMachine::getSubtarget
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Definition TargetMachine.h:199

llvm::TargetTransformInfo::getCastContextHint
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
Definition TargetTransformInfo.cpp:1092

llvm::TargetTransformInfo::getCastInstrCost
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
Definition TargetTransformInfo.cpp:1135

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:332

llvm::TargetTransformInfo::getArithmeticInstrCost
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition TargetTransformInfo.cpp:1003

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310

llvm::Type::getIntegerBitWidth
LLVM_ABI unsigned getIntegerBitWidth() const
Definition DerivedTypes.h:107

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309

llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368

llvm::Type::getWithNewBitWidth
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
Definition DerivedTypes.h:832

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232

llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257

llvm::Type::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106

llvm::UniformityInfoAnalysis
Analysis pass which computes UniformityInfo.
Definition UniformityAnalysis.h:29

llvm::UniformityInfoWrapperPass
Legacy analysis pass which computes a CycleInfo.
Definition UniformityAnalysis.h:55

llvm::User
Definition User.h:44

llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition User.h:212

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552

llvm::Value::use_empty
bool use_empty() const
Definition Value.h:346

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399

llvm::VectorType::getElementType
Type * getElementType() const
Definition DerivedTypes.h:523

llvm::cl::opt
Definition CommandLine.h:1454

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

Changed
Changed
Definition ObjCARCOpts.cpp:2366

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

false
Definition MachinePipeliner.cpp:245

llvm::AArch64PACKey::IB
@ IB
Definition AArch64BaseInfo.h:1013

llvm::AArch64PACKey::IA
@ IA
Definition AArch64BaseInfo.h:1012

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:37

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::getNullPointerValue
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
Definition AMDGPUAddrSpace.h:178

llvm::AMDGPU::WidenLoad
@ WidenLoad
Definition AMDGPURegBankLegalizeRules.h:298

llvm::AMDGPU::copyMetadataForWidenedLoad
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
Definition AMDGPUMemoryUtils.cpp:34

llvm::ARCCC::Z
@ Z
Definition ARCInfo.h:41

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:780

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::PatternMatchHelpers::m_CombineOr
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
Definition PatternMatchHelpers.h:56

llvm::PatternMatch
Definition PatternMatch.h:51

llvm::PatternMatch::m_AllOnes
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition PatternMatch.h:492

llvm::PatternMatch::m_UnordFMin
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
Definition PatternMatch.h:2640

llvm::PatternMatch::m_FCmp
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1732

llvm::PatternMatch::m_FMinNum_or_FMinimumNum
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition PatternMatch.h:3007

llvm::PatternMatch::m_FSub
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
Definition PatternMatch.h:1172

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:53

llvm::PatternMatch::m_Deferred
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition PatternMatch.h:951

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:943

llvm::PatternMatch::m_APFloatAllowPoison
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
Definition PatternMatch.h:284

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2848

llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition PatternMatch.h:1900

llvm::PatternMatch::m_FMinimum
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition PatternMatch.h:2968

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1220

llvm::PatternMatch::m_NonNaN
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
Definition PatternMatch.h:700

llvm::PatternMatch::m_ZExt
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition PatternMatch.h:2227

llvm::PatternMatch::m_PosInf
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
Definition PatternMatch.h:719

llvm::PatternMatch::m_PosZeroFP
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
Definition PatternMatch.h:774

llvm::PatternMatch::m_SExt
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
Definition PatternMatch.h:2221

llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition PatternMatch.h:591

llvm::PatternMatch::m_FAbs
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Definition PatternMatch.h:2940

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:490

llvm::cl::ReallyHidden
@ ReallyHidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::lltok::APFloat
@ APFloat
Definition LLToken.h:533

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::lsp::MessageType::Log
@ Log
Definition Protocol.h:1295

llvm::mdconst::extract
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::numbers::ln2
constexpr double ln2
Definition STLForwardCompat.h:66

llvm::numbers::ln10
constexpr double ln10
Definition STLForwardCompat.h:67

llvm::tgtok::TrueVal
@ TrueVal
Definition TGLexer.h:57

llvm::tgtok::FalseVal
@ FalseVal
Definition TGLexer.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::UniformityInfo
GenericUniformityInfo< SSAContext > UniformityInfo
Definition UniformityAnalysis.h:25

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::computeKnownFPClass
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
Definition ValueTracking.cpp:6124

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::RecursivelyDeleteTriviallyDeadInstructions
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::expandRemainderUpTo64Bits
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
Definition IntegerDivision.cpp:553

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546

llvm::ReplaceInstWithValue
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
Definition BasicBlockUtils.cpp:611

llvm::bit_ceil
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::isInstructionTriviallyDead
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403

llvm::HexPrintStyle::Lower
@ Lower
Definition NativeFormatting.h:23

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407

llvm::expandDivisionUpTo64Bits
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Definition IntegerDivision.cpp:648

llvm::FPClassTest
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Definition FloatingPointMode.h:338

llvm::fcSubnormal
@ fcSubnormal
Definition FloatingPointMode.h:355

llvm::fcNone
@ fcNone
Definition FloatingPointMode.h:339

llvm::fcZero
@ fcZero
Definition FloatingPointMode.h:356

llvm::fcPosInf
@ fcPosInf
Definition FloatingPointMode.h:350

llvm::computeKnownBits
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition ValueTracking.cpp:153

llvm::alignTo
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::ConstantFoldCastOperand
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Definition ConstantFolding.cpp:1616

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::ConstantFoldBinaryOpOperands
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
Definition ConstantFolding.cpp:1452

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:263

llvm::IRBuilder
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

llvm::createAMDGPUCodeGenPreparePass
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition AMDGPUCodeGenPrepare.cpp:2575

llvm::bit_cast
To bit_cast(const From &from) noexcept
Definition bit.h:90

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition ValueTracking.cpp:337

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::isKnownNeverNaN
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
Definition ValueTracking.cpp:6212

llvm::ComputeMaxSignificantBits
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Definition ValueTracking.cpp:345

llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197

llvm::isKnownToBeAPowerOfTwo
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
Definition ValueTracking.cpp:269

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:586

llvm::getUnderlyingObjects
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Definition ValueTracking.cpp:6986

llvm::getCGPassBuilderOption
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
Definition TargetPassConfig.cpp:499

N
#define N

llvm::DenormalMode::Input
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
Definition FloatingPointMode.h:97

llvm::DenormalMode::inputsAreZero
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
Definition FloatingPointMode.h:162

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition FloatingPointMode.h:119

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:106

llvm::KnownBits::countMaxActiveBits
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310

llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103

llvm::KnownFPClass
Definition KnownFPClass.h:26

llvm::KnownFPClass::isKnownNeverSubnormal
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
Definition KnownFPClass.h:70

llvm::KnownFPClass::isKnownNeverLogicalZero
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
Definition KnownFPClass.cpp:35

llvm::KnownFPClass::isKnownNeverPosInfinity
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
Definition KnownFPClass.h:64

llvm::SIModeRegisterDefaults
Definition SIModeRegisterDefaults.h:20

llvm::SimplifyQuery
Definition SimplifyQuery.h:71

llvm::SimplifyQuery::DL
const DataLayout & DL
Definition SimplifyQuery.h:72

llvm::SimplifyQuery::DT
const DominatorTree * DT
Definition SimplifyQuery.h:74

llvm::SimplifyQuery::getWithInstruction
SimplifyQuery getWithInstruction(const Instruction *I) const
Definition SimplifyQuery.h:109

llvm::SimplifyQuery::AC
AssumptionCache * AC
Definition SimplifyQuery.h:75

llvm::cl::desc
Definition CommandLine.h:410