doxygen/AMDGPUTargetTransformInfo_8cpp_source.html

//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// \file

// This file implements a TargetTransformInfo analysis pass specific to the

// AMDGPU target machine. It uses the target's detailed information to provide

// more precise answers to certain TTI queries, while letting the target

// independent and default TTI implementations handle the rest.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUTargetTransformInfo.h"

#include "AMDGPUSubtarget.h"

#include "AMDGPUTargetMachine.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIModeRegisterDefaults.h"

#include "llvm/ADT/SmallBitVector.h"

#include "llvm/Analysis/InlineCost.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Support/KnownBits.h"

#include <optional>


using namespace llvm;


#define DEBUG_TYPE "AMDGPUtti"


static cl::opt<unsigned> UnrollThresholdPrivate(

  "amdgpu-unroll-threshold-private",

  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),

  cl::init(2700), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdLocal(

  "amdgpu-unroll-threshold-local",

  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),

  cl::init(1000), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdIf(

  "amdgpu-unroll-threshold-if",

  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),

  cl::init(200), cl::Hidden);


static cl::opt<bool> UnrollRuntimeLocal(

  "amdgpu-unroll-runtime-local",

  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),

  cl::init(true), cl::Hidden);


static cl::opt<unsigned> UnrollMaxBlockToAnalyze(

    "amdgpu-unroll-max-block-to-analyze",

    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),

    cl::init(32), cl::Hidden);


static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",

                                       cl::Hidden, cl::init(4000),

                                       cl::desc("Cost of alloca argument"));


// If the amount of scratch memory to eliminate exceeds our ability to allocate

// it into registers we gain nothing by aggressively inlining functions for that

// heuristic.

static cl::opt<unsigned>

    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,

                    cl::init(256),

                    cl::desc("Maximum alloca size to use for inline cost"));


// Inliner constraint to achieve reasonable compilation time.

static cl::opt<size_t> InlineMaxBB(

    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),

    cl::desc("Maximum number of BBs allowed in a function after inlining"

             " (compile time constraint)"));


// This default unroll factor is based on microbenchmarks on gfx1030.

static cl::opt<unsigned> MemcpyLoopUnroll(

    "amdgpu-memcpy-loop-unroll",

    cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "

             "operations when lowering statically-sized memcpy, memmove, or"

             "memset as a loop"),

    cl::init(16), cl::Hidden);


static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,

                              unsigned Depth = 0) {

  const Instruction *I = dyn_cast<Instruction>(Cond);

  if (!I)

    return false;


  if (!L->contains(I))

    return false;

  for (const Value *V : I->operand_values()) {

    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {

      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {

                  return SubLoop->contains(PHI); }))

        return true;

    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))

      return true;

  }

  return false;

}


AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      TargetTriple(TM->getTargetTriple()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()) {}


void AMDGPUTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  const Function &F = *L->getHeader()->getParent();

  UP.Threshold =

      F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);

  UP.MaxCount = std::numeric_limits<unsigned>::max();

  UP.Partial = true;


  // Conditional branch in a loop back edge needs 3 additional exec

  // manipulations in average.

  UP.BEInsns += 3;


  // We want to run unroll even for the loops which have been vectorized.

  UP.UnrollVectorizedLoop = true;


  // Enable runtime unrolling for loops whose trip count is not known at

  // compile time.

  UP.Runtime = true;


  // Maximum alloca size than can fit registers. Reserve 16 registers.

  const unsigned MaxAlloca = (256 - 16) * 4;

  unsigned ThresholdPrivate = UnrollThresholdPrivate;

  unsigned ThresholdLocal = UnrollThresholdLocal;


  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the

  // provided threshold value as the default for Threshold

  if (MDNode *LoopUnrollThreshold =

          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {

    if (LoopUnrollThreshold->getNumOperands() == 2) {

      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(

          LoopUnrollThreshold->getOperand(1));

      if (MetaThresholdValue) {

        // We will also use the supplied value for PartialThreshold for now.

        // We may introduce additional metadata if it becomes necessary in the

        // future.

        UP.Threshold = MetaThresholdValue->getSExtValue();

        UP.PartialThreshold = UP.Threshold;

        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);

        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);

      }

    }

  }


  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);

  for (const BasicBlock *BB : L->getBlocks()) {

    const DataLayout &DL = BB->getDataLayout();

    unsigned LocalGEPsSeen = 0;


    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {

               return SubLoop->contains(BB); }))

        continue; // Block belongs to an inner loop.


    for (const Instruction &I : *BB) {

      // Unroll a loop which contains an "if" statement whose condition

      // defined by a PHI belonging to the loop. This may help to eliminate

      // if region and potentially even PHI itself, saving on both divergence

      // and registers used for the PHI.

      // Add a small bonus for each of such "if" statements.

      if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {

        if (UP.Threshold < MaxBoost) {

          BasicBlock *Succ0 = Br->getSuccessor(0);

          BasicBlock *Succ1 = Br->getSuccessor(1);

          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||

              (L->contains(Succ1) && L->isLoopExiting(Succ1)))

            continue;

          if (dependsOnLocalPhi(L, Br->getCondition())) {

            UP.Threshold += UnrollThresholdIf;

            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold

                              << " for loop:\n"

                              << *L << " due to " << *Br << '\n');

            if (UP.Threshold >= MaxBoost)

              return;

          }

        }

        continue;

      }


      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);

      if (!GEP)

        continue;


      unsigned AS = GEP->getAddressSpace();

      unsigned Threshold = 0;

      if (AS == AMDGPUAS::PRIVATE_ADDRESS)

        Threshold = ThresholdPrivate;

      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)

        Threshold = ThresholdLocal;

      else

        continue;


      if (UP.Threshold >= Threshold)

        continue;


      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

        const Value *Ptr = GEP->getPointerOperand();

        const AllocaInst *Alloca =

            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));

        if (!Alloca || !Alloca->isStaticAlloca())

          continue;

        auto AllocaSize = Alloca->getAllocationSize(DL);

        if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)

          continue;

      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||

                 AS == AMDGPUAS::REGION_ADDRESS) {

        LocalGEPsSeen++;

        // Inhibit unroll for local memory if we have seen addressing not to

        // a variable, most likely we will be unable to combine it.

        // Do not unroll too deep inner loops for local memory to give a chance

        // to unroll an outer loop for a more important reason.

        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||

            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&

             !isa<Argument>(GEP->getPointerOperand())))

          continue;

        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"

                          << *L << " due to LDS use.\n");

        UP.Runtime = UnrollRuntimeLocal;

      }


      // Check if GEP depends on a value defined by this loop itself.

      bool HasLoopDef = false;

      for (const Value *Op : GEP->operands()) {

        const Instruction *Inst = dyn_cast<Instruction>(Op);

        if (!Inst || L->isLoopInvariant(Op))

          continue;


        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {

             return SubLoop->contains(Inst); }))

          continue;

        HasLoopDef = true;

        break;

      }

      if (!HasLoopDef)

        continue;


      // We want to do whatever we can to limit the number of alloca

      // instructions that make it through to the code generator.  allocas

      // require us to use indirect addressing, which is slow and prone to

      // compiler bugs.  If this loop does an address calculation on an

      // alloca ptr, then we want to use a higher than normal loop unroll

      // threshold. This will give SROA a better chance to eliminate these

      // allocas.

      //

      // We also want to have more unrolling for local memory to let ds

      // instructions with different offsets combine.

      //

      // Don't use the maximum allowed value here as it will make some

      // programs way too big.

      UP.Threshold = Threshold;

      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold

                        << " for loop:\n"

                        << *L << " due to " << *GEP << '\n');

      if (UP.Threshold >= MaxBoost)

        return;

    }


    // If we got a GEP in a small BB from inner loop then increase max trip

    // count to analyze for better estimation cost in unroll

    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)

      UP.MaxIterationsCountToAnalyze = 32;

  }

}


void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                          TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {

    // Codegen control options which don't matter.

    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,

    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,

    AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,


    AMDGPU::FeatureAutoWaitcntBeforeBarrier,


    // Property of the kernel/environment which can't actually differ.

    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,

    AMDGPU::FeatureTrapHandler,


    // The default assumption needs to be ecc is enabled, but no directly

    // exposed operations depend on it, so it can be safely inlined.

    AMDGPU::FeatureSRAMECC,


    // Perf-tuning features

    AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};


GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()), CommonTTI(TM, F),

      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {

  SIModeRegisterDefaults Mode(F, *ST);

  HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();

  HasFP64FP16Denormals =

      Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();

}


bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {

  return !F || !ST->isSingleLaneExecution(*F);

}


unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {

  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector

  // registers. See getRegisterClassForType for the implementation.

  // In this case vector registers are not vector in terms of

  // VGPRs, but those which can hold multiple values.


  // This is really the number of registers to fill when vectorizing /

  // interleaving loops, so we lie to avoid trying to use all registers.

  return 4;

}


TypeSize


GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

  switch (K) {

  case TargetTransformInfo::RGK_Scalar:

    return TypeSize::getFixed(32);

  case TargetTransformInfo::RGK_FixedWidthVector:

    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);

  case TargetTransformInfo::RGK_ScalableVector:

    return TypeSize::getScalable(0);

  }

  llvm_unreachable("Unsupported register kind");

}


unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {

  return 32;

}


unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

  if (Opcode == Instruction::Load || Opcode == Instruction::Store)

    return 32 * 4 / ElemWidth;

  // For a given width return the max 0number of elements that can be combined

  // into a wider bit value:

  return (ElemWidth == 8 && ST->has16BitInsts())       ? 4

         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2

         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2

                                                       : 1;

}


bool GCNTTIImpl::preferSLPInstCountCheck() const {

  // The integer inst-count heuristic causes regressions on gfx94x and gfx950

  // because 2-element vector trees that pass the scalar/vector instruction

  // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)

  // after codegen, increasing register pressure and throughput cost without

  // reducing the total instruction count.

  return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();

}


unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,

                                         unsigned ChainSizeInBytes,

                                         VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * LoadSize;

  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)

    // TODO: Support element-size less than 32bit?

    return 128 / LoadSize;


  return VF;

}


unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,

                                             unsigned ChainSizeInBytes,

                                             VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * StoreSize;

  if (VecRegBitWidth > 128)

    return 128 / StoreSize;


  return VF;

}


unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {

  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||

      AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||

      AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {

    return 512;

  }


  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)

    return 8 * ST->getMaxPrivateElementSize();


  // Common to flat, global, local and region. Assume for unknown addrspace.

  return 128;

}


bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,

                                            Align Alignment,

                                            unsigned AddrSpace) const {

  // We allow vectorization of flat stores, even though we may need to decompose

  // them later if they may access private memory. We don't have enough context

  // here, and legalization can handle it.

  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {

    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&

           ChainSizeInBytes <= ST->getMaxPrivateElementSize();

  }

  return true;

}


bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,

                                             Align Alignment,

                                             unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,

                                              Align Alignment,

                                              unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


Type *GCNTTIImpl::getMemcpyLoopLoweringType(

    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,

    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,

    std::optional<uint32_t> AtomicElementSize) const {


  if (AtomicElementSize)

    return Type::getIntNTy(Context, *AtomicElementSize * 8);


  // 16-byte accesses achieve the highest copy throughput.

  // If the operation has a fixed known length that is large enough, it is

  // worthwhile to return an even wider type and let legalization lower it into

  // multiple accesses, effectively unrolling the memcpy loop.

  // We also rely on legalization to decompose into smaller accesses for

  // subtargets and address spaces where it is necessary.

  //

  // Don't unroll if Length is not a constant, since unrolling leads to worse

  // performance for length values that are smaller or slightly larger than the

  // total size of the type returned here. Mitigating that would require a more

  // complex lowering for variable-length memcpy and memmove.

  unsigned I32EltsInVector = 4;

  if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))

    return FixedVectorType::get(Type::getInt32Ty(Context),

                                MemcpyLoopUnroll * I32EltsInVector);


  return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);

}


void GCNTTIImpl::getMemcpyLoopResidualLoweringType(

    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,

    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,

    Align SrcAlign, Align DestAlign,

    std::optional<uint32_t> AtomicCpySize) const {


  if (AtomicCpySize)

    BaseT::getMemcpyLoopResidualLoweringType(

        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,

        DestAlign, AtomicCpySize);


  Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);

  while (RemainingBytes >= 16) {

    OpsOut.push_back(I32x4Ty);

    RemainingBytes -= 16;

  }


  Type *I64Ty = Type::getInt64Ty(Context);

  while (RemainingBytes >= 8) {

    OpsOut.push_back(I64Ty);

    RemainingBytes -= 8;

  }


  Type *I32Ty = Type::getInt32Ty(Context);

  while (RemainingBytes >= 4) {

    OpsOut.push_back(I32Ty);

    RemainingBytes -= 4;

  }


  Type *I16Ty = Type::getInt16Ty(Context);

  while (RemainingBytes >= 2) {

    OpsOut.push_back(I16Ty);

    RemainingBytes -= 2;

  }


  Type *I8Ty = Type::getInt8Ty(Context);

  while (RemainingBytes) {

    OpsOut.push_back(I8Ty);

    --RemainingBytes;

  }

}


unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {

  // Disable unrolling if the loop is not vectorized.

  // TODO: Enable this again.

  if (VF.isScalar())

    return 1;


  return 8;

}


bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

                                       MemIntrinsicInfo &Info) const {

  switch (Inst->getIntrinsicID()) {

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));

    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));

    if (!Ordering || !Volatile)

      return false; // Invalid.


    unsigned OrderingVal = Ordering->getZExtValue();

    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))

      return false;


    Info.PtrVal = Inst->getArgOperand(0);

    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);

    Info.ReadMem = true;

    Info.WriteMem = true;

    Info.IsVolatile = !Volatile->isZero();

    return true;

  }

  default:

    return false;

  }

}


InstructionCost GCNTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  // Because we don't have any legal vector operations, but the legal types, we

  // need to account for split vectors.

  unsigned NElts = LT.second.isVector() ?

    LT.second.getVectorNumElements() : 1;


  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;


  switch (ISD) {

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    if (SLT == MVT::i64)

      return get64BitInstrCost(CostKind) * LT.first * NElts;


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return getFullRateInstrCost() * LT.first * NElts;

  case ISD::ADD:

  case ISD::SUB:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    if (SLT == MVT::i64) {

      // and, or and xor are typically split into 2 VALU instructions.

      return 2 * getFullRateInstrCost() * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    return LT.first * NElts * getFullRateInstrCost();

  case ISD::MUL: {

    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);

    if (SLT == MVT::i64) {

      const int FullRateCost = getFullRateInstrCost();

      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return QuarterRateCost * NElts * LT.first;

  }

  case ISD::FMUL:

    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for

    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole

    // fused operation.

    if (CxtI && CxtI->hasOneUse())

      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {

        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());

        if (OPC == ISD::FADD || OPC == ISD::FSUB) {

          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)

            return TargetTransformInfo::TCC_Free;

          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)

            return TargetTransformInfo::TCC_Free;


          // Estimate all types may be fused with contract/unsafe flags

          const TargetOptions &Options = TLI->getTargetMachine().Options;

          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||

              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))

            return TargetTransformInfo::TCC_Free;

        }

      }

    [[fallthrough]];

  case ISD::FADD:

  case ISD::FSUB:

    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)

      NElts = (NElts + 1) / 2;

    if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)

      NElts = (NElts + 1) / 2;

    if (SLT == MVT::f64)

      return LT.first * NElts * get64BitInstrCost(CostKind);


    if (ST->has16BitInsts() && SLT == MVT::f16)

      NElts = (NElts + 1) / 2;


    if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)

      return LT.first * NElts * getFullRateInstrCost();

    break;

  case ISD::FDIV:

  case ISD::FREM:

    // FIXME: frem should be handled separately. The fdiv in it is most of it,

    // but the current lowering is also not entirely correct.

    if (SLT == MVT::f64) {

      int Cost = 7 * get64BitInstrCost(CostKind) +

                 getQuarterRateInstrCost(CostKind) +

                 3 * getHalfRateInstrCost(CostKind);

      // Add cost of workaround.

      if (!ST->hasUsableDivScaleConditionOutput())

        Cost += 3 * getFullRateInstrCost();


      return LT.first * Cost * NElts;

    }


    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {

      // TODO: This is more complicated, unsafe flags etc.

      if ((SLT == MVT::f32 && !HasFP32Denormals) ||

          (SLT == MVT::f16 && ST->has16BitInsts())) {

        return LT.first * getTransInstrCost(CostKind) * NElts;

      }

    }


    if (SLT == MVT::f16 && ST->has16BitInsts()) {

      // 2 x v_cvt_f32_f16

      // f32 rcp

      // f32 fmul

      // v_cvt_f16_f32

      // f16 div_fixup

      int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {

      // Fast unsafe fdiv lowering:

      // f32 rcp

      // f32 fmul

      int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 || SLT == MVT::f16) {

      // 4 more v_cvt_* insts without f16 insts support

      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +

                 1 * getTransInstrCost(CostKind);


      if (!HasFP32Denormals) {

        // FP mode switches.

        Cost += 2 * getFullRateInstrCost();

      }


      return LT.first * NElts * Cost;

    }

    break;

  case ISD::FNEG:

    // Use the backend' estimation. If fneg is not free each element will cost

    // one additional instruction.

    return TLI->isFNegFree(SLT) ? 0 : NElts;

  default:

    break;

  }


  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                       Args, CxtI);

}


// Return true if there's a potential benefit from using v2f16/v2i16

// instructions for an intrinsic, even if it requires nontrivial legalization.


static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {

  switch (ID) {

  case Intrinsic::fma:

  case Intrinsic::fmuladd:

  case Intrinsic::copysign:

  case Intrinsic::minimumnum:

  case Intrinsic::maximumnum:

  case Intrinsic::canonicalize:

  // There's a small benefit to using vector ops in the legalized code.

  case Intrinsic::round:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::abs:

    return true;

  default:

    return false;

  }

}


InstructionCost


GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                  TTI::TargetCostKind CostKind) const {

  switch (ICA.getID()) {

  case Intrinsic::fabs:

    // Free source modifier in the common case.

    return 0;

  case Intrinsic::amdgcn_workitem_id_x:

  case Intrinsic::amdgcn_workitem_id_y:

  case Intrinsic::amdgcn_workitem_id_z:

    // TODO: If hasPackedTID, or if the calling context is not an entry point

    // there may be a bit instruction.

    return 0;

  case Intrinsic::amdgcn_workgroup_id_x:

  case Intrinsic::amdgcn_workgroup_id_y:

  case Intrinsic::amdgcn_workgroup_id_z:

  case Intrinsic::amdgcn_lds_kernel_id:

  case Intrinsic::amdgcn_dispatch_ptr:

  case Intrinsic::amdgcn_dispatch_id:

  case Intrinsic::amdgcn_implicitarg_ptr:

  case Intrinsic::amdgcn_queue_ptr:

    // Read from an argument register.

    return 0;

  default:

    break;

  }


  Type *RetTy = ICA.getReturnType();


  Intrinsic::ID IID = ICA.getID();

  switch (IID) {

  case Intrinsic::exp:

  case Intrinsic::exp2:

  case Intrinsic::exp10: {

    // Legalize the type.

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

    MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

    unsigned NElts =

        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;


    if (SLT == MVT::f64) {

      unsigned NumOps = 20;

      if (IID == Intrinsic::exp)

        ++NumOps;

      else if (IID == Intrinsic::exp10)

        NumOps += 3;


      return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);

    }


    if (SLT == MVT::f32) {

      unsigned NumFullRateOps = 0;

      // v_exp_f32 (transcendental).

      unsigned NumTransOps = 1;


      if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {

        // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +

        // overflow/underflow checks (lowerFEXP). Denorm is also handled.

        // FMA preamble: ~13 full-rate ops; non-FMA: ~17.

        NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;

      } else {

        if (IID == Intrinsic::exp) {

          // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.

          NumFullRateOps = 1;

        } else if (IID == Intrinsic::exp10) {

          // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).

          NumFullRateOps = 3;

          NumTransOps = 2;

        }

        // Denorm scaling adds setcc + select + fadd + select + fmul.

        if (HasFP32Denormals)

          NumFullRateOps += 5;

      }


      InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +

                             NumTransOps * getTransInstrCost(CostKind);

      return LT.first * NElts * Cost;

    }


    break;

  }

  case Intrinsic::log:

  case Intrinsic::log2:

  case Intrinsic::log10: {

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

    MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

    unsigned NElts =

        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;


    if (SLT == MVT::f32) {

      unsigned NumFullRateOps = 0;


      if (IID == Intrinsic::log2) {

        // LowerFLOG2: just v_log_f32.

      } else if (ICA.getFlags().approxFunc()) {

        // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).

        NumFullRateOps = 1;

      } else {

        // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision

        // multiply + finite check.

        NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;

      }


      if (HasFP32Denormals)

        NumFullRateOps += 5;


      InstructionCost Cost =

          NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);

      return LT.first * NElts * Cost;

    }


    break;

  }

  case Intrinsic::sin:

  case Intrinsic::cos: {

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

    MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

    unsigned NElts =

        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;


    if (SLT == MVT::f32) {

      // LowerTrig: fmul(1/2pi) + v_sin/v_cos.

      unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;


      InstructionCost Cost =

          NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);

      return LT.first * NElts * Cost;

    }


    break;

  }

  case Intrinsic::sqrt: {

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

    MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

    unsigned NElts =

        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;


    if (SLT == MVT::f32) {

      unsigned NumFullRateOps = 0;


      if (!ICA.getFlags().approxFunc()) {

        // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.

        NumFullRateOps = HasFP32Denormals ? 17 : 16;

      }


      InstructionCost Cost =

          NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);

      return LT.first * NElts * Cost;

    }


    break;

  }

  default:

    break;

  }


  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))

    return BaseT::getIntrinsicInstrCost(ICA, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

  unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;


  if ((ST->hasVOP3PInsts() &&

       (SLT == MVT::f16 || SLT == MVT::i16 ||

        (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||

      (ST->hasPackedFP32Ops() && SLT == MVT::f32))

    NElts = (NElts + 1) / 2;


  // TODO: Get more refined intrinsic costs?

  unsigned InstRate = getQuarterRateInstrCost(CostKind);


  switch (ICA.getID()) {

  case Intrinsic::fma:

  case Intrinsic::fmuladd:

    if (SLT == MVT::f64) {

      InstRate = get64BitInstrCost(CostKind);

      break;

    }


    if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)

      InstRate = getFullRateInstrCost();

    else {

      InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)

                                     : getQuarterRateInstrCost(CostKind);

    }

    break;

  case Intrinsic::copysign:

    return NElts * getFullRateInstrCost();

  case Intrinsic::minimumnum:

  case Intrinsic::maximumnum: {

    // Instruction + 2 canonicalizes. For cases that need type promotion, we the

    // promotion takes the place of the canonicalize.

    unsigned NumOps = 3;

    if (const IntrinsicInst *II = ICA.getInst()) {

      // Directly legal with ieee=0

      // TODO: Not directly legal with strictfp

      if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)

        NumOps = 1;

    }


    unsigned BaseRate =

        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

    InstRate = BaseRate * NumOps;

    break;

  }

  case Intrinsic::canonicalize: {

    InstRate =

        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

    break;

  }

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat: {

    if (SLT == MVT::i16 || SLT == MVT::i32)

      InstRate = getFullRateInstrCost();


    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};

    if (any_of(ValidSatTys, equal_to(LT.second)))

      NElts = 1;

    break;

  }

  case Intrinsic::abs:

    // Expansion takes 2 instructions for VALU

    if (SLT == MVT::i16 || SLT == MVT::i32)

      InstRate = 2 * getFullRateInstrCost();

    break;

  default:

    break;

  }


  return LT.first * NElts * InstRate;

}


InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,

                                           TTI::TargetCostKind CostKind,

                                           const Instruction *I) const {

  assert((I == nullptr || I->getOpcode() == Opcode) &&

         "Opcode should reflect passed instruction.");

  const bool SCost =

      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);

  const int CBrCost = SCost ? 5 : 7;

  switch (Opcode) {

  case Instruction::UncondBr:

    // Branch instruction takes about 4 slots on gfx900.

    return SCost ? 1 : 4;

  case Instruction::CondBr:

    // Suppose conditional branch takes additional 3 exec manipulations

    // instructions in average.

    return CBrCost;

  case Instruction::Switch: {

    const auto *SI = dyn_cast_or_null<SwitchInst>(I);

    // Each case (including default) takes 1 cmp + 1 cbr instructions in

    // average.

    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);

  }

  case Instruction::Ret:

    return SCost ? 1 : 10;

  }

  return BaseT::getCFInstrCost(Opcode, CostKind, I);

}


InstructionCost


GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

                                       std::optional<FastMathFlags> FMF,

                                       TTI::TargetCostKind CostKind) const {

  if (TTI::requiresOrderedReduction(FMF))

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getFullRateInstrCost();

}


InstructionCost


GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                   FastMathFlags FMF,

                                   TTI::TargetCostKind CostKind) const {

  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getHalfRateInstrCost(CostKind);

}


InstructionCost GCNTTIImpl::getVectorInstrCost(

    unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,

    const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {

  switch (Opcode) {

  case Instruction::ExtractElement:

  case Instruction::InsertElement: {

    unsigned EltSize

      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());

    // Dynamic indexing isn't free and is best avoided.

    if (Index == ~0u)

      return 2;

    if (EltSize < 32) {

      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())

        return 0;

      // Some i8 inserts and extracts are free so we want to reduce the

      // cost to avoid scalarization. We limit the zero cost cases to avoid

      // adversely impacting all i8 vectorizing.

      if (EltSize == 8) {

        unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();

        if (NumElts >= 4 && isPowerOf2_32(NumElts)) {

          // Extracts at indices aligned to 32-bit boundaries (0, 4, 8, 12 for

          // v16i8) are free as they access the low byte of each VGPR. Other

          // indices require bit manipulation (shifts/byte selects) and cost 1.

          return Index % 4 == 0 ? 0 : 1;

        }

      }

      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,

                                       VIC);

    }


    // Extracts are just reads of a subregister, so are free. Inserts are

    // considered free because we don't want to have any cost for scalarizing

    // operations, and we don't have to copy into a different register class.

    return 0;

  }

  default:

    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,

                                     VIC);

  }

}


/// Analyze if the results of inline asm are divergent. If \p Indices is empty,

/// this is analyzing the collective result of all output registers. Otherwise,

/// this is only querying a specific result index if this returns multiple

/// registers in a struct.


bool GCNTTIImpl::isInlineAsmSourceOfDivergence(

  const CallInst *CI, ArrayRef<unsigned> Indices) const {

  // TODO: Handle complex extract indices

  if (Indices.size() > 1)

    return true;


  const DataLayout &DL = CI->getDataLayout();

  const SIRegisterInfo *TRI = ST->getRegisterInfo();

  TargetLowering::AsmOperandInfoVector TargetConstraints =

      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);


  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];


  int OutputIdx = 0;

  for (auto &TC : TargetConstraints) {

    if (TC.Type != InlineAsm::isOutput)

      continue;


    // Skip outputs we don't care about.

    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)

      continue;


    TLI->ComputeConstraintToUse(TC, SDValue());


    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(

        TRI, TC.ConstraintCode, TC.ConstraintVT).second;


    // For AGPR constraints null is returned on subtargets without AGPRs, so

    // assume divergent for null.

    if (!RC || !TRI->isSGPRClass(RC))

      return true;

  }


  return false;

}


bool GCNTTIImpl::isReadRegisterSourceOfDivergence(

    const IntrinsicInst *ReadReg) const {

  Metadata *MD =

      cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();

  StringRef RegName =

      cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();


  // Special case registers that look like VCC.

  MVT VT = MVT::getVT(ReadReg->getType());

  if (VT == MVT::i1)

    return true;


  // Special case scalar registers that start with 'v'.

  if (RegName.starts_with("vcc") || RegName.empty())

    return false;


  // VGPR or AGPR is divergent. There aren't any specially named vector

  // registers.

  return RegName[0] == 'v' || RegName[0] == 'a';

}


/// \returns true if the result of the value could potentially be

/// different across workitems in a wavefront.

bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {

  if (const Argument *A = dyn_cast<Argument>(V))

    return !AMDGPU::isArgPassedInSGPR(A);


  // Loads from the private and flat address spaces are divergent, because

  // threads can execute the load instruction with the same inputs and get

  // different results.

  //

  // All other loads are not divergent, because if threads issue loads with the

  // same arguments, they will always get the same result.

  if (const LoadInst *Load = dyn_cast<LoadInst>(V))

    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||

           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;


  // Atomics are divergent because they are executed sequentially: when an

  // atomic operation refers to the same address in each thread, then each

  // thread after the first sees the value written by the previous thread as

  // original value.

  if (isa<AtomicRMWInst, AtomicCmpXchgInst>(V))

    return true;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {

    Intrinsic::ID IID = Intrinsic->getIntrinsicID();

    switch (IID) {

    case Intrinsic::read_register:

      return isReadRegisterSourceOfDivergence(Intrinsic);

    case Intrinsic::amdgcn_addrspacecast_nonnull: {

      unsigned SrcAS =

          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();

      unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();

      return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&

             DstAS == AMDGPUAS::FLAT_ADDRESS &&

             ST->hasGloballyAddressableScratch();

    }

    case Intrinsic::amdgcn_workitem_id_y:

    case Intrinsic::amdgcn_workitem_id_z: {

      const Function *F = Intrinsic->getFunction();

      bool HasUniformYZ =

          ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);

      std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(

          *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);

      return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);

    }

    default:

      return AMDGPU::isIntrinsicSourceOfDivergence(IID);

    }

  }


  // Assume all function calls are a source of divergence.

  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return isInlineAsmSourceOfDivergence(CI);

    return true;

  }


  // Assume all function calls are a source of divergence.

  if (isa<InvokeInst>(V))

    return true;


  // If the target supports globally addressable scratch, the mapping from

  // scratch memory to the flat aperture changes therefore an address space cast

  // is no longer uniform.

  if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {

    return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&

           CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&

           ST->hasGloballyAddressableScratch();

  }


  return false;

}


bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))

    return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());


  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return !isInlineAsmSourceOfDivergence(CI);

    return false;

  }


  // In most cases TID / wavefrontsize is uniform.

  //

  // However, if a kernel has uneven dimesions we can have a value of

  // workitem-id-x divided by the wavefrontsize non-uniform. For example

  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)

  // packed into a same wave which gives 1 and 0 after the division by 64

  // respectively.

  //

  // The X dimension doesn't reset within a wave if either both the Y

  // and Z dimensions are of length 1, or if the X dimension's required

  // size is a power of 2. Note, however, if the X dimension's maximum

  // size is a power of 2 < the wavefront size, division by the wavefront

  // size is guaranteed to yield 0, so this is also a no-reset case.

  bool XDimDoesntResetWithinWaves = false;

  if (auto *I = dyn_cast<Instruction>(V)) {

    const Function *F = I->getFunction();

    XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);

  }

  using namespace llvm::PatternMatch;

  uint64_t C;

  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C))) ||

      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C)))) {

    return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;

  }


  Value *Mask;

  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                       m_Value(Mask)))) {

    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=

               ST->getWavefrontSizeLog2() &&

           XDimDoesntResetWithinWaves;

  }


  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

  if (!ExtValue)

    return false;


  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));

  if (!CI)

    return false;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {

    switch (Intrinsic->getIntrinsicID()) {

    default:

      return false;

    case Intrinsic::amdgcn_if:

    case Intrinsic::amdgcn_else: {

      ArrayRef<unsigned> Indices = ExtValue->getIndices();

      return Indices.size() == 1 && Indices[0] == 1;

    }

    }

  }


  // If we have inline asm returning mixed SGPR and VGPR results, we inferred

  // divergent for the overall struct return. We need to override it in the

  // case we're extracting an SGPR component here.

  if (CI->isInlineAsm())

    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());


  return false;

}


bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,

                                            Intrinsic::ID IID) const {

  switch (IID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private:

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_make_buffer_rsrc:

    OpIndexes.push_back(0);

    return true;

  default:

    return false;

  }

}


Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,

                                                    Value *OldV,

                                                    Value *NewV) const {

  auto IntrID = II->getIntrinsicID();

  switch (IntrID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private: {

    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?

      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;

    unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    LLVMContext &Ctx = NewV->getType()->getContext();

    ConstantInt *NewVal = (TrueAS == NewAS) ?

      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);

    return NewVal;

  }

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num: {

    Type *DestTy = II->getType();

    Type *SrcTy = NewV->getType();

    unsigned NewAS = SrcTy->getPointerAddressSpace();

    if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))

      return nullptr;

    Module *M = II->getModule();

    Function *NewDecl = Intrinsic::getOrInsertDeclaration(

        M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  case Intrinsic::amdgcn_load_to_lds: {

    Type *SrcTy = NewV->getType();

    Module *M = II->getModule();

    Function *NewDecl =

        Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  case Intrinsic::amdgcn_make_buffer_rsrc: {

    Type *SrcTy = NewV->getType();

    Type *DstTy = II->getType();

    Module *M = II->getModule();

    Function *NewDecl = Intrinsic::getOrInsertDeclaration(

        M, II->getIntrinsicID(), {DstTy, SrcTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  default:

    return nullptr;

  }

}


InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

                                           VectorType *DstTy, VectorType *SrcTy,

                                           ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind,

                                           int Index, VectorType *SubTp,

                                           ArrayRef<const Value *> Args,

                                           const Instruction *CxtI) const {

  if (!isa<FixedVectorType>(SrcTy))

    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                                 SubTp);


  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);


  unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());

  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&

      (ScalarSize == 16 || ScalarSize == 8)) {

    // Larger vector widths may require additional instructions, but are

    // typically cheaper than scalarized versions.

    //

    // We assume that shuffling at a register granularity can be done for free.

    // This is not true for vectors fed into memory instructions, but it is

    // effectively true for all other shuffling. The emphasis of the logic here

    // is to assist generic transform in cleaning up / canonicalizing those

    // shuffles.


    // With op_sel VOP3P instructions freely can access the low half or high

    // half of a register, so any swizzle of two elements is free.

    if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {

      unsigned NumSrcElts = SrcVecTy->getNumElements();

      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&

          (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||

           Kind == TTI::SK_PermuteSingleSrc))

        return 0;

    }


    unsigned EltsPerReg = 32 / ScalarSize;

    switch (Kind) {

    case TTI::SK_Broadcast:

      // A single v_perm_b32 can be re-used for all destination registers.

      return 1;

    case TTI::SK_Reverse:

      // One instruction per register.

      if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))

        return divideCeil(DstVecTy->getNumElements(), EltsPerReg);

      return InstructionCost::getInvalid();

    case TTI::SK_ExtractSubvector:

      if (Index % EltsPerReg == 0)

        return 0; // Shuffling at register granularity

      if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))

        return divideCeil(DstVecTy->getNumElements(), EltsPerReg);

      return InstructionCost::getInvalid();

    case TTI::SK_InsertSubvector: {

      auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);

      if (!DstVecTy)

        return InstructionCost::getInvalid();

      unsigned NumDstElts = DstVecTy->getNumElements();

      unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();

      unsigned EndIndex = Index + NumInsertElts;

      unsigned BeginSubIdx = Index % EltsPerReg;

      unsigned EndSubIdx = EndIndex % EltsPerReg;

      unsigned Cost = 0;


      if (BeginSubIdx != 0) {

        // Need to shift the inserted vector into place. The cost is the number

        // of destination registers overlapped by the inserted vector.

        Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);

      }


      // If the last register overlap is partial, there may be three source

      // registers feeding into it; that takes an extra instruction.

      if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)

        Cost += 1;


      return Cost;

    }

    case TTI::SK_Splice: {

      auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);

      if (!DstVecTy)

        return InstructionCost::getInvalid();

      unsigned NumElts = DstVecTy->getNumElements();

      assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());

      // Determine the sub-region of the result vector that requires

      // sub-register shuffles / mixing.

      unsigned EltsFromLHS = NumElts - Index;

      bool LHSIsAligned = (Index % EltsPerReg) == 0;

      bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;

      if (LHSIsAligned && RHSIsAligned)

        return 0;

      if (LHSIsAligned && !RHSIsAligned)

        return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);

      if (!LHSIsAligned && RHSIsAligned)

        return divideCeil(EltsFromLHS, EltsPerReg);

      return divideCeil(NumElts, EltsPerReg);

    }

    default:

      break;

    }


    if (!Mask.empty()) {

      unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();


      // Generically estimate the cost by assuming that each destination

      // register is derived from sources via v_perm_b32 instructions if it

      // can't be copied as-is.

      //

      // For each destination register, derive the cost of obtaining it based

      // on the number of source registers that feed into it.

      unsigned Cost = 0;

      for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {

        SmallVector<int, 4> Regs;

        bool Aligned = true;

        for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {

          int SrcIdx = Mask[DstIdx + I];

          if (SrcIdx == -1)

            continue;

          int Reg;

          if (SrcIdx < (int)NumSrcElts) {

            Reg = SrcIdx / EltsPerReg;

            if (SrcIdx % EltsPerReg != I)

              Aligned = false;

          } else {

            Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;

            if ((SrcIdx - NumSrcElts) % EltsPerReg != I)

              Aligned = false;

          }

          if (!llvm::is_contained(Regs, Reg))

            Regs.push_back(Reg);

        }

        if (Regs.size() >= 2)

          Cost += Regs.size() - 1;

        else if (!Aligned)

          Cost += 1;

      }

      return Cost;

    }

  }


  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                               SubTp);

}


/// Whether it is profitable to sink the operands of an

/// Instruction I to the basic block of I.

/// This helps using several modifiers (like abs and neg) more often.


bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,

                                            SmallVectorImpl<Use *> &Ops) const {

  using namespace PatternMatch;


  for (auto &Op : I->operands()) {

    // Ensure we are not already sinking this operand.

    if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))

      continue;


    if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {

      Ops.push_back(&Op);

      continue;

    }


    // Check for zero-cost multiple use InsertElement/ExtractElement

    // instructions

    if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {

      if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {

        Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));

        if (VecOpInst && VecOpInst->hasOneUse())

          continue;


        if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),

                               TTI::TCK_RecipThroughput, 0,

                               OpInst->getOperand(0),

                               OpInst->getOperand(1)) == 0) {

          Ops.push_back(&Op);

          continue;

        }

      }

    }


    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {


      unsigned EltSize = DL.getTypeSizeInBits(

          cast<VectorType>(Shuffle->getType())->getElementType());


      // For i32 (or greater) shufflevectors, these will be lowered into a

      // series of insert / extract elements, which will be coalesced away.

      if (EltSize < 16 || !ST->has16BitInsts())

        continue;


      int NumSubElts, SubIndex;

      if (Shuffle->changesLength()) {

        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {

          Ops.push_back(&Op);

          continue;

        }


        if ((Shuffle->isExtractSubvectorMask(SubIndex) ||

             Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&

            !(SubIndex & 0x1)) {

          Ops.push_back(&Op);

          continue;

        }

      }


      if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||

          Shuffle->isSingleSource()) {

        Ops.push_back(&Op);

        continue;

      }

    }

  }


  return !Ops.empty();

}


bool GCNTTIImpl::areInlineCompatible(const Function *Caller,

                                     const Function *Callee) const {

  const TargetMachine &TM = getTLI()->getTargetMachine();

  const GCNSubtarget *CallerST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));

  const GCNSubtarget *CalleeST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));


  const FeatureBitset &CallerBits = CallerST->getFeatureBits();

  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();


  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;

  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;

  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)

    return false;


  // FIXME: dx10_clamp can just take the caller setting, but there seems to be

  // no way to support merge for backend defined attributes.

  SIModeRegisterDefaults CallerMode(*Caller, *CallerST);

  SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);

  if (!CallerMode.isInlineCompatible(CalleeMode))

    return false;


  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||

      Callee->hasFnAttribute(Attribute::InlineHint))

    return true;


  // Hack to make compile times reasonable.

  if (InlineMaxBB) {

    // Single BB does not increase total BB amount.

    if (Callee->size() == 1)

      return true;

    size_t BBSize = Caller->size() + Callee->size() - 1;

    return BBSize <= InlineMaxBB;

  }


  return true;

}


static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,

                                                   const SITargetLowering *TLI,

                                                   const GCNTTIImpl *TTIImpl) {

  const int NrOfSGPRUntilSpill = 26;

  const int NrOfVGPRUntilSpill = 32;


  const DataLayout &DL = TTIImpl->getDataLayout();


  unsigned adjustThreshold = 0;

  int SGPRsInUse = 0;

  int VGPRsInUse = 0;

  for (const Use &A : CB->args()) {

    SmallVector<EVT, 4> ValueVTs;

    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);

    for (auto ArgVT : ValueVTs) {

      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(

          CB->getContext(), CB->getCallingConv(), ArgVT);

      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))

        SGPRsInUse += CCRegNum;

      else

        VGPRsInUse += CCRegNum;

    }

  }


  // The cost of passing function arguments through the stack:

  //  1 instruction to put a function argument on the stack in the caller.

  //  1 instruction to take a function argument from the stack in callee.

  //  1 instruction is explicitly take care of data dependencies in callee

  //  function.

  InstructionCost ArgStackCost(1);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);


  // The penalty cost is computed relative to the cost of instructions and does

  // not model any storage costs.

  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *

                     ArgStackCost.getValue() * InlineConstants::getInstrCost();

  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *

                     ArgStackCost.getValue() * InlineConstants::getInstrCost();

  return adjustThreshold;

}


static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,

                                           const DataLayout &DL) {

  // If we have a pointer to a private array passed into a function

  // it will not be optimized out, leaving scratch usage.

  // This function calculates the total size in bytes of the memory that would

  // end in scratch if the call was not inlined.

  unsigned AllocaSize = 0;

  SmallPtrSet<const AllocaInst *, 8> AIVisited;

  for (Value *PtrArg : CB->args()) {

    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());

    if (!Ty)

      continue;


    unsigned AddrSpace = Ty->getAddressSpace();

    if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&

        AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)

      continue;


    const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));

    if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)

      continue;


    if (auto Size = AI->getAllocationSize(DL))

      AllocaSize += Size->getFixedValue();

  }

  return AllocaSize;

}


int GCNTTIImpl::getInliningLastCallToStaticBonus() const {

  return BaseT::getInliningLastCallToStaticBonus() *

         getInliningThresholdMultiplier();

}


unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {

  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);


  // Private object passed as arguments may end up in scratch usage if the call

  // is not inlined. Increase the inline threshold to promote inlining.

  unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize > 0)

    Threshold += ArgAllocaCost;

  return Threshold;

}


unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,

                                         const AllocaInst *AI) const {


  // Below the cutoff, assume that the private memory objects would be

  // optimized

  auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize <= ArgAllocaCutoff)

    return 0;


  // Above the cutoff, we give a cost to each private memory object

  // depending its size. If the array can be optimized by SROA this cost is not

  // added to the total-cost in the inliner cost analysis.

  //

  // We choose the total cost of the alloca such that their sum cancels the

  // bonus given in the threshold (ArgAllocaCost).

  //

  //   Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost

  //

  // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,

  // the single-bb bonus and the vector-bonus.

  //

  // We compensate the first two multipliers, by repeating logic from the

  // inliner-cost in here. The vector-bonus is 0 on AMDGPU.

  static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");

  unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();


  bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {

    return BB.getTerminator()->getNumSuccessors() > 1;

  });

  if (SingleBB) {

    Threshold += Threshold / 2;

  }


  auto ArgAllocaSize = AI->getAllocationSize(DL);

  if (!ArgAllocaSize)

    return 0;


  // Attribute the bonus proportionally to the alloca size

  unsigned AllocaThresholdBonus =

      (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;


  return AllocaThresholdBonus;

}


void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::UnrollingPreferences &UP,

                                         OptimizationRemarkEmitter *ORE) const {

  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);

}


void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                       TTI::PeelingPreferences &PP) const {

  CommonTTI.getPeelingPreferences(L, SE, PP);

}


int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {

  return getQuarterRateInstrCost(CostKind);

}


int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {

  return ST->hasFullRate64Ops()

             ? getFullRateInstrCost()

             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)

                                      : getQuarterRateInstrCost(CostKind);

}


std::pair<InstructionCost, MVT>

GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {

  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);

  auto Size = DL.getTypeSizeInBits(Ty);

  // Maximum load or store can handle 8 dwords for scalar and 4 for

  // vector ALU. Let's assume anything above 8 dwords is expensive

  // even if legal.

  if (Size <= 256)

    return Cost;


  Cost.first += (Size + 255) / 256;

  return Cost;

}


unsigned GCNTTIImpl::getPrefetchDistance() const {

  return ST->hasPrefetch() ? 128 : 0;

}


bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {

  return AMDGPU::isFlatGlobalAddrSpace(AS);

}


void GCNTTIImpl::collectKernelLaunchBounds(

    const Function &F,

    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

  SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);

  LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});

  LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});

  LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});

  std::pair<unsigned, unsigned> FlatWorkGroupSize =

      ST->getFlatWorkGroupSizes(F);

  LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});

  LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});

  std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);

  LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});

  LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});

}


GCNTTIImpl::KnownIEEEMode


GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {

  if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))

    return KnownIEEEMode::On; // Only mode on gfx1170+


  const Function *F = I.getFunction();

  if (!F)

    return KnownIEEEMode::Unknown;


  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");

  if (IEEEAttr.isValid())

    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;


  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off

                                               : KnownIEEEMode::On;

}


InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

                                            Align Alignment,

                                            unsigned AddressSpace,

                                            TTI::TargetCostKind CostKind,

                                            TTI::OperandValueInfo OpInfo,

                                            const Instruction *I) const {

  if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {

    if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&

        CostKind != TTI::TCK_Latency &&

        VecTy->getElementType()->isIntegerTy(8)) {

      return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,

                        getLoadStoreVecRegBitWidth(AddressSpace));

    }

  }

  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,

                                OpInfo, I);

}


unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {

  if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {

    if (VecTy->getElementType()->isIntegerTy(8)) {

      unsigned ElementCount = VecTy->getElementCount().getFixedValue();

      return divideCeil(ElementCount - 1, 4);

    }

  }

  return BaseT::getNumberOfParts(Tp);

}


ValueUniformity GCNTTIImpl::getValueUniformity(const Value *V) const {

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {

    switch (Intrinsic->getIntrinsicID()) {

    case Intrinsic::amdgcn_wave_shuffle:

      return ValueUniformity::Custom;

    default:

      break;

    }

  }


  if (isAlwaysUniform(V))

    return ValueUniformity::AlwaysUniform;


  if (isSourceOfDivergence(V))

    return ValueUniformity::NeverUniform;


  return ValueUniformity::Default;

}


InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,

                                                 StackOffset BaseOffset,

                                                 bool HasBaseReg, int64_t Scale,

                                                 unsigned AddrSpace) const {

  if (HasBaseReg && Scale != 0) {

    // gfx1250+ can fold base+scale*index when scale matches the memory access

    // size (scale_offset bit). Supported for flat/global/constant/scratch

    // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).

    if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&

        (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace) ||

         AddrSpace == AMDGPUAS::FLAT_ADDRESS ||

         AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {

      TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);

      if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&

          static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)

        return 0;

    }

    return 1;

  }

  return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,

                                     AddrSpace);

}


bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,

                               const TTI::LSRCost &B) const {

  // Favor lower per-iteration work over preheader/setup costs.

  // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the

  // effective instruction count (base+scale*index requires a separate ADD).

  unsigned EffInsnsA = A.Insns + A.ScaleCost;

  unsigned EffInsnsB = B.Insns + B.ScaleCost;


  return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,

                  A.SetupCost, A.ImmCost, A.NumRegs) <

         std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,

                  B.SetupCost, B.ImmCost, B.NumRegs);

}


bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {

  // isLSRCostLess de-prioritizes register count; keep consistent.

  return false;

}


bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {

  // Prefer the baseline when LSR cannot clearly reduce per-iteration work.

  return true;

}


bool GCNTTIImpl::isUniform(const Instruction *I,

                           const SmallBitVector &UniformArgs) const {

  const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(I);

  switch (Intrinsic->getIntrinsicID()) {

  case Intrinsic::amdgcn_wave_shuffle:

    // wave_shuffle(Value, Index): result is uniform when either Value or Index

    // is uniform.

    return UniformArgs[0] || UniformArgs[1];

  default:

    llvm_unreachable("unexpected intrinsic in isUniform");

  }

}


SDValue
return SDValue()

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

PHI
Rewrite undef for PHI
Definition AMDGPURewriteUndefForPHI.cpp:98

AMDGPUSubtarget.h
Base class for AMDGPU specific classes of TargetSubtarget.

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

MemcpyLoopUnroll
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)

UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)

ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))

dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition AMDGPUTargetTransformInfo.cpp:89

UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)

adjustInliningThresholdUsingCallee
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
Definition AMDGPUTargetTransformInfo.cpp:1584

ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))

InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))

intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition AMDGPUTargetTransformInfo.cpp:698

UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)

getCallArgsTotalAllocaSize
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
Definition AMDGPUTargetTransformInfo.cpp:1630

UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)

UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)

AMDGPUTargetTransformInfo.h
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

IRBuilder.h

Function.h

InlineCost.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3473

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition LVOptions.cpp:25

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

PatternMatch.h

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

SIModeRegisterDefaults.h

getNumElements
static unsigned getNumElements(Type *Ty)
Definition SLPVectorizer.cpp:335

SmallBitVector.h
This file implements the SmallBitVector class.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

ValueTracking.h

PointerType
Definition ItaniumDemangle.h:639

llvm::AMDGPUSubtarget::getReqdWorkGroupSize
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Definition AMDGPUSubtarget.cpp:231

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
Definition AMDGPUSubtarget.cpp:239

llvm::AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
Definition AMDGPUTargetTransformInfo.cpp:282

llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition AMDGPUTargetTransformInfo.cpp:108

llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition AMDGPUTargetTransformInfo.cpp:277

llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition AMDGPUTargetTransformInfo.cpp:114

llvm::AMDGPUTargetMachine
Definition AMDGPUTargetMachine.h:34

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition Instructions.h:65

llvm::AllocaInst::isStaticAlloca
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition Instructions.cpp:1323

llvm::AllocaInst::getAllocationSize
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
Definition Instructions.cpp:65

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::ArrayRef::empty
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136

llvm::Attribute
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105

llvm::Attribute::getValueAsBool
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition Attributes.cpp:391

llvm::Attribute::isValid
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1048

llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3386

llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1403

llvm::BasicTTIImplBase< GCNTTIImpl >::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const override
Definition BasicTTIImpl.h:3236

llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
Definition BasicTTIImpl.h:1126

llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1182

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3374

llvm::BasicTTIImplBase< GCNTTIImpl >::getScalingFactorCost
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Definition BasicTTIImpl.h:534

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:786

llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:1012

llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Definition BasicTTIImpl.h:1461

llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1746

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL

llvm::BasicTTIImplBase< GCNTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1547

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition InstrTypes.h:1181

llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition InstrTypes.h:1484

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1417

llvm::CallBase::getCallingConv
CallingConv::ID getCallingConv() const
Definition InstrTypes.h:1475

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1361

llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition InstrTypes.h:1352

llvm::CallBase::getArgOperandNo
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition InstrTypes.h:1392

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1531

llvm::CondBrInst
Conditional Branch instruction.
Definition Instructions.h:3223

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:893

llvm::ConstantInt::getFalse
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition Constants.cpp:900

llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DataLayout::getTypeStoreSize
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579

llvm::ElementCount
Definition TypeSize.h:298

llvm::ElementCount::isScalar
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320

llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition Instructions.h:2510

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23

llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition FMF.h:70

llvm::FeatureBitset
Container class for subtarget features.
Definition SubtargetFeature.h:42

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869

llvm::Function
Definition Function.h:65

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNTTIImpl
Definition AMDGPUTargetTransformInfo.h:63

llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition AMDGPUTargetTransformInfo.cpp:305

llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:389

llvm::GCNTTIImpl::getScalingFactorCost
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:1842

llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:1333

llvm::GCNTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
Definition AMDGPUTargetTransformInfo.cpp:1795

llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:539

llvm::GCNTTIImpl::collectKernelLaunchBounds
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
Definition AMDGPUTargetTransformInfo.cpp:1762

llvm::GCNTTIImpl::isUniform
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
Definition AMDGPUTargetTransformInfo.cpp:1889

llvm::GCNTTIImpl::KnownIEEEMode
KnownIEEEMode
Definition AMDGPUTargetTransformInfo.h:304

llvm::GCNTTIImpl::KnownIEEEMode::On
@ On
Definition AMDGPUTargetTransformInfo.h:304

llvm::GCNTTIImpl::KnownIEEEMode::Unknown
@ Unknown
Definition AMDGPUTargetTransformInfo.h:304

llvm::GCNTTIImpl::KnownIEEEMode::Off
@ Off
Definition AMDGPUTargetTransformInfo.h:304

llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:425

llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition AMDGPUTargetTransformInfo.cpp:1060

llvm::GCNTTIImpl::isReadRegisterSourceOfDivergence
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
Definition AMDGPUTargetTransformInfo.cpp:1096

llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
Definition AMDGPUTargetTransformInfo.cpp:348

llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned RCID) const override
Definition AMDGPUTargetTransformInfo.cpp:320

llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
Definition AMDGPUTargetTransformInfo.cpp:419

llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
Definition AMDGPUTargetTransformInfo.cpp:379

llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition AMDGPUTargetTransformInfo.cpp:406

llvm::GCNTTIImpl::isLSRCostLess
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
Definition AMDGPUTargetTransformInfo.cpp:1865

llvm::GCNTTIImpl::shouldPrefetchAddressSpace
bool shouldPrefetchAddressSpace(unsigned AS) const override
Definition AMDGPUTargetTransformInfo.cpp:1758

llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Definition AMDGPUTargetTransformInfo.cpp:1015

llvm::GCNTTIImpl::hasBranchDivergence
bool hasBranchDivergence(const Function *F=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:316

llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
Definition AMDGPUTargetTransformInfo.cpp:1280

llvm::GCNTTIImpl::getCallerAllocaCost
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
Definition AMDGPUTargetTransformInfo.cpp:1674

llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(ElementCount VF) const override
Definition AMDGPUTargetTransformInfo.cpp:504

llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
Definition AMDGPUTargetTransformInfo.cpp:462

llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition AMDGPUTargetTransformInfo.cpp:983

llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
Definition AMDGPUTargetTransformInfo.cpp:720

llvm::GCNTTIImpl::getInliningThresholdMultiplier
unsigned getInliningThresholdMultiplier() const override
Definition AMDGPUTargetTransformInfo.h:261

llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
Definition AMDGPUTargetTransformInfo.cpp:368

llvm::GCNTTIImpl::getPrefetchDistance
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Definition AMDGPUTargetTransformInfo.cpp:1754

llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition AMDGPUTargetTransformInfo.cpp:954

llvm::GCNTTIImpl::fpenvIEEEMode
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Definition AMDGPUTargetTransformInfo.cpp:1779

llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const override
Definition AMDGPUTargetTransformInfo.cpp:1663

llvm::GCNTTIImpl::isProfitableToSinkOperands
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
Definition AMDGPUTargetTransformInfo.cpp:1477

llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
Definition AMDGPUTargetTransformInfo.cpp:513

llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
Definition AMDGPUTargetTransformInfo.cpp:1545

llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
Definition AMDGPUTargetTransformInfo.cpp:1001

llvm::GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable
bool shouldDropLSRSolutionIfLessProfitable() const override
Definition AMDGPUTargetTransformInfo.cpp:1884

llvm::GCNTTIImpl::getInliningLastCallToStaticBonus
int getInliningLastCallToStaticBonus() const override
Definition AMDGPUTargetTransformInfo.cpp:1658

llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
Definition AMDGPUTargetTransformInfo.cpp:1264

llvm::GCNTTIImpl::getValueUniformity
ValueUniformity getValueUniformity(const Value *V) const override
Definition AMDGPUTargetTransformInfo.cpp:1823

llvm::GCNTTIImpl::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
Definition AMDGPUTargetTransformInfo.cpp:1813

llvm::GCNTTIImpl::preferSLPInstCountCheck
bool preferSLPInstCountCheck() const override
Definition AMDGPUTargetTransformInfo.cpp:359

llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition AMDGPUTargetTransformInfo.cpp:1724

llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const override
Definition AMDGPUTargetTransformInfo.cpp:344

llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
Definition AMDGPUTargetTransformInfo.cpp:332

llvm::GCNTTIImpl::isNumRegsMajorCostOfLSR
bool isNumRegsMajorCostOfLSR() const override
Definition AMDGPUTargetTransformInfo.cpp:1879

llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition AMDGPUTargetTransformInfo.cpp:1718

llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
Definition AMDGPUTargetTransformInfo.cpp:435

llvm::GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
Definition AMDGPUTargetTransformInfo.cpp:431

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:968

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::InlineAsm::isOutput
@ isOutput
Definition InlineAsm.h:99

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition InstructionCost.h:82

llvm::InstructionCost::getValue
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition InstructionCost.h:96

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::hasApproxFunc
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
Definition Instruction.cpp:709

llvm::Instruction::hasAllowContract
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
Definition Instruction.cpp:704

llvm::Instruction::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition Instruction.cpp:94

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:178

llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition TargetTransformInfo.h:212

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition TargetTransformInfo.h:211

llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition TargetTransformInfo.h:210

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:209

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MDNode
Metadata node.
Definition Metadata.h:1075

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::SimpleValueType
SimpleValueType
Definition MachineValueType.h:38

llvm::MVT::getVT
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:249

llvm::Metadata
Root of the metadata hierarchy.
Definition Metadata.h:64

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::PHINode
Definition Instructions.h:2661

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SITargetLowering
Definition SIISelLowering.h:32

llvm::SITargetLowering::getNumRegistersForCallingConv
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
Definition SIISelLowering.cpp:1175

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:621

llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition SmallBitVector.h:35

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:387

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:533

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StackOffset
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition TargetLowering.h:5320

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getSubtargetImpl
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Definition TargetMachine.h:139

llvm::TargetOptions
Definition TargetOptions.h:119

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetTransformInfoImplBase::getInliningLastCallToStaticBonus
virtual int getInliningLastCallToStaticBonus() const
Definition TargetTransformInfoImpl.h:98

llvm::TargetTransformInfoImplBase::getDataLayout
virtual const DataLayout & getDataLayout() const
Definition TargetTransformInfoImpl.h:51

llvm::TargetTransformInfoImplBase::getMemcpyLoopResidualLoweringType
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition TargetTransformInfoImpl.h:1060

llvm::TargetTransformInfoImplBase::DL
const DataLayout & DL
Definition TargetTransformInfoImpl.h:40

llvm::TargetTransformInfo::VectorInstrContext
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
Definition TargetTransformInfo.h:1064

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:331

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:332

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:334

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:335

llvm::TargetTransformInfo::TCK_Latency
@ TCK_Latency
The latency of instruction.
Definition TargetTransformInfo.h:333

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition TargetTransformInfo.h:1716

llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition TargetTransformInfo.h:357

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition TargetTransformInfo.h:1244

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition TargetTransformInfo.h:1251

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition TargetTransformInfo.h:1255

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1257

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1245

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition TargetTransformInfo.h:1246

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition TargetTransformInfo.h:1252

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::TypeSize::getScalable
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:839

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307

llvm::Type::getInt16Ty
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:490

llvm::cl::opt
Definition CommandLine.h:1454

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

llvm::details::FixedOrScalableQuantity< TypeSize, uint64_t >::isKnownLE
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230

uint64_t

Analysis.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::BUFFER_STRIDED_POINTER
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
Definition AMDGPUAddrSpace.h:47

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:37

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition AMDGPUAddrSpace.h:42

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition AMDGPUAddrSpace.h:45

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::isShader
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1456

llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:94

llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition AMDGPUBaseInfo.cpp:3352

llvm::AMDGPU::isIntrinsicAlwaysUniform
bool isIntrinsicAlwaysUniform(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3517

llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3513

llvm::AMDGPU::isExtendedGlobalAddrSpace
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:99

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:32

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1030

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:421

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::InlineConstants::getInstrCost
LLVM_ABI int getInstrCost()
Definition InlineCost.cpp:206

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:780

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:51

llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1298

llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition PatternMatch.h:3092

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:53

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2848

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition PatternMatch.h:979

llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition PatternMatch.h:1292

llvm::PatternMatch::m_FNeg
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
Definition PatternMatch.h:1208

llvm::PatternMatch::m_FAbs
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Definition PatternMatch.h:2940

llvm::PatternMatch::m_ConstantInt
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:179

llvm::SI
Definition SIInstrInfo.h:1926

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
Definition CodeView.h:198

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::mdconst::extract_or_null
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::Length
@ Length
Definition DWP.cpp:558

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::ComputeValueVTs
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::AllocFnKind::Aligned
@ Aligned
Definition Attributes.h:60

llvm::findOptionMDForLoop
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition LoopInfo.cpp:1094

llvm::equal_to
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::computeKnownBits
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition ValueTracking.cpp:153

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
Definition IVDescriptors.h:49

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6938

llvm::ValueUniformity
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18

llvm::ValueUniformity::AlwaysUniform
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23

llvm::ValueUniformity::NeverUniform
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26

llvm::ValueUniformity::Default
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::ValueUniformity::Custom
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition FloatingPointMode.h:119

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408

llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition TargetTransformInfo.h:76

llvm::SIModeRegisterDefaults
Definition SIModeRegisterDefaults.h:20

llvm::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition SIModeRegisterDefaults.h:85

llvm::TargetTransformInfo::LSRCost
Definition TargetTransformInfo.h:624

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1281

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:762

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:638

llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition TargetTransformInfo.h:679

llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition TargetTransformInfo.h:646

llvm::TargetTransformInfo::UnrollingPreferences::UnrollVectorizedLoop
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
Definition TargetTransformInfo.h:723

llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition TargetTransformInfo.h:721

llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition TargetTransformInfo.h:692

llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition TargetTransformInfo.h:663

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:699

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:695

llvm::cl::desc
Definition CommandLine.h:410