doxygen/NVPTXTargetTransformInfo_8cpp_source.html

//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "NVPTXTargetTransformInfo.h"

#include "NVVMProperties.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/BasicTTIImpl.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsNVPTX.h"

#include "llvm/IR/Value.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/NVPTXAddrSpace.h"

#include "llvm/Transforms/InstCombine/InstCombiner.h"

#include <optional>

using namespace llvm;


#define DEBUG_TYPE "NVPTXtti"


// Whether the given intrinsic reads threadIdx.x/y/z.


static bool readsThreadIndex(const IntrinsicInst *II) {

  switch (II->getIntrinsicID()) {

    default: return false;

    case Intrinsic::nvvm_read_ptx_sreg_tid_x:

    case Intrinsic::nvvm_read_ptx_sreg_tid_y:

    case Intrinsic::nvvm_read_ptx_sreg_tid_z:

      return true;

  }

}


static bool readsLaneId(const IntrinsicInst *II) {

  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;

}


bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) const {

  // Without inter-procedural analysis, we conservatively assume that arguments

  // to __device__ functions are divergent.

  if (const Argument *Arg = dyn_cast<Argument>(V))

    return !isKernelFunction(*Arg->getParent());


  if (const Instruction *I = dyn_cast<Instruction>(V)) {

    // Without pointer analysis, we conservatively assume values loaded from

    // generic or local address space are divergent.

    if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {

      unsigned AS = LI->getPointerAddressSpace();

      return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;

    }

    // Atomic instructions may cause divergence. Atomic instructions are

    // executed sequentially across all threads in a warp. Therefore, an earlier

    // executed thread may see different memory inputs than a later executed

    // thread. For example, suppose *a = 0 initially.

    //

    //   atom.global.add.s32 d, [a], 1

    //

    // returns 0 for the first thread that enters the critical region, and 1 for

    // the second thread.

    if (I->isAtomic())

      return true;

    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {

      // Instructions that read threadIdx are obviously divergent.

      if (readsThreadIndex(II) || readsLaneId(II))

        return true;

    }

    // Conservatively consider the return value of function calls as divergent.

    // We could analyze callees with bodies more precisely using

    // inter-procedural analysis.

    if (isa<CallInst>(I))

      return true;

  }


  return false;

}


// Convert NVVM intrinsics to target-generic LLVM code where possible.


static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,

                                               IntrinsicInst *II) {

  // Each NVVM intrinsic we can simplify can be replaced with one of:

  //

  //  * an LLVM intrinsic,

  //  * an LLVM cast operation,

  //  * an LLVM binary operation, or

  //  * ad-hoc LLVM IR for the particular operation.


  // Some transformations are only valid when the module's

  // flush-denormals-to-zero (ftz) setting is true/false, whereas other

  // transformations are valid regardless of the module's ftz setting.

  enum FtzRequirementTy {

    FTZ_Any,       // Any ftz setting is ok.

    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.

    FTZ_MustBeOff, // Transformation is valid only if ftz is off.

  };

  // Classes of NVVM intrinsics that can't be replaced one-to-one with a

  // target-generic intrinsic, cast op, or binary op but that we can nonetheless

  // simplify.

  enum SpecialCase {

    SPC_Reciprocal,

    SCP_FunnelShiftClamp,

  };


  // SimplifyAction is a poor-man's variant (plus an additional flag) that

  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.

  struct SimplifyAction {

    // Invariant: At most one of these Optionals has a value.

    std::optional<Intrinsic::ID> IID;

    std::optional<Instruction::CastOps> CastOp;

    std::optional<Instruction::BinaryOps> BinaryOp;

    std::optional<SpecialCase> Special;


    FtzRequirementTy FtzRequirement = FTZ_Any;

    // Denormal handling is guarded by different attributes depending on the

    // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.

    bool IsHalfTy = false;


    SimplifyAction() = default;


    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,

                   bool IsHalfTy = false)

        : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}


    // Cast operations don't have anything to do with FTZ, so we skip that

    // argument.

    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}


    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)

        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}


    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)

        : Special(Special), FtzRequirement(FtzReq) {}

  };


  // Try to generate a SimplifyAction describing how to replace our

  // IntrinsicInstr with target-generic LLVM IR.

  const SimplifyAction Action = [II]() -> SimplifyAction {

    switch (II->getIntrinsicID()) {

    // NVVM intrinsics that map directly to LLVM intrinsics.

    case Intrinsic::nvvm_ceil_d:

      return {Intrinsic::ceil, FTZ_Any};

    case Intrinsic::nvvm_ceil_f:

      return {Intrinsic::ceil, FTZ_MustBeOff};

    case Intrinsic::nvvm_ceil_ftz_f:

      return {Intrinsic::ceil, FTZ_MustBeOn};

    case Intrinsic::nvvm_floor_d:

      return {Intrinsic::floor, FTZ_Any};

    case Intrinsic::nvvm_floor_f:

      return {Intrinsic::floor, FTZ_MustBeOff};

    case Intrinsic::nvvm_floor_ftz_f:

      return {Intrinsic::floor, FTZ_MustBeOn};

    case Intrinsic::nvvm_fma_rn_d:

      return {Intrinsic::fma, FTZ_Any};

    case Intrinsic::nvvm_fma_rn_f:

      return {Intrinsic::fma, FTZ_MustBeOff};

    case Intrinsic::nvvm_fma_rn_ftz_f:

      return {Intrinsic::fma, FTZ_MustBeOn};

    case Intrinsic::nvvm_fma_rn_f16:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_f16:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fma_rn_f16x2:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_ftz_f16x2:

      return {Intrinsic::fma, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fma_rn_bf16:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fma_rn_bf16x2:

      return {Intrinsic::fma, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_d:

      return {Intrinsic::maximumnum, FTZ_Any};

    case Intrinsic::nvvm_fmax_f:

      return {Intrinsic::maximumnum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmax_ftz_f:

      return {Intrinsic::maximumnum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmax_nan_f:

      return {Intrinsic::maximum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmax_ftz_nan_f:

      return {Intrinsic::maximum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmax_f16:

      return {Intrinsic::maximumnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_f16:

      return {Intrinsic::maximumnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_f16x2:

      return {Intrinsic::maximumnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_f16x2:

      return {Intrinsic::maximumnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_nan_f16:

      return {Intrinsic::maximum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_nan_f16:

      return {Intrinsic::maximum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmax_nan_f16x2:

      return {Intrinsic::maximum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmax_ftz_nan_f16x2:

      return {Intrinsic::maximum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_d:

      return {Intrinsic::minimumnum, FTZ_Any};

    case Intrinsic::nvvm_fmin_f:

      return {Intrinsic::minimumnum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmin_ftz_f:

      return {Intrinsic::minimumnum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmin_nan_f:

      return {Intrinsic::minimum, FTZ_MustBeOff};

    case Intrinsic::nvvm_fmin_ftz_nan_f:

      return {Intrinsic::minimum, FTZ_MustBeOn};

    case Intrinsic::nvvm_fmin_f16:

      return {Intrinsic::minimumnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_f16:

      return {Intrinsic::minimumnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_f16x2:

      return {Intrinsic::minimumnum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_f16x2:

      return {Intrinsic::minimumnum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_nan_f16:

      return {Intrinsic::minimum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_nan_f16:

      return {Intrinsic::minimum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_fmin_nan_f16x2:

      return {Intrinsic::minimum, FTZ_MustBeOff, true};

    case Intrinsic::nvvm_fmin_ftz_nan_f16x2:

      return {Intrinsic::minimum, FTZ_MustBeOn, true};

    case Intrinsic::nvvm_sqrt_rn_d:

      return {Intrinsic::sqrt, FTZ_Any};

    case Intrinsic::nvvm_sqrt_f:

      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the

      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts

      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are

      // the versions with explicit ftz-ness.

      return {Intrinsic::sqrt, FTZ_Any};

    case Intrinsic::nvvm_trunc_d:

      return {Intrinsic::trunc, FTZ_Any};

    case Intrinsic::nvvm_trunc_f:

      return {Intrinsic::trunc, FTZ_MustBeOff};

    case Intrinsic::nvvm_trunc_ftz_f:

      return {Intrinsic::trunc, FTZ_MustBeOn};


    // NVVM intrinsics that map to LLVM cast operations.

    // Note - we cannot map intrinsics like nvvm_d2ll_rz to LLVM's

    // FPToSI, as NaN to int conversion with FPToSI is considered UB and is

    // eliminated. NVVM conversion intrinsics are translated to PTX cvt

    // instructions which define the outcome for NaN rather than leaving as UB.

    // Therefore, translate NVVM intrinsics to sitofp/uitofp, but not to

    // fptosi/fptoui.

    case Intrinsic::nvvm_i2d_rn:

    case Intrinsic::nvvm_i2f_rn:

    case Intrinsic::nvvm_ll2d_rn:

    case Intrinsic::nvvm_ll2f_rn:

      return {Instruction::SIToFP};

    case Intrinsic::nvvm_ui2d_rn:

    case Intrinsic::nvvm_ui2f_rn:

    case Intrinsic::nvvm_ull2d_rn:

    case Intrinsic::nvvm_ull2f_rn:

      return {Instruction::UIToFP};


    // NVVM intrinsics that map to LLVM binary ops.

    case Intrinsic::nvvm_div_rn_d:

      return {Instruction::FDiv, FTZ_Any};


    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but

    // need special handling.

    //

    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just

    // as well.

    case Intrinsic::nvvm_rcp_rn_d:

      return {SPC_Reciprocal, FTZ_Any};


    case Intrinsic::nvvm_fshl_clamp:

    case Intrinsic::nvvm_fshr_clamp:

      return {SCP_FunnelShiftClamp, FTZ_Any};


      // We do not currently simplify intrinsics that give an approximate

      // answer. These include:

      //

      //   - nvvm_cos_approx_{f,ftz_f}

      //   - nvvm_ex2_approx(_ftz)

      //   - nvvm_lg2_approx_{d,f,ftz_f}

      //   - nvvm_sin_approx_{f,ftz_f}

      //   - nvvm_sqrt_approx_{f,ftz_f}

      //   - nvvm_rsqrt_approx_{d,f,ftz_f}

      //   - nvvm_div_approx_{ftz_d,ftz_f,f}

      //   - nvvm_rcp_approx_ftz_d

      //

      // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"

      // means that fastmath is enabled in the intrinsic.  Unfortunately only

      // binary operators (currently) have a fastmath bit in SelectionDAG, so

      // this information gets lost and we can't select on it.

      //

      // TODO: div and rcp are lowered to a binary op, so these we could in

      // theory lower them to "fast fdiv".


    default:

      return {};

    }

  }();


  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we

  // can bail out now.  (Notice that in the case that IID is not an NVVM

  // intrinsic, we don't have to look up any module metadata, as

  // FtzRequirementTy will be FTZ_Any.)

  if (Action.FtzRequirement != FTZ_Any) {

    // FIXME: Broken for f64

    DenormalMode Mode = II->getFunction()->getDenormalMode(

        Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());

    bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;


    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))

      return nullptr;

  }


  // Simplify to target-generic intrinsic.

  if (Action.IID) {

    SmallVector<Value *, 4> Args(II->args());

    // All the target-generic intrinsics currently of interest to us have one

    // type argument, equal to that of the nvvm intrinsic's argument.

    Type *Tys[] = {II->getArgOperand(0)->getType()};

    return CallInst::Create(

        Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),

        Args);

  }


  // Simplify to target-generic binary op.

  if (Action.BinaryOp)

    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),

                                  II->getArgOperand(1), II->getName());


  // Simplify to target-generic cast op.

  if (Action.CastOp)

    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),

                            II->getName());


  // All that's left are the special cases.

  if (!Action.Special)

    return nullptr;


  switch (*Action.Special) {

  case SPC_Reciprocal:

    // Simplify reciprocal.

    return BinaryOperator::Create(

        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),

        II->getArgOperand(0), II->getName());


  case SCP_FunnelShiftClamp: {

    // Canonicalize a clamping funnel shift to the generic llvm funnel shift

    // when possible, as this is easier for llvm to optimize further.

    if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {

      const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;

      if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())

        return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));


      const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;

      return CallInst::Create(Intrinsic::getOrInsertDeclaration(

                                  II->getModule(), FshIID, II->getType()),

                              SmallVector<Value *, 3>(II->args()));

    }

    return nullptr;

  }

  }

  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");

}


// Returns true/false when we know the answer, nullopt otherwise.


static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {

  if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC ||

      AS == NVPTXAS::ADDRESS_SPACE_ENTRY_PARAM)

    return std::nullopt; // Got to check at run-time.

  switch (IID) {

  case Intrinsic::nvvm_isspacep_global:

    return AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;

  case Intrinsic::nvvm_isspacep_local:

    return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;

  case Intrinsic::nvvm_isspacep_shared:

    // If shared cluster this can't be evaluated at compile time.

    if (AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)

      return std::nullopt;

    return AS == NVPTXAS::ADDRESS_SPACE_SHARED;

  case Intrinsic::nvvm_isspacep_shared_cluster:

    return AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER ||

           AS == NVPTXAS::ADDRESS_SPACE_SHARED;

  case Intrinsic::nvvm_isspacep_const:

    return AS == NVPTXAS::ADDRESS_SPACE_CONST;

  default:

    llvm_unreachable("Unexpected intrinsic");

  }

}


// Returns an instruction pointer (may be nullptr if we do not know the answer).

// Returns nullopt if `II` is not one of the `isspacep` intrinsics.

//

// TODO: If InferAddressSpaces were run early enough in the pipeline this could

// be removed in favor of the constant folding that occurs there through

// rewriteIntrinsicWithAddressSpace

static std::optional<Instruction *>


handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II) {


  switch (auto IID = II.getIntrinsicID()) {

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster:

  case Intrinsic::nvvm_isspacep_const: {

    Value *Op0 = II.getArgOperand(0);

    unsigned AS = Op0->getType()->getPointerAddressSpace();

    // Peek through ASC to generic AS.

    // TODO: we could dig deeper through both ASCs and GEPs.

    if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC)

      if (auto *ASCO = dyn_cast<AddrSpaceCastOperator>(Op0))

        AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace();


    if (std::optional<bool> Answer = evaluateIsSpace(IID, AS))

      return IC.replaceInstUsesWith(II,

                                    ConstantInt::get(II.getType(), *Answer));

    return nullptr; // Don't know the answer, got to check at run time.

  }

  default:

    return std::nullopt;

  }

}


std::optional<Instruction *>


NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

  if (std::optional<Instruction *> I = handleSpaceCheckIntrinsics(IC, II))

    return *I;

  if (Instruction *I = convertNvvmIntrinsicToLlvm(IC, &II))

    return I;


  return std::nullopt;

}


InstructionCost


NVPTXTTIImpl::getInstructionCost(const User *U,

                                 ArrayRef<const Value *> Operands,

                                 TTI::TargetCostKind CostKind) const {

  if (const auto *CI = dyn_cast<CallInst>(U))

    if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {

      // Without this implementation getCallCost() would return the number

      // of arguments+1 as the cost. Because the cost-model assumes it is a call

      // since it is classified as a call in the IR. A better cost model would

      // be to return the number of asm instructions embedded in the asm

      // string.

      StringRef AsmStr = IA->getAsmString();

      const unsigned InstCount =

          count_if(split(AsmStr, ';'), [](StringRef AsmInst) {

            // Trim off scopes denoted by '{' and '}' as these can be ignored

            AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");

            // This is pretty coarse but does a reasonably good job of

            // identifying things that look like instructions, possibly with a

            // predicate ("@").

            return !AsmInst.empty() &&

                   (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||

                    AsmInst.contains(".pragma"));

          });

      return InstCount * TargetTransformInfo::TCC_Basic;

    }


  return BaseT::getInstructionCost(U, Operands, CostKind);

}


InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {

  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  switch (ISD) {

  default:

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

                                         Op2Info);

  case ISD::ADD:

  case ISD::MUL:

  case ISD::XOR:

  case ISD::OR:

  case ISD::AND:

    // The machine code (SASS) simulates an i64 with two i32. Therefore, we

    // estimate that arithmetic operations on i64 are twice as expensive as

    // those on types that can fit into one machine register.

    if (LT.second.SimpleTy == MVT::i64)

      return 2 * LT.first;

    // Delegate other cases to the basic TTI.

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

                                         Op2Info);

  }

}


void NVPTXTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  BaseT::getUnrollingPreferences(L, SE, UP, ORE);


  // Enable partial unrolling and runtime unrolling, but reduce the

  // threshold.  This partially unrolls small loops which are often

  // unrolled by the PTX to SASS compiler and unrolling earlier can be

  // beneficial.

  UP.Partial = UP.Runtime = true;

  UP.PartialThreshold = UP.Threshold / 4;

}


void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,

                                              Intrinsic::ID IID) const {

  switch (IID) {

  case Intrinsic::nvvm_isspacep_const:

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster:

  case Intrinsic::nvvm_prefetch_tensormap: {

    OpIndexes.push_back(0);

    return true;

  }

  }

  return false;

}


Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,

                                                      Value *OldV,

                                                      Value *NewV) const {

  const Intrinsic::ID IID = II->getIntrinsicID();

  switch (IID) {

  case Intrinsic::nvvm_isspacep_const:

  case Intrinsic::nvvm_isspacep_global:

  case Intrinsic::nvvm_isspacep_local:

  case Intrinsic::nvvm_isspacep_shared:

  case Intrinsic::nvvm_isspacep_shared_cluster: {

    const unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    if (const auto R = evaluateIsSpace(IID, NewAS))

      return ConstantInt::get(II->getType(), *R);

    return nullptr;

  }

  case Intrinsic::nvvm_prefetch_tensormap: {

    IRBuilder<> Builder(II);

    const unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    if (NewAS == NVPTXAS::ADDRESS_SPACE_CONST ||

        NewAS == NVPTXAS::ADDRESS_SPACE_ENTRY_PARAM)

      return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,

                                          NewV);

    return nullptr;

  }

  }

  return nullptr;

}


bool NVPTXTTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment,

                                      unsigned AddrSpace,

                                      TTI::MaskKind MaskKind) const {

  if (MaskKind != TTI::MaskKind::ConstantMask)

    return false;


  //  We currently only support this feature for 256-bit vectors, so the

  //  alignment must be at least 32

  if (Alignment < 32)

    return false;


  if (!ST->has256BitVectorLoadStore(AddrSpace))

    return false;


  auto *VTy = dyn_cast<FixedVectorType>(DataTy);

  if (!VTy)

    return false;


  auto *ElemTy = VTy->getScalarType();

  return (ElemTy->getScalarSizeInBits() == 32 && VTy->getNumElements() == 8) ||

         (ElemTy->getScalarSizeInBits() == 64 && VTy->getNumElements() == 4);

}


bool NVPTXTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,

                                     unsigned /*AddrSpace*/,

                                     TTI::MaskKind MaskKind) const {

  if (MaskKind != TTI::MaskKind::ConstantMask)

    return false;


  if (Alignment < DL.getTypeStoreSize(DataTy))

    return false;


  // We do not support sub-byte element type masked loads.

  auto *VTy = dyn_cast<FixedVectorType>(DataTy);

  if (!VTy)

    return false;

  return VTy->getElementType()->getScalarSizeInBits() >= 8;

}


unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {

  // 256 bit loads/stores are currently only supported for global address space

  if (ST->has256BitVectorLoadStore(AddrSpace))

    return 256;

  return 128;

}


unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {

  if (isa<AllocaInst>(V))

    return ADDRESS_SPACE_LOCAL;


  if (const Argument *Arg = dyn_cast<Argument>(V)) {

    if (isKernelFunction(*Arg->getParent())) {

      const NVPTXTargetMachine &TM =

          static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());

      if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())

        return ADDRESS_SPACE_GLOBAL;

    } else {

      // We assume that all device parameters that are passed byval will be

      // placed in the local AS. Very simple cases will be updated after ISel to

      // use the device param space where possible.

      if (Arg->hasByValAttr())

        return ADDRESS_SPACE_LOCAL;

    }

  }


  return -1;

}


void NVPTXTTIImpl::collectKernelLaunchBounds(

    const Function &F,

    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

  if (const auto Val = getMaxClusterRank(F))

    LB.push_back({"maxclusterrank", *Val});


  const auto MaxNTID = getMaxNTID(F);

  if (MaxNTID.size() > 0)

    LB.push_back({"maxntidx", MaxNTID[0]});

  if (MaxNTID.size() > 1)

    LB.push_back({"maxntidy", MaxNTID[1]});

  if (MaxNTID.size() > 2)

    LB.push_back({"maxntidz", MaxNTID[2]});

}


ValueUniformity NVPTXTTIImpl::getValueUniformity(const Value *V) const {

  if (isSourceOfDivergence(V))

    return ValueUniformity::NeverUniform;


  return ValueUniformity::Default;

}


BasicTTIImpl.h
This file provides a helper that implements much of the TTI interface in terms of the target-independ...

Casting.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicInst.h

Value.h

InstCombiner.h
This file provides the interface for the instcombine pass implementation.

Intrinsics.h

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

NVPTXAddrSpace.h
NVPTX address space definition.

handleSpaceCheckIntrinsics
static std::optional< Instruction * > handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II)
Definition NVPTXTargetTransformInfo.cpp:400

convertNvvmIntrinsicToLlvm
static Instruction * convertNvvmIntrinsicToLlvm(InstCombiner &IC, IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:86

readsLaneId
static bool readsLaneId(const IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:42

evaluateIsSpace
static std::optional< bool > evaluateIsSpace(Intrinsic::ID IID, unsigned AS)
Definition NVPTXTargetTransformInfo.cpp:369

readsThreadIndex
static bool readsThreadIndex(const IntrinsicInst *II)
Definition NVPTXTargetTransformInfo.cpp:32

NVPTXTargetTransformInfo.h
This file a TargetTransformInfoImplBase conforming object specific to the NVPTX target machine.

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

NVVMProperties.h

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

ValueTracking.h

llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296

llvm::APFloatBase::IEEEhalf
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1048

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition BasicTTIImpl.h:714

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:786

llvm::BasicTTIImplBase< NVPTXTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:1012

llvm::BasicTTIImplBase< NVPTXTTIImpl >::DL
const DataLayout & DL

llvm::BinaryOperator::Create
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition Instructions.cpp:2738

llvm::CallInst::Create
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition Instructions.h:1566

llvm::CastInst::Create
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Definition Instructions.cpp:3084

llvm::Function
Definition Function.h:65

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868

llvm::InstCombiner
The core instruction combiner logic.
Definition InstCombiner.h:49

llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition InstCombiner.h:403

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::BinaryOps
BinaryOps
Definition Instruction.h:1056

llvm::Instruction::CastOps
CastOps
Definition Instruction.h:1070

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::NVPTXTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
Definition NVPTXTargetTransformInfo.cpp:556

llvm::NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
Definition NVPTXTargetTransformInfo.cpp:528

llvm::NVPTXTTIImpl::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
Definition NVPTXTargetTransformInfo.cpp:437

llvm::NVPTXTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
Definition NVPTXTargetTransformInfo.cpp:595

llvm::NVPTXTTIImpl::instCombineIntrinsic
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Definition NVPTXTargetTransformInfo.cpp:427

llvm::NVPTXTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition NVPTXTargetTransformInfo.cpp:465

llvm::NVPTXTTIImpl::getValueUniformity
ValueUniformity getValueUniformity(const Value *V) const override
Definition NVPTXTargetTransformInfo.cpp:639

llvm::NVPTXTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition NVPTXTargetTransformInfo.cpp:494

llvm::NVPTXTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition NVPTXTargetTransformInfo.cpp:507

llvm::NVPTXTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
Definition NVPTXTargetTransformInfo.cpp:512

llvm::NVPTXTTIImpl::getAssumedAddrSpace
unsigned getAssumedAddrSpace(const Value *V) const override
Definition NVPTXTargetTransformInfo.cpp:602

llvm::NVPTXTTIImpl::collectKernelLaunchBounds
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
Definition NVPTXTargetTransformInfo.cpp:624

llvm::NVPTXTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
Definition NVPTXTargetTransformInfo.cpp:579

llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition NVPTXTargetMachine.h:25

llvm::NVPTXTargetMachine::getDrvInterface
NVPTX::DrvInterface getDrvInterface() const
Definition NVPTXTargetMachine.h:47

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:621

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::StringRef::empty
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141

llvm::StringRef::size
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144

llvm::StringRef::ltrim
StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition StringRef.h:820

llvm::StringRef::contains
bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition StringRef.h:446

llvm::StringRef::trim
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition StringRef.h:844

llvm::TargetTransformInfoImplBase::getInstructionCost
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
Definition TargetTransformInfoImpl.h:86

llvm::TargetTransformInfo::MaskKind
MaskKind
Some targets only support masked load/store with a constant mask.
Definition TargetTransformInfo.h:898

llvm::TargetTransformInfo::ConstantMask
@ ConstantMask
Definition TargetTransformInfo.h:900

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:331

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition TargetTransformInfo.h:358

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:839

llvm::User
Definition User.h:44

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:780

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::NVPTXAS::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition NVPTXAddrSpace.h:26

llvm::NVPTXAS::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition NVPTXAddrSpace.h:22

llvm::NVPTXAS::ADDRESS_SPACE_ENTRY_PARAM
@ ADDRESS_SPACE_ENTRY_PARAM
Definition NVPTXAddrSpace.h:30

llvm::NVPTXAS::ADDRESS_SPACE_SHARED
@ ADDRESS_SPACE_SHARED
Definition NVPTXAddrSpace.h:24

llvm::NVPTXAS::ADDRESS_SPACE_CONST
@ ADDRESS_SPACE_CONST
Definition NVPTXAddrSpace.h:25

llvm::NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER
@ ADDRESS_SPACE_SHARED_CLUSTER
Definition NVPTXAddrSpace.h:28

llvm::NVPTXAS::ADDRESS_SPACE_GLOBAL
@ ADDRESS_SPACE_GLOBAL
Definition NVPTXAddrSpace.h:23

llvm::NVPTX::CUDA
@ CUDA
Definition NVPTX.h:139

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::isAlpha
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
Definition StringExtras.h:118

llvm::split
iterator_range< SplittingIterator > split(StringRef Str, StringRef Separator)
Split the specified string over a separator and return a range-compatible iterable over its partition...
Definition StringExtras.h:609

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::getMaxClusterRank
std::optional< unsigned > getMaxClusterRank(const Function &F)
Definition NVVMProperties.cpp:281

llvm::getMaxNTID
SmallVector< unsigned, 3 > getMaxNTID(const Function &F)
Definition NVVMProperties.cpp:245

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018

llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition NVVMProperties.h:34

llvm::ValueUniformity
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18

llvm::ValueUniformity::NeverUniform
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26

llvm::ValueUniformity::Default
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition FloatingPointMode.h:71

llvm::DenormalMode::PreserveSign
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Definition FloatingPointMode.h:81

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1281

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:762

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:638

llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition TargetTransformInfo.h:646

llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition TargetTransformInfo.h:663

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:699

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:695