doxygen/RISCVTargetTransformInfo_8cpp_source.html

//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "RISCVTargetTransformInfo.h"

#include "MCTargetDesc/RISCVMatInt.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/CodeGen/BasicTTIImpl.h"

#include "llvm/CodeGen/CostTable.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicsRISCV.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Transforms/InstCombine/InstCombiner.h"

#include <cmath>

#include <optional>

using namespace llvm;

using namespace llvm::PatternMatch;


#define DEBUG_TYPE "riscvtti"


static cl::opt<unsigned> RVVRegisterWidthLMUL(

    "riscv-v-register-bit-width-lmul",

    cl::desc(

        "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "

        "by autovectorized code. Fractional LMULs are not supported."),

    cl::init(2), cl::Hidden);


static cl::opt<unsigned> SLPMaxVF(

    "riscv-v-slp-max-vf",

    cl::desc(

        "Overrides result used for getMaximumVF query which is used "

        "exclusively by SLP vectorizer."),

    cl::Hidden);


static cl::opt<unsigned>

    RVVMinTripCount("riscv-v-min-trip-count",

                    cl::desc("Set the lower bound of a trip count to decide on "

                             "vectorization while tail-folding."),

                    cl::init(5), cl::Hidden);


static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",

                                           cl::init(true), cl::Hidden);


InstructionCost

RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,

                                      TTI::TargetCostKind CostKind) const {

  // Check if the type is valid for all CostKind

  if (!VT.isVector())

    return InstructionCost::getInvalid();

  size_t NumInstr = OpCodes.size();

  if (CostKind == TTI::TCK_CodeSize)

    return NumInstr;

  InstructionCost LMULCost = TLI->getLMULCost(VT);

  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))

    return LMULCost * NumInstr;

  InstructionCost Cost = 0;

  for (auto Op : OpCodes) {

    switch (Op) {

    case RISCV::VRGATHER_VI:

      Cost += TLI->getVRGatherVICost(VT);

      break;

    case RISCV::VRGATHER_VV:

      Cost += TLI->getVRGatherVVCost(VT);

      break;

    case RISCV::VSLIDEUP_VI:

    case RISCV::VSLIDEDOWN_VI:

      Cost += TLI->getVSlideVICost(VT);

      break;

    case RISCV::VSLIDEUP_VX:

    case RISCV::VSLIDEDOWN_VX:

      Cost += TLI->getVSlideVXCost(VT);

      break;

    case RISCV::VREDMAX_VS:

    case RISCV::VREDMIN_VS:

    case RISCV::VREDMAXU_VS:

    case RISCV::VREDMINU_VS:

    case RISCV::VREDSUM_VS:

    case RISCV::VREDAND_VS:

    case RISCV::VREDOR_VS:

    case RISCV::VREDXOR_VS:

    case RISCV::VFREDMAX_VS:

    case RISCV::VFREDMIN_VS:

    case RISCV::VFREDUSUM_VS: {

      unsigned VL = VT.getVectorMinNumElements();

      if (!VT.isFixedLengthVector())

        VL *= *getVScaleForTuning();

      Cost += Log2_32_Ceil(VL);

      break;

    }

    case RISCV::VFREDOSUM_VS: {

      unsigned VL = VT.getVectorMinNumElements();

      if (!VT.isFixedLengthVector())

        VL *= *getVScaleForTuning();

      Cost += VL;

      break;

    }

    case RISCV::VMV_X_S:

    case RISCV::VMV_S_X:

    case RISCV::VFMV_F_S:

    case RISCV::VFMV_S_F:

    case RISCV::VMOR_MM:

    case RISCV::VMXOR_MM:

    case RISCV::VMAND_MM:

    case RISCV::VMANDN_MM:

    case RISCV::VMNAND_MM:

    case RISCV::VCPOP_M:

    case RISCV::VFIRST_M:

      Cost += 1;

      break;

    case RISCV::VDIV_VV:

    case RISCV::VREM_VV:

      Cost += LMULCost * TTI::TCC_Expensive;

      break;

    default:

      Cost += LMULCost;

    }

  }

  return Cost;

}


static InstructionCost getIntImmCostImpl(const DataLayout &DL,

                                         const RISCVSubtarget *ST,

                                         const APInt &Imm, Type *Ty,

                                         TTI::TargetCostKind CostKind,

                                         bool FreeZeroes) {

  assert(Ty->isIntegerTy() &&

         "getIntImmCost can only estimate cost of materialising integers");


  // We have a Zero register, so 0 is always free.

  if (Imm == 0)

    return TTI::TCC_Free;


  // Otherwise, we check how many instructions it will take to materialise.

  return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,

                                    /*CompressionCost=*/false, FreeZeroes);

}


InstructionCost


RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,

                            TTI::TargetCostKind CostKind) const {

  return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);

}


// Look for patterns of shift followed by AND that can be turned into a pair of

// shifts. We won't need to materialize an immediate for the AND so these can

// be considered free.


static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {

  uint64_t Mask = Imm.getZExtValue();

  auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));

  if (!BO || !BO->hasOneUse())

    return false;


  if (BO->getOpcode() != Instruction::Shl)

    return false;


  if (!isa<ConstantInt>(BO->getOperand(1)))

    return false;


  unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();

  // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1

  // is a mask shifted by c2 bits with c3 leading zeros.

  if (isShiftedMask_64(Mask)) {

    unsigned Trailing = llvm::countr_zero(Mask);

    if (ShAmt == Trailing)

      return true;

  }


  return false;

}


// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),

// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,

// the type will be split so only the lower 32 bits need to be compared using

// (srai/srli X, C) == C2.


static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {

  if (!Inst->hasOneUse())

    return false;


  // Look for equality comparison.

  auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());

  if (!Cmp || !Cmp->isEquality())

    return false;


  // Right hand side of comparison should be a constant.

  auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));

  if (!C)

    return false;


  uint64_t Mask = Imm.getZExtValue();


  // Mask should be of the form -(1 << C) in the lower 32 bits.

  if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))

    return false;


  // Comparison constant should be a subset of Mask.

  uint64_t CmpC = C->getZExtValue();

  if ((CmpC & Mask) != CmpC)

    return false;


  // We'll need to sign extend the comparison constant and shift it right. Make

  // sure the new constant can use addi/xori+seqz/snez.

  unsigned ShiftBits = llvm::countr_zero(Mask);

  int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;

  return NewCmpC >= -2048 && NewCmpC <= 2048;

}


InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,

                                                const APInt &Imm, Type *Ty,

                                                TTI::TargetCostKind CostKind,

                                                Instruction *Inst) const {

  assert(Ty->isIntegerTy() &&

         "getIntImmCost can only estimate cost of materialising integers");


  // We have a Zero register, so 0 is always free.

  if (Imm == 0)

    return TTI::TCC_Free;


  // Some instructions in RISC-V can take a 12-bit immediate. Some of these are

  // commutative, in others the immediate comes from a specific argument index.

  bool Takes12BitImm = false;

  unsigned ImmArgIdx = ~0U;


  switch (Opcode) {

  case Instruction::GetElementPtr:

    // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will

    // split up large offsets in GEP into better parts than ConstantHoisting

    // can.

    return TTI::TCC_Free;

  case Instruction::Store: {

    // Use the materialization cost regardless of if it's the address or the

    // value that is constant, except for if the store is misaligned and

    // misaligned accesses are not legal (experience shows constant hoisting

    // can sometimes be harmful in such cases).

    if (Idx == 1 || !Inst)

      return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,

                               /*FreeZeroes=*/true);


    StoreInst *ST = cast<StoreInst>(Inst);

    if (!getTLI()->allowsMemoryAccessForAlignment(

            Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),

            ST->getPointerAddressSpace(), ST->getAlign()))

      return TTI::TCC_Free;


    return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,

                             /*FreeZeroes=*/true);

  }

  case Instruction::Load:

    // If the address is a constant, use the materialization cost.

    return getIntImmCost(Imm, Ty, CostKind);

  case Instruction::And:

    // zext.h

    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())

      return TTI::TCC_Free;

    // zext.w

    if (Imm == UINT64_C(0xffffffff) &&

        ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))

      return TTI::TCC_Free;

    // bclri

    if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())

      return TTI::TCC_Free;

    if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&

        canUseShiftPair(Inst, Imm))

      return TTI::TCC_Free;

    if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&

        canUseShiftCmp(Inst, Imm))

      return TTI::TCC_Free;

    Takes12BitImm = true;

    break;

  case Instruction::Add:

    Takes12BitImm = true;

    break;

  case Instruction::Or:

  case Instruction::Xor:

    // bseti/binvi

    if (ST->hasStdExtZbs() && Imm.isPowerOf2())

      return TTI::TCC_Free;

    Takes12BitImm = true;

    break;

  case Instruction::Mul:

    // Power of 2 is a shift. Negated power of 2 is a shift and a negate.

    if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())

      return TTI::TCC_Free;

    // One more or less than a power of 2 can use SLLI+ADD/SUB.

    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())

      return TTI::TCC_Free;

    // FIXME: There is no MULI instruction.

    Takes12BitImm = true;

    break;

  case Instruction::Sub:

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

    Takes12BitImm = true;

    ImmArgIdx = 1;

    break;

  default:

    break;

  }


  if (Takes12BitImm) {

    // Check immediate is the correct argument...

    if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {

      // ... and fits into the 12-bit immediate.

      if (Imm.getSignificantBits() <= 64 &&

          getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {

        return TTI::TCC_Free;

      }

    }


    // Otherwise, use the full materialisation cost.

    return getIntImmCost(Imm, Ty, CostKind);

  }


  // By default, prevent hoisting.

  return TTI::TCC_Free;

}


InstructionCost


RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,

                                  const APInt &Imm, Type *Ty,

                                  TTI::TargetCostKind CostKind) const {

  // Prevent hoisting in unknown cases.

  return TTI::TCC_Free;

}


bool RISCVTTIImpl::hasActiveVectorLength() const {

  return ST->hasVInstructions();

}


TargetTransformInfo::PopcntSupportKind


RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {

  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");

  return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;

}


InstructionCost RISCVTTIImpl::getPartialReductionCost(

    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,

    ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,

    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,

    TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {

  if (Opcode == Instruction::FAdd)

    return InstructionCost::getInvalid();


  // zve32x is broken for partial_reduce_umla, but let's make sure we

  // don't generate them.

  if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||

      Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||

      InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||

      !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))

    return InstructionCost::getInvalid();


  Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

  // Note: Asuming all vdot4a* variants are equal cost

  return LT.first *

         getRISCVInstructionCost(RISCV::VDOT4A_VV, LT.second, CostKind);

}


bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {

  // Currently, the ExpandReductions pass can't expand scalable-vector

  // reductions, but we still request expansion as RVV doesn't support certain

  // reductions and the SelectionDAG can't legalize them either.

  switch (II->getIntrinsicID()) {

  default:

    return false;

  // These reductions have no equivalent in RVV

  case Intrinsic::vector_reduce_mul:

  case Intrinsic::vector_reduce_fmul:

    return true;

  }

}


std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {

  if (ST->hasVInstructions())

    return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;

  return BaseT::getMaxVScale();

}


std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {

  if (ST->hasVInstructions())

    if (unsigned MinVLen = ST->getRealMinVLen();

        MinVLen >= RISCV::RVVBitsPerBlock)

      return MinVLen / RISCV::RVVBitsPerBlock;

  return BaseT::getVScaleForTuning();

}


TypeSize


RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

  unsigned LMUL =

      llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));

  switch (K) {

  case TargetTransformInfo::RGK_Scalar:

    return TypeSize::getFixed(ST->getXLen());

  case TargetTransformInfo::RGK_FixedWidthVector:

    return TypeSize::getFixed(

        ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);

  case TargetTransformInfo::RGK_ScalableVector:

    return TypeSize::getScalable(

        (ST->hasVInstructions() &&

         ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)

            ? LMUL * RISCV::RVVBitsPerBlock

            : 0);

  }


  llvm_unreachable("Unsupported register kind");

}


InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(

    const TTI::TargetCostKind CostKind) const {

  switch (CostKind) {

  case TTI::TCK_CodeSize:

  case TTI::TCK_SizeAndLatency:

    // Always 2 instructions

    return 2;

  case TTI::TCK_Latency:

  case TTI::TCK_RecipThroughput:

    // Depending on the memory model the address generation will

    // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't

    // have a way of getting this information here, so conservatively

    // require both.

    // In practice, these are generally implemented together.

    return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;

  }

  llvm_unreachable("Unsupported cost kind");

}


InstructionCost

RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,

                                      TTI::TargetCostKind CostKind) const {

  // Add a cost of address generation + the cost of the load. The address

  // is expected to be a PC relative offset to a constant pool entry

  // using auipc/addi.

  return getStaticDataAddrGenerationCost(CostKind) +

         getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),

                         /*AddressSpace=*/0, CostKind);

}


static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {

  unsigned Size = Mask.size();

  if (!isPowerOf2_32(Size))

    return false;

  for (unsigned I = 0; I != Size; ++I) {

    if (static_cast<unsigned>(Mask[I]) == I)

      continue;

    if (Mask[I] != 0)

      return false;

    if (Size % I != 0)

      return false;

    for (unsigned J = I + 1; J != Size; ++J)

      // Check the pattern is repeated.

      if (static_cast<unsigned>(Mask[J]) != J % I)

        return false;

    SubVectorSize = I;

    return true;

  }

  // That means Mask is <0, 1, 2, 3>. This is not a concatenation.

  return false;

}


static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,

                                        LLVMContext &C) {

  assert((DataVT.getScalarSizeInBits() != 8 ||

          DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");

  MVT IndexVT = DataVT.changeTypeToInteger();

  if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))

    IndexVT = IndexVT.changeVectorElementType(MVT::i16);

  return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));

}


/// Attempt to approximate the cost of a shuffle which will require splitting

/// during legalization.  Note that processShuffleMasks is not an exact proxy

/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a

/// reasonably close upperbound.


static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI,

                                               MVT LegalVT, VectorType *Tp,

                                               ArrayRef<int> Mask,

                                               TTI::TargetCostKind CostKind) {

  assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&

         "Expected fixed vector type and non-empty mask");

  unsigned LegalNumElts = LegalVT.getVectorNumElements();

  // Number of destination vectors after legalization:

  unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);

  // We are going to permute multiple sources and the result will be in

  // multiple destinations. Providing an accurate cost only for splits where

  // the element type remains the same.

  if (NumOfDests <= 1 ||

      LegalVT.getVectorElementType().getSizeInBits() !=

          Tp->getElementType()->getPrimitiveSizeInBits() ||

      LegalNumElts >= Tp->getElementCount().getFixedValue())

    return InstructionCost::getInvalid();


  unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);

  unsigned LegalVTSize = LegalVT.getStoreSize();

  // Number of source vectors after legalization:

  unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);


  auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);


  unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);

  unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;

  unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;

  SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);

  assert(NormalizedVF >= Mask.size() &&

         "Normalized mask expected to be not shorter than original mask.");

  copy(Mask, NormalizedMask.begin());

  InstructionCost Cost = 0;

  SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;

  processShuffleMasks(

      NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},

      [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {

        if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))

          return;

        if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))

                 .second)

          return;

        Cost += TTI.getShuffleCost(

            TTI::SK_PermuteSingleSrc,

            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),

            SingleOpTy, RegMask, CostKind, 0, nullptr);

      },

      [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {

        Cost += TTI.getShuffleCost(

            TTI::SK_PermuteTwoSrc,

            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),

            SingleOpTy, RegMask, CostKind, 0, nullptr);

      });

  return Cost;

}


/// Try to perform better estimation of the permutation.

/// 1. Split the source/destination vectors into real registers.

/// 2. Do the mask analysis to identify which real registers are

/// permuted. If more than 1 source registers are used for the

/// destination register building, the cost for this destination register

/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one

/// source register is used, build mask and calculate the cost as a cost

/// of PermuteSingleSrc.

/// Also, for the single register permute we try to identify if the

/// destination register is just a copy of the source register or the

/// copy of the previous destination register (the cost is

/// TTI::TCC_Basic). If the source register is just reused, the cost for

/// this operation is 0.

static InstructionCost


costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT,

                            std::optional<unsigned> VLen, VectorType *Tp,

                            ArrayRef<int> Mask, TTI::TargetCostKind CostKind) {

  assert(LegalVT.isFixedLengthVector());

  if (!VLen || Mask.empty())

    return InstructionCost::getInvalid();

  MVT ElemVT = LegalVT.getVectorElementType();

  unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();

  LegalVT = TTI.getTypeLegalizationCost(

                   FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))

                .second;

  // Number of destination vectors after legalization:

  InstructionCost NumOfDests =

      divideCeil(Mask.size(), LegalVT.getVectorNumElements());

  if (NumOfDests <= 1 ||

      LegalVT.getVectorElementType().getSizeInBits() !=

          Tp->getElementType()->getPrimitiveSizeInBits() ||

      LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())

    return InstructionCost::getInvalid();


  unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);

  unsigned LegalVTSize = LegalVT.getStoreSize();

  // Number of source vectors after legalization:

  unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);


  auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),

                                          LegalVT.getVectorNumElements());


  unsigned E = NumOfDests.getValue();

  unsigned NormalizedVF =

      LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);

  unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();

  unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();

  SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);

  assert(NormalizedVF >= Mask.size() &&

         "Normalized mask expected to be not shorter than original mask.");

  copy(Mask, NormalizedMask.begin());

  InstructionCost Cost = 0;

  int NumShuffles = 0;

  SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;

  processShuffleMasks(

      NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},

      [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {

        if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))

          return;

        if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))

                 .second)

          return;

        ++NumShuffles;

        Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,

                                   SingleOpTy, RegMask, CostKind, 0, nullptr);

      },

      [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {

        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,

                                   SingleOpTy, RegMask, CostKind, 0, nullptr);

        NumShuffles += 2;

      });

  // Note: check that we do not emit too many shuffles here to prevent code

  // size explosion.

  // TODO: investigate, if it can be improved by extra analysis of the masks

  // to check if the code is more profitable.

  if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||

      (NumOfDestRegs <= 2 && NumShuffles < 4))

    return Cost;

  return InstructionCost::getInvalid();

}


InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,

                                           ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind) const {

  // Avoid missing masks and length changing shuffles

  if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())

    return InstructionCost::getInvalid();


  int NumElts = Tp->getNumElements();

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

  // Avoid scalarization cases

  if (!LT.second.isFixedLengthVector())

    return InstructionCost::getInvalid();


  // Requires moving elements between parts, which requires additional

  // unmodeled instructions.

  if (LT.first != 1)

    return InstructionCost::getInvalid();


  auto GetSlideOpcode = [&](int SlideAmt) {

    assert(SlideAmt != 0);

    bool IsVI = isUInt<5>(std::abs(SlideAmt));

    if (SlideAmt < 0)

      return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;

    return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;

  };


  std::array<std::pair<int, int>, 2> SrcInfo;

  if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))

    return InstructionCost::getInvalid();


  if (SrcInfo[1].second == 0)

    std::swap(SrcInfo[0], SrcInfo[1]);


  InstructionCost FirstSlideCost = 0;

  if (SrcInfo[0].second != 0) {

    unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);

    FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);

  }


  if (SrcInfo[1].first == -1)

    return FirstSlideCost;


  InstructionCost SecondSlideCost = 0;

  if (SrcInfo[1].second != 0) {

    unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);

    SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);

  } else {

    SecondSlideCost =

        getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);

  }


  auto EC = Tp->getElementCount();

  VectorType *MaskTy =

      VectorType::get(IntegerType::getInt1Ty(Tp->getContext()), EC);

  InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);

  return FirstSlideCost + SecondSlideCost + MaskCost;

}


InstructionCost


RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,

                             VectorType *SrcTy, ArrayRef<int> Mask,

                             TTI::TargetCostKind CostKind, int Index,

                             VectorType *SubTp, ArrayRef<const Value *> Args,

                             const Instruction *CxtI) const {

  assert((Mask.empty() || DstTy->isScalableTy() ||

          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&

         "Expected the Mask to match the return size if given");

  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&

         "Expected the same scalar types");


  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);


  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)

  // For now, skip all fixed vector cost analysis when P extension is available

  // to avoid crashes in getMinRVVVectorSizeInBits()

  if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))

    return 1;


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);


  // First, handle cases where having a fixed length vector enables us to

  // give a more accurate cost than falling back to generic scalable codegen.

  // TODO: Each of these cases hints at a modeling gap around scalable vectors.

  if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);

      FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {

    InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting(

        *this, LT.second, ST->getRealVLen(),

        Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);

    if (VRegSplittingCost.isValid())

      return VRegSplittingCost;

    switch (Kind) {

    default:

      break;

    case TTI::SK_PermuteSingleSrc: {

      if (Mask.size() >= 2) {

        MVT EltTp = LT.second.getVectorElementType();

        // If the size of the element is < ELEN then shuffles of interleaves and

        // deinterleaves of 2 vectors can be lowered into the following

        // sequences

        if (EltTp.getScalarSizeInBits() < ST->getELen()) {

          // Example sequence:

          //   vsetivli     zero, 4, e8, mf4, ta, ma (ignored)

          //   vwaddu.vv    v10, v8, v9

          //   li       a0, -1                   (ignored)

          //   vwmaccu.vx   v10, a0, v9

          if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))

            return 2 * LT.first * TLI->getLMULCost(LT.second);


          if (Mask[0] == 0 || Mask[0] == 1) {

            auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());

            // Example sequence:

            //   vnsrl.wi   v10, v8, 0

            if (equal(DeinterleaveMask, Mask))

              return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,

                                                        LT.second, CostKind);

          }

        }

        int SubVectorSize;

        if (LT.second.getScalarSizeInBits() != 1 &&

            isRepeatedConcatMask(Mask, SubVectorSize)) {

          InstructionCost Cost = 0;

          unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);

          // The cost of extraction from a subvector is 0 if the index is 0.

          for (unsigned I = 0; I != NumSlides; ++I) {

            unsigned InsertIndex = SubVectorSize * (1 << I);

            FixedVectorType *SubTp =

                FixedVectorType::get(SrcTy->getElementType(), InsertIndex);

            FixedVectorType *DestTp =

                FixedVectorType::getDoubleElementsVectorType(SubTp);

            std::pair<InstructionCost, MVT> DestLT =

                getTypeLegalizationCost(DestTp);

            // Add the cost of whole vector register move because the

            // destination vector register group for vslideup cannot overlap the

            // source.

            Cost += DestLT.first * TLI->getLMULCost(DestLT.second);

            Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},

                                   CostKind, InsertIndex, SubTp);

          }

          return Cost;

        }

      }


      if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);

          SlideCost.isValid())

        return SlideCost;


      // vrgather + cost of generating the mask constant.

      // We model this for an unknown mask with a single vrgather.

      if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||

                            LT.second.getVectorNumElements() <= 256)) {

        VectorType *IdxTy =

            getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());

        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);

        return IndexCost +

               getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);

      }

      break;

    }

    case TTI::SK_Transpose:

    case TTI::SK_PermuteTwoSrc: {


      if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);

          SlideCost.isValid())

        return SlideCost;


      // 2 x (vrgather + cost of generating the mask constant) + cost of mask

      // register for the second vrgather. We model this for an unknown

      // (shuffle) mask.

      if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||

                            LT.second.getVectorNumElements() <= 256)) {

        auto &C = SrcTy->getContext();

        auto EC = SrcTy->getElementCount();

        VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);

        VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);

        InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);

        InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);

        return 2 * IndexCost +

               getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},

                                       LT.second, CostKind) +

               MaskCost;

      }

      break;

    }

    }


    auto shouldSplit = [](TTI::ShuffleKind Kind) {

      switch (Kind) {

      default:

        return false;

      case TTI::SK_PermuteSingleSrc:

      case TTI::SK_Transpose:

      case TTI::SK_PermuteTwoSrc:

        return true;

      }

    };


    if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&

        shouldSplit(Kind)) {

      InstructionCost SplitCost =

          costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);

      if (SplitCost.isValid())

        return SplitCost;

    }

  }


  // Handle scalable vectors (and fixed vectors legalized to scalable vectors).

  switch (Kind) {

  default:

    // Fallthrough to generic handling.

    // TODO: Most of these cases will return getInvalid in generic code, and

    // must be implemented here.

    break;

  case TTI::SK_ExtractSubvector:

    // Extract at zero is always a subregister extract

    if (Index == 0)

      return TTI::TCC_Free;


    // If we're extracting a subvector of at most m1 size at a sub-register

    // boundary - which unfortunately we need exact vlen to identify - this is

    // a subregister extract at worst and thus won't require a vslidedown.

    // TODO: Extend for aligned m2, m4 subvector extracts

    // TODO: Extend for misalgined (but contained) extracts

    // TODO: Extend for scalable subvector types

    if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);

        SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {

      if (std::optional<unsigned> VLen = ST->getRealVLen();

          VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&

          SubLT.second.getSizeInBits() <= *VLen)

        return TTI::TCC_Free;

    }


    // Example sequence:

    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)

    // vslidedown.vi  v8, v9, 2

    return LT.first *

           getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);

  case TTI::SK_InsertSubvector:

    // Example sequence:

    // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)

    // vslideup.vi  v8, v9, 2

    LT = getTypeLegalizationCost(DstTy);

    return LT.first *

           getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);

  case TTI::SK_Select: {

    // Example sequence:

    // li           a0, 90

    // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)

    // vmv.s.x      v0, a0

    // vmerge.vvm   v8, v9, v8, v0

    // We use 2 for the cost of the mask materialization as this is the true

    // cost for small masks and most shuffles are small.  At worst, this cost

    // should be a very small constant for the constant pool load.  As such,

    // we may bias towards large selects slightly more than truly warranted.

    return LT.first *

           (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},

                                        LT.second, CostKind));

  }

  case TTI::SK_Broadcast: {

    // Check for broadcast loads, which are synthesized by optimized zero-stride

    // loads (this is checked in RISCVTTIImpl::isLegalBroadcastLoad).

    bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);

    if (IsLoad && LT.second.isVector() &&

        isLegalBroadcastLoad(SrcTy->getElementType(),

                             LT.second.getVectorElementCount()))

      return 0;


    bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==

                                           Instruction::InsertElement);

    if (LT.second.getScalarSizeInBits() == 1) {

      if (HasScalar) {

        // Example sequence:

        //   andi a0, a0, 1

        //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)

        //   vmv.v.x v8, a0

        //   vmsne.vi v0, v8, 0

        return LT.first *

               (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},

                                            LT.second, CostKind));

      }

      // Example sequence:

      //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)

      //   vmv.v.i v8, 0

      //   vmerge.vim      v8, v8, 1, v0

      //   vmv.x.s a0, v8

      //   andi    a0, a0, 1

      //   vmv.v.x v8, a0

      //   vmsne.vi  v0, v8, 0


      return LT.first *

             (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,

                                           RISCV::VMV_X_S, RISCV::VMV_V_X,

                                           RISCV::VMSNE_VI},

                                          LT.second, CostKind));

    }


    if (HasScalar) {

      // Example sequence:

      //   vmv.v.x v8, a0

      return LT.first *

             getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);

    }


    // Example sequence:

    //   vrgather.vi     v9, v8, 0

    return LT.first *

           getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);

  }

  case TTI::SK_Splice: {

    // vslidedown+vslideup.

    // TODO: Multiplying by LT.first implies this legalizes into multiple copies

    // of similar code, but I think we expand through memory.

    unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};

    if (Index >= 0 && Index < 32)

      Opcodes[0] = RISCV::VSLIDEDOWN_VI;

    else if (Index < 0 && Index > -32)

      Opcodes[1] = RISCV::VSLIDEUP_VI;

    return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);

  }

  case TTI::SK_Reverse: {


    if (!LT.second.isVector())

      return InstructionCost::getInvalid();


    // TODO: Cases to improve here:

    // * Illegal vector types

    // * i64 on RV32

    if (SrcTy->getElementType()->isIntegerTy(1)) {

      VectorType *WideTy =

          VectorType::get(IntegerType::get(SrcTy->getContext(), 8),

                          cast<VectorType>(SrcTy)->getElementCount());

      return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,

                              TTI::CastContextHint::None, CostKind) +

             getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,

                            nullptr) +

             getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,

                              TTI::CastContextHint::None, CostKind);

    }


    MVT ContainerVT = LT.second;

    if (LT.second.isFixedLengthVector())

      ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);

    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);

    if (ContainerVT.bitsLE(M1VT)) {

      // Example sequence:

      //   csrr a0, vlenb

      //   srli a0, a0, 3

      //   addi a0, a0, -1

      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)

      //   vid.v v9

      //   vrsub.vx v10, v9, a0

      //   vrgather.vv v9, v8, v10

      InstructionCost LenCost = 3;

      if (LT.second.isFixedLengthVector())

        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices

        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;

      unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};

      if (LT.second.isFixedLengthVector() &&

          isInt<5>(LT.second.getVectorNumElements() - 1))

        Opcodes[1] = RISCV::VRSUB_VI;

      InstructionCost GatherCost =

          getRISCVInstructionCost(Opcodes, LT.second, CostKind);

      return LT.first * (LenCost + GatherCost);

    }


    // At high LMUL, we split into a series of M1 reverses (see

    // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate

    // the resulting gap at the bottom (for fixed vectors only).  The important

    // bit is that the cost scales linearly, not quadratically with LMUL.

    unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};

    InstructionCost FixedCost =

        getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;

    unsigned Ratio =

        ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();

    InstructionCost GatherCost =

        getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;

    InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :

      getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);

    return FixedCost + LT.first * (GatherCost + SlideCost);

  }

  }

  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,

                               SubTp);

}


static unsigned isM1OrSmaller(MVT VT) {

  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);

  return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||

          LMUL == RISCVVType::VLMUL::LMUL_F4 ||

          LMUL == RISCVVType::VLMUL::LMUL_F2 ||

          LMUL == RISCVVType::VLMUL::LMUL_1);

}


InstructionCost RISCVTTIImpl::getScalarizationOverhead(

    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,

    TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,

    TTI::VectorInstrContext VIC) const {

  if (isa<ScalableVectorType>(Ty))

    return InstructionCost::getInvalid();


  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)

  // For now, skip all fixed vector cost analysis when P extension is available

  // to avoid crashes in getMinRVVVectorSizeInBits()

  if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {

    return 1; // Treat as single instruction cost for now

  }


  // A build_vector (which is m1 sized or smaller) can be done in no

  // worse than one vslide1down.vx per element in the type.  We could

  // in theory do an explode_vector in the inverse manner, but our

  // lowering today does not have a first class node for this pattern.

  InstructionCost Cost = BaseT::getScalarizationOverhead(

      Ty, DemandedElts, Insert, Extract, CostKind);

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {

    if (Ty->getScalarSizeInBits() == 1) {

      auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));

      // Note: Implicit scalar anyextend is assumed to be free since the i1

      // must be stored in a GPR.

      return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,

                                      CostKind) +

             getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,

                              TTI::CastContextHint::None, CostKind, nullptr);

    }


    assert(LT.second.isFixedLengthVector());

    MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);

    if (isM1OrSmaller(ContainerVT)) {

      InstructionCost BV =

          cast<FixedVectorType>(Ty)->getNumElements() *

          getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);

      if (BV < Cost)

        Cost = BV;

    }

  }

  return Cost;

}


InstructionCost


RISCVTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,

                                       TTI::TargetCostKind CostKind) const {

  Type *DataTy = MICA.getDataType();

  Align Alignment = MICA.getAlignment();

  switch (MICA.getID()) {

  case Intrinsic::vp_load_ff: {

    EVT DataTypeVT = TLI->getValueType(DL, DataTy);

    if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))

      return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


    unsigned AS = MICA.getAddressSpace();

    return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,

                           {TTI::OK_AnyValue, TTI::OP_None}, nullptr);

  }

  case Intrinsic::experimental_vp_strided_load:

  case Intrinsic::experimental_vp_strided_store:

    return getStridedMemoryOpCost(MICA, CostKind);

  case Intrinsic::masked_compressstore:

  case Intrinsic::masked_expandload:

    return getExpandCompressMemoryOpCost(MICA, CostKind);

  case Intrinsic::vp_scatter:

  case Intrinsic::vp_gather:

  case Intrinsic::masked_scatter:

  case Intrinsic::masked_gather:

    return getGatherScatterOpCost(MICA, CostKind);

  case Intrinsic::vp_load:

  case Intrinsic::vp_store:

  case Intrinsic::masked_load:

  case Intrinsic::masked_store:

    return getMaskedMemoryOpCost(MICA, CostKind);

  }

  return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);

}


InstructionCost


RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,

                                    TTI::TargetCostKind CostKind) const {

  unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load

                                                           : Instruction::Store;

  Type *Src = MICA.getDataType();

  Align Alignment = MICA.getAlignment();

  unsigned AddressSpace = MICA.getAddressSpace();


  if (!isLegalMaskedLoadStore(Src, Alignment) ||

      CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


  return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);

}


InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(

    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,

    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,

    bool UseMaskForCond, bool UseMaskForGaps) const {


  // The interleaved memory access pass will lower (de)interleave ops combined

  // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg

  // only support masking per-iteration (i.e. condition), not per-segment (i.e.

  // gap).

  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {

    auto *VTy = cast<VectorType>(VecTy);

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);

    // Need to make sure type has't been scalarized

    if (LT.second.isVector()) {

      auto *SubVecTy =

          VectorType::get(VTy->getElementType(),

                          VTy->getElementCount().divideCoefficientBy(Factor));

      if (VTy->getElementCount().isKnownMultipleOf(Factor) &&

          TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,

                                            AddressSpace, DL)) {


        // Some processors optimize segment loads/stores as one wide memory op +

        // Factor * LMUL shuffle ops.

        if (ST->hasOptimizedSegmentLoadStore(Factor)) {

          InstructionCost Cost =

              getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);

          MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();

          Cost += Factor * TLI->getLMULCost(SubVecVT);

          return LT.first * Cost;

        }


        // Otherwise, the cost is proportional to the number of elements (VL *

        // Factor ops).

        InstructionCost MemOpCost =

            getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,

                            CostKind, {TTI::OK_AnyValue, TTI::OP_None});

        unsigned NumLoads = getEstimatedVLFor(VTy);

        return NumLoads * MemOpCost;

      }

    }

  }


  // TODO: Return the cost of interleaved accesses for scalable vector when

  // unable to convert to segment accesses instructions.

  if (isa<ScalableVectorType>(VecTy))

    return InstructionCost::getInvalid();


  auto *FVTy = cast<FixedVectorType>(VecTy);

  // When gaps are only at the tail, for interleaved load, we can emit a wide

  // masked load and shufflevectors. For interleaved store, we can emit

  // shufflevectors and a wide masked store. The interleaved memory access pass

  // will lower them into vlsseg/vssseg intrinsics.

  if (UseMaskForGaps) {

    assert(llvm::is_sorted(Indices) && "Indices must be sorted");

    assert(llvm::adjacent_find(Indices) == Indices.end() &&

           "Indices should not contain duplicate elements");

    unsigned NumOfFields = Indices.size();

    bool IsTailGapOnly = NumOfFields > 1 && (NumOfFields == Indices.back() + 1);

    if (IsTailGapOnly &&

        NumOfFields <= TLI->getMaxSupportedInterleaveFactor()) {

      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);

      if (LT.second.isVector() &&

          FVTy->getElementCount().isKnownMultipleOf(Factor)) {

        auto *SubVecTy = VectorType::get(

            FVTy->getElementType(),

            FVTy->getElementCount().divideCoefficientBy(Factor));

        if (TLI->isLegalInterleavedAccessType(SubVecTy, NumOfFields, Alignment,

                                              AddressSpace, DL)) {

          // The cost is proportional to the total number of element accesses.

          unsigned NumAccesses = getEstimatedVLFor(FVTy);

          return NumAccesses * TTI::TCC_Basic;

        }

      }

    }

  }


  InstructionCost MemCost =

      getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);

  unsigned VF = FVTy->getNumElements() / Factor;


  // An interleaved load will look like this for Factor=3:

  // %wide.vec = load <12 x i32>, ptr %3, align 4

  // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

  if (Opcode == Instruction::Load) {

    InstructionCost Cost = MemCost;

    for (unsigned Index : Indices) {

      FixedVectorType *VecTy =

          FixedVectorType::get(FVTy->getElementType(), VF * Factor);

      auto Mask = createStrideMask(Index, Factor, VF);

      Mask.resize(VF * Factor, -1);

      InstructionCost ShuffleCost =

          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy,

                         Mask, CostKind, 0, nullptr, {});

      Cost += ShuffleCost;

    }

    return Cost;

  }


  // TODO: Model for NF > 2

  // We'll need to enhance getShuffleCost to model shuffles that are just

  // inserts and extracts into subvectors, since they won't have the full cost

  // of a vrgather.

  // An interleaved store for 3 vectors of 4 lanes will look like

  // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>

  // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>

  // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>

  // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>

  // store <12 x i32> %interleaved.vec, ptr %10, align 4

  if (Factor != 2)

    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

                                             Alignment, AddressSpace, CostKind,

                                             UseMaskForCond, UseMaskForGaps);


  assert(Opcode == Instruction::Store && "Opcode must be a store");

  // For an interleaving store of 2 vectors, we perform one large interleaving

  // shuffle that goes into the wide store

  auto Mask = createInterleaveMask(VF, Factor);

  InstructionCost ShuffleCost =

      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask,

                     CostKind, 0, nullptr, {});

  return MemCost + ShuffleCost;

}


InstructionCost


RISCVTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,

                                     TTI::TargetCostKind CostKind) const {


  bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||

                MICA.getID() == Intrinsic::vp_gather;

  unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;

  Type *DataTy = MICA.getDataType();

  Align Alignment = MICA.getAlignment();

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


  if ((Opcode == Instruction::Load &&

       !isLegalMaskedGather(DataTy, Align(Alignment))) ||

      (Opcode == Instruction::Store &&

       !isLegalMaskedScatter(DataTy, Align(Alignment))))

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


  // Cost is proportional to the number of memory operations implied.  For

  // scalable vectors, we use an estimate on that number since we don't

  // know exactly what VL will be.

  auto &VTy = *cast<VectorType>(DataTy);

  unsigned NumLoads = getEstimatedVLFor(&VTy);

  return NumLoads * TTI::TCC_Basic;

}


InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(

    const MemIntrinsicCostAttributes &MICA,

    TTI::TargetCostKind CostKind) const {

  unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload

                        ? Instruction::Load

                        : Instruction::Store;

  Type *DataTy = MICA.getDataType();

  bool VariableMask = MICA.getVariableMask();

  Align Alignment = MICA.getAlignment();

  bool IsLegal = (Opcode == Instruction::Store &&

                  isLegalMaskedCompressStore(DataTy, Alignment)) ||

                 (Opcode == Instruction::Load &&

                  isLegalMaskedExpandLoad(DataTy, Alignment));

  if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);

  // Example compressstore sequence:

  // vsetivli        zero, 8, e32, m2, ta, ma (ignored)

  // vcompress.vm    v10, v8, v0

  // vcpop.m a1, v0

  // vsetvli zero, a1, e32, m2, ta, ma

  // vse32.v v10, (a0)

  // Example expandload sequence:

  // vsetivli        zero, 8, e8, mf2, ta, ma (ignored)

  // vcpop.m a1, v0

  // vsetvli zero, a1, e32, m2, ta, ma

  // vle32.v v10, (a0)

  // vsetivli        zero, 8, e32, m2, ta, ma

  // viota.m v12, v0

  // vrgather.vv     v8, v10, v12, v0.t

  auto MemOpCost =

      getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);

  auto LT = getTypeLegalizationCost(DataTy);

  SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};

  if (VariableMask)

    Opcodes.push_back(RISCV::VCPOP_M);

  if (Opcode == Instruction::Store)

    Opcodes.append({RISCV::VCOMPRESS_VM});

  else

    Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});

  return MemOpCost +

         LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost


RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,

                                     TTI::TargetCostKind CostKind) const {


  unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load

                        ? Instruction::Load

                        : Instruction::Store;


  Type *DataTy = MICA.getDataType();

  Align Alignment = MICA.getAlignment();

  const Instruction *I = MICA.getInst();


  if (!isLegalStridedLoadStore(DataTy, Alignment))

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


  if (CostKind == TTI::TCK_CodeSize)

    return TTI::TCC_Basic;


  // Cost is proportional to the number of memory operations implied.  For

  // scalable vectors, we use an estimate on that number since we don't

  // know exactly what VL will be.

  // FIXME: This will overcost for i64 on rv32 with +zve64x.

  auto &VTy = *cast<VectorType>(DataTy);

  InstructionCost MemOpCost =

      getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,

                      {TTI::OK_AnyValue, TTI::OP_None}, I);

  unsigned NumLoads = getEstimatedVLFor(&VTy);

  return NumLoads * MemOpCost;

}


InstructionCost


RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {

  // FIXME: This is a property of the default vector convention, not

  // all possible calling conventions.  Fixing that will require

  // some TTI API and SLP rework.

  InstructionCost Cost = 0;

  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  for (auto *Ty : Tys) {

    if (!Ty->isVectorTy())

      continue;

    Align A = DL.getPrefTypeAlign(Ty);

    Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +

            getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);

  }

  return Cost;

}


// Currently, these represent both throughput and codesize costs

// for the respective intrinsics.  The costs in this table are simply

// instruction counts with the following adjustments made:

// * One vsetvli is considered free.


static const CostTblEntry VectorIntrinsicCostTable[]{

    {Intrinsic::floor, MVT::f32, 9},

    {Intrinsic::floor, MVT::f64, 9},

    {Intrinsic::ceil, MVT::f32, 9},

    {Intrinsic::ceil, MVT::f64, 9},

    {Intrinsic::trunc, MVT::f32, 7},

    {Intrinsic::trunc, MVT::f64, 7},

    {Intrinsic::round, MVT::f32, 9},

    {Intrinsic::round, MVT::f64, 9},

    {Intrinsic::roundeven, MVT::f32, 9},

    {Intrinsic::roundeven, MVT::f64, 9},

    {Intrinsic::rint, MVT::f32, 7},

    {Intrinsic::rint, MVT::f64, 7},

    {Intrinsic::nearbyint, MVT::f32, 9},

    {Intrinsic::nearbyint, MVT::f64, 9},

    {Intrinsic::bswap, MVT::i16, 3},

    {Intrinsic::bswap, MVT::i32, 12},

    {Intrinsic::bswap, MVT::i64, 31},

    {Intrinsic::vp_bswap, MVT::i16, 3},

    {Intrinsic::vp_bswap, MVT::i32, 12},

    {Intrinsic::vp_bswap, MVT::i64, 31},

    {Intrinsic::vp_fshl, MVT::i8, 7},

    {Intrinsic::vp_fshl, MVT::i16, 7},

    {Intrinsic::vp_fshl, MVT::i32, 7},

    {Intrinsic::vp_fshl, MVT::i64, 7},

    {Intrinsic::vp_fshr, MVT::i8, 7},

    {Intrinsic::vp_fshr, MVT::i16, 7},

    {Intrinsic::vp_fshr, MVT::i32, 7},

    {Intrinsic::vp_fshr, MVT::i64, 7},

    {Intrinsic::bitreverse, MVT::i8, 17},

    {Intrinsic::bitreverse, MVT::i16, 24},

    {Intrinsic::bitreverse, MVT::i32, 33},

    {Intrinsic::bitreverse, MVT::i64, 52},

    {Intrinsic::vp_bitreverse, MVT::i8, 17},

    {Intrinsic::vp_bitreverse, MVT::i16, 24},

    {Intrinsic::vp_bitreverse, MVT::i32, 33},

    {Intrinsic::vp_bitreverse, MVT::i64, 52},

    {Intrinsic::ctpop, MVT::i8, 12},

    {Intrinsic::ctpop, MVT::i16, 19},

    {Intrinsic::ctpop, MVT::i32, 20},

    {Intrinsic::ctpop, MVT::i64, 21},

    {Intrinsic::ctlz, MVT::i8, 19},

    {Intrinsic::ctlz, MVT::i16, 28},

    {Intrinsic::ctlz, MVT::i32, 31},

    {Intrinsic::ctlz, MVT::i64, 35},

    {Intrinsic::cttz, MVT::i8, 16},

    {Intrinsic::cttz, MVT::i16, 23},

    {Intrinsic::cttz, MVT::i32, 24},

    {Intrinsic::cttz, MVT::i64, 25},

    {Intrinsic::vp_ctpop, MVT::i8, 12},

    {Intrinsic::vp_ctpop, MVT::i16, 19},

    {Intrinsic::vp_ctpop, MVT::i32, 20},

    {Intrinsic::vp_ctpop, MVT::i64, 21},

    {Intrinsic::vp_ctlz, MVT::i8, 19},

    {Intrinsic::vp_ctlz, MVT::i16, 28},

    {Intrinsic::vp_ctlz, MVT::i32, 31},

    {Intrinsic::vp_ctlz, MVT::i64, 35},

    {Intrinsic::vp_cttz, MVT::i8, 16},

    {Intrinsic::vp_cttz, MVT::i16, 23},

    {Intrinsic::vp_cttz, MVT::i32, 24},

    {Intrinsic::vp_cttz, MVT::i64, 25},

};


InstructionCost


RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                    TTI::TargetCostKind CostKind) const {

  auto *RetTy = ICA.getReturnType();

  switch (ICA.getID()) {

  case Intrinsic::lrint:

  case Intrinsic::llrint:

  case Intrinsic::lround:

  case Intrinsic::llround: {

    auto LT = getTypeLegalizationCost(RetTy);

    Type *SrcTy = ICA.getArgTypes().front();

    auto SrcLT = getTypeLegalizationCost(SrcTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      SmallVector<unsigned, 2> Ops;

      unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());

      unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());

      if (LT.second.getVectorElementType() == MVT::bf16) {

        if (!ST->hasVInstructionsBF16Minimal())

          return InstructionCost::getInvalid();

        if (DstEltSz == 32)

          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};

        else

          Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};

      } else if (LT.second.getVectorElementType() == MVT::f16 &&

                 !ST->hasVInstructionsF16()) {

        if (!ST->hasVInstructionsF16Minimal())

          return InstructionCost::getInvalid();

        if (DstEltSz == 32)

          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};

        else

          Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};


      } else if (SrcEltSz > DstEltSz) {

        Ops = {RISCV::VFNCVT_X_F_W};

      } else if (SrcEltSz < DstEltSz) {

        Ops = {RISCV::VFWCVT_X_F_V};

      } else {

        Ops = {RISCV::VFCVT_X_F_V};

      }


      // We need to use the source LMUL in the case of a narrowing op, and the

      // destination LMUL otherwise.

      if (SrcEltSz > DstEltSz)

        return SrcLT.first *

               getRISCVInstructionCost(Ops, SrcLT.second, CostKind);

      return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::ceil:

  case Intrinsic::floor:

  case Intrinsic::trunc:

  case Intrinsic::rint:

  case Intrinsic::round:

  case Intrinsic::roundeven: {

    // These all use the same code.

    auto LT = getTypeLegalizationCost(RetTy);

    if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))

      return LT.first * 8;

    break;

  }

  case Intrinsic::umin:

  case Intrinsic::umax:

  case Intrinsic::smin:

  case Intrinsic::smax: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (LT.second.isScalarInteger() && ST->hasStdExtZbb())

      return LT.first;


    if (ST->hasVInstructions() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::umin:

        Op = RISCV::VMINU_VV;

        break;

      case Intrinsic::umax:

        Op = RISCV::VMAXU_VV;

        break;

      case Intrinsic::smin:

        Op = RISCV::VMIN_VV;

        break;

      case Intrinsic::smax:

        Op = RISCV::VMAX_VV;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::sadd_sat:

        Op = RISCV::VSADD_VV;

        break;

      case Intrinsic::ssub_sat:

        Op = RISCV::VSSUB_VV;

        break;

      case Intrinsic::uadd_sat:

        Op = RISCV::VSADDU_VV;

        break;

      case Intrinsic::usub_sat:

        Op = RISCV::VSSUBU_VV;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::fma:

  case Intrinsic::fmuladd: {

    // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector())

      return LT.first *

             getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);

    break;

  }

  case Intrinsic::fabs: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      // lui a0, 8

      // addi a0, a0, -1

      // vsetvli a1, zero, e16, m1, ta, ma

      // vand.vx v8, v8, a0

      // f16 with zvfhmin and bf16 with zvfhbmin

      if (LT.second.getVectorElementType() == MVT::bf16 ||

          (LT.second.getVectorElementType() == MVT::f16 &&

           !ST->hasVInstructionsF16()))

        return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,

                                                  CostKind) +

               2;

      else

        return LT.first *

               getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::sqrt: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      SmallVector<unsigned, 4> ConvOp;

      SmallVector<unsigned, 2> FsqrtOp;

      MVT ConvType = LT.second;

      MVT FsqrtType = LT.second;

      // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16

      // will be spilt.

      if (LT.second.getVectorElementType() == MVT::bf16) {

        if (LT.second == MVT::nxv32bf16) {

          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,

                    RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};

          ConvType = MVT::nxv16f16;

          FsqrtType = MVT::nxv16f32;

        } else {

          ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V};

          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);

        }

      } else if (LT.second.getVectorElementType() == MVT::f16 &&

                 !ST->hasVInstructionsF16()) {

        if (LT.second == MVT::nxv32f16) {

          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,

                    RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};

          ConvType = MVT::nxv16f16;

          FsqrtType = MVT::nxv16f32;

        } else {

          ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};

          FsqrtOp = {RISCV::VFSQRT_V};

          FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);

        }

      } else {

        FsqrtOp = {RISCV::VFSQRT_V};

      }


      return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +

                         getRISCVInstructionCost(ConvOp, ConvType, CostKind));

    }

    break;

  }

  case Intrinsic::cttz:

  case Intrinsic::ctlz:

  case Intrinsic::ctpop: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasStdExtZvbb() && LT.second.isVector()) {

      unsigned Op;

      switch (ICA.getID()) {

      case Intrinsic::cttz:

        Op = RISCV::VCTZ_V;

        break;

      case Intrinsic::ctlz:

        Op = RISCV::VCLZ_V;

        break;

      case Intrinsic::ctpop:

        Op = RISCV::VCPOP_V;

        break;

      }

      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::abs: {

    auto LT = getTypeLegalizationCost(RetTy);

    if (ST->hasVInstructions() && LT.second.isVector()) {

      // vabs.v v10, v8

      if (ST->hasStdExtZvabd())

        return LT.first *

               getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);


      // vrsub.vi v10, v8, 0

      // vmax.vv v8, v8, v10

      return LT.first *

             getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},

                                     LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::fshl:

  case Intrinsic::fshr: {

    if (ICA.getArgs().empty())

      break;


    // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.

    // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)

    // instruction.

    if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&

        ICA.getArgs()[0] == ICA.getArgs()[1] &&

        (RetTy->getIntegerBitWidth() == 32 ||

         RetTy->getIntegerBitWidth() == 64) &&

        RetTy->getIntegerBitWidth() <= ST->getXLen()) {

      return 1;

    }

    break;

  }

  case Intrinsic::masked_udiv:

    return getArithmeticInstrCost(Instruction::UDiv, ICA.getReturnType(),

                                  CostKind);

  case Intrinsic::masked_sdiv:

    return getArithmeticInstrCost(Instruction::SDiv, ICA.getReturnType(),

                                  CostKind);

  case Intrinsic::masked_urem:

    return getArithmeticInstrCost(Instruction::URem, ICA.getReturnType(),

                                  CostKind);

  case Intrinsic::masked_srem:

    return getArithmeticInstrCost(Instruction::SRem, ICA.getReturnType(),

                                  CostKind);

  case Intrinsic::get_active_lane_mask: {

    if (ST->hasVInstructions()) {

      Type *ExpRetTy = VectorType::get(

          ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());

      auto LT = getTypeLegalizationCost(ExpRetTy);


      // vid.v   v8  // considered hoisted

      // vsaddu.vx   v8, v8, a0

      // vmsltu.vx   v0, v8, a1

      return LT.first *

             getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},

                                     LT.second, CostKind);

    }

    break;

  }

  // TODO: add more intrinsic

  case Intrinsic::stepvector: {

    auto LT = getTypeLegalizationCost(RetTy);

    // Legalisation of illegal types involves an `index' instruction plus

    // (LT.first - 1) vector adds.

    if (ST->hasVInstructions())

      return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +

             (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);

    return 1 + (LT.first - 1);

  }

  case Intrinsic::vector_splice_left:

  case Intrinsic::vector_splice_right: {

    auto LT = getTypeLegalizationCost(RetTy);

    // Constant offsets fall through to getShuffleCost.

    if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))

      break;

    if (ST->hasVInstructions() && LT.second.isVector()) {

      return LT.first *

             getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},

                                     LT.second, CostKind);

    }

    break;

  }

  case Intrinsic::experimental_cttz_elts: {

    Type *ArgTy = ICA.getArgTypes()[0];

    EVT ArgType = TLI->getValueType(DL, ArgTy, true);

    if (getTLI()->shouldExpandCttzElements(ArgType))

      break;

    InstructionCost Cost = getRISCVInstructionCost(

        RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);


    // If zero_is_poison is false, then we will generate additional

    // cmp + select instructions to convert -1 to EVL.

    Type *BoolTy = Type::getInt1Ty(RetTy->getContext());

    if (ICA.getArgs().size() > 1 &&

        cast<ConstantInt>(ICA.getArgs()[1])->isZero())

      Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,

                                 CmpInst::ICMP_SLT, CostKind) +

              getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,

                                 CmpInst::BAD_ICMP_PREDICATE, CostKind);


    return Cost;

  }

  case Intrinsic::experimental_vp_splice: {

    // To support type-based query from vectorizer, set the index to 0.

    // Note that index only change the cost from vslide.vx to vslide.vi and in

    // current implementations they have same costs.

    return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),

                          cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,

                          0, cast<VectorType>(ICA.getReturnType()));

  }

  case Intrinsic::fptoui_sat:

  case Intrinsic::fptosi_sat: {

    InstructionCost Cost = 0;

    bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;

    Type *SrcTy = ICA.getArgTypes()[0];


    auto SrcLT = getTypeLegalizationCost(SrcTy);

    auto DstLT = getTypeLegalizationCost(RetTy);

    if (!SrcTy->isVectorTy())

      break;


    if (!SrcLT.first.isValid() || !DstLT.first.isValid())

      return InstructionCost::getInvalid();


    Cost +=

        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,

                         RetTy, SrcTy, TTI::CastContextHint::None, CostKind);


    // Handle NaN.

    // vmfne v0, v8, v8         # If v8[i] is NaN set v0[i] to 1.

    // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.

    Type *CondTy = RetTy->getWithNewBitWidth(1);

    Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,

                               CmpInst::FCMP_UNO, CostKind);

    Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,

                               CmpInst::FCMP_UNO, CostKind);

    return Cost;

  }

  case Intrinsic::experimental_vector_extract_last_active: {

    auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);

    auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);


    auto ValLT = getTypeLegalizationCost(ValTy);

    auto MaskLT = getTypeLegalizationCost(MaskTy);


    // TODO: Return cheaper cost when the entire lane is inactive.

    // The expected asm sequence is:

    // vcpop.m a0, v0

    // beqz a0, exit # Return passthru when the entire lane is inactive.

    // vid v10, v0.t

    // vredmaxu.vs v10, v10, v10

    // vmv.x.s a0, v10

    // zext.b a0, a0

    // vslidedown.vx v8, v8, a0

    // vmv.x.s a0, v8

    // exit:

    //   ...


    // Find a suitable type for a stepvector.

    ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));

    unsigned EltWidth = getTLI()->getBitWidthForCttzElements(

        TLI->getVectorIdxTy(getDataLayout()), MaskTy->getElementCount(),

        /*ZeroIsPoison=*/true, &VScaleRange);

    EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());

    Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);

    auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());

    auto StepLT = getTypeLegalizationCost(StepVecTy);


    // Currently expandVectorFindLastActive cannot handle step vector split.

    // So return invalid when the type needs split.

    // FIXME: Remove this if expandVectorFindLastActive supports split vector.

    if (StepLT.first > 1)

      return InstructionCost::getInvalid();


    InstructionCost Cost = 0;

    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};


    Cost += MaskLT.first *

            getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);

    Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);

    Cost += StepLT.first *

            getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);

    Cost += getCastInstrCost(Instruction::ZExt,

                             Type::getInt64Ty(ValTy->getContext()), StepTy,

                             TTI::CastContextHint::None, CostKind, nullptr);

    Cost += ValLT.first *

            getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},

                                    ValLT.second, CostKind);

    return Cost;

  }

  }


  if (ST->hasVInstructions() && RetTy->isVectorTy()) {

    if (auto LT = getTypeLegalizationCost(RetTy);

        LT.second.isVector()) {

      MVT EltTy = LT.second.getVectorElementType();

      if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,

                                              ICA.getID(), EltTy))

        return LT.first * Entry->Cost;

    }

  }


  return BaseT::getIntrinsicInstrCost(ICA, CostKind);

}


InstructionCost


RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,

                                        const SCEV *Ptr,

                                        TTI::TargetCostKind CostKind) const {

  // Address computations for vector indexed load/store likely require an offset

  // and/or scaling.

  if (ST->hasVInstructions() && PtrTy->isVectorTy())

    return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);


  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);

}


InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,

                                               Type *Src,

                                               TTI::CastContextHint CCH,

                                               TTI::TargetCostKind CostKind,

                                               const Instruction *I) const {

  bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);

  if (!IsVectorType)

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)

  // For now, skip all fixed vector cost analysis when P extension is available

  // to avoid crashes in getMinRVVVectorSizeInBits()

  if (ST->hasStdExtP() &&

      (isa<FixedVectorType>(Dst) || isa<FixedVectorType>(Src))) {

    return 1; // Treat as single instruction cost for now

  }


  // FIXME: Need to compute legalizing cost for illegal types.  The current

  // code handles only legal types and those which can be trivially

  // promoted to legal.

  if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||

      Dst->getScalarSizeInBits() > ST->getELen())

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  assert(ISD && "Invalid opcode");

  std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);

  std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);


  // Handle i1 source and dest cases *before* calling logic in BasicTTI.

  // The shared implementation doesn't model vector widening during legalization

  // and instead assumes scalarization.  In order to scalarize an <N x i1>

  // vector, we need to extend/trunc to/from i8.  If we don't special case

  // this, we can get an infinite recursion cycle.

  switch (ISD) {

  default:

    break;

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

    if (Src->getScalarSizeInBits() == 1) {

      // We do not use vsext/vzext to extend from mask vector.

      // Instead we use the following instructions to extend from mask vector:

      // vmv.v.i v8, 0

      // vmerge.vim v8, v8, -1, v0 (repeated per split)

      return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +

             DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,

                                                   DstLT.second, CostKind) +

             DstLT.first - 1;

    }

    break;

  case ISD::TRUNCATE:

    if (Dst->getScalarSizeInBits() == 1) {

      // We do not use several vncvt to truncate to mask vector. So we could

      // not use PowDiff to calculate it.

      // Instead we use the following instructions to truncate to mask vector:

      // vand.vi v8, v8, 1

      // vmsne.vi v0, v8, 0

      return SrcLT.first *

                 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},

                                         SrcLT.second, CostKind) +

             SrcLT.first - 1;

    }

    break;

  };


  // Our actual lowering for the case where a wider legal type is available

  // uses promotion to the wider type.  This is reflected in the result of

  // getTypeLegalizationCost, but BasicTTI assumes the widened cases are

  // scalarized if the legalized Src and Dst are not equal sized.

  const DataLayout &DL = this->getDataLayout();

  if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||

      !SrcLT.first.isValid() || !DstLT.first.isValid() ||

      !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),

                           SrcLT.second.getSizeInBits()) ||

      !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),

                           DstLT.second.getSizeInBits()) ||

      SrcLT.first > 1 || DstLT.first > 1)

    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);


  // The split cost is handled by the base getCastInstrCost

  assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");


  int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -

                (int)Log2_32(SrcLT.second.getScalarSizeInBits());

  switch (ISD) {

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND: {

    if ((PowDiff < 1) || (PowDiff > 3))

      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);

    unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};

    unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};

    unsigned Op =

        (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];

    return getRISCVInstructionCost(Op, DstLT.second, CostKind);

  }

  case ISD::TRUNCATE:

  case ISD::FP_EXTEND:

  case ISD::FP_ROUND: {

    // Counts of narrow/widen instructions.

    unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();

    unsigned DstEltSize = DstLT.second.getScalarSizeInBits();


    unsigned Op = (ISD == ISD::TRUNCATE)    ? RISCV::VNSRL_WI

                  : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V

                                            : RISCV::VFNCVT_F_F_W;

    InstructionCost Cost = 0;

    for (; SrcEltSize != DstEltSize;) {

      MVT ElementMVT = (ISD == ISD::TRUNCATE)

                           ? MVT::getIntegerVT(DstEltSize)

                           : MVT::getFloatingPointVT(DstEltSize);

      MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);

      DstEltSize =

          (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;

      Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);

    }

    return Cost;

  }

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT: {

    unsigned IsSigned = ISD == ISD::FP_TO_SINT;

    unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;

    unsigned FWCVT =

        IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;

    unsigned FNCVT =

        IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;

    unsigned SrcEltSize = Src->getScalarSizeInBits();

    unsigned DstEltSize = Dst->getScalarSizeInBits();

    InstructionCost Cost = 0;

    if ((SrcEltSize == 16) &&

        (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {

      // If the target only supports zvfhmin or it is fp16-to-i64 conversion

      // pre-widening to f32 and then convert f32 to integer

      VectorType *VecF32Ty =

          VectorType::get(Type::getFloatTy(Dst->getContext()),

                          cast<VectorType>(Dst)->getElementCount());

      std::pair<InstructionCost, MVT> VecF32LT =

          getTypeLegalizationCost(VecF32Ty);

      Cost +=

          VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,

                                                   VecF32LT.second, CostKind);

      Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);

      return Cost;

    }

    if (DstEltSize == SrcEltSize)

      Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);

    else if (DstEltSize > SrcEltSize)

      Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);

    else { // (SrcEltSize > DstEltSize)

      // First do a narrowing conversion to an integer half the size, then

      // truncate if needed.

      MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);

      MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);

      Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);

      if ((SrcEltSize / 2) > DstEltSize) {

        Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());

        Cost +=

            getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);

      }

    }

    return Cost;

  }

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP: {

    unsigned IsSigned = ISD == ISD::SINT_TO_FP;

    unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;

    unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;

    unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;

    unsigned SrcEltSize = Src->getScalarSizeInBits();

    unsigned DstEltSize = Dst->getScalarSizeInBits();


    InstructionCost Cost = 0;

    if ((DstEltSize == 16) &&

        (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {

      // If the target only supports zvfhmin or it is i64-to-fp16 conversion

      // it is converted to f32 and then converted to f16

      VectorType *VecF32Ty =

          VectorType::get(Type::getFloatTy(Dst->getContext()),

                          cast<VectorType>(Dst)->getElementCount());

      std::pair<InstructionCost, MVT> VecF32LT =

          getTypeLegalizationCost(VecF32Ty);

      Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);

      Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,

                                                       DstLT.second, CostKind);

      return Cost;

    }


    if (DstEltSize == SrcEltSize)

      Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);

    else if (DstEltSize > SrcEltSize) {

      if ((DstEltSize / 2) > SrcEltSize) {

        VectorType *VecTy =

            VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),

                            cast<VectorType>(Dst)->getElementCount());

        unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;

        Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);

      }

      Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);

    } else

      Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);

    return Cost;

  }

  }

  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);

}


unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {

  if (isa<ScalableVectorType>(Ty)) {

    const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());

    const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();

    const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;

    return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);

  }

  return cast<FixedVectorType>(Ty)->getNumElements();

}


InstructionCost


RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                     FastMathFlags FMF,

                                     TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  if (Ty->getElementType()->isIntegerTy(1)) {

    // SelectionDAGBuilder does following transforms:

    //   vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)

    //   vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)

    if (IID == Intrinsic::umax || IID == Intrinsic::smin)

      return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);

    else

      return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);

  }


  if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {

    SmallVector<unsigned, 3> Opcodes;

    InstructionCost ExtraCost = 0;

    switch (IID) {

    case Intrinsic::maximum:

      if (FMF.noNaNs()) {

        Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};

      } else {

        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,

                   RISCV::VFMV_F_S};

        // Cost of Canonical Nan + branch

        // lui a0, 523264

        // fmv.w.x fa0, a0

        Type *DstTy = Ty->getScalarType();

        const unsigned EltTyBits = DstTy->getScalarSizeInBits();

        Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);

        ExtraCost = 1 +

                    getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,

                                     TTI::CastContextHint::None, CostKind) +

                    getCFInstrCost(Instruction::CondBr, CostKind);

      }

      break;


    case Intrinsic::minimum:

      if (FMF.noNaNs()) {

        Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};

      } else {

        Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,

                   RISCV::VFMV_F_S};

        // Cost of Canonical Nan + branch

        // lui a0, 523264

        // fmv.w.x fa0, a0

        Type *DstTy = Ty->getScalarType();

        const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);

        Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);

        ExtraCost = 1 +

                    getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,

                                     TTI::CastContextHint::None, CostKind) +

                    getCFInstrCost(Instruction::CondBr, CostKind);

      }

      break;

    }

    return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

  }


  // IR Reduction is composed by one rvv reduction instruction and vmv

  unsigned SplitOp;

  SmallVector<unsigned, 3> Opcodes;

  switch (IID) {

  default:

    llvm_unreachable("Unsupported intrinsic");

  case Intrinsic::smax:

    SplitOp = RISCV::VMAX_VV;

    Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::smin:

    SplitOp = RISCV::VMIN_VV;

    Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::umax:

    SplitOp = RISCV::VMAXU_VV;

    Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::umin:

    SplitOp = RISCV::VMINU_VV;

    Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};

    break;

  case Intrinsic::maxnum:

    SplitOp = RISCV::VFMAX_VV;

    Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};

    break;

  case Intrinsic::minnum:

    SplitOp = RISCV::VFMIN_VV;

    Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};

    break;

  }

  // Add a cost for data larger than LMUL8

  InstructionCost SplitCost =

      (LT.first > 1) ? (LT.first - 1) *

                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)

                     : 0;

  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost


RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

                                         std::optional<FastMathFlags> FMF,

                                         TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  assert(ISD && "Invalid opcode");


  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&

      ISD != ISD::FADD)

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  Type *ElementTy = Ty->getElementType();

  if (ElementTy->isIntegerTy(1)) {

    // Example sequences:

    //   vfirst.m a0, v0

    //   seqz a0, a0

    if (LT.second == MVT::v1i1)

      return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_EQ, CostKind);


    if (ISD == ISD::AND) {

      // Example sequences:

      //   vmand.mm v8, v9, v8 ; needed every time type is split

      //   vmnot.m v8, v0      ; alias for vmnand

      //   vcpop.m a0, v8

      //   seqz a0, a0


      // See the discussion: https://github.com/llvm/llvm-project/pull/119160

      // For LMUL <= 8, there is no splitting,

      //   the sequences are vmnot, vcpop and seqz.

      // When LMUL > 8 and split = 1,

      //   the sequences are vmnand, vcpop and seqz.

      // When LMUL > 8 and split > 1,

      //   the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.

      return ((LT.first > 2) ? (LT.first - 2) : 0) *

                 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_EQ, CostKind);

    } else if (ISD == ISD::XOR || ISD == ISD::ADD) {

      // Example sequences:

      //   vsetvli a0, zero, e8, mf8, ta, ma

      //   vmxor.mm v8, v0, v8 ; needed every time type is split

      //   vcpop.m a0, v8

      //   andi a0, a0, 1

      return (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;

    } else {

      assert(ISD == ISD::OR);

      // Example sequences:

      //   vsetvli a0, zero, e8, mf8, ta, ma

      //   vmor.mm v8, v9, v8 ; needed every time type is split

      //   vcpop.m a0, v0

      //   snez a0, a0

      return (LT.first - 1) *

                 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +

             getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +

             getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,

                                CmpInst::ICMP_NE, CostKind);

    }

  }


  // IR Reduction of or/and is composed by one vmv and one rvv reduction

  // instruction, and others is composed by two vmv and one rvv reduction

  // instruction

  unsigned SplitOp;

  SmallVector<unsigned, 3> Opcodes;

  switch (ISD) {

  case ISD::ADD:

    SplitOp = RISCV::VADD_VV;

    Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};

    break;

  case ISD::OR:

    SplitOp = RISCV::VOR_VV;

    Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};

    break;

  case ISD::XOR:

    SplitOp = RISCV::VXOR_VV;

    Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};

    break;

  case ISD::AND:

    SplitOp = RISCV::VAND_VV;

    Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};

    break;

  case ISD::FADD:

    // We can't promote f16/bf16 fadd reductions.

    if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||

        LT.second.getScalarType() == MVT::bf16)

      return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);

    if (TTI::requiresOrderedReduction(FMF)) {

      Opcodes.push_back(RISCV::VFMV_S_F);

      for (unsigned i = 0; i < LT.first.getValue(); i++)

        Opcodes.push_back(RISCV::VFREDOSUM_VS);

      Opcodes.push_back(RISCV::VFMV_F_S);

      return getRISCVInstructionCost(Opcodes, LT.second, CostKind);

    }

    SplitOp = RISCV::VFADD_VV;

    Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};

    break;

  }

  // Add a cost for data larger than LMUL8

  InstructionCost SplitCost =

      (LT.first > 1) ? (LT.first - 1) *

                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)

                     : 0;

  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);

}


InstructionCost RISCVTTIImpl::getExtendedReductionCost(

    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,

    std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {

  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  // Skip if scalar size of ResTy is bigger than ELEN.

  if (ResTy->getScalarSizeInBits() > ST->getELen())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);


  if (IsUnsigned && Opcode == Instruction::Add &&

      LT.second.isFixedLengthVectorOf(MVT::i1)) {

    // Represent vector_reduce_add(ZExt(<n x i1>)) as

    // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).

    return LT.first *

           getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);

  }


  if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())

    return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,

                                           FMF, CostKind);


  return (LT.first - 1) +

         getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

}


InstructionCost


RISCVTTIImpl::getStoreImmCost(Type *Ty, TTI::OperandValueInfo OpInfo,

                              TTI::TargetCostKind CostKind) const {

  assert(OpInfo.isConstant() && "non constant operand?");

  if (!isa<VectorType>(Ty))

    // FIXME: We need to account for immediate materialization here, but doing

    // a decent job requires more knowledge about the immediate than we

    // currently have here.

    return 0;


  if (OpInfo.isUniform())

    // vmv.v.i, vmv.v.x, or vfmv.v.f

    // We ignore the cost of the scalar constant materialization to be consistent

    // with how we treat scalar constants themselves just above.

    return 1;


  return getConstantPoolLoadCost(Ty, CostKind);

}


InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

                                              Align Alignment,

                                              unsigned AddressSpace,

                                              TTI::TargetCostKind CostKind,

                                              TTI::OperandValueInfo OpInfo,

                                              const Instruction *I) const {

  EVT VT = TLI->getValueType(DL, Src, true);

  // Type legalization can't handle structs, and load latency isn't handled here

  if (VT == MVT::Other ||

      (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))

    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                  CostKind, OpInfo, I);


  InstructionCost Cost = 0;

  if (Opcode == Instruction::Store && OpInfo.isConstant())

    Cost += getStoreImmCost(Src, OpInfo, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);


  InstructionCost BaseCost = [&]() {

    InstructionCost Cost = LT.first;

    if (CostKind != TTI::TCK_RecipThroughput)

      return Cost;


    // Our actual lowering for the case where a wider legal type is available

    // uses the a VL predicated load on the wider type.  This is reflected in

    // the result of getTypeLegalizationCost, but BasicTTI assumes the

    // widened cases are scalarized.

    const DataLayout &DL = this->getDataLayout();

    if (Src->isVectorTy() && LT.second.isVector() &&

        TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),

                            LT.second.getSizeInBits()))

        return Cost;


    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                  CostKind, OpInfo, I);

  }();


  // Assume memory ops cost scale with the number of vector registers

  // possible accessed by the instruction.  Note that BasicTTI already

  // handles the LT.first term for us.

  if (ST->hasVInstructions() && LT.second.isVector() &&

      CostKind != TTI::TCK_CodeSize)

    BaseCost *= TLI->getLMULCost(LT.second);

  return Cost + BaseCost;

}


InstructionCost RISCVTTIImpl::getCmpSelInstrCost(

    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,

    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,

    TTI::OperandValueInfo Op2Info, const Instruction *I) const {

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  // Skip if scalar size of ValTy is bigger than ELEN.

  if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())

    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                     Op1Info, Op2Info, I);


  auto GetConstantMatCost =

      [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {

    if (OpInfo.isUniform())

      // We return 0 we currently ignore the cost of materializing scalar

      // constants in GPRs.

      return 0;


    return getConstantPoolLoadCost(ValTy, CostKind);

  };


  InstructionCost ConstantMatCost;

  if (Op1Info.isConstant())

    ConstantMatCost += GetConstantMatCost(Op1Info);

  if (Op2Info.isConstant())

    ConstantMatCost += GetConstantMatCost(Op2Info);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

  if (Opcode == Instruction::Select && ValTy->isVectorTy()) {

    if (CondTy->isVectorTy()) {

      if (ValTy->getScalarSizeInBits() == 1) {

        // vmandn.mm v8, v8, v9

        // vmand.mm v9, v0, v9

        // vmor.mm v0, v9, v8

        return ConstantMatCost +

               LT.first *

                   getRISCVInstructionCost(

                       {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},

                       LT.second, CostKind);

      }

      // vselect and max/min are supported natively.

      return ConstantMatCost +

             LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,

                                                CostKind);

    }


    if (ValTy->getScalarSizeInBits() == 1) {

      //  vmv.v.x v9, a0

      //  vmsne.vi v9, v9, 0

      //  vmandn.mm v8, v8, v9

      //  vmand.mm v9, v0, v9

      //  vmor.mm v0, v9, v8

      MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},

                                         InterimVT, CostKind) +

             LT.first * getRISCVInstructionCost(

                            {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},

                            LT.second, CostKind);

    }


    // vmv.v.x v10, a0

    // vmsne.vi v0, v10, 0

    // vmerge.vvm v8, v9, v8, v0

    return ConstantMatCost +

           LT.first * getRISCVInstructionCost(

                          {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},

                          LT.second, CostKind);

  }


  if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&

      CmpInst::isIntPredicate(VecPred)) {

    // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE

    // provided they incur the same cost across all implementations

    return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,

                                                                LT.second,

                                                                CostKind);

  }


  if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&

      CmpInst::isFPPredicate(VecPred)) {


    // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask

    if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))

      return ConstantMatCost +

             getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);


    // If we do not support the input floating point vector type, use the base

    // one which will calculate as:

    // ScalarizeCost + Num * Cost for fixed vector,

    // InvalidCost for scalable vector.

    if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||

        (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||

        (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))

      return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                       Op1Info, Op2Info, I);


    // Assuming vector fp compare and mask instructions are all the same cost

    // until a need arises to differentiate them.

    switch (VecPred) {

    case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm

    case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm

    case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm

    case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm

      return ConstantMatCost +

             LT.first * getRISCVInstructionCost(

                            {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},

                            LT.second, CostKind);


    case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m

    case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m

    case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m

    case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},

                                         LT.second, CostKind);


    case CmpInst::FCMP_OEQ: // vmfeq.vv

    case CmpInst::FCMP_OGT: // vmflt.vv

    case CmpInst::FCMP_OGE: // vmfle.vv

    case CmpInst::FCMP_OLT: // vmflt.vv

    case CmpInst::FCMP_OLE: // vmfle.vv

    case CmpInst::FCMP_UNE: // vmfne.vv

      return ConstantMatCost +

             LT.first *

                 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);

    default:

      break;

    }

  }


  // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select

  // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will

  // generate a conditional branch + mv. The cost of scalar (icmp + select) will

  // be (0 + select instr cost).

  if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&

      ValTy->isIntegerTy() && !I->user_empty()) {

    if (all_of(I->users(), [&](const User *U) {

          return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&

                 U->getType()->isIntegerTy() &&

                 !isa<ConstantData>(U->getOperand(1)) &&

                 !isa<ConstantData>(U->getOperand(2));

        }))

      return 0;

  }


  // TODO: Add cost for scalar type.


  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

                                   Op1Info, Op2Info, I);

}


InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,

                                             TTI::TargetCostKind CostKind,

                                             const Instruction *I) const {

  if (CostKind != TTI::TCK_RecipThroughput)

    return Opcode == Instruction::PHI ? 0 : 1;

  // Branches are assumed to be predicted.

  return 0;

}


InstructionCost RISCVTTIImpl::getVectorInstrCost(

    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,

    const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {

  assert(Val->isVectorTy() && "This must be a vector type");


  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)

  // For now, skip all fixed vector cost analysis when P extension is available

  // to avoid crashes in getMinRVVVectorSizeInBits()

  if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {

    return 1; // Treat as single instruction cost for now

  }


  if (Opcode != Instruction::ExtractElement &&

      Opcode != Instruction::InsertElement)

    return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,

                                     VIC);


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);


  // This type is legalized to a scalar type.

  if (!LT.second.isVector()) {

    auto *FixedVecTy = cast<FixedVectorType>(Val);

    // If Index is a known constant, cost is zero.

    if (Index != -1U)

      return 0;

    // Extract/InsertElement with non-constant index is very costly when

    // scalarized; estimate cost of loads/stores sequence via the stack:

    // ExtractElement cost: store vector to stack, load scalar;

    // InsertElement cost: store vector to stack, store scalar, load vector.

    Type *ElemTy = FixedVecTy->getElementType();

    auto NumElems = FixedVecTy->getNumElements();

    auto Align = DL.getPrefTypeAlign(ElemTy);

    InstructionCost LoadCost =

        getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);

    InstructionCost StoreCost =

        getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);

    return Opcode == Instruction::ExtractElement

               ? StoreCost * NumElems + LoadCost

               : (StoreCost + LoadCost) * NumElems + StoreCost;

  }


  // For unsupported scalable vector.

  if (LT.second.isScalableVector() && !LT.first.isValid())

    return LT.first;


  // Mask vector extract/insert is expanded via e8.

  if (Val->getScalarSizeInBits() == 1) {

    VectorType *WideTy =

      VectorType::get(IntegerType::get(Val->getContext(), 8),

                      cast<VectorType>(Val)->getElementCount());

    if (Opcode == Instruction::ExtractElement) {

      InstructionCost ExtendCost

        = getCastInstrCost(Instruction::ZExt, WideTy, Val,

                           TTI::CastContextHint::None, CostKind);

      InstructionCost ExtractCost

        = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);

      return ExtendCost + ExtractCost;

    }

    InstructionCost ExtendCost

      = getCastInstrCost(Instruction::ZExt, WideTy, Val,

                         TTI::CastContextHint::None, CostKind);

    InstructionCost InsertCost

      = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);

    InstructionCost TruncCost

      = getCastInstrCost(Instruction::Trunc, Val, WideTy,

                         TTI::CastContextHint::None, CostKind);

    return ExtendCost + InsertCost + TruncCost;

  }


  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector

  // and vslideup + vmv.s.x to insert element to vector.

  unsigned BaseCost = 1;

  // When insertelement we should add the index with 1 as the input of vslideup.

  unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;


  if (Index != -1U) {

    // The type may be split. For fixed-width vectors we can normalize the

    // index to the new type.

    if (LT.second.isFixedLengthVector()) {

      unsigned Width = LT.second.getVectorNumElements();

      Index = Index % Width;

    }


    // If exact VLEN is known, we will insert/extract into the appropriate

    // subvector with no additional subvector insert/extract cost.

    if (auto VLEN = ST->getRealVLen()) {

      unsigned EltSize = LT.second.getScalarSizeInBits();

      unsigned M1Max = *VLEN / EltSize;

      Index = Index % M1Max;

    }


    if (Index == 0)

      // We can extract/insert the first element without vslidedown/vslideup.

      SlideCost = 0;

    else if (Opcode == Instruction::InsertElement)

      SlideCost = 1; // With a constant index, we do not need to use addi.

  }


  // When the vector needs to split into multiple register groups and the index

  // exceeds single vector register group, we need to insert/extract the element

  // via stack.

  if (LT.first > 1 &&

      ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&

                          LT.second.isScalableVector()))) {

    Type *ScalarType = Val->getScalarType();

    Align VecAlign = DL.getPrefTypeAlign(Val);

    Align SclAlign = DL.getPrefTypeAlign(ScalarType);

    // Extra addi for unknown index.

    InstructionCost IdxCost = Index == -1U ? 1 : 0;


    // Store all split vectors into stack and load the target element.

    if (Opcode == Instruction::ExtractElement)

      return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +

             getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,

                             CostKind) +

             IdxCost;


    // Store all split vectors into stack and store the target element and load

    // vectors back.

    return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +

           getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +

           getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,

                           CostKind) +

           IdxCost;

  }


  // Extract i64 in the target that has XLEN=32 need more instruction.

  if (Val->getScalarType()->isIntegerTy() &&

      ST->getXLen() < Val->getScalarSizeInBits()) {

    // For extractelement, we need the following instructions:

    // vsetivli zero, 1, e64, m1, ta, mu (not count)

    // vslidedown.vx v8, v8, a0

    // vmv.x.s a0, v8

    // li a1, 32

    // vsrl.vx v8, v8, a1

    // vmv.x.s a1, v8


    // For insertelement, we need the following instructions:

    // vsetivli zero, 2, e32, m4, ta, mu (not count)

    // vmv.v.i v12, 0

    // vslide1up.vx v16, v12, a1

    // vslide1up.vx v12, v16, a0

    // addi a0, a2, 1

    // vsetvli zero, a0, e64, m4, tu, mu (not count)

    // vslideup.vx v8, v12, a2


    // TODO: should we count these special vsetvlis?

    BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;

  }

  return BaseCost + SlideCost;

}


InstructionCost


RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,

                                               TTI::TargetCostKind CostKind,

                                               unsigned Index) const {

  if (isa<FixedVectorType>(Val))

    return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,

                                                   Index);


  // TODO: This code replicates what LoopVectorize.cpp used to do when asking

  // for the cost of extracting the last lane of a scalable vector. It probably

  // needs a more accurate cost.

  ElementCount EC = cast<VectorType>(Val)->getElementCount();

  assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");

  return getVectorInstrCost(Opcode, Val, CostKind,

                            EC.getKnownMinValue() - 1 - Index, nullptr,

                            nullptr);

}


/// Check to see if this instruction is expected to be combined to a simpler

/// operation during/before lowering. If so return the cost of the combined

/// operation rather than provided one. For instance, `udiv i16 %X, 2` is likely

/// to be combined to `lshr i16 %X, 1`, so return the cost of a `lshr` rather

/// than the cost of a `udiv`

std::optional<InstructionCost>


RISCVTTIImpl::getCombinedArithmeticInstructionCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {

  // Vector unsigned division/remainder will be simplified to shifts/masks.

  if ((Opcode == Instruction::UDiv || Opcode == Instruction::URem) &&

      Opd2Info.isConstant() && Opd2Info.isPowerOf2()) {

    if (Opcode == Instruction::UDiv)

      return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Opd1Info,

                                    Opd2Info.getNoProps());

    // UREM

    return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Opd1Info,

                                  Opd2Info.getNoProps());

  }

  return std::nullopt;

}


InstructionCost RISCVTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {


  // TODO: Handle more cost kinds.

  if (CostKind != TTI::TCK_RecipThroughput)

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  // Skip if scalar size of Ty is bigger than ELEN.

  if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())

    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);


  if (std::optional<InstructionCost> CombinedCost =

          getCombinedArithmeticInstructionCost(Opcode, Ty, CostKind, Op1Info,

                                               Op2Info, Args, CxtI))

    return *CombinedCost;


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);


  // TODO: Handle scalar type.

  if (!LT.second.isVector()) {

    static const CostTblEntry DivTbl[]{

        {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},

        {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},

        {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},

        {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},

        {ISD::UREM, MVT::i32, TTI::TCC_Expensive},

        {ISD::UREM, MVT::i64, TTI::TCC_Expensive},

        {ISD::SREM, MVT::i32, TTI::TCC_Expensive},

        {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};

    if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))

      if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))

        return Entry->Cost * LT.first;


    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);

  }


  // f16 with zvfhmin and bf16 will be promoted to f32.

  // FIXME: nxv32[b]f16 will be custom lowered and split.

  InstructionCost CastCost = 0;

  if ((LT.second.getVectorElementType() == MVT::f16 ||

       LT.second.getVectorElementType() == MVT::bf16) &&

      TLI->getOperationAction(ISDOpcode, LT.second) ==

          TargetLoweringBase::LegalizeAction::Promote) {

    MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);

    Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());

    Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());

    // Add cost of extending arguments

    CastCost += LT.first * Args.size() *

                getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,

                                 TTI::CastContextHint::None, CostKind);

    // Add cost of truncating result

    CastCost +=

        LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,

                                    TTI::CastContextHint::None, CostKind);

    // Compute cost of op in promoted type

    LT.second = PromotedVT;

  }


  auto getConstantMatCost =

      [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {

    if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))

      // Two sub-cases:

      // * Has a 5 bit immediate operand which can be splatted.

      // * Has a larger immediate which must be materialized in scalar register

      // We return 0 for both as we currently ignore the cost of materializing

      // scalar constants in GPRs.

      return 0;


    return getConstantPoolLoadCost(Ty, CostKind);

  };


  // Add the cost of materializing any constant vectors required.

  InstructionCost ConstantMatCost = 0;

  if (Op1Info.isConstant())

    ConstantMatCost += getConstantMatCost(0, Op1Info);

  if (Op2Info.isConstant())

    ConstantMatCost += getConstantMatCost(1, Op2Info);


  unsigned Op;

  switch (ISDOpcode) {

  case ISD::ADD:

  case ISD::SUB:

    Op = RISCV::VADD_VV;

    break;

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    Op = RISCV::VSLL_VV;

    break;

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;

    break;

  case ISD::MUL:

  case ISD::MULHS:

  case ISD::MULHU:

    Op = RISCV::VMUL_VV;

    break;

  case ISD::SDIV:

  case ISD::UDIV:

    Op = RISCV::VDIV_VV;

    break;

  case ISD::SREM:

  case ISD::UREM:

    Op = RISCV::VREM_VV;

    break;

  case ISD::FADD:

  case ISD::FSUB:

    Op = RISCV::VFADD_VV;

    break;

  case ISD::FMUL:

    Op = RISCV::VFMUL_VV;

    break;

  case ISD::FDIV:

    Op = RISCV::VFDIV_VV;

    break;

  case ISD::FNEG:

    Op = RISCV::VFSGNJN_VV;

    break;

  default:

    // Assuming all other instructions have the same cost until a need arises to

    // differentiate them.

    return CastCost + ConstantMatCost +

           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                         Args, CxtI);

  }


  InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);

  // We use BasicTTIImpl to calculate scalar costs, which assumes floating point

  // ops are twice as expensive as integer ops. Do the same for vectors so

  // scalar floating point ops aren't cheaper than their vector equivalents.

  if (Ty->isFPOrFPVectorTy())

    InstrCost *= 2;

  return CastCost + ConstantMatCost + LT.first * InstrCost;

}


// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.


InstructionCost RISCVTTIImpl::getPointersChainCost(

    ArrayRef<const Value *> Ptrs, const Value *Base,

    const TTI::PointersChainInfo &Info, Type *AccessTy,

    TTI::TargetCostKind CostKind) const {

  InstructionCost Cost = TTI::TCC_Free;

  // In the basic model we take into account GEP instructions only

  // (although here can come alloca instruction, a value, constants and/or

  // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a

  // pointer). Typically, if Base is a not a GEP-instruction and all the

  // pointers are relative to the same base address, all the rest are

  // either GEP instructions, PHIs, bitcasts or constants. When we have same

  // base, we just calculate cost of each non-Base GEP as an ADD operation if

  // any their index is a non-const.

  // If no known dependencies between the pointers cost is calculated as a sum

  // of costs of GEP instructions.

  for (auto [I, V] : enumerate(Ptrs)) {

    const auto *GEP = dyn_cast<GetElementPtrInst>(V);

    if (!GEP)

      continue;

    if (Info.isSameBase() && V != Base) {

      if (GEP->hasAllConstantIndices())

        continue;

      // If the chain is unit-stride and BaseReg + stride*i is a legal

      // addressing mode, then presume the base GEP is sitting around in a

      // register somewhere and check if we can fold the offset relative to

      // it.

      unsigned Stride = DL.getTypeStoreSize(AccessTy);

      if (Info.isUnitStride() &&

          isLegalAddressingMode(AccessTy,

                                /* BaseGV */ nullptr,

                                /* BaseOffset */ Stride * I,

                                /* HasBaseReg */ true,

                                /* Scale */ 0,

                                GEP->getType()->getPointerAddressSpace()))

        continue;

      Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,

                                     {TTI::OK_AnyValue, TTI::OP_None},

                                     {TTI::OK_AnyValue, TTI::OP_None}, {});

    } else {

      SmallVector<const Value *> Indices(GEP->indices());

      Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),

                         Indices, AccessTy, CostKind);

    }

  }

  return Cost;

}


void RISCVTTIImpl::getUnrollingPreferences(

    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

    OptimizationRemarkEmitter *ORE) const {

  // TODO: More tuning on benchmarks and metrics with changes as needed

  //       would apply to all settings below to enable performance.


  if (ST->enableDefaultUnroll())

    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);


  // Enable Upper bound unrolling universally, not dependent upon the conditions

  // below.

  UP.UpperBound = true;


  // Disable loop unrolling for Oz and Os.

  UP.OptSizeThreshold = 0;

  UP.PartialOptSizeThreshold = 0;

  if (L->getHeader()->getParent()->hasOptSize())

    return;


  SmallVector<BasicBlock *, 4> ExitingBlocks;

  L->getExitingBlocks(ExitingBlocks);

  LLVM_DEBUG(dbgs() << "Loop has:\n"

                    << "Blocks: " << L->getNumBlocks() << "\n"

                    << "Exit blocks: " << ExitingBlocks.size() << "\n");


  // Only allow another exit other than the latch. This acts as an early exit

  // as it mirrors the profitability calculation of the runtime unroller.

  if (ExitingBlocks.size() > 2)

    return;


  // Limit the CFG of the loop body for targets with a branch predictor.

  // Allowing 4 blocks permits if-then-else diamonds in the body.

  if (L->getNumBlocks() > 4)

    return;


  // Scan the loop: don't unroll loops with calls as this could prevent

  // inlining. Don't unroll auto-vectorized loops either, though do allow

  // unrolling of the scalar remainder.

  bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");

  InstructionCost Cost = 0;

  for (auto *BB : L->getBlocks()) {

    for (auto &I : *BB) {

      // Both auto-vectorized loops and the scalar remainder have the

      // isvectorized attribute, so differentiate between them by the presence

      // of vector instructions.

      if (IsVectorized && (I.getType()->isVectorTy() ||

                           llvm::any_of(I.operand_values(), [](Value *V) {

                             return V->getType()->isVectorTy();

                           })))

        return;


      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {

        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {

          if (!isLoweredToCall(F))

            continue;

        }

        return;

      }


      SmallVector<const Value *> Operands(I.operand_values());

      Cost += getInstructionCost(&I, Operands,

                                 TargetTransformInfo::TCK_SizeAndLatency);

    }

  }


  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");


  UP.Partial = true;

  UP.Runtime = true;

  UP.UnrollRemainder = true;

  UP.UnrollAndJam = true;


  // Force unrolling small loops can be very useful because of the branch

  // taken cost of the backedge.

  if (Cost < 12)

    UP.Force = true;

}


void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

                                      MemIntrinsicInfo &Info) const {

  const DataLayout &DL = getDataLayout();

  Intrinsic::ID IID = Inst->getIntrinsicID();

  LLVMContext &C = Inst->getContext();

  bool HasMask = false;


  auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,

                      bool IsWrite) -> int64_t {

    if (auto *TarExtTy =

            dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))

      return TarExtTy->getIntParameter(0);


    return 1;

  };


  switch (IID) {

  case Intrinsic::riscv_vle_mask:

  case Intrinsic::riscv_vse_mask:

  case Intrinsic::riscv_vlseg2_mask:

  case Intrinsic::riscv_vlseg3_mask:

  case Intrinsic::riscv_vlseg4_mask:

  case Intrinsic::riscv_vlseg5_mask:

  case Intrinsic::riscv_vlseg6_mask:

  case Intrinsic::riscv_vlseg7_mask:

  case Intrinsic::riscv_vlseg8_mask:

  case Intrinsic::riscv_vsseg2_mask:

  case Intrinsic::riscv_vsseg3_mask:

  case Intrinsic::riscv_vsseg4_mask:

  case Intrinsic::riscv_vsseg5_mask:

  case Intrinsic::riscv_vsseg6_mask:

  case Intrinsic::riscv_vsseg7_mask:

  case Intrinsic::riscv_vsseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vle:

  case Intrinsic::riscv_vse:

  case Intrinsic::riscv_vlseg2:

  case Intrinsic::riscv_vlseg3:

  case Intrinsic::riscv_vlseg4:

  case Intrinsic::riscv_vlseg5:

  case Intrinsic::riscv_vlseg6:

  case Intrinsic::riscv_vlseg7:

  case Intrinsic::riscv_vlseg8:

  case Intrinsic::riscv_vsseg2:

  case Intrinsic::riscv_vsseg3:

  case Intrinsic::riscv_vsseg4:

  case Intrinsic::riscv_vsseg5:

  case Intrinsic::riscv_vsseg6:

  case Intrinsic::riscv_vsseg7:

  case Intrinsic::riscv_vsseg8: {

    // Intrinsic interface:

    // riscv_vle(merge, ptr, vl)

    // riscv_vle_mask(merge, ptr, mask, vl, policy)

    // riscv_vse(val, ptr, vl)

    // riscv_vse_mask(val, ptr, mask, vl, policy)

    // riscv_vlseg#(merge, ptr, vl, sew)

    // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)

    // riscv_vsseg#(val, ptr, vl, sew)

    // riscv_vsseg#_mask(val, ptr, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 1 - HasMask;

    MaybeAlign Alignment =

        Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);

    Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

    Value *Mask = ConstantInt::getTrue(MaskType);

    if (HasMask)

      Mask = Inst->getArgOperand(VLIndex - 1);

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Alignment, Mask, EVL);

    return true;

  }

  case Intrinsic::riscv_vlse_mask:

  case Intrinsic::riscv_vsse_mask:

  case Intrinsic::riscv_vlsseg2_mask:

  case Intrinsic::riscv_vlsseg3_mask:

  case Intrinsic::riscv_vlsseg4_mask:

  case Intrinsic::riscv_vlsseg5_mask:

  case Intrinsic::riscv_vlsseg6_mask:

  case Intrinsic::riscv_vlsseg7_mask:

  case Intrinsic::riscv_vlsseg8_mask:

  case Intrinsic::riscv_vssseg2_mask:

  case Intrinsic::riscv_vssseg3_mask:

  case Intrinsic::riscv_vssseg4_mask:

  case Intrinsic::riscv_vssseg5_mask:

  case Intrinsic::riscv_vssseg6_mask:

  case Intrinsic::riscv_vssseg7_mask:

  case Intrinsic::riscv_vssseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vlse:

  case Intrinsic::riscv_vsse:

  case Intrinsic::riscv_vlsseg2:

  case Intrinsic::riscv_vlsseg3:

  case Intrinsic::riscv_vlsseg4:

  case Intrinsic::riscv_vlsseg5:

  case Intrinsic::riscv_vlsseg6:

  case Intrinsic::riscv_vlsseg7:

  case Intrinsic::riscv_vlsseg8:

  case Intrinsic::riscv_vssseg2:

  case Intrinsic::riscv_vssseg3:

  case Intrinsic::riscv_vssseg4:

  case Intrinsic::riscv_vssseg5:

  case Intrinsic::riscv_vssseg6:

  case Intrinsic::riscv_vssseg7:

  case Intrinsic::riscv_vssseg8: {

    // Intrinsic interface:

    // riscv_vlse(merge, ptr, stride, vl)

    // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)

    // riscv_vsse(val, ptr, stride, vl)

    // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)

    // riscv_vlsseg#(merge, ptr, offset, vl, sew)

    // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)

    // riscv_vssseg#(val, ptr, offset, vl, sew)

    // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 2 - HasMask;

    MaybeAlign Alignment =

        Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);


    Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);

    // Use the pointer alignment as the element alignment if the stride is a

    // multiple of the pointer alignment. Otherwise, the element alignment

    // should be the greatest common divisor of pointer alignment and stride.

    // For simplicity, just consider unalignment for elements.

    unsigned PointerAlign = Alignment.valueOrOne().value();

    if (!isa<ConstantInt>(Stride) ||

        cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)

      Alignment = Align(1);


    Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

    Value *Mask = ConstantInt::getTrue(MaskType);

    if (HasMask)

      Mask = Inst->getArgOperand(VLIndex - 1);

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Alignment, Mask, EVL, Stride);

    return true;

  }

  case Intrinsic::riscv_vloxei_mask:

  case Intrinsic::riscv_vluxei_mask:

  case Intrinsic::riscv_vsoxei_mask:

  case Intrinsic::riscv_vsuxei_mask:

  case Intrinsic::riscv_vloxseg2_mask:

  case Intrinsic::riscv_vloxseg3_mask:

  case Intrinsic::riscv_vloxseg4_mask:

  case Intrinsic::riscv_vloxseg5_mask:

  case Intrinsic::riscv_vloxseg6_mask:

  case Intrinsic::riscv_vloxseg7_mask:

  case Intrinsic::riscv_vloxseg8_mask:

  case Intrinsic::riscv_vluxseg2_mask:

  case Intrinsic::riscv_vluxseg3_mask:

  case Intrinsic::riscv_vluxseg4_mask:

  case Intrinsic::riscv_vluxseg5_mask:

  case Intrinsic::riscv_vluxseg6_mask:

  case Intrinsic::riscv_vluxseg7_mask:

  case Intrinsic::riscv_vluxseg8_mask:

  case Intrinsic::riscv_vsoxseg2_mask:

  case Intrinsic::riscv_vsoxseg3_mask:

  case Intrinsic::riscv_vsoxseg4_mask:

  case Intrinsic::riscv_vsoxseg5_mask:

  case Intrinsic::riscv_vsoxseg6_mask:

  case Intrinsic::riscv_vsoxseg7_mask:

  case Intrinsic::riscv_vsoxseg8_mask:

  case Intrinsic::riscv_vsuxseg2_mask:

  case Intrinsic::riscv_vsuxseg3_mask:

  case Intrinsic::riscv_vsuxseg4_mask:

  case Intrinsic::riscv_vsuxseg5_mask:

  case Intrinsic::riscv_vsuxseg6_mask:

  case Intrinsic::riscv_vsuxseg7_mask:

  case Intrinsic::riscv_vsuxseg8_mask:

    HasMask = true;

    [[fallthrough]];

  case Intrinsic::riscv_vloxei:

  case Intrinsic::riscv_vluxei:

  case Intrinsic::riscv_vsoxei:

  case Intrinsic::riscv_vsuxei:

  case Intrinsic::riscv_vloxseg2:

  case Intrinsic::riscv_vloxseg3:

  case Intrinsic::riscv_vloxseg4:

  case Intrinsic::riscv_vloxseg5:

  case Intrinsic::riscv_vloxseg6:

  case Intrinsic::riscv_vloxseg7:

  case Intrinsic::riscv_vloxseg8:

  case Intrinsic::riscv_vluxseg2:

  case Intrinsic::riscv_vluxseg3:

  case Intrinsic::riscv_vluxseg4:

  case Intrinsic::riscv_vluxseg5:

  case Intrinsic::riscv_vluxseg6:

  case Intrinsic::riscv_vluxseg7:

  case Intrinsic::riscv_vluxseg8:

  case Intrinsic::riscv_vsoxseg2:

  case Intrinsic::riscv_vsoxseg3:

  case Intrinsic::riscv_vsoxseg4:

  case Intrinsic::riscv_vsoxseg5:

  case Intrinsic::riscv_vsoxseg6:

  case Intrinsic::riscv_vsoxseg7:

  case Intrinsic::riscv_vsoxseg8:

  case Intrinsic::riscv_vsuxseg2:

  case Intrinsic::riscv_vsuxseg3:

  case Intrinsic::riscv_vsuxseg4:

  case Intrinsic::riscv_vsuxseg5:

  case Intrinsic::riscv_vsuxseg6:

  case Intrinsic::riscv_vsuxseg7:

  case Intrinsic::riscv_vsuxseg8: {

    // Intrinsic interface (only listed ordered version):

    // riscv_vloxei(merge, ptr, index, vl)

    // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)

    // riscv_vsoxei(val, ptr, index, vl)

    // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)

    // riscv_vloxseg#(merge, ptr, index, vl, sew)

    // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)

    // riscv_vsoxseg#(val, ptr, index, vl, sew)

    // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)

    bool IsWrite = Inst->getType()->isVoidTy();

    Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();

    // The results of segment loads are TargetExtType.

    if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {

      unsigned SEW =

          1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))

                   ->getZExtValue();

      Ty = TarExtTy->getTypeParameter(0U);

      Ty = ScalableVectorType::get(

          IntegerType::get(C, SEW),

          cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);

    }

    const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);

    unsigned VLIndex = RVVIInfo->VLOperand;

    unsigned PtrOperandNo = VLIndex - 2 - HasMask;

    Value *Mask;

    if (HasMask) {

      Mask = Inst->getArgOperand(VLIndex - 1);

    } else {

      // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,

      // and casting that to scalar i64 triggers a vector/scalar mismatch

      // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it

      // via extractelement instead.

      Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));

      Mask = ConstantInt::getTrue(MaskType);

    }

    Value *EVL = Inst->getArgOperand(VLIndex);

    unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);

    // RVV uses contiguous elements as a segment.

    if (SegNum > 1) {

      unsigned ElemSize = Ty->getScalarSizeInBits();

      auto *SegTy = IntegerType::get(C, ElemSize * SegNum);

      Ty = VectorType::get(SegTy, cast<VectorType>(Ty));

    }

    Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);

    Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,

                                          Align(1), Mask, EVL,

                                          /* Stride */ nullptr, OffsetOp);

    return true;

  }

  }

  return false;

}


unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) const {

  if (Ty->isVectorTy()) {

    // f16 with only zvfhmin and bf16 will be promoted to f32

    Type *EltTy = cast<VectorType>(Ty)->getElementType();

    if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||

        EltTy->isBFloatTy())

      Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),

                           cast<VectorType>(Ty));


    TypeSize Size = DL.getTypeSizeInBits(Ty);

    if (Size.isScalable() && ST->hasVInstructions())

      return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);


    if (ST->useRVVForFixedLengthVectors())

      return divideCeil(Size, ST->getRealMinVLen());

  }


  return BaseT::getRegUsageForType(Ty);

}


unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

  if (SLPMaxVF.getNumOccurrences())

    return SLPMaxVF;


  // Return how many elements can fit in getRegisterBitwidth.  This is the

  // same routine as used in LoopVectorizer.  We should probably be

  // accounting for whether we actually have instructions with the right

  // lane type, but we don't have enough information to do that without

  // some additional plumbing which hasn't been justified yet.

  TypeSize RegWidth =

    getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);

  // If no vector registers, or absurd element widths, disable

  // vectorization by returning 1.

  return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);

}


unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {

  return RVVMinTripCount;

}


bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {

  return ST->enableUnalignedVectorMem();

}


TTI::AddressingModeKind


RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,

                                         ScalarEvolution *SE) const {

  if (ST->hasVendorXCVmem() && !ST->is64Bit())

    return TTI::AMK_PostIndexed;


  return BasicTTIImplBase::getPreferredAddressingMode(L, SE);

}


bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,

                                 const TargetTransformInfo::LSRCost &C2) const {

  // RISC-V specific here are "instruction number 1st priority".

  // If we need to emit adds inside the loop to add up base registers, then

  // we need at least one extra temporary register.

  unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);

  unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);

  return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,

                  C1.NumIVMuls, C1.NumBaseAdds,

                  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <

         std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,

                  C2.NumIVMuls, C2.NumBaseAdds,

                  C2.ScaleCost, C2.ImmCost, C2.SetupCost);

}


bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy,

                                           Align Alignment) const {

  auto *VTy = dyn_cast<VectorType>(DataTy);

  if (!VTy || VTy->isScalableTy())

    return false;


  if (!isLegalMaskedLoadStore(DataTy, Alignment))

    return false;


  // FIXME: If it is an i8 vector and the element count exceeds 256, we should

  // scalarize these types with LMUL >= maximum fixed-length LMUL.

  if (VTy->getElementType()->isIntegerTy(8))

    if (VTy->getElementCount().getFixedValue() > 256)

      return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <

             ST->getMaxLMULForFixedLengthVectors();

  return true;

}


bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy,

                                              Align Alignment) const {

  auto *VTy = dyn_cast<VectorType>(DataTy);

  if (!VTy || VTy->isScalableTy())

    return false;


  if (!isLegalMaskedLoadStore(DataTy, Alignment))

    return false;

  return true;

}


bool RISCVTTIImpl::isLegalBroadcastLoad(Type *ElementTy,

                                        ElementCount NumElements) const {

  // Optimized zero-stride loads can be treated as broadcasts.

  if (!ST->hasVInstructions() || !ST->hasOptimizedZeroStrideLoad())

    return false;


  return TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, ElementTy));

}


/// See if \p I should be considered for address type promotion. We check if \p

/// I is a sext with right type and used in memory accesses. If it used in a

/// "complex" getelementptr, we allow it to be promoted without finding other

/// sext instructions that sign extended the same initial value. A getelementptr

/// is considered as "complex" if it has more than 2 operands.


bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(

    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {

  bool Considerable = false;

  AllowPromotionWithoutCommonHeader = false;

  if (!isa<SExtInst>(&I))

    return false;

  Type *ConsideredSExtType =

      Type::getInt64Ty(I.getParent()->getParent()->getContext());

  if (I.getType() != ConsideredSExtType)

    return false;

  // See if the sext is the one with the right type and used in at least one

  // GetElementPtrInst.

  for (const User *U : I.users()) {

    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {

      Considerable = true;

      // A getelementptr is considered as "complex" if it has more than 2

      // operands. We will promote a SExt used in such complex GEP as we

      // expect some computation to be merged if they are done on 64 bits.

      if (GEPInst->getNumOperands() > 2) {

        AllowPromotionWithoutCommonHeader = true;

        break;

      }

    }

  }

  return Considerable;

}


bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {

  switch (Opcode) {

  case Instruction::Add:

  case Instruction::Sub:

  case Instruction::Mul:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor:

  case Instruction::FAdd:

  case Instruction::FSub:

  case Instruction::FMul:

  case Instruction::FDiv:

  case Instruction::ICmp:

  case Instruction::FCmp:

    return true;

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

  case Instruction::UDiv:

  case Instruction::SDiv:

  case Instruction::URem:

  case Instruction::SRem:

  case Instruction::Select:

    return Operand == 1;

  default:

    return false;

  }

}


bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {

  if (!I->getType()->isVectorTy() || !ST->hasVInstructions())

    return false;


  if (canSplatOperand(I->getOpcode(), Operand))

    return true;


  auto *II = dyn_cast<IntrinsicInst>(I);

  if (!II)

    return false;


  switch (II->getIntrinsicID()) {

  case Intrinsic::fma:

  case Intrinsic::vp_fma:

  case Intrinsic::fmuladd:

  case Intrinsic::vp_fmuladd:

    return Operand == 0 || Operand == 1;

  case Intrinsic::vp_shl:

  case Intrinsic::vp_lshr:

  case Intrinsic::vp_ashr:

  case Intrinsic::vp_udiv:

  case Intrinsic::vp_sdiv:

  case Intrinsic::vp_urem:

  case Intrinsic::vp_srem:

  case Intrinsic::ssub_sat:

  case Intrinsic::vp_ssub_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::vp_usub_sat:

  case Intrinsic::vp_select:

    return Operand == 1;

    // These intrinsics are commutative.

  case Intrinsic::vp_add:

  case Intrinsic::vp_mul:

  case Intrinsic::vp_and:

  case Intrinsic::vp_or:

  case Intrinsic::vp_xor:

  case Intrinsic::vp_fadd:

  case Intrinsic::vp_fmul:

  case Intrinsic::vp_icmp:

  case Intrinsic::vp_fcmp:

  case Intrinsic::smin:

  case Intrinsic::vp_smin:

  case Intrinsic::umin:

  case Intrinsic::vp_umin:

  case Intrinsic::smax:

  case Intrinsic::vp_smax:

  case Intrinsic::umax:

  case Intrinsic::vp_umax:

  case Intrinsic::sadd_sat:

  case Intrinsic::vp_sadd_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::vp_uadd_sat:

    // These intrinsics have 'vr' versions.

  case Intrinsic::vp_sub:

  case Intrinsic::vp_fsub:

  case Intrinsic::vp_fdiv:

    return Operand == 0 || Operand == 1;

  default:

    return false;

  }

}


/// Check if sinking \p I's operands to I's basic block is profitable, because

/// the operands can be folded into a target instruction, e.g.

/// splats of scalars can fold into vector instructions.


bool RISCVTTIImpl::isProfitableToSinkOperands(

    Instruction *I, SmallVectorImpl<Use *> &Ops) const {

  using namespace llvm::PatternMatch;


  if (I->isBitwiseLogicOp()) {

    if (!I->getType()->isVectorTy()) {

      if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {

        for (auto &Op : I->operands()) {

          // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)

          if (match(Op.get(), m_Not(m_Value()))) {

            Ops.push_back(&Op);

            return true;

          }

        }

      }

    } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {

      for (auto &Op : I->operands()) {

        // (and X, (not Y)) -> (vandn.vv X, Y)

        if (match(Op.get(), m_Not(m_Value()))) {

          Ops.push_back(&Op);

          return true;

        }

        // (and X, (splat (not Y))) -> (vandn.vx X, Y)

        if (match(Op.get(), m_Shuffle(m_InsertElt(m_Value(), m_Not(m_Value()),

                                                  m_ZeroInt()),

                                      m_Value(), m_ZeroMask()))) {

          Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);

          Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);

          Ops.push_back(&Not);

          Ops.push_back(&InsertElt);

          Ops.push_back(&Op);

          return true;

        }

      }

    }

  }


  if (!I->getType()->isVectorTy() || !ST->hasVInstructions())

    return false;


  // Don't sink splat operands if the target prefers it. Some targets requires

  // S2V transfer buffers and we can run out of them copying the same value

  // repeatedly.

  // FIXME: It could still be worth doing if it would improve vector register

  // pressure and prevent a vector spill.

  if (!ST->sinkSplatOperands())

    return false;


  for (auto OpIdx : enumerate(I->operands())) {

    if (!canSplatOperand(I, OpIdx.index()))

      continue;


    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());

    // Make sure we are not already sinking this operand

    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))

      continue;


    // We are looking for a splat that can be sunk.

    if (!match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),

                             m_Value(), m_ZeroMask())))

      continue;


    // Don't sink i1 splats.

    if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))

      continue;


    // All uses of the shuffle should be sunk to avoid duplicating it across gpr

    // and vector registers

    for (Use &U : Op->uses()) {

      Instruction *Insn = cast<Instruction>(U.getUser());

      if (!canSplatOperand(Insn, U.getOperandNo()))

        return false;

    }


    // Sink any fpexts since they might be used in a widening fp pattern.

    Use *InsertEltUse = &Op->getOperandUse(0);

    auto *InsertElt = cast<InsertElementInst>(InsertEltUse);

    if (isa<FPExtInst>(InsertElt->getOperand(1)))

      Ops.push_back(&InsertElt->getOperandUse(1));

    Ops.push_back(InsertEltUse);

    Ops.push_back(&OpIdx.value());

  }

  return true;

}


RISCVTTIImpl::TTI::MemCmpExpansionOptions


RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {

  TTI::MemCmpExpansionOptions Options;

  // TODO: Enable expansion when unaligned access is not supported after we fix

  // issues in ExpandMemcmp.

  if (!ST->enableUnalignedScalarMem())

    return Options;


  if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)

    return Options;


  Options.AllowOverlappingLoads = true;

  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

  Options.NumLoadsPerBlock = Options.MaxNumLoads;

  if (ST->is64Bit()) {

    Options.LoadSizes = {8, 4, 2, 1};

    Options.AllowedTailExpansions = {3, 5, 6};

  } else {

    Options.LoadSizes = {4, 2, 1};

    Options.AllowedTailExpansions = {3};

  }


  if (IsZeroCmp && ST->hasVInstructions()) {

    unsigned VLenB = ST->getRealMinVLen() / 8;

    // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be

    // `VLenB * MaxLMUL` so that it fits in a single register group.

    unsigned MinSize = ST->getXLen() / 8 + 1;

    unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();

    for (unsigned Size = MinSize; Size <= MaxSize; Size++)

      Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);

  }

  return Options;

}


bool RISCVTTIImpl::shouldTreatInstructionLikeSelect(

    const Instruction *I) const {

  if (EnableOrLikeSelectOpt) {

    // For the binary operators (e.g. or) we need to be more careful than

    // selects, here we only transform them if they are already at a natural

    // break point in the code - the end of a block with an unconditional

    // terminator.

    if (I->getOpcode() == Instruction::Or &&

        isa<UncondBrInst>(I->getNextNode()))

      return true;


    if (I->getOpcode() == Instruction::Add ||

        I->getOpcode() == Instruction::Sub)

      return true;

  }

  return BaseT::shouldTreatInstructionLikeSelect(I);

}


bool RISCVTTIImpl::shouldCopyAttributeWhenOutliningFrom(

    const Function *Caller, const Attribute &Attr) const {

  // "interrupt" controls the prolog/epilog of interrupt handlers (and includes

  // restrictions on their signatures). We can outline from the bodies of these

  // handlers, but when we do we need to make sure we don't mark the outlined

  // function as an interrupt handler too.

  if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")

    return false;


  return BaseT::shouldCopyAttributeWhenOutliningFrom(Caller, Attr);

}


std::optional<Instruction *>


RISCVTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

  // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale

  // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid

  // creating redundant masks.

  const DataLayout &DL = IC.getDataLayout();

  if (II.user_empty())

    return {};

  auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());

  if (!TargetVecTy)

    return {};

  const APInt *Scalar;

  uint64_t VL;

  if (!match(&II, m_Intrinsic<Intrinsic::riscv_vmv_v_x>(

                      m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||

      !all_of(II.users(), [TargetVecTy](User *U) {

        return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));

      }))

    return {};

  auto *SourceVecTy = cast<ScalableVectorType>(II.getType());

  unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());

  unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());

  if (TargetEltBW % SourceEltBW)

    return {};

  unsigned TargetScale = TargetEltBW / SourceEltBW;

  if (VL % TargetScale || TargetScale == 1)

    return {};

  Type *VLTy = II.getOperand(2)->getType();

  ElementCount SourceEC = SourceVecTy->getElementCount();

  unsigned NewEltBW = SourceEltBW * TargetScale;

  if (!SourceEC.isKnownMultipleOf(TargetScale) ||

      !DL.fitsInLegalInteger(NewEltBW))

    return {};

  auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);

  if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))

    return {};

  ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);

  Type *RetTy = VectorType::get(NewEltTy, NewEC);

  assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&

         "Lossless bitcast between types expected");

  APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);

  return IC.replaceInstUsesWith(

      II,

      IC.Builder.CreateBitCast(

          IC.Builder.CreateIntrinsic(

              RetTy, Intrinsic::riscv_vmv_v_x,

              {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),

               ConstantInt::get(VLTy, VL / TargetScale)}),

          SourceVecTy));

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

EnableOrLikeSelectOpt
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

BasicTTIImpl.h
This file provides a helper that implements much of the TTI interface in terms of the target-independ...

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

shouldSplit
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
Definition ControlHeightReduction.cpp:1052

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:51

CostTable.h
Cost tables and simple lookup functions.

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

InstrCost
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

OffsetOp
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
Definition InstCombineCompares.cpp:5910

InstCombiner.h
This file provides the interface for the instcombine pass implementation.

Instructions.h

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

Options
static LVOptions Options
Definition LVOptions.cpp:25

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

getCalledFunction
static const Function * getCalledFunction(const Value *V)
Definition MemoryBuiltins.cpp:155

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

PatternMatch.h

RISCVMatInt.h

costShuffleViaVRegSplitting
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
Definition RISCVTargetTransformInfo.cpp:551

costShuffleViaSplitting
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
Definition RISCVTargetTransformInfo.cpp:481

isRepeatedConcatMask
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
Definition RISCVTargetTransformInfo.cpp:445

isM1OrSmaller
static unsigned isM1OrSmaller(MVT VT)
Definition RISCVTargetTransformInfo.cpp:1002

EnableOrLikeSelectOpt
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)

SLPMaxVF
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)

RVVRegisterWidthLMUL
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)

RVVMinTripCount
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)

getIntImmCostImpl
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
Definition RISCVTargetTransformInfo.cpp:128

getVRGatherIndexType
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
Definition RISCVTargetTransformInfo.cpp:467

VectorIntrinsicCostTable
static const CostTblEntry VectorIntrinsicCostTable[]
Definition RISCVTargetTransformInfo.cpp:1351

canUseShiftPair
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
Definition RISCVTargetTransformInfo.cpp:154

canUseShiftCmp
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
Definition RISCVTargetTransformInfo.cpp:182

RISCVTargetTransformInfo.h
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...

getValueType
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
Definition SLPVectorizer.cpp:322

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

ValueTypes.h

VectorType
Definition ItaniumDemangle.h:1189

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getSplat
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652

llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::back
const T & back() const
Get the last element.
Definition ArrayRef.h:150

llvm::ArrayRef::end
iterator end() const
Definition ArrayRef.h:130

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::Attribute
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105

llvm::Attribute::isStringAttribute
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
Definition Attributes.cpp:361

llvm::Attribute::getKindAsString
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
Definition Attributes.cpp:398

llvm::BasicTTIImplBase< RISCVTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition BasicTTIImpl.h:1598

llvm::BasicTTIImplBase< RISCVTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1048

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3386

llvm::BasicTTIImplBase< RISCVTTIImpl >::getGEPCost
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:569

llvm::BasicTTIImplBase< RISCVTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
Definition BasicTTIImpl.h:1126

llvm::BasicTTIImplBase< RISCVTTIImpl >::isLegalAddressingMode
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition BasicTTIImpl.h:467

llvm::BasicTTIImplBase< RISCVTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1182

llvm::BasicTTIImplBase< RISCVTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Definition BasicTTIImpl.h:899

llvm::BasicTTIImplBase< RISCVTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3374

llvm::BasicTTIImplBase< RISCVTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1408

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMaxVScale
std::optional< unsigned > getMaxVScale() const override
Definition BasicTTIImpl.h:890

llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition BasicTTIImpl.h:714

llvm::BasicTTIImplBase< RISCVTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:786

llvm::BasicTTIImplBase< RISCVTTIImpl >::getIndexedVectorInstrCostFromEnd
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
Definition BasicTTIImpl.h:1501

llvm::BasicTTIImplBase< RISCVTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1211

llvm::BasicTTIImplBase< RISCVTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:1012

llvm::BasicTTIImplBase< RISCVTTIImpl >::isLegalAddImmediate
bool isLegalAddImmediate(int64_t imm) const override
Definition BasicTTIImpl.h:455

llvm::BasicTTIImplBase< RISCVTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Definition BasicTTIImpl.h:1461

llvm::BasicTTIImplBase< RISCVTTIImpl >::getVScaleForTuning
std::optional< unsigned > getVScaleForTuning() const override
Definition BasicTTIImpl.h:891

llvm::BasicTTIImplBase< RISCVTTIImpl >::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3434

llvm::BasicTTIImplBase< RISCVTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1746

llvm::BasicTTIImplBase< RISCVTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
Definition BasicTTIImpl.h:3254

llvm::BasicTTIImplBase< RISCVTTIImpl >::getRegUsageForType
unsigned getRegUsageForType(Type *Ty) const override
Definition BasicTTIImpl.h:564

llvm::BasicTTIImplBase< RISCVTTIImpl >::DL
const DataLayout & DL

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMemIntrinsicInstrCost
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3159

llvm::BasicTTIImplBase< RISCVTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1547

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1361

llvm::CallBase::arg_size
unsigned arg_size() const
Definition InstrTypes.h:1359

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740

llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743

llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition InstrTypes.h:773

llvm::CmpInst::FCMP_TRUE
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769

llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746

llvm::CmpInst::FCMP_ULE
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755

llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744

llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745

llvm::CmpInst::FCMP_ULT
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754

llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748

llvm::CmpInst::FCMP_UEQ
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751

llvm::CmpInst::FCMP_UGT
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752

llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747

llvm::CmpInst::FCMP_ORD
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:761

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:762

llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756

llvm::CmpInst::FCMP_UGE
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753

llvm::CmpInst::FCMP_FALSE
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742

llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750

llvm::CmpInst::isFPPredicate
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:833

llvm::CmpInst::isIntPredicate
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition Constants.cpp:893

llvm::ConstantRange
This class represents a range of values.
Definition ConstantRange.h:48

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::ElementCount
Definition TypeSize.h:298

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23

llvm::FastMathFlags::noNaNs
bool noNaNs() const
Definition FMF.h:65

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:650

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:693

llvm::FixedVectorType::getDoubleElementsVectorType
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition DerivedTypes.h:685

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869

llvm::Function
Definition Function.h:65

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition Instructions.h:968

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Definition IRBuilder.cpp:936

llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252

llvm::InstCombiner
The core instruction combiner logic.
Definition InstCombiner.h:49

llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition InstCombiner.h:351

llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition InstCombiner.h:403

llvm::InstCombiner::Builder
BuilderTy & Builder
Definition InstCombiner.h:62

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition InstructionCost.h:82

llvm::InstructionCost::getValue
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition InstructionCost.h:96

llvm::InstructionCost::isValid
bool isValid() const
Definition InstructionCost.h:88

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::isCommutative
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Definition Instruction.cpp:1407

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:178

llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition TargetTransformInfo.h:215

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition TargetTransformInfo.h:211

llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition TargetTransformInfo.h:214

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:209

llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition TargetTransformInfo.h:217

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::getFloatingPointVT
static MVT getFloatingPointVT(unsigned BitWidth)
Definition MachineValueType.h:459

llvm::MVT::getVectorMinNumElements
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition MachineValueType.h:305

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:374

llvm::MVT::changeVectorElementType
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition MachineValueType.h:228

llvm::MVT::bitsLE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
Definition MachineValueType.h:453

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:322

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:106

llvm::MVT::changeTypeToInteger
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition MachineValueType.h:245

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:336

llvm::MVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition MachineValueType.h:370

llvm::MVT::bitsGT
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
Definition MachineValueType.h:432

llvm::MVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition MachineValueType.h:145

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:384

llvm::MVT::getVectorElementType
MVT getVectorElementType() const
Definition MachineValueType.h:291

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:469

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:287

llvm::MemIntrinsicCostAttributes
Information for memory intrinsic cost model.
Definition TargetTransformInfo.h:128

llvm::MemIntrinsicCostAttributes::getAlignment
Align getAlignment() const
Definition TargetTransformInfo.h:175

llvm::MemIntrinsicCostAttributes::getAddressSpace
unsigned getAddressSpace() const
Definition TargetTransformInfo.h:174

llvm::MemIntrinsicCostAttributes::getDataType
Type * getDataType() const
Definition TargetTransformInfo.h:172

llvm::MemIntrinsicCostAttributes::getVariableMask
bool getVariableMask() const
Definition TargetTransformInfo.h:173

llvm::MemIntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:169

llvm::MemIntrinsicCostAttributes::getInst
const Instruction * getInst() const
Definition TargetTransformInfo.h:170

llvm::Operator::getOpcode
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::RISCVSubtarget
Definition RISCVSubtarget.h:83

llvm::RISCVTTIImpl
Definition RISCVTargetTransformInfo.h:28

llvm::RISCVTTIImpl::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:2280

llvm::RISCVTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2540

llvm::RISCVTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2744

llvm::RISCVTTIImpl::shouldCopyAttributeWhenOutliningFrom
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
Definition RISCVTargetTransformInfo.cpp:3692

llvm::RISCVTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Definition RISCVTargetTransformInfo.cpp:2549

llvm::RISCVTTIImpl::isLegalMaskedExpandLoad
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.cpp:3391

llvm::RISCVTTIImpl::getStridedMemoryOpCost
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition RISCVTargetTransformInfo.cpp:1301

llvm::RISCVTTIImpl::isLegalMaskedLoadStore
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
Definition RISCVTargetTransformInfo.h:281

llvm::RISCVTTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:326

llvm::RISCVTTIImpl::getMinTripCountTailFoldingThreshold
unsigned getMinTripCountTailFoldingThreshold() const override
Definition RISCVTargetTransformInfo.cpp:3359

llvm::RISCVTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
Definition RISCVTargetTransformInfo.cpp:3368

llvm::RISCVTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:1829

llvm::RISCVTTIImpl::getStoreImmCost
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
Definition RISCVTargetTransformInfo.cpp:2315

llvm::RISCVTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
Definition RISCVTargetTransformInfo.cpp:3024

llvm::RISCVTTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
Definition RISCVTargetTransformInfo.cpp:1331

llvm::RISCVTTIImpl::getCombinedArithmeticInstructionCost
std::optional< InstructionCost > getCombinedArithmeticInstructionCost(unsigned ISDOpcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI) const
Check to see if this instruction is expected to be combined to a simpler operation during/before lowe...
Definition RISCVTargetTransformInfo.cpp:2727

llvm::RISCVTTIImpl::hasActiveVectorLength
bool hasActiveVectorLength() const override
Definition RISCVTargetTransformInfo.cpp:333

llvm::RISCVTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:1840

llvm::RISCVTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2380

llvm::RISCVTTIImpl::getIndexedVectorInstrCostFromEnd
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
Definition RISCVTargetTransformInfo.cpp:2704

llvm::RISCVTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition RISCVTargetTransformInfo.cpp:2940

llvm::RISCVTTIImpl::isLegalBroadcastLoad
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
Definition RISCVTargetTransformInfo.cpp:3420

llvm::RISCVTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:214

llvm::RISCVTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
Definition RISCVTargetTransformInfo.cpp:2056

llvm::RISCVTTIImpl::canSplatOperand
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
Definition RISCVTargetTransformInfo.cpp:3490

llvm::RISCVTTIImpl::isLSRCostLess
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
Definition RISCVTargetTransformInfo.cpp:3376

llvm::RISCVTTIImpl::isLegalStridedLoadStore
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:351

llvm::RISCVTTIImpl::getRegUsageForType
unsigned getRegUsageForType(Type *Ty) const override
Definition RISCVTargetTransformInfo.cpp:3323

llvm::RISCVTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition RISCVTargetTransformInfo.cpp:1106

llvm::RISCVTTIImpl::isLegalMaskedScatter
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:335

llvm::RISCVTTIImpl::isLegalMaskedCompressStore
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
Definition RISCVTargetTransformInfo.cpp:3409

llvm::RISCVTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition RISCVTargetTransformInfo.cpp:1232

llvm::RISCVTTIImpl::getPartialReductionCost
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
Definition RISCVTargetTransformInfo.cpp:343

llvm::RISCVTTIImpl::shouldTreatInstructionLikeSelect
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
Definition RISCVTargetTransformInfo.cpp:3674

llvm::RISCVTTIImpl::getExpandCompressMemoryOpCost
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition RISCVTargetTransformInfo.cpp:1257

llvm::RISCVTTIImpl::preferAlternateOpcodeVectorization
bool preferAlternateOpcodeVectorization() const override
Definition RISCVTargetTransformInfo.cpp:3363

llvm::RISCVTTIImpl::isProfitableToSinkOperands
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
Definition RISCVTargetTransformInfo.cpp:3555

llvm::RISCVTTIImpl::getMaxVScale
std::optional< unsigned > getMaxVScale() const override
Definition RISCVTargetTransformInfo.cpp:380

llvm::RISCVTTIImpl::shouldExpandReduction
bool shouldExpandReduction(const IntrinsicInst *II) const override
Definition RISCVTargetTransformInfo.cpp:366

llvm::RISCVTTIImpl::getVScaleForTuning
std::optional< unsigned > getVScaleForTuning() const override
Definition RISCVTargetTransformInfo.cpp:386

llvm::RISCVTTIImpl::getMemIntrinsicInstrCost
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
Definition RISCVTargetTransformInfo.cpp:1056

llvm::RISCVTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
Definition RISCVTargetTransformInfo.h:332

llvm::RISCVTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:677

llvm::RISCVTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
Definition RISCVTargetTransformInfo.cpp:3343

llvm::RISCVTTIImpl::getPointersChainCost
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:2893

llvm::RISCVTTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
Definition RISCVTargetTransformInfo.cpp:3641

llvm::RISCVTTIImpl::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
Definition RISCVTargetTransformInfo.cpp:1010

llvm::RISCVTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition RISCVTargetTransformInfo.cpp:2333

llvm::RISCVTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
Definition RISCVTargetTransformInfo.cpp:1415

llvm::RISCVTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition RISCVTargetTransformInfo.cpp:1091

llvm::RISCVTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:2162

llvm::RISCVTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
Definition RISCVTargetTransformInfo.cpp:395

llvm::RISCVTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition RISCVTargetTransformInfo.cpp:3019

llvm::RISCVTTIImpl::instCombineIntrinsic
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Definition RISCVTargetTransformInfo.cpp:3705

llvm::RISCVTTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
Definition RISCVTargetTransformInfo.cpp:3434

llvm::RISCVTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
Definition RISCVTargetTransformInfo.cpp:146

llvm::RISCVTTIImpl::getPopcntSupport
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
Definition RISCVTargetTransformInfo.cpp:338

llvm::RISCVTargetLowering::getM1VT
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
Definition RISCVISelLowering.h:370

llvm::RISCVTargetLowering::getVRGatherVVCost
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
Definition RISCVISelLowering.cpp:3438

llvm::RISCVTargetLowering::getVRGatherVICost
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
Definition RISCVISelLowering.cpp:3453

llvm::RISCVTargetLowering::computeVLMAX
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
Definition RISCVISelLowering.h:353

llvm::RISCVTargetLowering::getLMULCost
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
Definition RISCVISelLowering.cpp:3411

llvm::RISCVTargetLowering::getVSlideVICost
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
Definition RISCVISelLowering.cpp:3469

llvm::RISCVTargetLowering::getVSlideVXCost
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
Definition RISCVISelLowering.cpp:3461

llvm::RISCVTargetLowering::getLMUL
static RISCVVType::VLMUL getLMUL(MVT VT)
Definition RISCVISelLowering.cpp:2949

llvm::SCEV
This class represents an analyzed expression in the program.
Definition ScalarEvolution.h:254

llvm::ScalableVectorType::get
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:891

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:621

llvm::ShuffleVectorInst::isIdentityMask
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
Definition Instructions.cpp:1977

llvm::ShuffleVectorInst::isInterleaveMask
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition Instructions.cpp:2389

llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:691

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:276

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:205

llvm::TargetTransformInfoImplBase::getDataLayout
virtual const DataLayout & getDataLayout() const
Definition TargetTransformInfoImpl.h:51

llvm::TargetTransformInfoImplBase::shouldTreatInstructionLikeSelect
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
Definition TargetTransformInfoImpl.h:535

llvm::TargetTransformInfoImplBase::getPreferredAddressingMode
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition TargetTransformInfoImpl.h:353

llvm::TargetTransformInfoImplBase::shouldCopyAttributeWhenOutliningFrom
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
Definition TargetTransformInfoImpl.h:1085

llvm::TargetTransformInfoImplBase::isLoweredToCall
virtual bool isLoweredToCall(const Function *F) const
Definition TargetTransformInfoImpl.h:221

llvm::TargetTransformInfoImplBase::DL
const DataLayout & DL
Definition TargetTransformInfoImpl.h:40

llvm::TargetTransformInfoImplCRTPBase::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
Definition TargetTransformInfoImpl.h:1443

llvm::TargetTransformInfo::VectorInstrContext
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
Definition TargetTransformInfo.h:1064

llvm::TargetTransformInfo::VectorInstrContext::None
@ None
The insert/extract is not used with a load/store.
Definition TargetTransformInfo.h:1065

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:331

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:332

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:334

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:335

llvm::TargetTransformInfo::TCK_Latency
@ TCK_Latency
The latency of instruction.
Definition TargetTransformInfo.h:333

llvm::TargetTransformInfo::OP_None
@ OP_None
Definition TargetTransformInfo.h:1273

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition TargetTransformInfo.h:1716

llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition TargetTransformInfo.h:822

llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition TargetTransformInfo.h:822

llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition TargetTransformInfo.h:822

llvm::TargetTransformInfo::PartialReductionExtendKind
PartialReductionExtendKind
Definition TargetTransformInfo.h:270

llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition TargetTransformInfo.h:359

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition TargetTransformInfo.h:357

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition TargetTransformInfo.h:358

llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
Definition TargetTransformInfo.h:885

llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
Definition TargetTransformInfo.h:888

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition TargetTransformInfo.h:1244

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition TargetTransformInfo.h:1251

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition TargetTransformInfo.h:1247

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition TargetTransformInfo.h:1255

llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition TargetTransformInfo.h:1250

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1257

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1245

llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition TargetTransformInfo.h:1253

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition TargetTransformInfo.h:1246

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition TargetTransformInfo.h:1252

llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition TargetTransformInfo.h:1579

llvm::TargetTransformInfo::OK_AnyValue
@ OK_AnyValue
Definition TargetTransformInfo.h:1265

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::TypeSize::getScalable
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getInt64Ty
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288

llvm::Type::isScalableTy
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61

llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:839

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368

llvm::Type::getWithNewBitWidth
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
Definition DerivedTypes.h:832

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144

llvm::Type::getWithNewType
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
Definition DerivedTypes.h:826

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232

llvm::Type::getInt1Ty
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313

llvm::Type::getFloatTy
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User
Definition User.h:44

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258

llvm::Value::getPointerAlignment
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:972

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:490

llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition DerivedTypes.h:753

llvm::VectorType::get
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.

llvm::cl::opt
Definition CommandLine.h:1454

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212

llvm::details::FixedOrScalableQuantity::isKnownMultipleOf
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

llvm::details::FixedOrScalableQuantity< TypeSize, uint64_t >::isKnownLE
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230

llvm::details::FixedOrScalableQuantity< TypeSize, uint64_t >::isKnownLT
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165

llvm::details::FixedOrScalableQuantity::divideCoefficientBy
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252

uint32_t

uint64_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64CC::LT
@ LT
Definition AArch64BaseInfo.h:300

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:269

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:268

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:885

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:267

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1030

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:931

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:705

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:270

llvm::ISD::FCEIL
@ FCEIL
Definition ISDOpcodes.h:1061

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963

llvm::ISD::FSQRT
@ FSQRT
Definition ISDOpcodes.h:1032

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:278

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition MIPatternMatch.h:943

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:51

llvm::PatternMatch::m_Poison
auto m_Poison()
Match an arbitrary poison constant.
Definition PatternMatch.h:173

llvm::PatternMatch::m_APInt
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition PatternMatch.h:261

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:53

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2848

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_Shuffle
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
Definition PatternMatch.h:2012

llvm::PatternMatch::m_InsertElt
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Definition PatternMatch.h:1930

llvm::PatternMatch::m_ConstantInt
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:179

llvm::RISCVMatInt::getIntMatCost
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
Definition RISCVMatInt.cpp:577

llvm::RISCVVType::VLMUL
VLMUL
Definition RISCVTargetParser.h:99

llvm::RISCVVType::LMUL_1
@ LMUL_1
Definition RISCVTargetParser.h:100

llvm::RISCVVType::LMUL_F4
@ LMUL_F4
Definition RISCVTargetParser.h:106

llvm::RISCVVType::LMUL_F8
@ LMUL_F8
Definition RISCVTargetParser.h:105

llvm::RISCVVType::LMUL_F2
@ LMUL_F2
Definition RISCVTargetParser.h:107

llvm::RISCV::RVVBitsPerBlock
static constexpr unsigned RVVBitsPerBlock
Definition RISCVTargetParser.h:68

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::CompileSym2Flags::EC
@ EC
Definition CodeView.h:432

llvm::pdb::PDB_SymType::VectorType
@ VectorType
Definition PDBTypes.h:278

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::Log2_32_Ceil
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35

llvm::getBooleanLoopAttribute
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition LoopInfo.cpp:1136

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::adjacent_find
auto adjacent_find(R &&Range)
Provide wrappers to std::adjacent_find which finds the first pair of adjacent elements that are equal...
Definition STLExtras.h:1817

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204

llvm::isShiftedMask_64
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331

llvm::createStrideMask
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
Definition VectorUtils.cpp:1168

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::is_sorted
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1969

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition Instructions.h:1943

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:263

llvm::isMaskedSlidePair
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
Definition VectorUtils.cpp:488

llvm::createInterleaveMask
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
Definition VectorUtils.cpp:1157

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::CostTblEntry
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::SignExtend64
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572

llvm::processShuffleMasks
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
Definition VectorUtils.cpp:664

llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145

llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MaybeAlign::valueOrOne
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130

llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition TargetTransformInfo.h:76

llvm::PatternMatch::m_ZeroMask
Definition PatternMatch.h:1969

llvm::TargetTransformInfo::LSRCost
Definition TargetTransformInfo.h:624

llvm::TargetTransformInfo::LSRCost::NumIVMuls
unsigned NumIVMuls
Definition TargetTransformInfo.h:630

llvm::TargetTransformInfo::LSRCost::ScaleCost
unsigned ScaleCost
Definition TargetTransformInfo.h:634

llvm::TargetTransformInfo::LSRCost::Insns
unsigned Insns
TODO: Some of these could be merged.
Definition TargetTransformInfo.h:627

llvm::TargetTransformInfo::LSRCost::ImmCost
unsigned ImmCost
Definition TargetTransformInfo.h:632

llvm::TargetTransformInfo::LSRCost::AddRecCost
unsigned AddRecCost
Definition TargetTransformInfo.h:629

llvm::TargetTransformInfo::LSRCost::NumRegs
unsigned NumRegs
Definition TargetTransformInfo.h:628

llvm::TargetTransformInfo::LSRCost::NumBaseAdds
unsigned NumBaseAdds
Definition TargetTransformInfo.h:631

llvm::TargetTransformInfo::LSRCost::SetupCost
unsigned SetupCost
Definition TargetTransformInfo.h:633

llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition TargetTransformInfo.h:1106

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1281

llvm::TargetTransformInfo::OperandValueInfo::isConstant
bool isConstant() const
Definition TargetTransformInfo.h:1285

llvm::TargetTransformInfo::OperandValueInfo::getNoProps
OperandValueInfo getNoProps() const
Definition TargetTransformInfo.h:1298

llvm::TargetTransformInfo::OperandValueInfo::isPowerOf2
bool isPowerOf2() const
Definition TargetTransformInfo.h:1291

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:762

llvm::TargetTransformInfo::PointersChainInfo
Describe known properties for a set of pointers.
Definition TargetTransformInfo.h:379

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:638

llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition TargetTransformInfo.h:709

llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition TargetTransformInfo.h:707

llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition TargetTransformInfo.h:667

llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition TargetTransformInfo.h:713

llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition TargetTransformInfo.h:711

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:699

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:695

llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition TargetTransformInfo.h:660

llvm::cl::desc
Definition CommandLine.h:410