doxygen/NVPTXISelLowering_8cpp_source.html

//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file defines the interfaces that NVPTX uses to lower LLVM code into a

// selection DAG.

//

//===----------------------------------------------------------------------===//


#include "NVPTXISelLowering.h"

#include "MCTargetDesc/NVPTXBaseInfo.h"

#include "NVPTX.h"

#include "NVPTXISelDAGToDAG.h"

#include "NVPTXSelectionDAGInfo.h"

#include "NVPTXSubtarget.h"

#include "NVPTXTargetMachine.h"

#include "NVPTXTargetObjectFile.h"

#include "NVPTXUtilities.h"

#include "NVVMProperties.h"

#include "llvm/ADT/APFloat.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/StringRef.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/ISDOpcodes.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineJumpTableInfo.h"

#include "llvm/CodeGen/MachineMemOperand.h"

#include "llvm/CodeGen/SDPatternMatch.h"

#include "llvm/CodeGen/SelectionDAG.h"

#include "llvm/CodeGen/SelectionDAGNodes.h"

#include "llvm/CodeGen/TargetCallingConv.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/CodeGenTypes/MachineValueType.h"

#include "llvm/IR/Argument.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/FPEnv.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/GlobalValue.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicsNVPTX.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/Support/Alignment.h"

#include "llvm/Support/AtomicOrdering.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CodeGen.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/NVPTXAddrSpace.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Target/TargetMachine.h"

#include "llvm/Target/TargetOptions.h"

#include <algorithm>

#include <cassert>

#include <cmath>

#include <cstdint>

#include <iterator>

#include <optional>

#include <string>

#include <tuple>

#include <utility>

#include <vector>


#define DEBUG_TYPE "nvptx-lower"


using namespace llvm;


static cl::opt<bool> sched4reg(

    "nvptx-sched4reg",

    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));


static cl::opt<unsigned> FMAContractLevelOpt(

    "nvptx-fma-level", cl::Hidden,

    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"

             " 1: do it  2: do it aggressively"),

    cl::init(2));


static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32(

    "nvptx-prec-divf32", cl::Hidden,

    cl::desc(

        "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),

    cl::values(

        clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),

        clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),

        clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2",

                   "Use IEEE Compliant F32 div.rnd if available (default)"),

        clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3",

                   "Use IEEE Compliant F32 div.rnd if available, no FTZ")),

    cl::init(NVPTX::DivPrecisionLevel::IEEE754));


static cl::opt<bool> UsePrecSqrtF32(

    "nvptx-prec-sqrtf32", cl::Hidden,

    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),

    cl::init(true));


// PTX atom.add.f32 has fixed FTZ behavior that may not match the function's

// (see shouldExpandAtomicRMWInIR), so by default we fall back to a CAS loop

// when they disagree. This flag is an escape hatch to use atom.add anyway,

// trading correct denormal handling for the speed of the native instruction.

static cl::opt<bool> AllowFTZAtomics(

    "nvptx-allow-ftz-atomics", cl::Hidden,

    cl::desc("NVPTX Specific: Lower atomicrmw fadd to atom.add even when its "

             "FTZ behavior does not match the function's denormal mode."),

    cl::init(false));


/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it

/// does NOT use lg2.approx for log2, so this is disabled by default.

static cl::opt<bool> UseApproxLog2F32(

    "nvptx-approx-log2f32",

    cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),

    cl::init(false));


NVPTX::DivPrecisionLevel


NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,

                                    const SDNode &N) const {

  // If nvptx-prec-div32=N is used on the command-line, always honor it

  if (UsePrecDivF32.getNumOccurrences() > 0)

    return UsePrecDivF32;


  const SDNodeFlags Flags = N.getFlags();

  if (Flags.hasApproximateFuncs())

    return NVPTX::DivPrecisionLevel::Approx;


  return NVPTX::DivPrecisionLevel::IEEE754;

}


bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const {

  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it

  if (UsePrecSqrtF32.getNumOccurrences() > 0)

    return UsePrecSqrtF32;


  if (N) {

    const SDNodeFlags Flags = N->getFlags();

    if (Flags.hasApproximateFuncs())

      return false;

  }


  return true;

}


bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {

  return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==

         DenormalMode::PreserveSign;

}


static bool IsPTXVectorType(MVT VT) {

  switch (VT.SimpleTy) {

  default:

    return false;

  case MVT::v2i1:

  case MVT::v4i1:

  case MVT::v2i8:

  case MVT::v4i8:

  case MVT::v8i8:  // <2 x i8x4>

  case MVT::v16i8: // <4 x i8x4>

  case MVT::v2i16:

  case MVT::v4i16:

  case MVT::v8i16: // <4 x i16x2>

  case MVT::v2i32:

  case MVT::v4i32:

  case MVT::v2i64:

  case MVT::v2f16:

  case MVT::v4f16:

  case MVT::v8f16: // <4 x f16x2>

  case MVT::v2bf16:

  case MVT::v4bf16:

  case MVT::v8bf16: // <4 x bf16x2>

  case MVT::v2f32:

  case MVT::v4f32:

  case MVT::v2f64:

  case MVT::v4i64:

  case MVT::v4f64:

  case MVT::v8i32:

  case MVT::v8f32:

  case MVT::v16f16:  // <8 x f16x2>

  case MVT::v16bf16: // <8 x bf16x2>

  case MVT::v16i16:  // <8 x i16x2>

  case MVT::v32i8:   // <8 x i8x4>

    return true;

  }

}


// When legalizing vector loads/stores, this function is called, which does two

// things:

// 1. Determines Whether the vector is something we want to custom lower,

// std::nullopt is returned if we do not want to custom lower it.

// 2. If we do want to handle it, returns two parameters:

//    - unsigned int NumElts - The number of elements in the final vector

//    - EVT EltVT - The type of the elements in the final vector

static std::optional<std::pair<unsigned int, MVT>>


getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,

                       unsigned AddressSpace) {

  const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);


  if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&

      VectorEVT.getSizeInBits() == 256)

    return {{4, MVT::i64}};


  if (!VectorEVT.isSimple())

    return std::nullopt;

  const MVT VectorVT = VectorEVT.getSimpleVT();


  if (!VectorVT.isVector()) {

    if (VectorVT == MVT::i128 || VectorVT == MVT::f128)

      return {{2, MVT::i64}};

    return std::nullopt;

  }


  const MVT EltVT = VectorVT.getVectorElementType();

  const unsigned NumElts = VectorVT.getVectorNumElements();


  // The size of the PTX virtual register that holds a packed type.

  unsigned PackRegSize;


  // We only handle "native" vector sizes for now, e.g. <4 x double> is not

  // legal.  We can (and should) split that into 2 stores of <2 x double> here

  // but I'm leaving that as a TODO for now.

  switch (VectorVT.SimpleTy) {

  default:

    return std::nullopt;


  case MVT::v4i64:

  case MVT::v4f64:

    // This is a "native" vector type iff the address space is global and the

    // target supports 256-bit loads/stores

    if (!CanLowerTo256Bit)

      return std::nullopt;

    [[fallthrough]];

  case MVT::v2i8:

  case MVT::v2i64:

  case MVT::v2f64:

    // This is a "native" vector type

    return std::pair(NumElts, EltVT);


  case MVT::v16f16:  // <8 x f16x2>

  case MVT::v16bf16: // <8 x bf16x2>

  case MVT::v16i16:  // <8 x i16x2>

  case MVT::v32i8:   // <8 x i8x4>

    // This can be upsized into a "native" vector type iff the address space is

    // global and the target supports 256-bit loads/stores.

    if (!CanLowerTo256Bit)

      return std::nullopt;

    [[fallthrough]];

  case MVT::v2i16:  // <1 x i16x2>

  case MVT::v2f16:  // <1 x f16x2>

  case MVT::v2bf16: // <1 x bf16x2>

  case MVT::v4i8:   // <1 x i8x4>

  case MVT::v4i16:  // <2 x i16x2>

  case MVT::v4f16:  // <2 x f16x2>

  case MVT::v4bf16: // <2 x bf16x2>

  case MVT::v8i8:   // <2 x i8x4>

  case MVT::v8f16:  // <4 x f16x2>

  case MVT::v8bf16: // <4 x bf16x2>

  case MVT::v8i16:  // <4 x i16x2>

  case MVT::v16i8:  // <4 x i8x4>

    PackRegSize = 32;

    break;


  case MVT::v8f32: // <4 x f32x2>

  case MVT::v8i32: // <4 x i32x2>

    // This is a "native" vector type iff the address space is global and the

    // target supports 256-bit loads/stores

    if (!CanLowerTo256Bit)

      return std::nullopt;

    [[fallthrough]];

  case MVT::v2f32: // <1 x f32x2>

  case MVT::v4f32: // <2 x f32x2>

  case MVT::v2i32: // <1 x i32x2>

  case MVT::v4i32: // <2 x i32x2>

    if (!STI.hasF32x2Instructions())

      return std::pair(NumElts, EltVT);

    PackRegSize = 64;

    break;

  }


  // If we reach here, then we can pack 2 or more elements into a single 32-bit

  // or 64-bit PTX register and treat the vector as a new vector containing

  // packed elements.


  // Number of elements to pack in one word.

  const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();


  return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));

}


/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive

/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize

/// the types as required by the calling convention (with special handling for

/// i8s).

/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the

/// same number of types as the Ins/Outs arrays in LowerFormalArguments,

/// LowerCall, and LowerReturn.


static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,

                               LLVMContext &Ctx, CallingConv::ID CallConv,

                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,

                               SmallVectorImpl<uint64_t> &Offsets,

                               uint64_t StartingOffset = 0) {

  SmallVector<EVT, 16> TempVTs;

  SmallVector<uint64_t, 16> TempOffsets;

  ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,

                  StartingOffset);


  for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {

    MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);

    unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);


    // Since we actually can load/store b8, we need to ensure that we'll use

    // the original sized type for any i8s or i8 vectors.

    if (VT.getScalarType() == MVT::i8) {

      if (RegisterVT == MVT::i16)

        RegisterVT = MVT::i8;

      else if (RegisterVT == MVT::v2i16)

        RegisterVT = MVT::v2i8;

      else

        assert(RegisterVT == MVT::v4i8 &&

               "Expected v4i8, v2i16, or i16 for i8 RegisterVT");

    }


    // TODO: This is horribly incorrect for cases where the vector elements are

    // not a multiple of bytes (ex i1) and legal or i8. However, this problem

    // has existed for as long as NVPTX has and no one has complained, so we'll

    // leave it for now.

    for (unsigned I : seq(NumRegs)) {

      ValueVTs.push_back(RegisterVT);

      Offsets.push_back(Off + I * RegisterVT.getStoreSize());

    }

  }

}


// We return an EVT that can hold N VTs

// If the VT is a vector, the resulting EVT is a flat vector with the same

// element type as VT's element type.


static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {

  if (N == 1)

    return VT;


  return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),

                                          VT.getVectorNumElements() * N)

                       : EVT::getVectorVT(C, VT, N);

}


static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT,

                                         const SDLoc &dl, SelectionDAG &DAG) {

  if (V.getValueType() == VT) {

    assert(I == 0 && "Index must be 0 for scalar value");

    return V;

  }


  if (!VT.isVector())

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,

                       DAG.getVectorIdxConstant(I, dl));


  return DAG.getNode(

      ISD::EXTRACT_SUBVECTOR, dl, VT, V,

      DAG.getVectorIdxConstant(I * VT.getVectorNumElements(), dl));

}


template <typename T>


static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,

                                              SelectionDAG &DAG, T GetElement) {

  if (N == 1)

    return GetElement(0);


  SmallVector<SDValue, 8> Values;

  for (const unsigned I : llvm::seq(N)) {

    SDValue Val = GetElement(I);

    if (Val.getValueType().isVector())

      DAG.ExtractVectorElements(Val, Values);

    else

      Values.push_back(Val);

  }


  EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),

                            Values.size());

  return DAG.getBuildVector(VT, dl, Values);

}


/// PromoteScalarIntegerPTX

/// Used to make sure the arguments/returns are suitable for passing

/// and promote them to a larger size if they're not.

///

/// The promoted type is placed in \p PromoteVT if the function returns true.


static EVT promoteScalarIntegerPTX(const EVT VT) {

  if (VT.isScalarInteger()) {

    switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {

    default:

      llvm_unreachable(

          "Promotion is not suitable for scalars of size larger than 64-bits");

    case 1:

      return MVT::i1;

    case 2:

    case 4:

    case 8:

      return MVT::i8;

    case 16:

      return MVT::i16;

    case 32:

      return MVT::i32;

    case 64:

      return MVT::i64;

    }

  }

  return VT;

}


// Check whether we can merge loads/stores of some of the pieces of a

// flattened function parameter or return value into a single vector

// load/store.

//

// The flattened parameter is represented as a list of EVTs and

// offsets, and the whole structure is aligned to ParamAlignment. This

// function determines whether we can load/store pieces of the

// parameter starting at index Idx using a single vectorized op of

// size AccessSize. If so, it returns the number of param pieces

// covered by the vector op. Otherwise, it returns 1.

template <typename T>


static unsigned canMergeParamLoadStoresStartingAt(

    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,

    const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {


  // Can't vectorize if param alignment is not sufficient.

  if (ParamAlignment < AccessSize)

    return 1;

  // Can't vectorize if offset is not aligned.

  if (Offsets[Idx] & (AccessSize - 1))

    return 1;


  EVT EltVT = ValueVTs[Idx];

  unsigned EltSize = EltVT.getStoreSize();


  // Element is too large to vectorize.

  if (EltSize >= AccessSize)

    return 1;


  unsigned NumElts = AccessSize / EltSize;

  // Can't vectorize if AccessBytes if not a multiple of EltSize.

  if (AccessSize != EltSize * NumElts)

    return 1;


  // We don't have enough elements to vectorize.

  if (Idx + NumElts > ValueVTs.size())

    return 1;


  // PTX ISA can only deal with 2- and 4-element vector ops.

  if (NumElts != 4 && NumElts != 2)

    return 1;


  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {

    // Types do not match.

    if (ValueVTs[j] != EltVT)

      return 1;


    // Elements are not contiguous.

    if (Offsets[j] - Offsets[j - 1] != EltSize)

      return 1;

  }

  // OK. We can vectorize ValueVTs[i..i+NumElts)

  return NumElts;

}


// Computes whether and how we can vectorize the loads/stores of a

// flattened function parameter or return value.

//

// The flattened parameter is represented as the list of ValueVTs and

// Offsets, and is aligned to ParamAlignment bytes. We return a vector

// of the same size as ValueVTs indicating how each piece should be

// loaded/stored (i.e. as a scalar, or as part of a vector

// load/store).

template <typename T>

static SmallVector<unsigned, 16>


VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,

                     const SmallVectorImpl<T> &Offsets, Align ParamAlignment,

                     bool IsVAArg = false) {

  // Set vector size to match ValueVTs and mark all elements as

  // scalars by default.


  if (IsVAArg)

    return SmallVector<unsigned>(ValueVTs.size(), 1);


  SmallVector<unsigned, 16> VectorInfo;


  const auto GetNumElts = [&](unsigned I) -> unsigned {

    for (const unsigned AccessSize : {16, 8, 4, 2}) {

      const unsigned NumElts = canMergeParamLoadStoresStartingAt(

          I, AccessSize, ValueVTs, Offsets, ParamAlignment);

      assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&

             "Unexpected vectorization size");

      if (NumElts != 1)

        return NumElts;

    }

    return 1;

  };


  // Check what we can vectorize using 128/64/32-bit accesses.

  for (unsigned I = 0, E = ValueVTs.size(); I != E;) {

    const unsigned NumElts = GetNumElts(I);

    VectorInfo.push_back(NumElts);

    I += NumElts;

  }

  assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==

         ValueVTs.size());

  return VectorInfo;

}


// NVPTXTargetLowering Constructor.


NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,

                                         const NVPTXSubtarget &STI)

    : TargetLowering(TM, STI), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {

  // always lower memset, memcpy, and memmove intrinsics to load/store

  // instructions, rather

  // then generating calls to memset, mempcy or memmove.

  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;

  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;

  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;


  setBooleanContents(ZeroOrNegativeOneBooleanContent);

  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);


  // Jump is Expensive. Don't create extra control flow for 'and', 'or'

  // condition branches.

  setJumpIsExpensive(true);


  // Wide divides are _very_ slow. Try to reduce the width of the divide if

  // possible.

  addBypassSlowDiv(64, 32);


  // By default, use the Source scheduling

  if (sched4reg)

    setSchedulingPreference(Sched::RegPressure);

  else

    setSchedulingPreference(Sched::Source);


  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,

                                    LegalizeAction NoF16Action) {

    bool IsOpSupported = STI.allowFP16Math();

    switch (Op) {

    // Several FP16 instructions are available on sm_80 only.

    case ISD::FMINNUM:

    case ISD::FMAXNUM:

    case ISD::FMAXNUM_IEEE:

    case ISD::FMINNUM_IEEE:

    case ISD::FMAXIMUM:

    case ISD::FMINIMUM:

    case ISD::FMAXIMUMNUM:

    case ISD::FMINIMUMNUM:

      IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;

      break;

    case ISD::FEXP2:

      IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;

      break;

    }

    setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);

  };


  auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,

                                    LegalizeAction NoBF16Action) {

    bool IsOpSupported = STI.hasNativeBF16Support(Op);

    setOperationAction(

        Op, VT, IsOpSupported ? Action : NoBF16Action);

  };


  auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,

                                     LegalizeAction NoI16x2Action) {

    bool IsOpSupported = false;

    // instructions are available on sm_90 only

    switch (Op) {

    case ISD::ADD:

    case ISD::SMAX:

    case ISD::SMIN:

    case ISD::UMIN:

    case ISD::UMAX:

      IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;

      break;

    }

    setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);

  };


  addRegisterClass(MVT::i1, &NVPTX::B1RegClass);

  addRegisterClass(MVT::i16, &NVPTX::B16RegClass);

  addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);

  addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);

  addRegisterClass(MVT::i32, &NVPTX::B32RegClass);

  addRegisterClass(MVT::i64, &NVPTX::B64RegClass);

  addRegisterClass(MVT::f32, &NVPTX::B32RegClass);

  addRegisterClass(MVT::f64, &NVPTX::B64RegClass);

  addRegisterClass(MVT::f16, &NVPTX::B16RegClass);

  addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);

  addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);

  addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);


  if (STI.hasF32x2Instructions()) {

    addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);

    addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);

  }


  // Conversion to/from FP16/FP16x2 is always legal.

  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);

  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);

  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);


  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

  if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)

    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);


  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);

  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);


  // Conversion to/from BFP16/BFP16x2 is always legal.

  setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);

  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);

  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);

  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);


  setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);

  setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);

  if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)

    AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);


  // Conversion to/from i16/i16x2 is always legal.

  setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);

  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);

  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);

  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);


  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);

  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);

  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);

  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);


  // No support for these operations with v2f32/v2i32

  setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);

  setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);


  setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);

  setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},

                     MVT::v2i32, Expand);


  // Need custom lowering in case the index is dynamic.

  if (STI.hasF32x2Instructions())

    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},

                       Custom);


  // Custom conversions to/from v2i8.

  setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);


  // Only logical ops can be done on v4i8/v2i32 directly, others must be done

  // elementwise.

  setOperationAction(

      {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,

       ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,

       ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,

       ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,

       ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,

       ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,

       ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,

       ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,

       ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,

       ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,

       ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,

       ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,

       ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,

       ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,

       ISD::USUBSAT},

      {MVT::v4i8, MVT::v2i32}, Expand);


  // Operations not directly supported by NVPTX.

  for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,

                 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,

                 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {

    setOperationAction(ISD::SELECT_CC, VT, Expand);

    setOperationAction(ISD::BR_CC, VT, Expand);

  }


  // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.

  setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);


  // Some SIGN_EXTEND_INREG can be done using cvt instruction.

  // For others we will expand to a SHL/SRA pair.

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);


  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);

  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);

  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);

  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);

  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);

  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);


  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);


  setOperationAction({ISD::ROTL, ISD::ROTR},

                     {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},

                     Expand);


  if (STI.hasHWROT32()) {

    setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);

    setOperationAction({ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, MVT::i64,

                       Custom);

  }


  setOperationAction(ISD::BR_JT, MVT::Other, STI.hasBrx() ? Legal : Expand);

  setOperationAction(ISD::BRIND, MVT::Other, Expand);


  // We want to legalize constant related memmove and memcopy

  // intrinsics.

  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);


  // FP extload/truncstore is not legal in PTX. We need to expand all these.

  for (auto FloatVTs :

       {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {

    for (MVT ValVT : FloatVTs) {

      for (MVT MemVT : FloatVTs) {

        setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);

        setTruncStoreAction(ValVT, MemVT, Expand);

      }

    }

  }


  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is

  // how they'll be lowered in ISel anyway, and by doing this a little earlier

  // we allow for more DAG combine opportunities.

  for (auto IntVTs :

       {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})

    for (MVT ValVT : IntVTs)

      for (MVT MemVT : IntVTs)

        if (isTypeLegal(ValVT))

          setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);


  // PTX does not support load / store predicate registers

  setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);

  for (MVT VT : MVT::integer_valuetypes()) {

    setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,

                     Promote);

    setTruncStoreAction(VT, MVT::i1, Expand);

  }


  // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic

  // expansion for these nodes when they are unaligned is incorrect if the

  // type is a vector.

  //

  // TODO: Fix the generic expansion for these nodes found in

  //       TargetLowering::expandUnalignedLoad/Store.

  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,

                   MVT::v2i8, Expand);

  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,

                   {MVT::v2i8, MVT::v2i16}, Expand);

  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);

  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);

  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);


  // Register custom handling for illegal type loads/stores. We'll try to custom

  // lower almost all illegal types and logic in the lowering will discard cases

  // we can't handle.

  setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::i256, MVT::f128},

                     Custom);

  for (MVT VT : MVT::fixedlen_vector_valuetypes())

    if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)

      setOperationAction({ISD::STORE, ISD::LOAD, ISD::MSTORE, ISD::MLOAD}, VT,

                         Custom);


  // Custom legalization for LDU intrinsics.

  // TODO: The logic to lower these is not very robust and we should rewrite it.

  //       Perhaps LDU should not be represented as an intrinsic at all.

  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

  for (MVT VT : MVT::fixedlen_vector_valuetypes())

    if (IsPTXVectorType(VT))

      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);


  setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,

                     ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,

                     ISD::SETGE, ISD::SETLE},

                    MVT::i1, Expand);


  // This is legal in NVPTX

  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);

  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);

  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);

  setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);


  setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);

  setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);


  // TRAP can be lowered to PTX trap

  setOperationAction(ISD::TRAP, MVT::Other, Legal);

  // DEBUGTRAP can be lowered to PTX brkpt

  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);


  // Support varargs.

  setOperationAction(ISD::VASTART, MVT::Other, Custom);

  setOperationAction(ISD::VAARG, MVT::Other, Custom);

  setOperationAction(ISD::VACOPY, MVT::Other, Expand);

  setOperationAction(ISD::VAEND, MVT::Other, Expand);


  setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},

                     {MVT::i16, MVT::i32, MVT::i64}, Legal);

  // PTX abs.s is undefined for INT_MIN, so ISD::ABS (which requires

  // abs(INT_MIN) == INT_MIN) must be expanded. ABS_MIN_POISON matches

  // PTX abs semantics since INT_MIN input is poison/undefined.

  setOperationAction(ISD::ABS, {MVT::i16, MVT::i32, MVT::i64}, Expand);

  setOperationAction(ISD::ABS_MIN_POISON, {MVT::i16, MVT::i32, MVT::i64},

                     Legal);


  setOperationAction({ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i16,

                     Promote);

  setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);

  setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i64, Custom);


  setI16x2OperationAction(ISD::ABS_MIN_POISON, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);

  setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);


  setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);

  setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);


  // Other arithmetic and logic ops are unsupported.

  setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,

                      ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

                      ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC},

                     {MVT::v2i16, MVT::v2i32}, Expand);


  // v2i32 is not supported for any arithmetic operations

  setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,

                      ISD::CTPOP, ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL,

                      ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::XOR,

                      ISD::SREM, ISD::UREM},

                     MVT::v2i32, Expand);


  setOperationAction(ISD::ADDC, MVT::i32, Legal);

  setOperationAction(ISD::ADDE, MVT::i32, Legal);

  setOperationAction(ISD::SUBC, MVT::i32, Legal);

  setOperationAction(ISD::SUBE, MVT::i32, Legal);

  if (STI.getPTXVersion() >= 43) {

    setOperationAction(ISD::ADDC, MVT::i64, Legal);

    setOperationAction(ISD::ADDE, MVT::i64, Legal);

    setOperationAction(ISD::SUBC, MVT::i64, Legal);

    setOperationAction(ISD::SUBE, MVT::i64, Legal);

  }


  setOperationAction(ISD::CTTZ, MVT::i16, Expand);

  setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);

  setOperationAction(ISD::CTTZ, MVT::i32, Expand);

  setOperationAction(ISD::CTTZ, MVT::i64, Expand);


  // PTX does not directly support SELP of i1, so promote to i32 first

  setOperationAction(ISD::SELECT, MVT::i1, Custom);


  // PTX cannot multiply two i64s in a single instruction.

  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);


  // We have some custom DAG combine patterns for these nodes

  setTargetDAGCombine({ISD::ADD,

                       ISD::AND,

                       ISD::EXTRACT_VECTOR_ELT,

                       ISD::FADD,

                       ISD::FMAXNUM,

                       ISD::FMINNUM,

                       ISD::FMAXIMUM,

                       ISD::FMINIMUM,

                       ISD::FMAXIMUMNUM,

                       ISD::FMINIMUMNUM,

                       ISD::MUL,

                       ISD::SELECT,

                       ISD::SHL,

                       ISD::SREM,

                       ISD::UREM,

                       ISD::VSELECT,

                       ISD::BUILD_VECTOR,

                       ISD::ADDRSPACECAST,

                       ISD::LOAD,

                       ISD::STORE,

                       ISD::ZERO_EXTEND,

                       ISD::SIGN_EXTEND,

                       ISD::INTRINSIC_WO_CHAIN});


  // If the vector operands require register coalescing, scalarize instead

  if (STI.hasF32x2Instructions())

    setTargetDAGCombine({ISD::FMA, ISD::FMUL, ISD::FSUB});


  // setcc for f16x2 and bf16x2 needs special handling to prevent

  // legalizer's attempt to scalarize it due to v2i1 not being legal.

  if (STI.allowFP16Math() || STI.hasBF16Math())

    setTargetDAGCombine(ISD::SETCC);


  // Vector reduction operations. These may be turned into shuffle or tree

  // reductions depending on what instructions are available for each type.

  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

    MVT EltVT = VT.getVectorElementType();

    if (EltVT == MVT::f32 || EltVT == MVT::f64) {

      setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,

                          ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},

                         VT, Custom);

    }

  }


  // Promote fp16 arithmetic if fp16 hardware isn't available or the

  // user passed --nvptx-no-fp16-math. The flag is useful because,

  // although sm_53+ GPUs have some sort of FP16 support in

  // hardware, only sm_53 and sm_60 have full implementation. Others

  // only have token amount of hardware and are likely to run faster

  // by using fp32 units instead.

  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {

    setFP16OperationAction(Op, MVT::f16, Legal, Promote);

    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);

    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

    // bf16 must be promoted to f32.

    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);

    if (getOperationAction(Op, MVT::bf16) == Promote)

      AddPromotedToType(Op, MVT::bf16, MVT::f32);

    setOperationAction(Op, MVT::v2f32,

                       STI.hasF32x2Instructions() ? Legal : Expand);

  }


  // On SM80, we select add/mul/sub as fma to avoid promotion to float

  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {

    for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {

      if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {

        setOperationAction(Op, VT, Custom);

      }

    }

  }


  // f16/f16x2 neg was introduced in PTX 60, SM_53.

  const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&

                                        STI.getPTXVersion() >= 60 &&

                                        STI.allowFP16Math();

  for (const auto &VT : {MVT::f16, MVT::v2f16})

    setOperationAction(ISD::FNEG, VT,

                       IsFP16FP16x2NegAvailable ? Legal : Expand);


  setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);

  setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);

  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);

  // (would be) Library functions.


  // These map to conversion instructions for scalar FP types.

  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,

                         ISD::FROUNDEVEN, ISD::FTRUNC}) {

    setOperationAction(Op, MVT::f16, Legal);

    setOperationAction(Op, MVT::f32, Legal);

    setOperationAction(Op, MVT::f64, Legal);

    setOperationAction(Op, MVT::v2f16, Expand);

    setOperationAction(Op, MVT::v2bf16, Expand);

    setOperationAction(Op, MVT::v2f32, Expand);

    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);

    if (getOperationAction(Op, MVT::bf16) == Promote)

      AddPromotedToType(Op, MVT::bf16, MVT::f32);

  }


  if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {

    setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);

  }

  if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

    for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {

      setOperationAction(ISD::FP_EXTEND, VT, Custom);

      setOperationAction(ISD::FP_ROUND, VT, Custom);

    }

  }


  // Expand v2f32 = fp_extend

  setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);

  // Expand v2[b]f16 = fp_round v2f32

  setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);


  // sm_80 only has conversions between f32 and bf16. Custom lower all other

  // bf16 conversions.

  if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

    for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {

      setOperationAction(

          {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},

          VT, Custom);

    }

    setOperationAction(

        {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},

        MVT::bf16, Custom);

  }


  setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i1, Custom);

  setOperationAction(ISD::FROUND, MVT::f16, Promote);

  setOperationAction(ISD::FROUND, MVT::v2f16, Expand);

  setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);

  setOperationAction(ISD::FROUND, MVT::f32, Custom);

  setOperationAction(ISD::FROUND, MVT::f64, Custom);

  setOperationAction(ISD::FROUND, MVT::bf16, Promote);

  AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);


  // 'Expand' implements FCOPYSIGN without calling an external library.

  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);


  // These map to corresponding instructions for f32/f64. f16 must be

  // promoted to f32. v2f16 is expanded to f16, which is then promoted

  // to f32.

  for (const auto &Op :

       {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {

    setOperationAction(Op, MVT::f16, Promote);

    setOperationAction(Op, MVT::f32, Legal);

    // only div/rem/sqrt are legal for f64

    if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {

      setOperationAction(Op, MVT::f64, Legal);

    }

    setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);

    setOperationAction(Op, MVT::bf16, Promote);

    AddPromotedToType(Op, MVT::bf16, MVT::f32);

  }

  setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);


  setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);

  setOperationAction(ISD::FABS, MVT::v2f32, Expand);

  if (STI.getPTXVersion() >= 65) {

    setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);

    setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);

  } else {

    setOperationAction(ISD::FABS, MVT::f16, Promote);

    setOperationAction(ISD::FABS, MVT::v2f16, Expand);

  }

  setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);

  setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);

  if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)

    AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);


  for (const auto &Op :

       {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {

    setOperationAction(Op, MVT::f32, Legal);

    setOperationAction(Op, MVT::f64, Legal);

    setFP16OperationAction(Op, MVT::f16, Legal, Promote);

    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);

    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);

    if (getOperationAction(Op, MVT::bf16) == Promote)

      AddPromotedToType(Op, MVT::bf16, MVT::f32);

    setOperationAction(Op, MVT::v2f32, Expand);

  }

  bool SupportsF32MinMaxNaN =

      STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;

  for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {

    setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);

    setFP16OperationAction(Op, MVT::f16, Legal, Expand);

    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);

    setBF16OperationAction(Op, MVT::bf16, Legal, Expand);

    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

    setOperationAction(Op, MVT::v2f32, Expand);

  }


  // Custom lowering for inline asm with 128-bit operands

  setOperationAction(ISD::CopyToReg, MVT::i128, Custom);

  setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);


  // FEXP2 support:

  // - f32

  // - f16/f16x2 (sm_70+, PTX 7.0+)

  // - bf16/bf16x2 (sm_90+, PTX 7.8+)

  // When f16/bf16 types aren't supported, they are promoted/expanded to f32.

  setOperationAction(ISD::FEXP2, MVT::f32, Legal);

  setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);

  setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);

  setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);

  setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);

  setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);


  // FLOG2 supports f32 only

  // f16/bf16 types aren't supported, but they are promoted/expanded to f32.

  if (UseApproxLog2F32) {

    setOperationAction(ISD::FLOG2, MVT::f32, Legal);

    setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);

    setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);

    setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},

                       Expand);

  }


  setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);


  setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);


  // atom.b128 is legal in PTX but since we don't represent i128 as a legal

  // type, we need to custom lower it.

  setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,

                     Custom);


  // Now deduce the information based on the above mentioned

  // actions

  computeRegisterProperties(STI.getRegisterInfo());


  // PTX support for 16-bit CAS is emulated. Only use 32+

  setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());

  setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);

  setMaxDivRemBitWidthSupported(64);


  // Custom lowering for tcgen05.ld vector operands

  setOperationAction(ISD::INTRINSIC_W_CHAIN,

                     {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,

                      MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,

                      MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,

                      MVT::v64f32, MVT::v128f32},

                     Custom);


  // Custom lowering for tcgen05.st vector operands

  setOperationAction(ISD::INTRINSIC_VOID,

                     {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,

                      MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},

                     Custom);


  // Enable custom lowering for the following:

  //   * MVT::i128 - clusterlaunchcontrol

  //   * MVT::i32 - prmt

  //   * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics

  //   * MVT::Other - internal.addrspace.wrap

  setOperationAction(ISD::INTRINSIC_WO_CHAIN,

                     {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);


  // Custom lowering for bswap

  setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},

                     Custom);

}


TargetLoweringBase::LegalizeTypeAction


NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {

  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

      VT.getScalarType() == MVT::i1)

    return TypeSplitVector;

  return TargetLoweringBase::getPreferredVectorAction(VT);

}


SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,

                                             int Enabled, int &ExtraSteps,

                                             bool &UseOneConst,

                                             bool Reciprocal) const {

  if (!(Enabled == ReciprocalEstimate::Enabled ||

        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))

    return SDValue();


  if (ExtraSteps == ReciprocalEstimate::Unspecified)

    ExtraSteps = 0;


  SDLoc DL(Operand);

  EVT VT = Operand.getValueType();

  bool Ftz = useF32FTZ(DAG.getMachineFunction());


  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,

                       DAG.getConstant(IID, DL, MVT::i32), Operand);

  };


  // The sqrt and rsqrt refinement processes assume we always start out with an

  // approximation of the rsqrt.  Therefore, if we're going to do any refinement

  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing

  // any refinement, we must return a regular sqrt.

  if (Reciprocal || ExtraSteps > 0) {

    if (VT == MVT::f32)

      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f

                                   : Intrinsic::nvvm_rsqrt_approx_f);

    else if (VT == MVT::f64)

      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);

    else

      return SDValue();

  } else {

    if (VT == MVT::f32)

      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f

                                   : Intrinsic::nvvm_sqrt_approx_f);

    else {

      // There's no sqrt.approx.f64 instruction, so we emit

      // reciprocal(rsqrt(x)).  This is faster than

      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain

      // x * rsqrt(x).)

      return DAG.getNode(

          ISD::INTRINSIC_WO_CHAIN, DL, VT,

          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),

          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));

    }

  }

}


static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx,

                                  const DataLayout &DL);


std::string NVPTXTargetLowering::getPrototype(

    const DataLayout &DL, Type *RetTy, const ArgListTy &Args,

    const SmallVectorImpl<ISD::OutputArg> &Outs,

    std::optional<unsigned> FirstVAArg, const CallBase &CB,

    unsigned UniqueCallSite) const {

  auto PtrVT = getPointerTy(DL);


  std::string Prototype;

  raw_string_ostream O(Prototype);

  O << "prototype_" << UniqueCallSite << " : .callprototype ";


  if (RetTy->isVoidTy()) {

    O << "()";

  } else {

    O << "(";

    if (shouldPassAsArray(RetTy)) {

      const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);

      O << ".param .align " << RetAlign.value() << " .b8 _["

        << DL.getTypeAllocSize(RetTy) << "]";

    } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {

      unsigned size = 0;

      if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {

        size = ITy->getBitWidth();

      } else {

        assert(RetTy->isFloatingPointTy() &&

               "Floating point type expected here");

        size = RetTy->getPrimitiveSizeInBits();

      }

      // PTX ABI requires all scalar return values to be at least 32

      // bits in size.  fp16 normally uses .b16 as its storage type in

      // PTX, so its size must be adjusted here, too.

      size = promoteScalarArgumentSize(size);


      O << ".param .b" << size << " _";

    } else if (isa<PointerType>(RetTy)) {

      O << ".param .b" << PtrVT.getSizeInBits() << " _";

    } else {

      llvm_unreachable("Unknown return type");

    }

    O << ") ";

  }

  O << "_ (";


  bool first = true;


  const unsigned NumArgs = FirstVAArg.value_or(Args.size());

  auto AllOuts = ArrayRef(Outs);

  for (const unsigned I : llvm::seq(NumArgs)) {

    const auto ArgOuts =

        AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });

    AllOuts = AllOuts.drop_front(ArgOuts.size());


    Type *Ty = Args[I].Ty;

    if (!first) {

      O << ", ";

    }

    first = false;


    if (ArgOuts[0].Flags.isByVal()) {

      // Indirect calls need strict ABI alignment so we disable optimizations by

      // not providing a function to optimize.

      Type *ETy = Args[I].IndirectType;

      Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();

      Align ParamByValAlign =

          getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);


      O << ".param .align " << ParamByValAlign.value() << " .b8 _["

        << ArgOuts[0].Flags.getByValSize() << "]";

    } else {

      if (shouldPassAsArray(Ty)) {

        Align ParamAlign =

            getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);

        O << ".param .align " << ParamAlign.value() << " .b8 _["

          << DL.getTypeAllocSize(Ty) << "]";

        continue;

      }

      // i8 types in IR will be i16 types in SDAG

      assert((getValueType(DL, Ty) == ArgOuts[0].VT ||

              (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&

             "type mismatch between callee prototype and arguments");

      // scalar type

      unsigned sz = 0;

      if (auto *ITy = dyn_cast<IntegerType>(Ty)) {

        sz = promoteScalarArgumentSize(ITy->getBitWidth());

      } else if (isa<PointerType>(Ty)) {

        sz = PtrVT.getSizeInBits();

      } else {

        sz = Ty->getPrimitiveSizeInBits();

      }

      O << ".param .b" << sz << " _";

    }

  }


  if (FirstVAArg)

    O << (first ? "" : ",") << " .param .align "

      << STI.getMaxRequiredAlignment() << " .b8 _[]";

  O << ")";

  if (shouldEmitPTXNoReturn(&CB, *nvTM))

    O << " .noreturn";

  O << ";";


  return Prototype;

}


static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx,

                                  const DataLayout &DL) {

  if (!CB) {

    // CallSite is zero, fallback to ABI type alignment

    return DL.getABITypeAlign(Ty);

  }


  const Function *DirectCallee = CB->getCalledFunction();


  if (!DirectCallee) {

    // We don't have a direct function symbol, but that may be because of

    // constant cast instructions in the call.


    // With bitcast'd call targets, the instruction will be the call

    if (const auto *CI = dyn_cast<CallInst>(CB)) {

      // Check if we have call alignment metadata

      if (MaybeAlign StackAlign = getAlign(*CI, Idx))

        return StackAlign.value();

    }

    DirectCallee = getMaybeBitcastedCallee(CB);

  }


  // Check for function alignment information if we found that the

  // ultimate target is a Function

  if (DirectCallee)

    return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);


  // Call is indirect, fall back to the ABI type alignment

  return DL.getABITypeAlign(Ty);

}


static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG,

                                      const DataLayout &DL,

                                      const TargetLowering &TL) {

  if (Ptr->getOpcode() == ISD::FrameIndex) {

    auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);

    Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,

                               ADDRESS_SPACE_LOCAL);


    return MachinePointerInfo(ADDRESS_SPACE_LOCAL);

  }


  // Peel of an addrspacecast to generic and load directly from the specific

  // address space.

  if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {

    const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);

    if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {

      Ptr = ASC->getOperand(0);

      return MachinePointerInfo(ASC->getSrcAddressSpace());

    }

  }


  return MachinePointerInfo();

}


static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags) {

  if (Flags.isSExt())

    return ISD::SIGN_EXTEND;

  if (Flags.isZExt())

    return ISD::ZERO_EXTEND;

  return ISD::ANY_EXTEND;

}


static SDValue correctParamType(SDValue V, EVT ExpectedVT,

                                ISD::ArgFlagsTy Flags, SelectionDAG &DAG,

                                SDLoc dl) {

  const EVT ActualVT = V.getValueType();

  assert((ActualVT == ExpectedVT ||

          (ExpectedVT.isInteger() && ActualVT.isInteger())) &&

         "Non-integer argument type size mismatch");

  if (ExpectedVT.bitsGT(ActualVT))

    return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);

  if (ExpectedVT.bitsLT(ActualVT))

    return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);


  return V;

}


SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

                                       SmallVectorImpl<SDValue> &InVals) const {


  if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))

    report_fatal_error(

        "Support for variadic functions (unsized array parameter) introduced "

        "in PTX ISA version 6.0 and requires target sm_30.");


  SelectionDAG &DAG = CLI.DAG;

  SDLoc dl = CLI.DL;

  const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

  SDValue Callee = CLI.Callee;

  ArgListTy &Args = CLI.getArgs();

  Type *RetTy = CLI.RetTy;

  const CallBase *CB = CLI.CB;

  const DataLayout &DL = DAG.getDataLayout();

  LLVMContext &Ctx = *DAG.getContext();


  const auto GetI32 = [&](const unsigned I) {

    return DAG.getConstant(I, dl, MVT::i32);

  };


  const unsigned UniqueCallSite = GlobalUniqueCallSite++;

  const SDValue CallChain = CLI.Chain;

  const SDValue StartChain =

      DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);

  SDValue DeclareGlue = StartChain.getValue(1);


  SmallVector<SDValue, 16> CallPrereqs{StartChain};


  const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {

    // PTX ABI requires integral types to be at least 32 bits in size. FP16 is

    // loaded/stored using i16, so it's handled here as well.

    const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);

    SDValue Declare =

        DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},

                    {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});

    CallPrereqs.push_back(Declare);

    DeclareGlue = Declare.getValue(1);

    return Declare;

  };


  const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,

                                         unsigned Size) {

    SDValue Declare = DAG.getNode(

        NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},

        {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});

    CallPrereqs.push_back(Declare);

    DeclareGlue = Declare.getValue(1);

    return Declare;

  };


  // Variadic arguments.

  //

  // Normally, for each argument, we declare a param scalar or a param

  // byte array in the .param space, and store the argument value to that

  // param scalar or array starting at offset 0.

  //

  // In the case of the first variadic argument, we declare a vararg byte array

  // with size 0. The exact size of this array isn't known at this point, so

  // it'll be patched later. All the variadic arguments will be stored to this

  // array at a certain offset (which gets tracked by 'VAOffset'). The offset is

  // initially set to 0, so it can be used for non-variadic arguments (which use

  // 0 offset) to simplify the code.

  //

  // After all vararg is processed, 'VAOffset' holds the size of the

  // vararg byte array.

  assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&

         "Non-VarArg function with extra arguments");


  const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic

  unsigned VAOffset = 0; // current offset in the param array


  const SDValue VADeclareParam =

      CLI.Args.size() > FirstVAArg

          ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),

                                  Align(STI.getMaxRequiredAlignment()), 0)

          : SDValue();


  // Args.size() and Outs.size() need not match.

  // Outs.size() will be larger

  //   * if there is an aggregate argument with multiple fields (each field

  //     showing up separately in Outs)

  //   * if there is a vector argument with more than typical vector-length

  //     elements (generally if more than 4) where each vector element is

  //     individually present in Outs.

  // So a different index should be used for indexing into Outs/OutVals.

  // See similar issue in LowerFormalArguments.

  auto AllOuts = ArrayRef(CLI.Outs);

  auto AllOutVals = ArrayRef(CLI.OutVals);

  assert(AllOuts.size() == AllOutVals.size() &&

         "Outs and OutVals must be the same size");

  // Declare the .params or .reg need to pass values

  // to the function

  for (const auto E : llvm::enumerate(Args)) {

    const auto ArgI = E.index();

    const auto Arg = E.value();

    const auto ArgOuts =

        AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });

    const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());

    AllOuts = AllOuts.drop_front(ArgOuts.size());

    AllOutVals = AllOutVals.drop_front(ArgOuts.size());


    const bool IsVAArg = (ArgI >= FirstVAArg);

    const bool IsByVal = Arg.IsByVal;


    const SDValue ParamSymbol =

        getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);


    assert((!IsByVal || Arg.IndirectType) &&

           "byval arg must have indirect type");

    Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);


    const Align ArgAlign = [&]() {

      if (IsByVal) {

        // The ByValAlign in the Outs[OIdx].Flags is always set at this point,

        // so we don't need to worry whether it's naturally aligned or not.

        // See TargetLowering::LowerCallTo().

        const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();

        return getFunctionByValParamAlign(CB->getCalledFunction(), ETy,

                                          InitialAlign, DL);

      }

      return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);

    }();


    const unsigned TySize = DL.getTypeAllocSize(ETy);

    assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&

           "type size mismatch");


    const SDValue ArgDeclare = [&]() {

      if (IsVAArg)

        return VADeclareParam;


      if (IsByVal || shouldPassAsArray(Arg.Ty))

        return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);


      assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");

      assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&

             "Only int and float types are supported as non-array arguments");


      return MakeDeclareScalarParam(ParamSymbol, TySize);

    }();


    if (IsByVal) {

      assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");

      SDValue SrcPtr = ArgOutVals[0];

      const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);

      const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();


      if (IsVAArg)

        VAOffset = alignTo(VAOffset, ArgAlign);


      SmallVector<EVT, 4> ValueVTs, MemVTs;

      SmallVector<TypeSize, 4> Offsets;

      ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);


      unsigned J = 0;

      const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);

      for (const unsigned NumElts : VI) {

        EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);

        Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);

        SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);

        SDValue SrcLoad =

            DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);


        TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);

        Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);

        SDValue ParamAddr =

            DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);

        SDValue StoreParam = DAG.getStore(

            ArgDeclare, dl, SrcLoad, ParamAddr,

            MachinePointerInfo(NVPTX::AddressSpace::DeviceParam), ParamAlign);

        CallPrereqs.push_back(StoreParam);


        J += NumElts;

      }

      if (IsVAArg)

        VAOffset += TySize;

    } else {

      SmallVector<EVT, 16> VTs;

      SmallVector<uint64_t, 16> Offsets;

      ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,

                         VAOffset);

      assert(VTs.size() == Offsets.size() && "Size mismatch");

      assert(VTs.size() == ArgOuts.size() && "Size mismatch");


      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter

      // than 32-bits are sign extended or zero extended, depending on

      // whether they are signed or unsigned types. This case applies

      // only to scalar parameters and not to aggregate values.

      const bool ExtendIntegerParam =

          Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;


      const auto GetStoredValue = [&](const unsigned I) {

        SDValue StVal = ArgOutVals[I];

        assert(promoteScalarIntegerPTX(StVal.getValueType()) ==

                   StVal.getValueType() &&

               "OutVal type should always be legal");


        const EVT VTI = promoteScalarIntegerPTX(VTs[I]);

        const EVT StoreVT =

            ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);


        return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);

      };


      unsigned J = 0;

      const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);

      for (const unsigned NumElts : VI) {

        const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);


        unsigned Offset;

        if (IsVAArg) {

          // TODO: We may need to support vector types that can be passed

          // as scalars in variadic arguments.

          assert(NumElts == 1 &&

                 "Vectorization should be disabled for vaargs.");


          // Align each part of the variadic argument to their type.

          VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));

          Offset = VAOffset;


          const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;

          VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));

        } else {

          assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");

          Offset = Offsets[J];

        }


        SDValue Ptr =

            DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));


        const MaybeAlign CurrentAlign = ExtendIntegerParam

                                            ? MaybeAlign(std::nullopt)

                                            : commonAlignment(ArgAlign, Offset);


        SDValue Val =

            getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {

              return GetStoredValue(J + K);

            });


        SDValue StoreParam = DAG.getStore(

            ArgDeclare, dl, Val, Ptr,

            MachinePointerInfo(NVPTX::AddressSpace::DeviceParam), CurrentAlign);

        CallPrereqs.push_back(StoreParam);


        J += NumElts;

      }

    }

  }


  // Handle Result

  if (!Ins.empty()) {

    const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);

    const unsigned ResultSize = DL.getTypeAllocSize(RetTy);

    if (shouldPassAsArray(RetTy)) {

      const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);

      MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);

    } else {

      MakeDeclareScalarParam(RetSymbol, ResultSize);

    }

  }


  // Set the size of the vararg param byte array if the callee is a variadic

  // function and the variadic part is not empty.

  if (VADeclareParam) {

    SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),

                                 VADeclareParam.getOperand(1),

                                 VADeclareParam.getOperand(2), GetI32(VAOffset),

                                 VADeclareParam.getOperand(4)};

    DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),

                    VADeclareParam->getVTList(), DeclareParamOps);

  }


  const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());

  const auto *CalleeF = Func ? dyn_cast<Function>(Func->getGlobal()) : nullptr;


  // If the type of the callsite does not match that of the function, convert

  // the callsite to an indirect call.

  const bool ConvertToIndirectCall =

      CalleeF && CB->getFunctionType() != CalleeF->getFunctionType();


  // Both indirect calls and libcalls have nullptr Func. In order to distinguish

  // between them we must rely on the call site value which is valid for

  // indirect calls but is always null for libcalls.

  const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;


  if (isa<ExternalSymbolSDNode>(Callee)) {

    Function* CalleeFunc = nullptr;


    // Try to find the callee in the current module.

    Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);

    assert(CalleeFunc != nullptr && "Libcall callee must be set.");


    // Set the "libcall callee" attribute to indicate that the function

    // must always have a declaration.

    CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");

  }


  if (IsIndirectCall) {

    // This is indirect function call case : PTX requires a prototype of the

    // form

    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);

    // to be emitted, and the label has to used as the last arg of call

    // instruction.

    // The prototype is embedded in a string and put as the operand for a

    // CallPrototype SDNode which will print out to the value of the string.

    const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);

    std::string Proto =

        getPrototype(DL, RetTy, Args, CLI.Outs,

                     HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,

                     UniqueCallSite);

    const char *ProtoStr = nvTM->getStrPool().save(Proto).data();

    const SDValue PrototypeDeclare = DAG.getNode(

        NVPTXISD::CallPrototype, dl, MVT::Other,

        {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});

    CallPrereqs.push_back(PrototypeDeclare);

  }


  const bool IsUnknownIntrinsic =

      CalleeF && CalleeF->isIntrinsic() &&

      CalleeF->getIntrinsicID() == Intrinsic::not_intrinsic;

  if (IsUnknownIntrinsic) {

    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        DAG.getMachineFunction().getFunction(),

        "call to unknown intrinsic '" + CalleeF->getName() +

            "' cannot be lowered by the NVPTX backend",

        dl.getDebugLoc()));

  }


  const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;

  const unsigned NumArgs =

      std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());

  /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,

  ///      NumParams, Callee, Proto)

  const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);

  const SDValue Call = DAG.getNode(

      NVPTXISD::CALL, dl, MVT::Other,

      {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),

       GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});


  SmallVector<SDValue, 16> LoadChains{Call};

  SmallVector<SDValue, 16> ProxyRegOps;

  if (!Ins.empty()) {

    SmallVector<EVT, 16> VTs;

    SmallVector<uint64_t, 16> Offsets;

    ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);

    assert(VTs.size() == Ins.size() && "Bad value decomposition");


    const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);

    const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);


    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than

    // 32-bits are sign extended or zero extended, depending on whether

    // they are signed or unsigned types.

    const bool ExtendIntegerRetVal =

        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;


    unsigned I = 0;

    const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);

    for (const unsigned NumElts : VI) {

      const MaybeAlign CurrentAlign =

          ExtendIntegerRetVal ? MaybeAlign(std::nullopt)

                              : commonAlignment(RetAlign, Offsets[I]);


      const EVT VTI = promoteScalarIntegerPTX(VTs[I]);

      const EVT LoadVT =

          ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);

      const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);

      SDValue Ptr =

          DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));


      SDValue R = DAG.getLoad(

          VecVT, dl, Call, Ptr,

          MachinePointerInfo(NVPTX::AddressSpace::DeviceParam), CurrentAlign);


      LoadChains.push_back(R.getValue(1));

      for (const unsigned J : llvm::seq(NumElts))

        ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));

      I += NumElts;

    }

  }


  const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);

  const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,

                                             UniqueCallSite + 1, SDValue(), dl);


  // Append ProxyReg instructions to the chain to make sure that `callseq_end`

  // will not get lost. Otherwise, during libcalls expansion, the nodes can become

  // dangling.

  for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {

    SDValue Proxy =

        DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});

    SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);

    InVals.push_back(Ret);

  }


  // set IsTailCall to false for now, until we figure out how to express

  // tail call optimization in PTX

  CLI.IsTailCall = false;

  return CallEnd;

}


SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

                                                     SelectionDAG &DAG) const {


  if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

    const Function &Fn = DAG.getMachineFunction().getFunction();


    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        Fn,

        "Support for dynamic alloca introduced in PTX ISA version 7.3 and "

        "requires target sm_52.",

        SDLoc(Op).getDebugLoc()));

    auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),

                Op.getOperand(0)};

    return DAG.getMergeValues(Ops, SDLoc());

  }


  SDLoc DL(Op.getNode());

  SDValue Chain = Op.getOperand(0);

  SDValue Size = Op.getOperand(1);

  uint64_t Align = Op.getConstantOperandVal(2);


  // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that

  // the default stack alignment should be used.

  if (Align == 0)

    Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value();


  // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.

  const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);


  SDValue Alloc =

      DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},

                  {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),

                   DAG.getTargetConstant(Align, DL, MVT::i32)});


  SDValue ASC = DAG.getAddrSpaceCast(

      DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);


  return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);

}


SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc DL(Op.getNode());

  if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

    const Function &Fn = DAG.getMachineFunction().getFunction();


    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        Fn,

        "Support for stackrestore requires PTX ISA version >= 7.3 and target "

        ">= sm_52.",

        DL.getDebugLoc()));

    return Op.getOperand(0);

  }


  const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);

  SDValue Chain = Op.getOperand(0);

  SDValue Ptr = Op.getOperand(1);

  SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT, Ptr, ADDRESS_SPACE_GENERIC,

                                     ADDRESS_SPACE_LOCAL);

  return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});

}


SDValue NVPTXTargetLowering::LowerSTACKSAVE(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDLoc DL(Op.getNode());

  if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

    const Function &Fn = DAG.getMachineFunction().getFunction();


    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        Fn,

        "Support for stacksave requires PTX ISA version >= 7.3 and target >= "

        "sm_52.",

        DL.getDebugLoc()));

    auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};

    return DAG.getMergeValues(Ops, DL);

  }


  const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);

  SDValue Chain = Op.getOperand(0);

  SDValue SS =

      DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);

  SDValue ASC = DAG.getAddrSpaceCast(

      DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);

  return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);

}


// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()

// (see LegalizeDAG.cpp). This is slow and uses local memory.

// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5

SDValue

NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {

  SDNode *Node = Op.getNode();

  SDLoc dl(Node);

  SmallVector<SDValue, 8> Ops;

  unsigned NumOperands = Node->getNumOperands();

  for (unsigned i = 0; i < NumOperands; ++i) {

    SDValue SubOp = Node->getOperand(i);

    EVT VVT = SubOp.getNode()->getValueType(0);

    EVT EltVT = VVT.getVectorElementType();

    unsigned NumSubElem = VVT.getVectorNumElements();

    for (unsigned j = 0; j < NumSubElem; ++j) {

      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,

                                DAG.getIntPtrConstant(j, dl)));

    }

  }

  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);

}


static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,

                       SelectionDAG &DAG,

                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {

  assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&

         Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");

  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,

                     {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});

}


static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,

                       SelectionDAG &DAG,

                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {

  return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);

}


/// Reduces the elements using the scalar operations provided. The operations

/// are sorted descending in number of inputs they take. The flags on the

/// original reduction operation will be propagated to each scalar operation.

/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction

/// used in ExpandReductions and SelectionDAG.


static SDValue buildTreeReduction(

    const SmallVector<SDValue> &Elements, EVT EltTy,

    ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,

    const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {

  // Build the reduction tree at each level, starting with all the elements.

  SmallVector<SDValue> Level = Elements;


  unsigned OpIdx = 0;

  while (Level.size() > 1) {

    // Try to reduce this level using the current operator.

    const auto [Op, NumInputs] = Ops[OpIdx];


    // Build the next level by partially reducing all elements.

    SmallVector<SDValue> ReducedLevel;

    unsigned I = 0, E = Level.size();

    for (; I + NumInputs <= E; I += NumInputs) {

      // Reduce elements in groups of [NumInputs], as much as possible.

      ReducedLevel.push_back(DAG.getNode(

          Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));

    }


    if (I < E) {

      // Handle leftover elements.


      if (ReducedLevel.empty()) {

        // We didn't reduce anything at this level. We need to pick a smaller

        // operator.

        ++OpIdx;

        assert(OpIdx < Ops.size() && "no smaller operators for reduction");

        continue;

      }


      // We reduced some things but there's still more left, meaning the

      // operator's number of inputs doesn't evenly divide this level size. Move

      // these elements to the next level.

      for (; I < E; ++I)

        ReducedLevel.push_back(Level[I]);

    }


    // Process the next level.

    Level = ReducedLevel;

  }


  return *Level.begin();

}


// Get scalar reduction opcode


static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {

  switch (ReductionOpcode) {

  case ISD::VECREDUCE_FMAX:

    return ISD::FMAXNUM;

  case ISD::VECREDUCE_FMIN:

    return ISD::FMINNUM;

  case ISD::VECREDUCE_FMAXIMUM:

    return ISD::FMAXIMUM;

  case ISD::VECREDUCE_FMINIMUM:

    return ISD::FMINIMUM;

  default:

    llvm_unreachable("unhandled reduction opcode");

  }

}


/// Get 3-input scalar reduction opcode

static std::optional<unsigned>


getScalar3OpcodeForReduction(unsigned ReductionOpcode) {

  switch (ReductionOpcode) {

  case ISD::VECREDUCE_FMAX:

    return NVPTXISD::FMAXNUM3;

  case ISD::VECREDUCE_FMIN:

    return NVPTXISD::FMINNUM3;

  case ISD::VECREDUCE_FMAXIMUM:

    return NVPTXISD::FMAXIMUM3;

  case ISD::VECREDUCE_FMINIMUM:

    return NVPTXISD::FMINIMUM3;

  default:

    return std::nullopt;

  }

}


/// Lower reductions to either a sequence of operations or a tree if

/// reassociations are allowed. This method will use larger operations like

/// max3/min3 when the target supports them.

SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDLoc DL(Op);

  const SDNodeFlags Flags = Op->getFlags();

  SDValue Vector = Op.getOperand(0);


  const unsigned Opcode = Op->getOpcode();

  const EVT EltTy = Vector.getValueType().getVectorElementType();


  // Whether we can use 3-input min/max when expanding the reduction.

  const bool CanUseMinMax3 =

      EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&

      STI.getPTXVersion() >= 88 &&

      (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||

       Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);


  // A list of SDNode opcodes with equivalent semantics, sorted descending by

  // number of inputs they take.

  SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;


  if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);

      CanUseMinMax3 && Opcode3Elem)

    ScalarOps.push_back({*Opcode3Elem, 3});

  ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});


  SmallVector<SDValue> Elements;

  DAG.ExtractVectorElements(Vector, Elements);


  return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);

}


SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {

  // Handle bitcasting from v2i8 without hitting the default promotion

  // strategy which goes through stack memory.

  EVT FromVT = Op->getOperand(0)->getValueType(0);

  if (FromVT != MVT::v2i8) {

    return Op;

  }


  // Pack vector elements into i16 and bitcast to final type

  SDLoc DL(Op);

  SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,

                             Op->getOperand(0), DAG.getIntPtrConstant(0, DL));

  SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,

                             Op->getOperand(0), DAG.getIntPtrConstant(1, DL));

  SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);

  SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);

  SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);

  SDValue AsInt = DAG.getNode(

      ISD::OR, DL, MVT::i16,

      {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});

  EVT ToVT = Op->getValueType(0);

  return DAG.getBitcast(ToVT, AsInt);

}


// We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it

// would get lowered as two constant loads and vector-packing move.

// Instead we want just a constant move:

//        mov.b32         %r2, 0x40003C00

SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,

                                               SelectionDAG &DAG) const {

  EVT VT = Op->getValueType(0);

  if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))

    return Op;

  SDLoc DL(Op);


  if (!llvm::all_of(Op->ops(), [](SDValue Operand) {

        return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||

               isa<ConstantFPSDNode>(Operand);

      })) {

    if (VT != MVT::v4i8)

      return Op;

    // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us

    // to optimize calculation of constant parts.

    auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,

                       uint64_t SelectionValue) -> SDValue {

      SDValue L = Left;

      SDValue R = Right;

      if (Cast) {

        L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);

        R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);

      }

      return getPRMT(L, R, SelectionValue, DL, DAG);

    };

    auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);

    auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);

    auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);

    return DAG.getBitcast(VT, PRMT3210);

  }


  // Get value or the Nth operand as an APInt(32). Undef values treated as 0.

  auto GetOperand = [](SDValue Op, int N) -> APInt {

    const SDValue &Operand = Op->getOperand(N);

    EVT VT = Op->getValueType(0);

    if (Operand->isUndef())

      return APInt(32, 0);

    APInt Value;

    if (VT == MVT::v2f16 || VT == MVT::v2bf16)

      Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();

    else if (VT == MVT::v2i16 || VT == MVT::v4i8)

      Value = Operand->getAsAPIntVal();

    else

      llvm_unreachable("Unsupported type");

    // i8 values are carried around as i16, so we need to zero out upper bits,

    // so they do not get in the way of combining individual byte values

    if (VT == MVT::v4i8)

      Value = Value.trunc(8);

    return Value.zext(32);

  };


  // Construct a 32-bit constant by shifting into place smaller values

  // (elements of the vector type VT).

  // For example, if VT has 2 elements, then N == 2:

  //   ShiftAmount = 32 / N = 16

  //   Value |= Op0 (b16) << 0

  //   Value |= Op1 (b16) << 16

  // If N == 4:

  //   ShiftAmount = 32 / N = 8

  //   Value |= Op0 (b8) << 0

  //   Value |= Op1 (b8) << 8

  //   Value |= Op2 (b8) << 16

  //   Value |= Op3 (b8) << 24

  // ...etc

  APInt Value(32, 0);

  const unsigned NumElements = VT.getVectorNumElements();

  assert(32 % NumElements == 0 && "must evenly divide bit length");

  const unsigned ShiftAmount = 32 / NumElements;

  for (unsigned ElementNo : seq(NumElements))

    Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);

  SDValue Const = DAG.getConstant(Value, DL, MVT::i32);

  return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);

}


SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

                                                     SelectionDAG &DAG) const {

  SDValue Index = Op->getOperand(1);

  SDValue Vector = Op->getOperand(0);

  SDLoc DL(Op);

  EVT VectorVT = Vector.getValueType();


  if (VectorVT == MVT::v4i8) {

    SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,

                                   DAG.getZExtOrTrunc(Index, DL, MVT::i32),

                                   DAG.getConstant(0x7770, DL, MVT::i32));

    SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),

                           DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);

    SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));

    SDNodeFlags Flags;

    Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);

    Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);

    Ext->setFlags(Flags);

    return Ext;

  }


  // Constant index will be matched by tablegen.

  if (isa<ConstantSDNode>(Index.getNode()))

    return Op;


  // Extract individual elements and select one of them.

  assert(NVPTX::isPackedVectorTy(VectorVT) &&

         VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");

  EVT EltVT = VectorVT.getVectorElementType();


  SDLoc dl(Op.getNode());

  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,

                           DAG.getIntPtrConstant(0, dl));

  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,

                           DAG.getIntPtrConstant(1, dl));

  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,

                         ISD::CondCode::SETEQ);

}


SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

                                                    SelectionDAG &DAG) const {

  SDValue Vector = Op->getOperand(0);

  EVT VectorVT = Vector.getValueType();


  if (VectorVT != MVT::v4i8)

    return Op;

  SDLoc DL(Op);

  SDValue Value = Op->getOperand(1);

  if (Value->isUndef())

    return Vector;


  SDValue Index = Op->getOperand(2);


  SDValue BFI =

      DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,

                  {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,

                   DAG.getNode(ISD::MUL, DL, MVT::i32,

                               DAG.getZExtOrTrunc(Index, DL, MVT::i32),

                               DAG.getConstant(8, DL, MVT::i32)),

                   DAG.getConstant(8, DL, MVT::i32)});

  return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);

}


SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDValue V1 = Op.getOperand(0);

  EVT VectorVT = V1.getValueType();

  if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)

    return Op;


  // Lower shuffle to PRMT instruction.

  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

  SDValue V2 = Op.getOperand(1);

  uint32_t Selector = 0;

  for (auto I : llvm::enumerate(SVN->getMask())) {

    if (I.value() != -1) // -1 is a placeholder for undef.

      Selector |= (I.value() << (I.index() * 4));

  }


  SDLoc DL(Op);

  SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),

                         DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);

  return DAG.getBitcast(Op.getValueType(), PRMT);

}

/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which

/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift

///    amount, or

/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift

///    amount.

SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,

                                                  SelectionDAG &DAG) const {

  assert(Op.getNumOperands() == 3 && "Not a double-shift!");

  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);


  EVT VT = Op.getValueType();

  unsigned VTBits = VT.getSizeInBits();

  SDLoc dl(Op);

  SDValue ShOpLo = Op.getOperand(0);

  SDValue ShOpHi = Op.getOperand(1);

  SDValue ShAmt  = Op.getOperand(2);

  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;


  if (VTBits == 32 && STI.getSmVersion() >= 35) {

    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.

    // {dHi, dLo} = {aHi, aLo} >> Amt

    //   dHi = aHi >> Amt

    //   dLo = shf.r.clamp aLo, aHi, Amt


    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);

    SDValue Lo =

        DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);


    SDValue Ops[2] = { Lo, Hi };

    return DAG.getMergeValues(Ops, dl);

  }

  else {

    // {dHi, dLo} = {aHi, aLo} >> Amt

    // - if (Amt>=size) then

    //      dLo = aHi >> (Amt-size)

    //      dHi = aHi >> Amt (this is either all 0 or all 1)

    //   else

    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))

    //      dHi = aHi >> Amt


    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,

                                   DAG.getConstant(VTBits, dl, MVT::i32),

                                   ShAmt);

    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);

    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,

                                     DAG.getConstant(VTBits, dl, MVT::i32));

    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);

    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);


    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,

                               DAG.getConstant(VTBits, dl, MVT::i32),

                               ISD::SETGE);

    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);

    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);


    SDValue Ops[2] = { Lo, Hi };

    return DAG.getMergeValues(Ops, dl);

  }

}


/// LowerShiftLeftParts - Lower SHL_PARTS, which

/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift

///    amount, or

/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift

///    amount.

SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,

                                                 SelectionDAG &DAG) const {

  assert(Op.getNumOperands() == 3 && "Not a double-shift!");

  assert(Op.getOpcode() == ISD::SHL_PARTS);


  EVT VT = Op.getValueType();

  unsigned VTBits = VT.getSizeInBits();

  SDLoc dl(Op);

  SDValue ShOpLo = Op.getOperand(0);

  SDValue ShOpHi = Op.getOperand(1);

  SDValue ShAmt  = Op.getOperand(2);


  if (VTBits == 32 && STI.getSmVersion() >= 35) {

    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.

    // {dHi, dLo} = {aHi, aLo} << Amt

    //   dHi = shf.l.clamp aLo, aHi, Amt

    //   dLo = aLo << Amt


    SDValue Hi =

        DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);

    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);


    SDValue Ops[2] = { Lo, Hi };

    return DAG.getMergeValues(Ops, dl);

  }

  else {

    // {dHi, dLo} = {aHi, aLo} << Amt

    // - if (Amt>=size) then

    //      dLo = aLo << Amt (all 0)

    //      dLo = aLo << (Amt-size)

    //   else

    //      dLo = aLo << Amt

    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))


    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,

                                   DAG.getConstant(VTBits, dl, MVT::i32),

                                   ShAmt);

    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);

    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,

                                     DAG.getConstant(VTBits, dl, MVT::i32));

    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);

    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);


    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,

                               DAG.getConstant(VTBits, dl, MVT::i32),

                               ISD::SETGE);

    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);

    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);


    SDValue Ops[2] = { Lo, Hi };

    return DAG.getMergeValues(Ops, dl);

  }

}


/// If the types match, convert the generic copysign to the NVPTXISD version,

/// otherwise bail ensuring that mismatched cases are properly expaned.

SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,

                                            SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc DL(Op);


  SDValue In1 = Op.getOperand(0);

  SDValue In2 = Op.getOperand(1);

  EVT SrcVT = In2.getValueType();


  if (!SrcVT.bitsEq(VT))

    return SDValue();


  return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);

}


SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (VT == MVT::f32)

    return LowerFROUND32(Op, DAG);


  if (VT == MVT::f64)

    return LowerFROUND64(Op, DAG);


  llvm_unreachable("unhandled type");

}


// This is the the rounding method used in CUDA libdevice in C like code:

// float roundf(float A)

// {

//   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));

//   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;

//   return abs(A) < 0.5 ? (float)(int)A : RoundedA;

// }

SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue A = Op.getOperand(0);

  EVT VT = Op.getValueType();


  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);


  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))

  SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);

  const unsigned SignBitMask = 0x80000000;

  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,

                             DAG.getConstant(SignBitMask, SL, MVT::i32));

  const unsigned PointFiveInBits = 0x3F000000;

  SDValue PointFiveWithSignRaw =

      DAG.getNode(ISD::OR, SL, MVT::i32, Sign,

                  DAG.getConstant(PointFiveInBits, SL, MVT::i32));

  SDValue PointFiveWithSign =

      DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);

  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);

  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);


  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;

  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  SDValue IsLarge =

      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),

                   ISD::SETOGT);

  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);


  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;

  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,

                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);

  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);

  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);

}


// The implementation of round(double) is similar to that of round(float) in

// that they both separate the value range into three regions and use a method

// specific to the region to round the values. However, round(double) first

// calculates the round of the absolute value and then adds the sign back while

// round(float) directly rounds the value with sign.

SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue A = Op.getOperand(0);

  EVT VT = Op.getValueType();


  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);


  // double RoundedA = (double) (int) (abs(A) + 0.5f);

  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,

                                  DAG.getConstantFP(0.5, SL, VT));

  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);


  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;

  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,

                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);

  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,

                         DAG.getConstantFP(0, SL, VT),

                         RoundedA);


  // Add sign to rounded_A

  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);

  DAG.getNode(ISD::FTRUNC, SL, VT, A);


  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;

  SDValue IsLarge =

      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),

                   ISD::SETOGT);

  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);

}


static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  EVT NVT = MVT::f32;

  if (VT.isVector()) {

    NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());

  }

  SDLoc DL(N);

  SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);

  SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);

  SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());

  return DAG.getFPExtendOrRound(Res, DL, VT);

}


SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,

                                                  SelectionDAG &DAG) const {

  if (useF32FTZ(DAG.getMachineFunction())) {

    return PromoteBinOpToF32(Op.getNode(), DAG);

  }

  return Op;

}


SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,

                                            SelectionDAG &DAG) const {

  assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);


  if (Op.getValueType() == MVT::bf16) {

    SDLoc Loc(Op);

    return DAG.getNode(

        ISD::FP_ROUND, Loc, MVT::bf16,

        DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),

        DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));

  }


  // Everything else is considered legal.

  return Op;

}


SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,

                                            SelectionDAG &DAG) const {

  assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);


  if (Op.getOperand(0).getValueType() == MVT::bf16) {

    SDLoc Loc(Op);

    return DAG.getNode(

        Op.getOpcode(), Loc, Op.getValueType(),

        DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));

  }


  // Everything else is considered legal.

  return Op;

}


SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,

                                           SelectionDAG &DAG) const {

  EVT NarrowVT = Op.getValueType();

  SDValue Wide = Op.getOperand(0);

  EVT WideVT = Wide.getValueType();

  if (NarrowVT.getScalarType() == MVT::bf16) {

    const TargetLowering *TLI = STI.getTargetLowering();

    if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {

      return TLI->expandFP_ROUND(Op.getNode(), DAG);

    }

    if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

      // This combination was the first to support f32 -> bf16.

      if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {

        if (WideVT.getScalarType() == MVT::f32) {

          return Op;

        }

        if (WideVT.getScalarType() == MVT::f64) {

          SDLoc Loc(Op);

          // Round-inexact-to-odd f64 to f32, then do the final rounding using

          // the hardware f32 -> bf16 instruction.

          SDValue rod = TLI->expandRoundInexactToOdd(

              WideVT.changeElementType(*DAG.getContext(), MVT::f32), Wide, Loc,

              DAG);

          return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);

        }

      }

      return TLI->expandFP_ROUND(Op.getNode(), DAG);

    }

  }


  // Everything else is considered legal.

  return Op;

}


SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDValue Narrow = Op.getOperand(0);

  EVT NarrowVT = Narrow.getValueType();

  EVT WideVT = Op.getValueType();

  if (NarrowVT.getScalarType() == MVT::bf16) {

    if (WideVT.getScalarType() == MVT::f32 &&

        (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {

      SDLoc Loc(Op);

      return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);

    }

    if (WideVT.getScalarType() == MVT::f64 &&

        (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {

      EVT F32 = NarrowVT.changeElementType(*DAG.getContext(), MVT::f32);

      SDLoc Loc(Op);

      if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {

        Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);

      } else {

        Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);

      }

      return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);

    }

  }


  // Everything else is considered legal.

  return Op;

}


static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  if (Op.getValueType() != MVT::v2i16)

    return Op;

  EVT EltVT = Op.getValueType().getVectorElementType();

  SmallVector<SDValue> VecElements;

  for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {

    SmallVector<SDValue> ScalarArgs;

    llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),

                    [&](const SDUse &O) {

                      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,

                                         O.get(), DAG.getIntPtrConstant(I, DL));

                    });

    VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));

  }

  SDValue V =

      DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);

  return V;

}


static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG,

                              bool hasOffset = false) {

  // skip lowering if the vector operand is already legalized

  if (!Op->getOperand(hasOffset ? 4 : 3).getValueType().isVector())

    return Op;


  SDNode *N = Op.getNode();

  SDLoc DL(N);

  SmallVector<SDValue, 32> Ops;


  // split the vector argument

  for (size_t I = 0; I < N->getNumOperands(); I++) {

    SDValue Val = N->getOperand(I);

    EVT ValVT = Val.getValueType();

    if (ValVT.isVector()) {

      EVT EltVT = ValVT.getVectorElementType();

      for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)

        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,

                                  DAG.getIntPtrConstant(J, DL)));

    } else

      Ops.push_back(Val);

  }


  MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

  SDValue Tcgen05StNode =

      DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,

                              MemSD->getMemoryVT(), MemSD->getMemOperand());


  return Tcgen05StNode;

}


static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {

  SDLoc DL(Op);

  SDValue Src = Op.getOperand(0);

  EVT VT = Op.getValueType();


  switch (VT.getSimpleVT().SimpleTy) {

  case MVT::i16: {

    SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

    SDValue Swapped =

        getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);

    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);

  }

  case MVT::i32: {

    return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);

  }

  case MVT::v2i16: {

    SDValue Converted = DAG.getBitcast(MVT::i32, Src);

    SDValue Swapped =

        getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);

    return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);

  }

  case MVT::i64: {

    SDValue UnpackSrc =

        DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);

    SDValue SwappedLow =

        getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,

                DL, DAG);

    SDValue SwappedHigh =

        getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,

                DL, DAG);

    return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,

                       {SwappedHigh, SwappedLow});

  }

  default:

    llvm_unreachable("unsupported type for bswap");

  }

}


static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {

  switch (IID) {

  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:

    return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:

    return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:

    return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:

    return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:

    return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:

    return NVPTXISD::

        TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:

    return NVPTXISD::

        TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;

  };

  llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");

}


static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDLoc DL(N);

  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();


  SmallVector<SDValue, 16> Ops;

  // split the vector argument

  for (size_t I = 0; I < N->getNumOperands(); I++) {

    if (I == 1)

      continue; // skip IID

    SDValue Val = N->getOperand(I);

    EVT ValVT = Val.getValueType();

    if (ValVT.isVector()) {

      EVT EltVT = ValVT.getVectorElementType();

      for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)

        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,

                                  DAG.getIntPtrConstant(J, DL)));

    } else

      Ops.push_back(Val);

  }


  MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

  SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(

      getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,

      MemSD->getMemoryVT(), MemSD->getMemOperand());


  return Tcgen05MMANode;

}


// Lower vector return type of tcgen05.ld intrinsics

static std::optional<std::pair<SDValue, SDValue>>


lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {

  SDLoc DL(N);

  EVT ResVT = N->getValueType(0);

  if (!ResVT.isVector())

    return {}; // already legalized.


  const unsigned NumElts = ResVT.getVectorNumElements();


  // Create the return type of the instructions

  SmallVector<EVT, 5> ListVTs;

  for (unsigned i = 0; i < NumElts; ++i)

    ListVTs.push_back(MVT::i32);


  ListVTs.push_back(N->getValueType(1)); // Chain


  SDVTList ResVTs = DAG.getVTList(ListVTs);


  SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),

                              N->getOperand(2)};


  if (HasOffset) {

    Ops.push_back(N->getOperand(3)); // offset

    Ops.push_back(N->getOperand(4)); // Pack flag

  } else

    Ops.push_back(N->getOperand(3)); // Pack flag


  MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

  SDValue NewNode =

      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops,

                              MemSD->getMemoryVT(), MemSD->getMemOperand());


  // split the vector result

  SmallVector<SDValue, 4> ScalarRes;

  for (unsigned i = 0; i < NumElts; ++i) {

    SDValue Res = NewNode.getValue(i);

    ScalarRes.push_back(Res);

  }


  SDValue Chain = NewNode.getValue(NumElts);

  SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);

  return {{BuildVector, Chain}};

}


static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG,

                                                  unsigned Val) {

  SDNode *N = Op.getNode();

  SDLoc DL(N);


  const Function &Fn = DAG.getMachineFunction().getFunction();


  unsigned AS = 0;

  if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))

    AS = MemN->getAddressSpace();

  Type *PtrTy = PointerType::get(*DAG.getContext(), AS);

  Module *M = DAG.getMachineFunction().getFunction().getParent();


  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

      Fn,

      "Intrinsic " +

          Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +

          " with value " + Twine(Val) +

          " is not supported on the given target.",

      DL.getDebugLoc()));

  return Op.getOperand(0);

}


static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDLoc DL(N);


  // immediate argument representing elemtype

  unsigned Val = N->getConstantOperandVal(3);


  if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceElemtypeSupport(

          Val))

    return reportInvalidTensormapReplaceUsage(Op, DAG, Val);


  return Op;

}


static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDLoc DL(N);


  // immediate argument representing swizzle mode

  unsigned Val = N->getConstantOperandVal(3);


  if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceSwizzleModeSupport(

          Val))

    return reportInvalidTensormapReplaceUsage(Op, DAG, Val);


  return Op;

}


static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDValue Intrin = N->getOperand(1);


  // Get the intrinsic ID

  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();

  switch (IntrinNo) {

  default:

    break;

  case Intrinsic::nvvm_tcgen05_st_16x64b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x128:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x1:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x64:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x1:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x32:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x2:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x4:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x8:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x16:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x64b_x64:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x64:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x128:

    return lowerTcgen05St(Op, DAG);

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:

    return lowerTcgen05St(Op, DAG, /* hasOffset */ true);

  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:

    return LowerTcgen05MMADisableOutputLane(Op, DAG);

  case Intrinsic::nvvm_tensormap_replace_elemtype:

    return lowerTensormapReplaceElemtype(Op, DAG);

  case Intrinsic::nvvm_tensormap_replace_swizzle_mode:

    return lowerTensormapReplaceSwizzleMode(Op, DAG);

  }

  return Op;

}


static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,

                                                    SelectionDAG &DAG) {


  SDNode *N = Op.getNode();

  if (N->getOperand(1).getValueType() != MVT::i128) {

    // return, if the operand is already lowered

    return SDValue();

  }


  unsigned IID =

      cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();

  auto Opcode = [&]() {

    switch (IID) {

    case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:

      return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED;

    case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:

      return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X;

    case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:

      return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y;

    case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:

      return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z;

    default:

      llvm_unreachable("unsupported/unhandled intrinsic");

    }

  }();


  SDLoc DL(N);

  SDValue TryCancelResponse = N->getOperand(1);

  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);

  SDValue TryCancelResponse0 =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,

                  DAG.getIntPtrConstant(0, DL));

  SDValue TryCancelResponse1 =

      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,

                  DAG.getIntPtrConstant(1, DL));


  return DAG.getNode(Opcode, DL, N->getVTList(),

                     {TryCancelResponse0, TryCancelResponse1});

}


static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();

  SDLoc DL(N);

  SDValue F32Vec = N->getOperand(1);

  SDValue RBits = N->getOperand(2);


  unsigned IntrinsicID = N->getConstantOperandVal(0);


  // Extract the 4 float elements from the vector

  SmallVector<SDValue, 6> Ops;

  for (unsigned i = 0; i < 4; ++i)

    Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,

                              DAG.getIntPtrConstant(i, DL)));


  using NVPTX::PTXCvtMode::CvtMode;


  auto [OpCode, RetTy, CvtModeFlag] =

      [&]() -> std::tuple<unsigned, MVT::SimpleValueType, uint32_t> {

    switch (IntrinsicID) {

    case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:

      return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,

              CvtMode::RS | CvtMode::RELU_FLAG};

    case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:

      return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};

    case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:

      return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,

              CvtMode::RS | CvtMode::RELU_FLAG};

    case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:

      return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};

    case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:

      return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,

              CvtMode::RS | CvtMode::RELU_FLAG};

    case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:

      return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};

    case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:

      return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,

              CvtMode::RS | CvtMode::RELU_FLAG};

    case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:

      return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};

    case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:

      return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,

              CvtMode::RS | CvtMode::RELU_FLAG};

    case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:

      return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};

    default:

      llvm_unreachable("unsupported/unhandled intrinsic");

    }

  }();


  Ops.push_back(RBits);

  Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));


  return DAG.getNode(OpCode, DL, RetTy, Ops);

}


static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {

  const unsigned Mode = [&]() {

    switch (Op->getConstantOperandVal(0)) {

    case Intrinsic::nvvm_prmt:

      return NVPTX::PTXPrmtMode::NONE;

    case Intrinsic::nvvm_prmt_b4e:

      return NVPTX::PTXPrmtMode::B4E;

    case Intrinsic::nvvm_prmt_ecl:

      return NVPTX::PTXPrmtMode::ECL;

    case Intrinsic::nvvm_prmt_ecr:

      return NVPTX::PTXPrmtMode::ECR;

    case Intrinsic::nvvm_prmt_f4e:

      return NVPTX::PTXPrmtMode::F4E;

    case Intrinsic::nvvm_prmt_rc16:

      return NVPTX::PTXPrmtMode::RC16;

    case Intrinsic::nvvm_prmt_rc8:

      return NVPTX::PTXPrmtMode::RC8;

    default:

      llvm_unreachable("unsupported/unhandled intrinsic");

    }

  }();

  SDLoc DL(Op);

  SDValue A = Op->getOperand(1);

  SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)

                                       : DAG.getConstant(0, DL, MVT::i32);

  SDValue Selector = (Op->op_end() - 1)->get();

  return getPRMT(A, B, Selector, DL, DAG, Mode);

}


#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)                                  \

  Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE


#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)                                  \

  NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE


static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {

  switch (IID) {

  case TCGEN05_LD_RED_INTR(32x32b, 2, f32):

    return TCGEN05_LD_RED_INST(32x32b, 2, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 4, f32):

    return TCGEN05_LD_RED_INST(32x32b, 4, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 8, f32):

    return TCGEN05_LD_RED_INST(32x32b, 8, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 16, f32):

    return TCGEN05_LD_RED_INST(32x32b, 16, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 32, f32):

    return TCGEN05_LD_RED_INST(32x32b, 32, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 64, f32):

    return TCGEN05_LD_RED_INST(32x32b, 64, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 128, f32):

    return TCGEN05_LD_RED_INST(32x32b, 128, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):

    return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);

  case TCGEN05_LD_RED_INTR(32x32b, 2, i32):

    return TCGEN05_LD_RED_INST(32x32b, 2, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 4, i32):

    return TCGEN05_LD_RED_INST(32x32b, 4, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 8, i32):

    return TCGEN05_LD_RED_INST(32x32b, 8, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 16, i32):

    return TCGEN05_LD_RED_INST(32x32b, 16, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 32, i32):

    return TCGEN05_LD_RED_INST(32x32b, 32, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 64, i32):

    return TCGEN05_LD_RED_INST(32x32b, 64, I32);

  case TCGEN05_LD_RED_INTR(32x32b, 128, i32):

    return TCGEN05_LD_RED_INST(32x32b, 128, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);

  case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):

    return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);

  default:

    llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");

  }

}


// Lower vector return type of tcgen05.ld intrinsics

static std::optional<std::tuple<SDValue, SDValue, SDValue>>


lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG) {

  SDLoc DL(N);

  EVT ResVT = N->getValueType(0);

  if (!ResVT.isVector())

    return {}; // already legalized.


  const unsigned NumElts = ResVT.getVectorNumElements();


  // Create the return type of the instructions

  // +1 represents the reduction value

  SmallVector<EVT, 132> ListVTs{

      NumElts + 1,

      ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};


  ListVTs.push_back(MVT::Other); // Chain


  SDVTList ResVTs = DAG.getVTList(ListVTs);


  // Prepare the Operands

  SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain


  // skip IID at index 1

  for (unsigned i = 2; i < N->getNumOperands(); i++)

    Ops.push_back(N->getOperand(i));


  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

  MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

  SDValue NewNode =

      DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,

                              MemSD->getMemoryVT(), MemSD->getMemOperand());


  // Split vector result

  SmallVector<SDValue, 132> ScalarRes;

  for (unsigned i = 0; i < NumElts; ++i) {

    SDValue Res = NewNode.getValue(i);

    ScalarRes.push_back(Res);

  }


  SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);

  SDValue RedResult = NewNode.getValue(NumElts);

  SDValue Chain = NewNode.getValue(NumElts + 1);

  return {{BuildVector, RedResult, Chain}};

}


static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) {

  switch (Op->getConstantOperandVal(1)) {

  default:

    return Op;


  // These tcgen05 intrinsics return a v2i32, which is legal, so we have to

  // lower them through LowerOperation() instead of ReplaceNodeResults().

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:

    if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))

      return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));

    return SDValue();


  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:

    if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))

      return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));

    return SDValue();


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:

    if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))

      return DAG.getMergeValues(

          {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));

    return SDValue();

  }

}


static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {

  switch (Op->getConstantOperandVal(0)) {

  default:

    return Op;

  case Intrinsic::nvvm_prmt:

  case Intrinsic::nvvm_prmt_b4e:

  case Intrinsic::nvvm_prmt_ecl:

  case Intrinsic::nvvm_prmt_ecr:

  case Intrinsic::nvvm_prmt_f4e:

  case Intrinsic::nvvm_prmt_rc16:

  case Intrinsic::nvvm_prmt_rc8:

    return lowerPrmtIntrinsic(Op, DAG);

  case Intrinsic::nvvm_internal_addrspace_wrap:

    return Op.getOperand(1);

  case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:

  case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:

  case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:

  case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:

    return LowerClusterLaunchControlQueryCancel(Op, DAG);

  case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:

  case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:

  case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:

  case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:

  case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:

  case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:

  case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:

  case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:

  case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:

  case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:

    return lowerCvtRSIntrinsics(Op, DAG);

  }

}


// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.

// Lower these into a node returning the correct type which is zero-extended

// back to the correct size.


static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {

  SDValue V = Op->getOperand(0);

  assert(V.getValueType() == MVT::i64 &&

         "Unexpected CTLZ/CTPOP type to legalize");


  SDLoc DL(Op);

  SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);

  return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);

}


static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL,

                           unsigned Opcode, SelectionDAG &DAG) {

  assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);


  const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);

  if (!AmtConst)

    return SDValue();

  const auto Amt = AmtConst->getZExtValue() & 63;


  SDValue UnpackA =

      DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);

  SDValue UnpackB =

      DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);


  // Arch is Little endiain: 0 = low bits, 1 = high bits

  SDValue ALo = UnpackA.getValue(0);

  SDValue AHi = UnpackA.getValue(1);

  SDValue BLo = UnpackB.getValue(0);

  SDValue BHi = UnpackB.getValue(1);


  // The bitfeild consists of { AHi : ALo : BHi : BLo }

  //

  // * FSHL, Amt <  32 - The window will contain { AHi : ALo : BHi }

  // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }

  // * FSHR, Amt <  32 - The window will contain { ALo : BHi : BLo }

  // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }

  //

  // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts

  // are not needed at all. Amt = 0 is a no-op producing either A or B depending

  // on the direction. Amt = 32 can be implemented by a packing and unpacking

  // move to select and arrange the 32bit values. For simplicity, these cases

  // are not handled here explicitly and instead we rely on DAGCombiner to

  // remove the no-op funnel shifts we insert.

  auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))

                              ? std::make_tuple(AHi, ALo, BHi)

                              : std::make_tuple(ALo, BHi, BLo);


  SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);

  SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});

  SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});


  return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});

}


static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) {

  return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),

                     SDLoc(Op), Op->getOpcode(), DAG);

}


static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {

  unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;

  return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),

                     SDLoc(Op), Opcode, DAG);

}


static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) {

  // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),

  // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches

  // the semantics of LLVM's frem.

  SDLoc DL(Op);

  SDValue X = Op->getOperand(0);

  SDValue Y = Op->getOperand(1);

  EVT Ty = Op.getValueType();

  SDNodeFlags Flags = Op->getFlags();


  SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);

  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);

  SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,

                            Flags | SDNodeFlags::AllowContract);

  SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,

                            Flags | SDNodeFlags::AllowContract);


  if (Flags.hasNoInfs())

    return Sub;


  // If Y is infinite, return X

  SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);

  SDValue Inf =

      DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);

  SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);

  return DAG.getSelect(DL, Ty, IsInf, X, Sub);

}


static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) {

  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");


  SDValue Cond = Op->getOperand(0);

  SDValue TrueVal = Op->getOperand(1);

  SDValue FalseVal = Op->getOperand(2);

  SDLoc DL(Op);


  // If both operands are truncated, we push the select through the truncates.

  if (TrueVal.getOpcode() == ISD::TRUNCATE &&

      FalseVal.getOpcode() == ISD::TRUNCATE) {

    TrueVal = TrueVal.getOperand(0);

    FalseVal = FalseVal.getOperand(0);


    EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())

                 ? TrueVal.getValueType()

                 : FalseVal.getValueType();

    TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);

    FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);

    SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);

    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);

  }


  // Otherwise, expand the select into a series of logical operations. These

  // often can be folded into other operations either by us or ptxas.

  TrueVal = DAG.getFreeze(TrueVal);

  FalseVal = DAG.getFreeze(FalseVal);

  SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);

  SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);

  SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);

  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);

  return Or;

}


static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG) {

  SDNode *N = Op.getNode();


  SDValue Chain = N->getOperand(0);

  SDValue Val = N->getOperand(1);

  SDValue BasePtr = N->getOperand(2);

  SDValue Offset = N->getOperand(3);

  SDValue Mask = N->getOperand(4);


  SDLoc DL(N);

  EVT ValVT = Val.getValueType();

  MemSDNode *MemSD = cast<MemSDNode>(N);

  assert(ValVT.isVector() && "Masked vector store must have vector type");

  assert(MemSD->getAlign() >= DAG.getEVTAlign(ValVT) &&

         "Unexpected alignment for masked store");


  unsigned Opcode = 0;

  switch (ValVT.getSimpleVT().SimpleTy) {

  default:

    llvm_unreachable("Unexpected masked vector store type");

  case MVT::v4i64:

  case MVT::v4f64: {

    Opcode = NVPTXISD::StoreV4;

    break;

  }

  case MVT::v8i32:

  case MVT::v8f32: {

    Opcode = NVPTXISD::StoreV8;

    break;

  }

  }


  SmallVector<SDValue, 8> Ops;


  // Construct the new SDNode. First operand is the chain.

  Ops.push_back(Chain);


  // The next N operands are the values to store. Encode the mask into the

  // values using the sentinel register 0 to represent a masked-off element.

  assert(Mask.getValueType().isVector() &&

         Mask.getValueType().getVectorElementType() == MVT::i1 &&

         "Mask must be a vector of i1");

  assert(Mask.getOpcode() == ISD::BUILD_VECTOR &&

         "Mask expected to be a BUILD_VECTOR");

  assert(Mask.getValueType().getVectorNumElements() ==

             ValVT.getVectorNumElements() &&

         "Mask size must be the same as the vector size");

  for (auto [I, Op] : enumerate(Mask->ops())) {

    // Mask elements must be constants.

    if (Op.getNode()->getAsZExtVal() == 0) {

      // Append a sentinel register 0 to the Ops vector to represent a masked

      // off element, this will be handled in tablegen

      Ops.push_back(DAG.getRegister(MCRegister::NoRegister,

                                    ValVT.getVectorElementType()));

    } else {

      // Extract the element from the vector to store

      SDValue ExtVal =

          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValVT.getVectorElementType(),

                      Val, DAG.getIntPtrConstant(I, DL));

      Ops.push_back(ExtVal);

    }

  }


  // Next, the pointer operand.

  Ops.push_back(BasePtr);


  // Finally, the offset operand. We expect this to always be undef, and it will

  // be ignored in lowering, but to mirror the handling of the other vector

  // store instructions we include it in the new SDNode.

  assert(Offset.getOpcode() == ISD::UNDEF &&

         "Offset operand expected to be undef");

  Ops.push_back(Offset);


  SDValue NewSt =

      DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,

                              MemSD->getMemoryVT(), MemSD->getMemOperand());


  return NewSt;

}


SDValue


NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

  switch (Op.getOpcode()) {

  case ISD::RETURNADDR:

    return SDValue();

  case ISD::FRAMEADDR:

    return SDValue();

  case ISD::ADDRSPACECAST:

    return LowerADDRSPACECAST(Op, DAG);

  case ISD::INTRINSIC_W_CHAIN:

    return lowerIntrinsicWChain(Op, DAG);

  case ISD::INTRINSIC_WO_CHAIN:

    return lowerIntrinsicWOChain(Op, DAG);

  case ISD::INTRINSIC_VOID:

    return lowerIntrinsicVoid(Op, DAG);

  case ISD::BUILD_VECTOR:

    return LowerBUILD_VECTOR(Op, DAG);

  case ISD::BITCAST:

    return LowerBITCAST(Op, DAG);

  case ISD::EXTRACT_SUBVECTOR:

    return Op;

  case ISD::EXTRACT_VECTOR_ELT:

    return LowerEXTRACT_VECTOR_ELT(Op, DAG);

  case ISD::INSERT_VECTOR_ELT:

    return LowerINSERT_VECTOR_ELT(Op, DAG);

  case ISD::VECTOR_SHUFFLE:

    return LowerVECTOR_SHUFFLE(Op, DAG);

  case ISD::CONCAT_VECTORS:

    return LowerCONCAT_VECTORS(Op, DAG);

  case ISD::VECREDUCE_FMAX:

  case ISD::VECREDUCE_FMIN:

  case ISD::VECREDUCE_FMAXIMUM:

  case ISD::VECREDUCE_FMINIMUM:

    return LowerVECREDUCE(Op, DAG);

  case ISD::STORE:

    return LowerSTORE(Op, DAG);

  case ISD::MSTORE: {

    assert(STI.has256BitVectorLoadStore(

               cast<MemSDNode>(Op.getNode())->getAddressSpace()) &&

           "Masked store vector not supported on subtarget.");

    return lowerMSTORE(Op, DAG);

  }

  case ISD::LOAD:

    return LowerLOAD(Op, DAG);

  case ISD::MLOAD:

    return LowerMLOAD(Op, DAG);

  case ISD::SHL_PARTS:

    return LowerShiftLeftParts(Op, DAG);

  case ISD::SRA_PARTS:

  case ISD::SRL_PARTS:

    return LowerShiftRightParts(Op, DAG);

  case ISD::SELECT:

    return lowerSELECT(Op, DAG);

  case ISD::FROUND:

    return LowerFROUND(Op, DAG);

  case ISD::FCOPYSIGN:

    return LowerFCOPYSIGN(Op, DAG);

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP:

    return LowerINT_TO_FP(Op, DAG);

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT:

    // fptosi/fptoui to i1 truncate toward zero, so the only defined results

    // are {0,-1} (signed) and {0,1} (unsigned); every other input results in

    // poison. Thus we can simply lower to `x <= -1.0` or `x >= 1.0`.

    if (Op.getValueType() == MVT::i1) {

      SDLoc DL(Op);

      SDValue X = Op.getOperand(0);

      bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;

      return DAG.getSetCC(

          DL, MVT::i1, X,

          DAG.getConstantFP(IsSigned ? -1.0 : 1.0, DL, X.getValueType()),

          IsSigned ? ISD::SETOLE : ISD::SETOGE);

    }

    return LowerFP_TO_INT(Op, DAG);

  case ISD::FP_ROUND:

    return LowerFP_ROUND(Op, DAG);

  case ISD::FP_EXTEND:

    return LowerFP_EXTEND(Op, DAG);

  case ISD::VAARG:

    return LowerVAARG(Op, DAG);

  case ISD::VASTART:

    return LowerVASTART(Op, DAG);

  case ISD::FSHL:

  case ISD::FSHR:

    return lowerFSH(Op, DAG);

  case ISD::ROTL:

  case ISD::ROTR:

    return lowerROT(Op, DAG);

  case ISD::ABS:

  case ISD::ABS_MIN_POISON:

  case ISD::SMIN:

  case ISD::SMAX:

  case ISD::UMIN:

  case ISD::UMAX:

  case ISD::ADD:

  case ISD::SUB:

  case ISD::MUL:

  case ISD::SHL:

  case ISD::SREM:

  case ISD::UREM:

    return LowerVectorArith(Op, DAG);

  case ISD::DYNAMIC_STACKALLOC:

    return LowerDYNAMIC_STACKALLOC(Op, DAG);

  case ISD::STACKRESTORE:

    return LowerSTACKRESTORE(Op, DAG);

  case ISD::STACKSAVE:

    return LowerSTACKSAVE(Op, DAG);

  case ISD::CopyToReg:

    return LowerCopyToReg_128(Op, DAG);

  case ISD::FADD:

  case ISD::FSUB:

  case ISD::FMUL:

    // Used only for bf16 on SM80, where we select fma for non-ftz operation

    return PromoteBinOpIfF32FTZ(Op, DAG);

  case ISD::CTPOP:

  case ISD::CTLZ:

    return lowerCTLZCTPOP(Op, DAG);

  case ISD::FREM:

    return lowerFREM(Op, DAG);

  case ISD::BSWAP:

    return lowerBSWAP(Op, DAG);

  default:

    llvm_unreachable("Custom lowering not defined for operation");

  }

}


// This will prevent AsmPrinter from trying to print the jump tables itself.


unsigned NVPTXTargetLowering::getJumpTableEncoding() const {

  return MachineJumpTableInfo::EK_Inline;

}


SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,

                                                SelectionDAG &DAG) const {

  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

  unsigned SrcAS = N->getSrcAddressSpace();

  unsigned DestAS = N->getDestAddressSpace();

  if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&

      DestAS != llvm::ADDRESS_SPACE_GENERIC) {

    // Shared and SharedCluster can be converted to each other through generic

    // space

    if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&

         DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) ||

        (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER &&

         DestAS == llvm::ADDRESS_SPACE_SHARED)) {

      SDLoc DL(Op.getNode());

      const MVT GenerictVT =

          getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);

      SDValue GenericConversion = DAG.getAddrSpaceCast(

          DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);

      SDValue SharedClusterConversion =

          DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,

                               ADDRESS_SPACE_GENERIC, DestAS);

      return SharedClusterConversion;

    }


    return DAG.getUNDEF(Op.getValueType());

  }


  return Op;

}


// This function is almost a copy of SelectionDAG::expandVAArg().

// The only diff is that this one produces loads from local address space.

SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

  const TargetLowering *TLI = STI.getTargetLowering();

  SDLoc DL(Op);


  SDNode *Node = Op.getNode();

  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();

  EVT VT = Node->getValueType(0);

  auto *Ty = VT.getTypeForEVT(*DAG.getContext());

  SDValue Tmp1 = Node->getOperand(0);

  SDValue Tmp2 = Node->getOperand(1);

  const MaybeAlign MA(Node->getConstantOperandVal(3));


  SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,

                                   Tmp1, Tmp2, MachinePointerInfo(V));

  SDValue VAList = VAListLoad;


  if (MA && *MA > TLI->getMinStackArgumentAlignment()) {

    VAList = DAG.getNode(

        ISD::ADD, DL, VAList.getValueType(), VAList,

        DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));


    VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,

                         DAG.getSignedConstant(-(int64_t)MA->value(), DL,

                                               VAList.getValueType()));

  }


  // Increment the pointer, VAList, to the next vaarg

  Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,

                     DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),

                                     DL, VAList.getValueType()));


  // Store the incremented VAList to the legalized pointer

  Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,

                      MachinePointerInfo(V));


  const Value *SrcV = Constant::getNullValue(

      PointerType::get(*DAG.getContext(), ADDRESS_SPACE_LOCAL));


  // Load the actual argument out of the pointer VAList

  return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));

}


SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

  const TargetLowering *TLI = STI.getTargetLowering();

  SDLoc DL(Op);

  EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());


  // Store the address of unsized array <function>_vararg[] in the ap object.

  SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);


  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

  return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),

                      MachinePointerInfo(SV));

}


static std::pair<MemSDNode *, uint32_t>


convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG,

                                    const NVPTXSubtarget &STI) {

  SDValue Chain = N->getOperand(0);

  SDValue BasePtr = N->getOperand(1);

  SDValue Mask = N->getOperand(3);

  [[maybe_unused]] SDValue Passthru = N->getOperand(4);


  SDLoc DL(N);

  EVT ResVT = N->getValueType(0);

  assert(ResVT.isVector() && "Masked vector load must have vector type");

  // While we only expect poison passthru vectors as an input to the backend,

  // when the legalization framework splits a poison vector in half, it creates

  // two undef vectors, so we can technically expect those too.

  assert((Passthru.getOpcode() == ISD::POISON ||

          Passthru.getOpcode() == ISD::UNDEF) &&

         "Passthru operand expected to be poison or undef");


  // Extract the mask and convert it to a uint32_t representing the used bytes

  // of the entire vector load

  uint32_t UsedBytesMask = 0;

  uint32_t ElementSizeInBits = ResVT.getVectorElementType().getSizeInBits();

  assert(ElementSizeInBits % 8 == 0 && "Unexpected element size");

  uint32_t ElementSizeInBytes = ElementSizeInBits / 8;

  uint32_t ElementMask = (1u << ElementSizeInBytes) - 1u;


  for (SDValue Op : reverse(Mask->ops())) {

    // We technically only want to do this shift for every

    // iteration *but* the first, but in the first iteration UsedBytesMask is 0,

    // so this shift is a no-op.

    UsedBytesMask <<= ElementSizeInBytes;


    // Mask elements must be constants.

    if (Op->getAsZExtVal() != 0)

      UsedBytesMask |= ElementMask;

  }


  assert(UsedBytesMask != 0 && UsedBytesMask != UINT32_MAX &&

         "Unexpected masked load with elements masked all on or all off");


  // Create a new load sd node to be handled normally by ReplaceLoadVector.

  MemSDNode *NewLD = cast<MemSDNode>(

      DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());


  // If our subtarget does not support the used bytes mask pragma, "drop" the

  // mask by setting it to UINT32_MAX

  if (!STI.hasUsedBytesMaskPragma())

    UsedBytesMask = UINT32_MAX;


  return {NewLD, UsedBytesMask};

}


/// replaceLoadVector - Convert vector loads into multi-output scalar loads.

static std::optional<std::pair<SDValue, SDValue>>


replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {

  MemSDNode *LD = cast<MemSDNode>(N);

  const EVT ResVT = LD->getValueType(0);

  const EVT MemVT = LD->getMemoryVT();


  // If we're doing sign/zero extension as part of the load, avoid lowering to

  // a LoadV node. TODO: consider relaxing this restriction.

  if (ResVT != MemVT)

    return std::nullopt;


  const auto NumEltsAndEltVT =

      getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());

  if (!NumEltsAndEltVT)

    return std::nullopt;

  const auto [NumElts, EltVT] = NumEltsAndEltVT.value();


  Align Alignment = LD->getAlign();

  const auto &TD = DAG.getDataLayout();

  Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));

  if (Alignment < PrefAlign) {

    // This load is not sufficiently aligned, so bail out and let this vector

    // load be scalarized.  Note that we may still be able to emit smaller

    // vector loads.  For example, if we are loading a <4 x float> with an

    // alignment of 8, this check will fail but the legalizer will try again

    // with 2 x <2 x float>, which will succeed with an alignment of 8.

    return std::nullopt;

  }


  // If we have a masked load, convert it to a normal load now

  std::optional<uint32_t> UsedBytesMask = std::nullopt;

  if (LD->getOpcode() == ISD::MLOAD)

    std::tie(LD, UsedBytesMask) =

        convertMLOADToLoadWithUsedBytesMask(LD, DAG, STI);


  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.

  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the

  // loaded type to i16 and propagate the "real" type as the memory type.

  const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;


  unsigned Opcode;

  switch (NumElts) {

  default:

    return std::nullopt;

  case 2:

    Opcode = NVPTXISD::LoadV2;

    break;

  case 4:

    Opcode = NVPTXISD::LoadV4;

    break;

  case 8:

    Opcode = NVPTXISD::LoadV8;

    break;

  }

  auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);

  ListVTs.push_back(MVT::Other);

  SDVTList LdResVTs = DAG.getVTList(ListVTs);


  SDLoc DL(LD);


  // Copy regular operands

  SmallVector<SDValue, 8> OtherOps(LD->ops());


  OtherOps.push_back(

      DAG.getConstant(UsedBytesMask.value_or(UINT32_MAX), DL, MVT::i32));


  // The select routine does not have access to the LoadSDNode instance, so

  // pass along the extension information

  OtherOps.push_back(

      DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));


  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,

                                          LD->getMemOperand());


  SmallVector<SDValue> ScalarRes;

  if (EltVT.isVector()) {

    assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());

    assert(NumElts * EltVT.getVectorNumElements() ==

           ResVT.getVectorNumElements());

    // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back

    // into individual elements.

    for (const unsigned I : llvm::seq(NumElts)) {

      SDValue SubVector = NewLD.getValue(I);

      DAG.ExtractVectorElements(SubVector, ScalarRes);

    }

  } else {

    for (const unsigned I : llvm::seq(NumElts)) {

      SDValue Res = NewLD.getValue(I);

      if (LoadEltVT != EltVT)

        Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);

      ScalarRes.push_back(Res);

    }

  }


  SDValue LoadChain = NewLD.getValue(NumElts);


  const MVT BuildVecVT =

      MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());

  SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);

  SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);


  return {{LoadValue, LoadChain}};

}


static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,

                              SmallVectorImpl<SDValue> &Results,

                              const NVPTXSubtarget &STI) {

  if (auto Res = replaceLoadVector(N, DAG, STI))

    Results.append({Res->first, Res->second});

}


static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG,

                               const NVPTXSubtarget &STI) {

  if (auto Res = replaceLoadVector(N, DAG, STI))

    return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));

  return SDValue();

}


// v = ld i1* addr

//   =>

// v1 = ld i8* addr (-> i16)

// v = trunc i16 to i1


static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {

  SDLoc dl(LD);

  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);

  assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");

  SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),

                                 LD->getBasePtr(), LD->getPointerInfo(),

                                 MVT::i8, LD->getAlign(),

                                 LD->getMemOperand()->getFlags());

  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);

  // The legalizer (the caller) is expecting two values from the legalized

  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()

  // in LegalizeDAG.cpp which also uses MergeValues.

  return DAG.getMergeValues({result, LD->getChain()}, dl);

}


SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {

  LoadSDNode *LD = cast<LoadSDNode>(Op);


  if (Op.getValueType() == MVT::i1)

    return lowerLOADi1(LD, DAG);


  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is

  // how they'll be lowered in ISel anyway, and by doing this a little earlier

  // we allow for more DAG combine opportunities.

  if (LD->getExtensionType() == ISD::EXTLOAD) {

    assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&

           "Unexpected fpext-load");

    return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),

                          LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),

                          LD->getMemOperand());

  }


  llvm_unreachable("Unexpected custom lowering for load");

}


SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {

  // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle

  // masked loads of these types and have to handle them here.

  // v2f32 also needs to be handled here if the subtarget has f32x2

  // instructions, making it legal.

  //

  // Note: misaligned masked loads should never reach this point

  // because the override of isLegalMaskedLoad in NVPTXTargetTransformInfo.cpp

  // will validate alignment. Therefore, we do not need to special case handle

  // them here.

  EVT VT = Op.getValueType();

  if (NVPTX::isPackedVectorTy(VT)) {

    auto Result = convertMLOADToLoadWithUsedBytesMask(

        cast<MemSDNode>(Op.getNode()), DAG, STI);

    MemSDNode *LD = std::get<0>(Result);

    uint32_t UsedBytesMask = std::get<1>(Result);


    SDLoc DL(LD);


    // Copy regular operands

    SmallVector<SDValue, 8> OtherOps(LD->ops());


    OtherOps.push_back(DAG.getConstant(UsedBytesMask, DL, MVT::i32));


    // We currently are not lowering extending loads, but pass the extension

    // type anyway as later handling expects it.

    OtherOps.push_back(

        DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));

    SDValue NewLD =

        DAG.getMemIntrinsicNode(NVPTXISD::MLoad, DL, LD->getVTList(), OtherOps,

                                LD->getMemoryVT(), LD->getMemOperand());

    return NewLD;

  }

  return SDValue();

}


static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG,

                                const NVPTXSubtarget &STI) {

  MemSDNode *N = cast<MemSDNode>(Op.getNode());

  SDValue Val = N->getOperand(1);

  SDLoc DL(N);

  const EVT ValVT = Val.getValueType();

  const EVT MemVT = N->getMemoryVT();


  // If we're truncating as part of the store, avoid lowering to a StoreV node.

  // TODO: consider relaxing this restriction.

  if (ValVT != MemVT)

    return SDValue();


  const auto NumEltsAndEltVT =

      getVectorLoweringShape(ValVT, STI, N->getAddressSpace());

  if (!NumEltsAndEltVT)

    return SDValue();

  const auto [NumElts, EltVT] = NumEltsAndEltVT.value();


  const DataLayout &TD = DAG.getDataLayout();


  Align Alignment = N->getAlign();

  Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));

  if (Alignment < PrefAlign) {

    // This store is not sufficiently aligned, so bail out and let this vector

    // store be scalarized.  Note that we may still be able to emit smaller

    // vector stores.  For example, if we are storing a <4 x float> with an

    // alignment of 8, this check will fail but the legalizer will try again

    // with 2 x <2 x float>, which will succeed with an alignment of 8.

    return SDValue();

  }


  unsigned Opcode;

  switch (NumElts) {

  default:

    return SDValue();

  case 2:

    Opcode = NVPTXISD::StoreV2;

    break;

  case 4:

    Opcode = NVPTXISD::StoreV4;

    break;

  case 8:

    Opcode = NVPTXISD::StoreV8;

    break;

  }


  SmallVector<SDValue, 8> Ops;


  // First is the chain

  Ops.push_back(N->getOperand(0));


  // Then the split values

  if (EltVT.isVector()) {

    assert(EVT(EltVT.getVectorElementType()) == ValVT.getVectorElementType());

    assert(NumElts * EltVT.getVectorNumElements() ==

           ValVT.getVectorNumElements());

    // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be

    // stored as b32s

    const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();

    for (const unsigned I : llvm::seq(NumElts)) {

      SmallVector<SDValue, 4> SubVectorElts;

      DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,

                                NumEltsPerSubVector);

      Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));

    }

  } else {

    SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);

    for (const unsigned I : llvm::seq(NumElts)) {

      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,

                                   DAG.getIntPtrConstant(I, DL));


      // Since StoreV2 is a target node, we cannot rely on DAG type

      // legalization. Therefore, we must ensure the type is legal.  For i1 and

      // i8, we set the stored type to i16 and propagate the "real" type as the

      // memory type.

      if (EltVT.getSizeInBits() < 16)

        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);

      Ops.push_back(ExtVal);

    }

  }


  // Then any remaining arguments

  Ops.append(N->op_begin() + 2, N->op_end());


  SDValue NewSt =

      DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,

                              N->getMemoryVT(), N->getMemOperand());


  // return DCI.CombineTo(N, NewSt, true);

  return NewSt;

}


SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {

  StoreSDNode *Store = cast<StoreSDNode>(Op);

  EVT VT = Store->getMemoryVT();


  if (VT == MVT::i1)

    return LowerSTOREi1(Op, DAG);


  // Lower store of any other vector type, including v2f32 as we want to break

  // it apart since this is not a widely-supported type.

  return lowerSTOREVector(Op, DAG, STI);

}


// st i1 v, addr

//    =>

// v1 = zxt v to i16

// st.u8 i16, addr

SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {

  SDNode *Node = Op.getNode();

  SDLoc dl(Node);

  StoreSDNode *ST = cast<StoreSDNode>(Node);

  SDValue Tmp1 = ST->getChain();

  SDValue Tmp2 = ST->getBasePtr();

  SDValue Tmp3 = ST->getValue();

  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");

  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);

  SDValue Result =

      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,

                        ST->getAlign(), ST->getMemOperand()->getFlags());

  return Result;

}


SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,

                                                SelectionDAG &DAG) const {

  // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit

  // operand so that it can pass the legalization.


  assert(Op.getOperand(1).getValueType() == MVT::i128 &&

         "Custom lowering for 128-bit CopyToReg only");


  SDNode *Node = Op.getNode();

  SDLoc DL(Node);


  SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));

  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,

                           DAG.getIntPtrConstant(0, DL));

  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,

                           DAG.getIntPtrConstant(1, DL));


  SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);

  SmallVector<EVT, 3> ResultsType(Node->values());


  NewOps[0] = Op->getOperand(0); // Chain

  NewOps[1] = Op->getOperand(1); // Dst Reg

  NewOps[2] = Lo;                // Lower 64-bit

  NewOps[3] = Hi;                // Higher 64-bit

  if (Op.getNumOperands() == 4)

    NewOps[4] = Op->getOperand(3); // Glue if exists


  return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);

}


unsigned NVPTXTargetLowering::getNumRegisters(

    LLVMContext &Context, EVT VT,

    std::optional<MVT> RegisterVT = std::nullopt) const {

  if (VT == MVT::i128 && RegisterVT == MVT::i128)

    return 1;

  return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);

}


bool NVPTXTargetLowering::splitValueIntoRegisterParts(

    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

    unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

  if (Val.getValueType() == MVT::i128 && NumParts == 1) {

    Parts[0] = Val;

    return true;

  }

  return false;

}


// This creates target external symbol for a function parameter.

// Name of the symbol is composed from its index and the function name.

// Negative index corresponds to special parameter (unsized array) used for

// passing variable arguments.

SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,

                                            EVT T) const {

  StringRef SavedStr = nvTM->getStrPool().save(

      getParamName(&DAG.getMachineFunction().getFunction(), I));

  return DAG.getExternalSymbol(SavedStr.data(), T);

}


SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,

                                                EVT T) const {

  const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));

  return DAG.getExternalSymbol(SavedStr.data(), T);

}


SDValue NVPTXTargetLowering::LowerFormalArguments(

    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

  const DataLayout &DL = DAG.getDataLayout();

  LLVMContext &Ctx = *DAG.getContext();

  auto PtrVT = getPointerTy(DAG.getDataLayout());


  const Function &F = DAG.getMachineFunction().getFunction();

  const bool IsKernel = isKernelFunction(F);


  SDValue Root = DAG.getRoot();

  SmallVector<SDValue, 16> OutChains;


  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.

  // Ins.size() will be larger

  //   * if there is an aggregate argument with multiple fields (each field

  //     showing up separately in Ins)

  //   * if there is a vector argument with more than typical vector-length

  //     elements (generally if more than 4) where each vector element is

  //     individually present in Ins.

  // So a different index should be used for indexing into Ins.

  // See similar issue in LowerCall.


  auto AllIns = ArrayRef(Ins);

  for (const auto &Arg : F.args()) {

    const auto ArgIns = AllIns.take_while(

        [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });

    AllIns = AllIns.drop_front(ArgIns.size());


    Type *Ty = Arg.getType();


    if (ArgIns.empty())

      report_fatal_error("Empty parameter types are not supported");


    if (Arg.use_empty()) {

      // argument is dead

      for (const auto &In : ArgIns) {

        assert(!In.Used && "Arg.use_empty() is true but Arg is used?");

        InVals.push_back(DAG.getUNDEF(In.VT));

      }

      continue;

    }


    SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);


    // In the following cases, assign a node order of "i+1"

    // to newly created nodes. The SDNodes for params have to

    // appear in the same order as their order of appearance

    // in the original function. "i+1" holds that order.

    if (Arg.hasByValAttr()) {

      // Param has ByVal attribute

      // Return MoveParam(param symbol).

      // Ideally, the param symbol can be returned directly,

      // but when SDNode builder decides to use it in a CopyToReg(),

      // machine instruction fails because TargetExternalSymbol

      // (not lowered) is target dependent, and CopyToReg assumes

      // the source is lowered.

      assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");

      const auto &ByvalIn = ArgIns[0];

      assert(getValueType(DL, Ty) == ByvalIn.VT &&

             "Ins type did not match function type");

      assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");


      SDValue P;

      if (IsKernel) {

        assert(isParamGridConstant(Arg) && "ByVal argument must be lowered to "

                                           "grid_constant by NVPTXLowerArgs");

        P = ArgSymbol;

        P.getNode()->setIROrder(Arg.getArgNo() + 1);

      } else {

        P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);

        P.getNode()->setIROrder(Arg.getArgNo() + 1);

        P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,

                                 ADDRESS_SPACE_GENERIC);

      }

      InVals.push_back(P);

    } else {

      SmallVector<EVT, 16> VTs;

      SmallVector<uint64_t, 16> Offsets;

      ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);

      assert(VTs.size() == ArgIns.size() && "Size mismatch");

      assert(VTs.size() == Offsets.size() && "Size mismatch");


      const Align ArgAlign = getFunctionArgumentAlignment(

          &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);


      unsigned I = 0;

      const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);

      for (const unsigned NumElts : VI) {

        // i1 is loaded/stored as i8

        const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];

        const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);


        SDValue VecAddr = DAG.getObjectPtrOffset(

            dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));


        const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);

        const unsigned AS = IsKernel ? NVPTX::AddressSpace::EntryParam

                                     : NVPTX::AddressSpace::DeviceParam;

        SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,

                                MachinePointerInfo(AS), PartAlign,

                                MachineMemOperand::MODereferenceable |

                                    MachineMemOperand::MOInvariant);

        P.getNode()->setIROrder(Arg.getArgNo() + 1);

        for (const unsigned J : llvm::seq(NumElts)) {

          SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);


          Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,

                                 DAG, dl);

          InVals.push_back(Elt);

        }

        I += NumElts;

      }

    }

  }


  if (!OutChains.empty())

    DAG.setRoot(DAG.getTokenFactor(dl, OutChains));


  return Chain;

}


SDValue


NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

                                 bool isVarArg,

                                 const SmallVectorImpl<ISD::OutputArg> &Outs,

                                 const SmallVectorImpl<SDValue> &OutVals,

                                 const SDLoc &dl, SelectionDAG &DAG) const {

  const Function &F = DAG.getMachineFunction().getFunction();

  Type *RetTy = F.getReturnType();


  if (RetTy->isVoidTy()) {

    assert(OutVals.empty() && Outs.empty() && "Return value expected for void");

    return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);

  }


  const DataLayout &DL = DAG.getDataLayout();

  LLVMContext &Ctx = *DAG.getContext();


  const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);

  const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);


  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than

  // 32-bits are sign extended or zero extended, depending on whether

  // they are signed or unsigned types.

  const bool ExtendIntegerRetVal =

      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;


  SmallVector<EVT, 16> VTs;

  SmallVector<uint64_t, 16> Offsets;

  ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);

  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");


  const auto GetRetVal = [&](unsigned I) -> SDValue {

    SDValue RetVal = OutVals[I];

    assert(promoteScalarIntegerPTX(RetVal.getValueType()) ==

               RetVal.getValueType() &&

           "OutVal type should always be legal");


    const EVT VTI = promoteScalarIntegerPTX(VTs[I]);

    const EVT StoreVT =

        ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);

    return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);

  };


  unsigned I = 0;

  const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);

  for (const unsigned NumElts : VI) {

    const MaybeAlign CurrentAlign = ExtendIntegerRetVal

                                        ? MaybeAlign(std::nullopt)

                                        : commonAlignment(RetAlign, Offsets[I]);


    SDValue Val = getBuildVectorizedValue(

        NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });


    SDValue Ptr =

        DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));


    Chain = DAG.getStore(Chain, dl, Val, Ptr,

                         MachinePointerInfo(NVPTX::AddressSpace::DeviceParam),

                         CurrentAlign);


    I += NumElts;

  }


  return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);

}


void NVPTXTargetLowering::LowerAsmOperandForConstraint(

    SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,

    SelectionDAG &DAG) const {

  if (Constraint.size() > 1)

    return;

  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

}


// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as

// TgtMemIntrinsic

// because we need the information that is only available in the "Value" type

// of destination

// pointer. In particular, the address space information.


void NVPTXTargetLowering::getTgtMemIntrinsic(

    SmallVectorImpl<IntrinsicInfo> &Infos, const CallBase &I,

    MachineFunction &MF, unsigned Intrinsic) const {

  IntrinsicInfo Info;

  switch (Intrinsic) {

  default:

    return;

  case Intrinsic::nvvm_match_all_sync_i32p:

  case Intrinsic::nvvm_match_all_sync_i64p:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute

    // in order to model data exchange with other threads, but perform no real

    // memory accesses.

    Info.memVT = MVT::i1;


    // Our result depends on both our and other thread's arguments.

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Infos.push_back(Info);

    return;

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v8f16;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v2i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(8);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:

  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:

  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:


  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:

  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:

  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:


  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:

  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:

  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:

  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:

  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:

  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:

  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:

  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(4);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4f16;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:

  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:

  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v8f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:


  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:


  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:

  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:

  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:

  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v8i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:

  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:

  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:

  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:

  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:

  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v2i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(8);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:

  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:

  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:


  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:

  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:

  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::f64;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(8);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:

  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:

  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v2f64;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v4f16;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:

  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:

  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v8f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:

  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:

  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:

  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v8i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:

  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:

  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:

  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:

  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v2i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(8);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:

  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:

  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:

  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v2f64;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(4);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:

  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_prefetch_tensormap: {

    auto &DL = I.getDataLayout();

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = getPointerTy(DL);

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags =

        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tensormap_replace_global_address:

  case Intrinsic::nvvm_tensormap_replace_global_stride: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::i64;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tensormap_replace_rank:

  case Intrinsic::nvvm_tensormap_replace_box_dim:

  case Intrinsic::nvvm_tensormap_replace_global_dim:

  case Intrinsic::nvvm_tensormap_replace_element_stride:

  case Intrinsic::nvvm_tensormap_replace_elemtype:

  case Intrinsic::nvvm_tensormap_replace_interleave_layout:

  case Intrinsic::nvvm_tensormap_replace_swizzle_mode:

  case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:

  case Intrinsic::nvvm_tensormap_replace_fill_mode: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_ldu_global_i:

  case Intrinsic::nvvm_ldu_global_f:

  case Intrinsic::nvvm_ldu_global_p: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = getValueType(I.getDataLayout(), I.getType());

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();


    Infos.push_back(Info);

    return;

  }

  case Intrinsic::nvvm_tex_1d_v4f32_s32:

  case Intrinsic::nvvm_tex_1d_v4f32_f32:

  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:

  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:

  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:

  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_v4f32_s32:

  case Intrinsic::nvvm_tex_2d_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:

  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:

  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_3d_v4f32_s32:

  case Intrinsic::nvvm_tex_3d_v4f32_f32:

  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_cube_v4f32_f32:

  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:

  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:

  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:

  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:

  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:

  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:

  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:

  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:

  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4f32;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_tex_1d_v4s32_s32:

  case Intrinsic::nvvm_tex_1d_v4s32_f32:

  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:

  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:

  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_v4s32_s32:

  case Intrinsic::nvvm_tex_2d_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:

  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_3d_v4s32_s32:

  case Intrinsic::nvvm_tex_3d_v4s32_f32:

  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_cube_v4s32_f32:

  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:

  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:

  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_cube_v4u32_f32:

  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:

  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:

  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_v4u32_s32:

  case Intrinsic::nvvm_tex_1d_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:

  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_v4u32_s32:

  case Intrinsic::nvvm_tex_2d_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:

  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_3d_v4u32_s32:

  case Intrinsic::nvvm_tex_3d_v4u32_f32:

  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:

  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:

  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:

  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:

  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:

  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:

  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:

  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:

  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:

  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:

  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:

  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:

  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:

  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_suld_1d_i8_clamp:

  case Intrinsic::nvvm_suld_1d_v2i8_clamp:

  case Intrinsic::nvvm_suld_1d_v4i8_clamp:

  case Intrinsic::nvvm_suld_1d_array_i8_clamp:

  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:

  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:

  case Intrinsic::nvvm_suld_2d_i8_clamp:

  case Intrinsic::nvvm_suld_2d_v2i8_clamp:

  case Intrinsic::nvvm_suld_2d_v4i8_clamp:

  case Intrinsic::nvvm_suld_2d_array_i8_clamp:

  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:

  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:

  case Intrinsic::nvvm_suld_3d_i8_clamp:

  case Intrinsic::nvvm_suld_3d_v2i8_clamp:

  case Intrinsic::nvvm_suld_3d_v4i8_clamp:

  case Intrinsic::nvvm_suld_1d_i8_trap:

  case Intrinsic::nvvm_suld_1d_v2i8_trap:

  case Intrinsic::nvvm_suld_1d_v4i8_trap:

  case Intrinsic::nvvm_suld_1d_array_i8_trap:

  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:

  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:

  case Intrinsic::nvvm_suld_2d_i8_trap:

  case Intrinsic::nvvm_suld_2d_v2i8_trap:

  case Intrinsic::nvvm_suld_2d_v4i8_trap:

  case Intrinsic::nvvm_suld_2d_array_i8_trap:

  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:

  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:

  case Intrinsic::nvvm_suld_3d_i8_trap:

  case Intrinsic::nvvm_suld_3d_v2i8_trap:

  case Intrinsic::nvvm_suld_3d_v4i8_trap:

  case Intrinsic::nvvm_suld_1d_i8_zero:

  case Intrinsic::nvvm_suld_1d_v2i8_zero:

  case Intrinsic::nvvm_suld_1d_v4i8_zero:

  case Intrinsic::nvvm_suld_1d_array_i8_zero:

  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:

  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:

  case Intrinsic::nvvm_suld_2d_i8_zero:

  case Intrinsic::nvvm_suld_2d_v2i8_zero:

  case Intrinsic::nvvm_suld_2d_v4i8_zero:

  case Intrinsic::nvvm_suld_2d_array_i8_zero:

  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:

  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:

  case Intrinsic::nvvm_suld_3d_i8_zero:

  case Intrinsic::nvvm_suld_3d_v2i8_zero:

  case Intrinsic::nvvm_suld_3d_v4i8_zero:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i8;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_suld_1d_i16_clamp:

  case Intrinsic::nvvm_suld_1d_v2i16_clamp:

  case Intrinsic::nvvm_suld_1d_v4i16_clamp:

  case Intrinsic::nvvm_suld_1d_array_i16_clamp:

  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:

  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:

  case Intrinsic::nvvm_suld_2d_i16_clamp:

  case Intrinsic::nvvm_suld_2d_v2i16_clamp:

  case Intrinsic::nvvm_suld_2d_v4i16_clamp:

  case Intrinsic::nvvm_suld_2d_array_i16_clamp:

  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:

  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:

  case Intrinsic::nvvm_suld_3d_i16_clamp:

  case Intrinsic::nvvm_suld_3d_v2i16_clamp:

  case Intrinsic::nvvm_suld_3d_v4i16_clamp:

  case Intrinsic::nvvm_suld_1d_i16_trap:

  case Intrinsic::nvvm_suld_1d_v2i16_trap:

  case Intrinsic::nvvm_suld_1d_v4i16_trap:

  case Intrinsic::nvvm_suld_1d_array_i16_trap:

  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:

  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:

  case Intrinsic::nvvm_suld_2d_i16_trap:

  case Intrinsic::nvvm_suld_2d_v2i16_trap:

  case Intrinsic::nvvm_suld_2d_v4i16_trap:

  case Intrinsic::nvvm_suld_2d_array_i16_trap:

  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:

  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:

  case Intrinsic::nvvm_suld_3d_i16_trap:

  case Intrinsic::nvvm_suld_3d_v2i16_trap:

  case Intrinsic::nvvm_suld_3d_v4i16_trap:

  case Intrinsic::nvvm_suld_1d_i16_zero:

  case Intrinsic::nvvm_suld_1d_v2i16_zero:

  case Intrinsic::nvvm_suld_1d_v4i16_zero:

  case Intrinsic::nvvm_suld_1d_array_i16_zero:

  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:

  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:

  case Intrinsic::nvvm_suld_2d_i16_zero:

  case Intrinsic::nvvm_suld_2d_v2i16_zero:

  case Intrinsic::nvvm_suld_2d_v4i16_zero:

  case Intrinsic::nvvm_suld_2d_array_i16_zero:

  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:

  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:

  case Intrinsic::nvvm_suld_3d_i16_zero:

  case Intrinsic::nvvm_suld_3d_v2i16_zero:

  case Intrinsic::nvvm_suld_3d_v4i16_zero:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i16;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_suld_1d_i32_clamp:

  case Intrinsic::nvvm_suld_1d_v2i32_clamp:

  case Intrinsic::nvvm_suld_1d_v4i32_clamp:

  case Intrinsic::nvvm_suld_1d_array_i32_clamp:

  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:

  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:

  case Intrinsic::nvvm_suld_2d_i32_clamp:

  case Intrinsic::nvvm_suld_2d_v2i32_clamp:

  case Intrinsic::nvvm_suld_2d_v4i32_clamp:

  case Intrinsic::nvvm_suld_2d_array_i32_clamp:

  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:

  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:

  case Intrinsic::nvvm_suld_3d_i32_clamp:

  case Intrinsic::nvvm_suld_3d_v2i32_clamp:

  case Intrinsic::nvvm_suld_3d_v4i32_clamp:

  case Intrinsic::nvvm_suld_1d_i32_trap:

  case Intrinsic::nvvm_suld_1d_v2i32_trap:

  case Intrinsic::nvvm_suld_1d_v4i32_trap:

  case Intrinsic::nvvm_suld_1d_array_i32_trap:

  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:

  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:

  case Intrinsic::nvvm_suld_2d_i32_trap:

  case Intrinsic::nvvm_suld_2d_v2i32_trap:

  case Intrinsic::nvvm_suld_2d_v4i32_trap:

  case Intrinsic::nvvm_suld_2d_array_i32_trap:

  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:

  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:

  case Intrinsic::nvvm_suld_3d_i32_trap:

  case Intrinsic::nvvm_suld_3d_v2i32_trap:

  case Intrinsic::nvvm_suld_3d_v4i32_trap:

  case Intrinsic::nvvm_suld_1d_i32_zero:

  case Intrinsic::nvvm_suld_1d_v2i32_zero:

  case Intrinsic::nvvm_suld_1d_v4i32_zero:

  case Intrinsic::nvvm_suld_1d_array_i32_zero:

  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:

  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:

  case Intrinsic::nvvm_suld_2d_i32_zero:

  case Intrinsic::nvvm_suld_2d_v2i32_zero:

  case Intrinsic::nvvm_suld_2d_v4i32_zero:

  case Intrinsic::nvvm_suld_2d_array_i32_zero:

  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:

  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:

  case Intrinsic::nvvm_suld_3d_i32_zero:

  case Intrinsic::nvvm_suld_3d_v2i32_zero:

  case Intrinsic::nvvm_suld_3d_v4i32_zero:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i32;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_suld_1d_i64_clamp:

  case Intrinsic::nvvm_suld_1d_v2i64_clamp:

  case Intrinsic::nvvm_suld_1d_array_i64_clamp:

  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:

  case Intrinsic::nvvm_suld_2d_i64_clamp:

  case Intrinsic::nvvm_suld_2d_v2i64_clamp:

  case Intrinsic::nvvm_suld_2d_array_i64_clamp:

  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:

  case Intrinsic::nvvm_suld_3d_i64_clamp:

  case Intrinsic::nvvm_suld_3d_v2i64_clamp:

  case Intrinsic::nvvm_suld_1d_i64_trap:

  case Intrinsic::nvvm_suld_1d_v2i64_trap:

  case Intrinsic::nvvm_suld_1d_array_i64_trap:

  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:

  case Intrinsic::nvvm_suld_2d_i64_trap:

  case Intrinsic::nvvm_suld_2d_v2i64_trap:

  case Intrinsic::nvvm_suld_2d_array_i64_trap:

  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:

  case Intrinsic::nvvm_suld_3d_i64_trap:

  case Intrinsic::nvvm_suld_3d_v2i64_trap:

  case Intrinsic::nvvm_suld_1d_i64_zero:

  case Intrinsic::nvvm_suld_1d_v2i64_zero:

  case Intrinsic::nvvm_suld_1d_array_i64_zero:

  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:

  case Intrinsic::nvvm_suld_2d_i64_zero:

  case Intrinsic::nvvm_suld_2d_v2i64_zero:

  case Intrinsic::nvvm_suld_2d_array_i64_zero:

  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:

  case Intrinsic::nvvm_suld_3d_i64_zero:

  case Intrinsic::nvvm_suld_3d_v2i64_zero:

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::i64;

    Info.ptrVal = nullptr;

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v1i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v2i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v2f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v4f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v8i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v8f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v16i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v16f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v32i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v32f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v64i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v64f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v128i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::v128f32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x1:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x1:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x1:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v2i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x2:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x1:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x4:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x2:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v8i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x8:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x4:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v16i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x16:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x8:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v32i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x64:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x32:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x16:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x64:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v64i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_st_16x64b_x128:

  case Intrinsic::nvvm_tcgen05_st_16x128b_x64:

  case Intrinsic::nvvm_tcgen05_st_16x256b_x32:

  case Intrinsic::nvvm_tcgen05_st_32x32b_x128:

  case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v128i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOStore;

    Info.align.reset();

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {

    // We are reading and writing back to TMem

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v4i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }


  case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:

  case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:

  case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:

  case Intrinsic::

      nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {

    // We are reading and writing back to TMem

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = MVT::v8i32;

    Info.ptrVal = I.getArgOperand(0);

    Info.offset = 0;

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Info.align = Align(16);

    Infos.push_back(Info);

    return;

  }

  }

}


// Helper for getting a function parameter name. Name is composed from

// its index and the function name. Negative index corresponds to special

// parameter (unsized array) used for passing variable arguments.


std::string NVPTXTargetLowering::getParamName(const Function *F,

                                              int Idx) const {

  std::string ParamName;

  raw_string_ostream ParamStr(ParamName);


  ParamStr << getTargetMachine().getSymbol(F)->getName();

  if (Idx < 0)

    ParamStr << "_vararg";

  else

    ParamStr << "_param_" << Idx;


  return ParamName;

}


/// isLegalAddressingMode - Return true if the addressing mode represented

/// by AM is legal for this target, for a load/store of the specified type.

/// Used to guide target specific optimizations, like loop strength reduction

/// (LoopStrengthReduce.cpp) and memory optimization for address mode

/// (CodeGenPrepare.cpp)


bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,

                                                const AddrMode &AM, Type *Ty,

                                                unsigned AS, Instruction *I) const {

  // AddrMode - This represents an addressing mode of:

  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg

  //

  // The legal address modes are

  // - [avar]

  // - [areg]

  // - [areg+immoff]

  // - [immAddr]


  // immoff must fit in a signed 32-bit int

  if (!APInt(64, AM.BaseOffs).isSignedIntN(32))

    return false;


  if (AM.BaseGV)

    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;


  switch (AM.Scale) {

  case 0: // "r", "r+i" or "i" is allowed

    break;

  case 1:

    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.

      return false;

    // Otherwise we have r+i.

    break;

  default:

    // No scale > 1 is allowed

    return false;

  }

  return true;

}


//===----------------------------------------------------------------------===//

//                         NVPTX Inline Assembly Support

//===----------------------------------------------------------------------===//


/// getConstraintType - Given a constraint letter, return the type of

/// constraint it is for this target.

NVPTXTargetLowering::ConstraintType


NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    default:

      break;

    case 'b':

    case 'r':

    case 'h':

    case 'c':

    case 'l':

    case 'f':

    case 'd':

    case 'q':

    case '0':

    case 'N':

      return C_RegisterClass;

    }

  }

  return TargetLowering::getConstraintType(Constraint);

}


std::pair<unsigned, const TargetRegisterClass *>


NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

                                                  StringRef Constraint,

                                                  MVT VT) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    case 'b':

      return std::make_pair(0U, &NVPTX::B1RegClass);

    case 'c':

    case 'h':

      return std::make_pair(0U, &NVPTX::B16RegClass);

    case 'r':

    case 'f':

      return std::make_pair(0U, &NVPTX::B32RegClass);

    case 'l':

    case 'N':

    case 'd':

      return std::make_pair(0U, &NVPTX::B64RegClass);

    case 'q': {

      if (STI.getSmVersion() < 70)

        report_fatal_error("Inline asm with 128 bit operands is only "

                           "supported for sm_70 and higher!");

      return std::make_pair(0U, &NVPTX::B128RegClass);

    }

    }

  }

  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

}


//===----------------------------------------------------------------------===//

//                         NVPTX DAG Combining

//===----------------------------------------------------------------------===//


bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,

                                   CodeGenOptLevel OptLevel) const {

  // Always honor command-line argument

  if (FMAContractLevelOpt.getNumOccurrences() > 0)

    return FMAContractLevelOpt > 0;


  // Do not contract if we're not optimizing the code.

  if (OptLevel == CodeGenOptLevel::None)

    return false;


  // Honor TargetOptions flags that explicitly say fusion is okay.

  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)

    return true;


  return false;

}


static bool isConstZero(const SDValue &Operand) {

  const auto *Const = dyn_cast<ConstantSDNode>(Operand);

  return Const && Const->getZExtValue() == 0;

}


/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with

/// operands N0 and N1.  This is a helper for PerformADDCombine that is

/// called with the default operands, and if that fails, with commuted

/// operands.

static SDValue


PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,

                              TargetLowering::DAGCombinerInfo &DCI) {

  EVT VT = N0.getValueType();


  // Since integer multiply-add costs the same as integer multiply

  // but is more costly than integer add, do the fusion only when

  // the mul is only used in the add.

  // TODO: this may not be true for later architectures, consider relaxing this

  if (!N0.getNode()->hasOneUse())

    return SDValue();


  // fold (add (select cond, 0, (mul a, b)), c)

  //   -> (select cond, c, (add (mul a, b), c))

  //

  if (N0.getOpcode() == ISD::SELECT) {

    unsigned ZeroOpNum;

    if (isConstZero(N0->getOperand(1)))

      ZeroOpNum = 1;

    else if (isConstZero(N0->getOperand(2)))

      ZeroOpNum = 2;

    else

      return SDValue();


    SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);

    if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())

      return SDValue();


    SDLoc DL(N);

    SDValue Mul =

        DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));

    SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);

    return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),

                             ((ZeroOpNum == 1) ? N1 : MAD),

                             ((ZeroOpNum == 1) ? MAD : N1));

  }


  return SDValue();

}


SDValue NVPTXTargetLowering::performFADDCombineWithOperands(

    SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,

    CodeGenOptLevel OptLevel) const {

  EVT VT = N0.getValueType();

  if (N0.getOpcode() == ISD::FMUL) {

    if (!(allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||

          (N->getFlags().hasAllowContract() &&

           N0->getFlags().hasAllowContract())))

      return SDValue();


    // For floating point:

    // Do the fusion only when the mul has less than 5 uses and all

    // are add.

    // The heuristic is that if a use is not an add, then that use

    // cannot be fused into fma, therefore mul is still needed anyway.

    // If there are more than 4 uses, even if they are all add, fusing

    // them will increase register pressue.

    //

    int numUses = 0;

    int nonAddCount = 0;

    for (const SDNode *User : N0.getNode()->users()) {

      numUses++;

      if (User->getOpcode() != ISD::FADD)

        ++nonAddCount;

      if (numUses >= 5)

        return SDValue();

    }

    if (nonAddCount) {

      int orderNo = N->getIROrder();

      int orderNo2 = N0.getNode()->getIROrder();

      // simple heuristics here for considering potential register

      // pressure, the logics here is that the differnce are used

      // to measure the distance between def and use, the longer distance

      // more likely cause register pressure.

      if (orderNo - orderNo2 < 500)

        return SDValue();


      // Now, check if at least one of the FMUL's operands is live beyond the

      // node N, which guarantees that the FMA will not increase register

      // pressure at node N.

      bool opIsLive = false;

      const SDNode *left = N0.getOperand(0).getNode();

      const SDNode *right = N0.getOperand(1).getNode();


      if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))

        opIsLive = true;


      if (!opIsLive)

        for (const SDNode *User : left->users()) {

          int orderNo3 = User->getIROrder();

          if (orderNo3 > orderNo) {

            opIsLive = true;

            break;

          }

        }


      if (!opIsLive)

        for (const SDNode *User : right->users()) {

          int orderNo3 = User->getIROrder();

          if (orderNo3 > orderNo) {

            opIsLive = true;

            break;

          }

        }


      if (!opIsLive)

        return SDValue();

    }


    return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),

                           N0.getOperand(1), N1);

  }


  return SDValue();

}


/// Fold unpacking movs into a load by increasing the number of return values.

///

/// ex:

/// L: v2f16,ch = load <p>

/// a: f16 = extractelt L:0, 0

/// b: f16 = extractelt L:0, 1

/// use(a, b)

///

/// ...is turned into...

///

/// L: f16,f16,ch = LoadV2 <p>

/// use(L:0, L:1)

static SDValue


combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  // Don't run this optimization before the legalizer

  if (!DCI.isAfterLegalizeDAG())

    return SDValue();


  EVT ElementVT = N->getValueType(0);

  // Avoid non-packed types and v4i8

  if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)

    return SDValue();


  // Check whether all outputs are either used by an extractelt or are

  // glue/chain nodes

  if (!all_of(N->uses(), [&](SDUse &U) {

        // Skip glue, chain nodes

        if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)

          return true;

        if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

          if (N->getOpcode() != ISD::LOAD)

            return true;

          // Since this is an ISD::LOAD, check all extractelts are used. If

          // any are not used, we don't want to defeat another optimization that

          // will narrow the load.

          //

          // For example:

          //

          // L: v2f16,ch = load <p>

          // e0: f16 = extractelt L:0, 0

          // e1: f16 = extractelt L:0, 1        <-- unused

          // store e0

          //

          // Can be optimized by DAGCombiner to:

          //

          // L: f16,ch = load <p>

          // store L:0

          return !U.getUser()->use_empty();

        }


        // Otherwise, this use prevents us from splitting a value.

        return false;

      }))

    return SDValue();


  auto *LD = cast<MemSDNode>(N);

  SDLoc DL(LD);


  // the new opcode after we double the number of operands

  unsigned Opcode;

  SmallVector<SDValue> Operands(LD->ops());

  unsigned OldNumOutputs; // non-glue, non-chain outputs

  switch (LD->getOpcode()) {

  case ISD::LOAD:

    OldNumOutputs = 1;

    // Any packed type is legal, so the legalizer will not have lowered

    // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it

    // here.

    Opcode = NVPTXISD::LoadV2;

    // append a "full" used bytes mask operand right before the extension type

    // operand, signifying that all bytes are used.

    Operands.push_back(DCI.DAG.getConstant(UINT32_MAX, DL, MVT::i32));

    Operands.push_back(DCI.DAG.getIntPtrConstant(

        cast<LoadSDNode>(LD)->getExtensionType(), DL));

    break;

  case NVPTXISD::LoadV2:

    OldNumOutputs = 2;

    Opcode = NVPTXISD::LoadV4;

    break;

  case NVPTXISD::LoadV4:

    // V8 is only supported for f32/i32. Don't forget, we're not changing the

    // load size here. This is already a 256-bit load.

    if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)

      return SDValue();

    OldNumOutputs = 4;

    Opcode = NVPTXISD::LoadV8;

    break;

  case NVPTXISD::LoadV8:

    // PTX doesn't support the next doubling of outputs

    return SDValue();

  }


  // the non-glue, non-chain outputs in the new load

  const unsigned NewNumOutputs = OldNumOutputs * 2;

  SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());

  // add remaining chain and glue values

  NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());


  // Create the new load

  SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(

      Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),

      LD->getMemOperand());


  // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep

  // the outputs the same. These nodes will be optimized away in later

  // DAGCombiner iterations.

  SmallVector<SDValue> Results;

  for (unsigned I : seq(OldNumOutputs))

    Results.push_back(DCI.DAG.getBuildVector(

        ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));

  // Add remaining chain and glue nodes

  for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))

    Results.push_back(NewLoad.getValue(NewNumOutputs + I));


  return DCI.DAG.getMergeValues(Results, DL);

}


/// Fold packing movs into a store.

///

/// ex:

/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16

/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16

/// StoreV2 v1, v2

///

/// ...is turned into...

///

/// StoreV4 a, b, c, d


static SDValue combinePackingMovIntoStore(SDNode *N,

                                          TargetLowering::DAGCombinerInfo &DCI,

                                          unsigned Front, unsigned Back) {

  // We want to run this as late as possible since other optimizations may

  // eliminate the BUILD_VECTORs.

  if (!DCI.isAfterLegalizeDAG())

    return SDValue();


  // Get the type of the operands being stored.

  EVT ElementVT = N->getOperand(Front).getValueType();


  // Avoid non-packed types and v4i8

  if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)

    return SDValue();


  auto *ST = cast<MemSDNode>(N);


  // The new opcode after we double the number of operands.

  unsigned Opcode;

  switch (N->getOpcode()) {

  case ISD::STORE:

    // Any packed type is legal, so the legalizer will not have lowered

    // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do

    // it here.

    Opcode = NVPTXISD::StoreV2;

    break;

  case NVPTXISD::StoreV2:

    Opcode = NVPTXISD::StoreV4;

    break;

  case NVPTXISD::StoreV4:

    // V8 is only supported for f32/i32. Don't forget, we're not changing the

    // store size here. This is already a 256-bit store.

    if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)

      return SDValue();

    Opcode = NVPTXISD::StoreV8;

    break;

  case NVPTXISD::StoreV8:

    // PTX doesn't support the next doubling of operands

    return SDValue();

  default:

    llvm_unreachable("Unhandled store opcode");

  }


  // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered

  // their elements.

  SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));

  for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {

    if (BV.getOpcode() != ISD::BUILD_VECTOR)

      return SDValue();


    // If the operand has multiple uses, this optimization can increase register

    // pressure.

    if (!BV.hasOneUse())

      return SDValue();


    // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for

    // any signs they may be folded by some other pattern or rule.

    for (SDValue Op : BV->ops()) {

      // Peek through bitcasts

      if (Op.getOpcode() == ISD::BITCAST)

        Op = Op.getOperand(0);


      // This may be folded into a PRMT.

      if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&

          Op->getOperand(0).getValueType() == MVT::i32)

        return SDValue();


      // This may be folded into cvt.bf16x2

      if (Op.getOpcode() == ISD::FP_ROUND)

        return SDValue();

    }

    Operands.append({BV.getOperand(0), BV.getOperand(1)});

  }

  Operands.append(N->op_end() - Back, N->op_end());


  // Now we replace the store

  return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,

                                     ST->getMemoryVT(), ST->getMemOperand());

}


static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                            const NVPTXSubtarget &STI) {


  if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {

    // Here is our chance to custom lower a store with a non-simple type.

    // Unfortunately, we can't do this in the legalizer because there is no

    // way to setOperationAction for an non-simple type.

    StoreSDNode *ST = cast<StoreSDNode>(N);

    if (!ST->getValue().getValueType().isSimple())

      return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);

  }


  return combinePackingMovIntoStore(N, DCI, 1, 2);

}


static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                           const NVPTXSubtarget &STI) {

  if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {

    // Here is our chance to custom lower a load with a non-simple type.

    // Unfortunately, we can't do this in the legalizer because there is no

    // way to setOperationAction for an non-simple type.

    if (!N->getValueType(0).isSimple())

      return lowerLoadVector(N, DCI.DAG, STI);

  }


  return combineUnpackingMovIntoLoad(N, DCI);

}


/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.

///


static SDValue PerformADDCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 CodeGenOptLevel OptLevel) {

  if (OptLevel == CodeGenOptLevel::None)

    return SDValue();


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // Skip non-integer, non-scalar case

  EVT VT = N0.getValueType();

  if (VT.isVector() || VT != MVT::i32)

    return SDValue();


  // First try with the default operand order.

  if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))

    return Result;


  // If that didn't work, try again with the operands commuted.

  return PerformADDCombineWithOperands(N, N1, N0, DCI);

}


/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent

/// register pairs (non-coalescable).


static bool isNonCoalescableBuildVector(const SDValue &BV) {

  if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)

    return false;


  SDValue Elt0 = BV.getOperand(0);

  SDValue Elt1 = BV.getOperand(1);


  bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;

  bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;


  // If neither element is an EXTRACT_VECTOR_ELT they are free-standing

  // scalars and the register allocator can still place them side-by-side.

  if (!IsExt0 && !IsExt1)

    return false;


  // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar

  // that cannot generally occupy the adjacent register slot.

  if (IsExt0 != IsExt1)

    return true;


  // At this point both sources are extracting from vectors. If they are from

  // different vectors, then the BUILD_VECTOR is non-coalescable.

  SDValue Src0 = Elt0.getOperand(0);

  SDValue Src1 = Elt1.getOperand(0);

  if (Src0 != Src1)

    return true;


  auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));

  auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));

  // If both indices are dynamic they will be lowered to

  // loads and the vector will be spilled to local memory. The register

  // allocator can easily place the results in adjacent registers.

  if (!Idx0 && !Idx1)

    return false;


  // If one index is dynamic and the other is constant, the value from the

  // constant load will result in an additional register to pair with the result

  // from the dynamic load. We consider this non-coalescable.

  if ((Idx0 && !Idx1) || (!Idx0 && Idx1))

    return true;


  // Both are constant, adjacent pairs are coalescable

  return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;

}


/// Return true if FMUL v2f32 node \p N may be scalarized to fold each lane's

/// product into a scalar FMA.

bool NVPTXTargetLowering::mayFoldFMULIntoFMA(SDNode *N, MachineFunction &MF,

                                             CodeGenOptLevel OptLevel) const {

  if (N->getOpcode() != ISD::FMUL || N->getValueType(0) != MVT::v2f32)

    return false;

  const bool GlobalFMA = allowFMA(MF, OptLevel);

  if (!N->getFlags().hasAllowContract() && !GlobalFMA)

    return false;


  const SDNode *FirstFAdd = nullptr;

  unsigned NumScalarFAdd = 0;


  // Both lanes must feed unique FADDs

  for (SDNode *EE : N->users()) {

    if (NumScalarFAdd == 2)

      return false;


    if (EE->getOpcode() != ISD::EXTRACT_VECTOR_ELT || !EE->hasOneUse() ||

        !isa<ConstantSDNode>(EE->getOperand(1)))

      return false;


    const SDNode *const FAdd = *EE->users().begin();

    if (FAdd->getOpcode() != ISD::FADD ||

        (!GlobalFMA && !FAdd->getFlags().hasAllowContract()))

      return false;


    if (!FirstFAdd)

      FirstFAdd = FAdd;

    else if (FAdd == FirstFAdd)

      return false;


    NumScalarFAdd++;

  }


  return NumScalarFAdd == 2;

}


/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least

/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register

/// pairs.  Without this combine the BUILD_VECTOR forces allocation of a

/// temporary 64-bit register, increasing register pressure.

///

/// Example - before:

///   t0: v2f32,v2f32,ch = LoadV2 ...

///   t1: f32 = extract_vector_elt t0, 0

///   t2: f32 = extract_vector_elt t0:1, 0

///   t3: v2f32 = BUILD_VECTOR t1, t2       ;; non-coalescable repack

///   t4: v2f32 = fma t_a, t3, t_c

///

/// After:

///   t0: v2f32,v2f32,ch = LoadV2 ...

///   t1: f32 = extract_vector_elt t0, 0

///   t2: f32 = extract_vector_elt t0:1, 0

///   a0: f32 = extract_vector_elt t_a, 0

///   a1: f32 = extract_vector_elt t_a, 1

///   c0: f32 = extract_vector_elt t_c, 0

///   c1: f32 = extract_vector_elt t_c, 1

///   r0: f32 = fma a0, t1, c0

///   r1: f32 = fma a1, t2, c1

///   t4: v2f32 = BUILD_VECTOR r0, r1

///

/// Also scalarizes an FMUL when all output lanes feed into scalar FADDs

/// to enable scalar FMA combining.

SDValue NVPTXTargetLowering::performScalarizeV2F32Op(

    SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

    CodeGenOptLevel OptLevel) const {

  EVT VT = N->getValueType(0);

  if (VT != MVT::v2f32)

    return SDValue();


  if (none_of(N->ops(), isNonCoalescableBuildVector) &&

      !mayFoldFMULIntoFMA(N, DCI.DAG.getMachineFunction(), OptLevel))

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  EVT EltVT = VT.getVectorElementType();

  unsigned Opc = N->getOpcode();


  // For each operand, get the scalar element at the given index: if the operand

  // is a BUILD_VECTOR, grab the element directly; otherwise, emit an

  // EXTRACT_VECTOR_ELT.

  auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {

    if (Op.getOpcode() == ISD::BUILD_VECTOR)

      return Op.getOperand(Index);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,

                       DAG.getVectorIdxConstant(Index, DL));

  };


  // Build scalar operand lists for element 0 and element 1.

  SmallVector<SDValue, 3> Ops0, Ops1;

  for (const SDValue &Op : N->ops()) {

    Ops0.push_back(GetElement(Op, 0));

    Ops1.push_back(GetElement(Op, 1));

  }


  SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());

  SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());


  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);

}


/// Target-specific dag combine xforms for ISD::FADD.

SDValue

NVPTXTargetLowering::performFADDCombine(SDNode *N,

                                        TargetLowering::DAGCombinerInfo &DCI,

                                        CodeGenOptLevel OptLevel) const {

  if (SDValue Result = performScalarizeV2F32Op(N, DCI, OptLevel))

    return Result;


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  EVT VT = N0.getValueType();

  if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))

    return SDValue();


  // First try with the default operand order.

  if (SDValue Result = performFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))

    return Result;


  // If that didn't work, try again with the operands commuted.

  return performFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);

}


/// Get 3-input version of a 2-input min/max opcode


static unsigned getMinMax3Opcode(unsigned MinMax2Opcode) {

  switch (MinMax2Opcode) {

  case ISD::FMAXNUM:

  case ISD::FMAXIMUMNUM:

    return NVPTXISD::FMAXNUM3;

  case ISD::FMINNUM:

  case ISD::FMINIMUMNUM:

    return NVPTXISD::FMINNUM3;

  case ISD::FMAXIMUM:

    return NVPTXISD::FMAXIMUM3;

  case ISD::FMINIMUM:

    return NVPTXISD::FMINIMUM3;

  default:

    llvm_unreachable("Invalid 2-input min/max opcode");

  }

}


/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into

/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.


static SDValue PerformFMinMaxCombine(SDNode *N,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     unsigned PTXVersion, unsigned SmVersion) {


  // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s

  EVT VT = N->getValueType(0);

  if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)

    return SDValue();


  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);

  unsigned MinMaxOp2 = N->getOpcode();

  unsigned MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);


  if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {

    // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)

    SDValue A = Op0.getOperand(0);

    SDValue B = Op0.getOperand(1);

    SDValue C = Op1;

    return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());

  } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {

    // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)

    SDValue A = Op0;

    SDValue B = Op1.getOperand(0);

    SDValue C = Op1.getOperand(1);

    return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());

  }

  return SDValue();

}


static SDValue PerformREMCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 CodeGenOptLevel OptLevel) {

  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);


  // Don't do anything at less than -O2.

  if (OptLevel < CodeGenOptLevel::Default)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  bool IsSigned = N->getOpcode() == ISD::SREM;

  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;


  const SDValue &Num = N->getOperand(0);

  const SDValue &Den = N->getOperand(1);


  for (const SDNode *U : Num->users()) {

    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&

        U->getOperand(1) == Den) {

      // Num % Den -> Num - (Num / Den) * Den

      return DAG.getNode(ISD::SUB, DL, VT, Num,

                         DAG.getNode(ISD::MUL, DL, VT,

                                     DAG.getNode(DivOpc, DL, VT, Num, Den),

                                     Den));

    }

  }

  return SDValue();

}


// sext (mul.iN nsw x, y)     => mul.wide.sN x, y

// zext (mul.iN nuw x, y)     => mul.wide.uN x, y

// sext (shl.iN nsw x, const) => mul.wide.sN x, (1 << const)

// zext (shl.iN nuw x, const) => mul.wide.uN x, (1 << const)


static SDValue combineSZExtToMulWide(SDNode *N,

                                     TargetLowering::DAGCombinerInfo &DCI,

                                     CodeGenOptLevel OptLevel) {

  assert(N->getOpcode() == ISD::SIGN_EXTEND ||

         N->getOpcode() == ISD::ZERO_EXTEND);


  if (OptLevel == CodeGenOptLevel::None)

    return SDValue();


  SDValue Op = N->getOperand(0);

  if (!Op.hasOneUse())

    return SDValue();


  EVT ToVT = N->getValueType(0);

  EVT FromVT = Op.getValueType();

  if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||

        (ToVT == MVT::i64 && FromVT == MVT::i32)))

    return SDValue();


  bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;

  if ((IsSigned && !Op->getFlags().hasNoSignedWrap()) ||

      (!IsSigned && !Op->getFlags().hasNoUnsignedWrap()))

    return SDValue();


  SDLoc DL(N);

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  unsigned MulWideOpcode =

      IsSigned ? NVPTXISD::MUL_WIDE_SIGNED : NVPTXISD::MUL_WIDE_UNSIGNED;

  if (Op.getOpcode() == ISD::MUL) {

    return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);

  } else if (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(RHS)) {

    const auto ShiftAmt = Op.getConstantOperandVal(1);

    const auto MulVal = APInt(FromVT.getSizeInBits(), 1) << ShiftAmt;


    // Note that the sext (shl nsw ...) case doesn't work if 1 << const

    // overflows to a negative value!  The only valid input values in this

    // case are 0 and -1 (all other values yield poison because of the nsw),

    // and mul.wide.sN would give us the wrong sign for -1.  We could use

    // mul.wide.uN, but since this is a weird case anyway, we might as well not

    // apply this transformation at all.

    if (IsSigned && MulVal.isNegative())

      return SDValue();


    RHS = DCI.DAG.getConstant(MulVal, DL, FromVT);

    return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);

  }


  return SDValue();

}


enum OperandSignedness {

  Signed = 0,

  Unsigned,

  Unknown

};


/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand

/// that can be demoted to \p OptSize bits without loss of information. The

/// signedness of the operand, if determinable, is placed in \p S.


static bool IsMulWideOperandDemotable(SDValue Op,

                                      unsigned OptSize,

                                      OperandSignedness &S) {

  S = Unknown;


  if (Op.getOpcode() == ISD::SIGN_EXTEND ||

      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {

    EVT OrigVT = Op.getOperand(0).getValueType();

    if (OrigVT.getFixedSizeInBits() <= OptSize) {

      S = Signed;

      return true;

    }

  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {

    EVT OrigVT = Op.getOperand(0).getValueType();

    if (OrigVT.getFixedSizeInBits() <= OptSize) {

      S = Unsigned;

      return true;

    }

  }


  return false;

}


/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can

/// be demoted to \p OptSize bits without loss of information. If the operands

/// contain a constant, it should appear as the RHS operand. The signedness of

/// the operands is placed in \p IsSigned.


static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,

                                        unsigned OptSize,

                                        bool &IsSigned) {

  OperandSignedness LHSSign;


  // The LHS operand must be a demotable op

  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))

    return false;


  // We should have been able to determine the signedness from the LHS

  if (LHSSign == Unknown)

    return false;


  IsSigned = (LHSSign == Signed);


  // The RHS can be a demotable op or a constant

  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {

    const APInt &Val = CI->getAPIntValue();

    if (LHSSign == Unsigned) {

      return Val.isIntN(OptSize);

    } else {

      return Val.isSignedIntN(OptSize);

    }

  } else {

    OperandSignedness RHSSign;

    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))

      return false;


    return LHSSign == RHSSign;

  }

}


/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply

/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform

/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift

/// amount.


static SDValue TryMULWIDECombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI) {

  EVT MulType = N->getValueType(0);

  if (MulType != MVT::i32 && MulType != MVT::i64) {

    return SDValue();

  }


  SDLoc DL(N);

  unsigned OptSize = MulType.getSizeInBits() >> 1;

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // Canonicalize the multiply so the constant (if any) is on the right

  if (N->getOpcode() == ISD::MUL) {

    if (isa<ConstantSDNode>(LHS)) {

      std::swap(LHS, RHS);

    }

  }


  // If we have a SHL, determine the actual multiply amount

  if (N->getOpcode() == ISD::SHL) {

    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);

    if (!ShlRHS) {

      return SDValue();

    }


    APInt ShiftAmt = ShlRHS->getAPIntValue();

    unsigned BitWidth = MulType.getSizeInBits();

    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {

      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;

      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);

    } else {

      return SDValue();

    }

  }


  bool Signed;

  // Verify that our operands are demotable

  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {

    return SDValue();

  }


  EVT DemotedVT;

  if (MulType == MVT::i32) {

    DemotedVT = MVT::i16;

  } else {

    DemotedVT = MVT::i32;

  }


  // Truncate the operands to the correct size. Note that these are just for

  // type consistency and will (likely) be eliminated in later phases.

  SDValue TruncLHS =

    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);

  SDValue TruncRHS =

    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);


  unsigned Opc;

  if (Signed) {

    Opc = NVPTXISD::MUL_WIDE_SIGNED;

  } else {

    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;

  }


  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);

}


static bool isConstOne(const SDValue &Operand) {

  const auto *Const = dyn_cast<ConstantSDNode>(Operand);

  return Const && Const->getZExtValue() == 1;

}


static SDValue matchMADConstOnePattern(SDValue Add) {

  if (Add->getOpcode() != ISD::ADD)

    return SDValue();


  if (isConstOne(Add->getOperand(0)))

    return Add->getOperand(1);


  if (isConstOne(Add->getOperand(1)))

    return Add->getOperand(0);


  return SDValue();

}


static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,

                                  TargetLowering::DAGCombinerInfo &DCI) {


  if (SDValue Y = matchMADConstOnePattern(Add)) {

    SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);

    return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);

  }


  return SDValue();

}


static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,

                                        SDLoc DL,

                                        TargetLowering::DAGCombinerInfo &DCI) {

  if (Select->getOpcode() != ISD::SELECT)

    return SDValue();


  SDValue Cond = Select->getOperand(0);


  unsigned ConstOpNo;

  if (isConstOne(Select->getOperand(1)))

    ConstOpNo = 1;

  else if (isConstOne(Select->getOperand(2)))

    ConstOpNo = 2;

  else

    return SDValue();


  SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);


  // Do not combine if the resulting sequence is not obviously profitable.

  if (!matchMADConstOnePattern(Y))

    return SDValue();


  SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);


  return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,

                         (ConstOpNo == 1) ? X : NewMul,

                         (ConstOpNo == 1) ? NewMul : X);

}


static SDValue


PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,

                              TargetLowering::DAGCombinerInfo &DCI) {


  EVT VT = N0.getValueType();

  if (VT.isVector())

    return SDValue();


  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)

    return SDValue();


  SDLoc DL(N);


  // (mul x, (add y, 1)) -> (add (mul x, y), x)

  if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))

    return Res;

  if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))

    return Res;


  // (mul x, (select y, 1)) -> (select (mul x, y), x)

  if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))

    return Res;

  if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))

    return Res;


  return SDValue();

}


/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.


static SDValue PerformMULCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 CodeGenOptLevel OptLevel) {

  if (OptLevel == CodeGenOptLevel::None)

    return SDValue();


  if (SDValue Ret = TryMULWIDECombine(N, DCI))

    return Ret;


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);

  return PerformMULCombineWithOperands(N, N0, N1, DCI);

}


/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.


static SDValue PerformSHLCombine(SDNode *N,

                                 TargetLowering::DAGCombinerInfo &DCI,

                                 CodeGenOptLevel OptLevel) {

  if (OptLevel > CodeGenOptLevel::None) {

    // Try mul.wide combining at OptLevel > 0

    if (SDValue Ret = TryMULWIDECombine(N, DCI))

      return Ret;

  }


  return SDValue();

}


static SDValue PerformSETCCCombine(SDNode *N,

                                   TargetLowering::DAGCombinerInfo &DCI,

                                   unsigned int SmVersion) {

  EVT CCType = N->getValueType(0);

  SDValue A = N->getOperand(0);

  SDValue B = N->getOperand(1);


  EVT AType = A.getValueType();

  if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))

    return SDValue();


  if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)

    return SDValue();


  SDLoc DL(N);

  // setp.f16x2 returns two scalar predicates, which we need to

  // convert back to v2i1. The returned result will be scalarized by

  // the legalizer, but the comparison will remain a single vector

  // instruction.

  SDValue CCNode = DCI.DAG.getNode(

      A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2

                                     : NVPTXISD::SETP_BF16X2,

      DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});

  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),

                         CCNode.getValue(1));

}


static SDValue PerformEXTRACTCombine(SDNode *N,

                                     TargetLowering::DAGCombinerInfo &DCI) {

  SDValue Vector = peekThroughFreeze(N->getOperand(0));

  SDLoc DL(N);

  EVT VectorVT = Vector.getValueType();

  if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&

      IsPTXVectorType(VectorVT.getSimpleVT()))

    return SDValue(); // Native vector loads already combine nicely w/

                      // extract_vector_elt.

  // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),

  // we already handle them OK.

  if (VectorVT.getVectorNumElements() == 1 ||

      NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)

    return SDValue();


  // Don't mess with undef values as sra may be simplified to 0, not undef.

  if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))

    return SDValue();


  uint64_t VectorBits = VectorVT.getSizeInBits();

  // We only handle the types we can extract in-register.

  if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))

    return SDValue();


  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));

  // Index == 0 is handled by generic DAG combiner.

  if (!Index || Index->getZExtValue() == 0)

    return SDValue();


  MVT IVT = MVT::getIntegerVT(VectorBits);

  EVT EltVT = VectorVT.getVectorElementType();

  EVT EltIVT = EltVT.changeTypeToInteger();

  uint64_t EltBits = EltVT.getScalarSizeInBits();


  SDValue Result = DCI.DAG.getNode(

      ISD::TRUNCATE, DL, EltIVT,

      DCI.DAG.getNode(

          ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),

          DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));


  // If element has non-integer type, bitcast it back to the expected type.

  if (EltVT != EltIVT)

    Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);

  // Past legalizer, we may need to extent i8 -> i16 to match the register type.

  if (EltVT != N->getValueType(0))

    Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);


  return Result;

}


/// Transform patterns like:

///   (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))

///   (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)

/// Into:

///   (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)

///

/// These patterns arise from code like `s >= 32 ? 0 : x >> s`. In LLVM,

/// over-shifting a value results in poison, but PTX shr/shl instructions clamp

/// the shift amount to BitWidth, making the guard redundant.

///

/// Note: We only handle SRL and SHL, not SRA, because arithmetic right shifts

/// can produce 0 or -1 when shift >= BitWidth.

/// Note: We don't handle uge or ule. These don't appear because of

/// canonicalization.


static SDValue PerformSELECTShiftCombine(SDNode *N,

                                         TargetLowering::DAGCombinerInfo &DCI) {

  if (!DCI.isAfterLegalizeDAG())

    return SDValue();


  using namespace SDPatternMatch;

  unsigned BitWidth = N->getValueType(0).getSizeInBits();

  SDValue ShiftAmt, ShiftOp;


  // Match logical shifts where the shift amount in the guard matches the shift

  // amount in the operation.

  auto LogicalShift =

      m_AllOf(m_Value(ShiftOp),

              m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),

                      m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));


  // shift_amt > BitWidth-1 ? 0 : shift_op

  bool MatchedUGT =

      sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),

                                   m_SpecificInt(APInt(BitWidth, BitWidth - 1)),

                                   m_SpecificCondCode(ISD::SETUGT)),

                           m_Zero(), LogicalShift));

  // shift_amt < BitWidth ? shift_op : 0

  bool MatchedULT =

      !MatchedUGT &&

      sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),

                                   m_SpecificInt(APInt(BitWidth, BitWidth)),

                                   m_SpecificCondCode(ISD::SETULT)),

                           LogicalShift, m_Zero()));


  if (!MatchedUGT && !MatchedULT)

    return SDValue();


  // In LLVM IR, the shift amount and the value-to-be-shifted are the same

  // type, whereas in PTX the shift amount is always i32.  Therefore when

  // shifting types larger than i32, we can only do this transformation if we

  // know that the upper bits of the shift amount are known zero.

  SDValue ClampAmt = ShiftOp.getOperand(1);

  unsigned ClampAmtBits = ClampAmt.getValueSizeInBits();

  if (ShiftAmt.getValueSizeInBits() > ClampAmtBits &&

      DCI.DAG.computeKnownBits(ShiftAmt).countMaxActiveBits() > ClampAmtBits)

    return SDValue();


  // Return a clamp shift operation, which has the same semantics as PTX shift.

  unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP

                                                      : NVPTXISD::SHL_CLAMP;

  return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),

                         ShiftOp.getOperand(0), ClampAmt);

}


static SDValue PerformVSELECTCombine(SDNode *N,

                                     TargetLowering::DAGCombinerInfo &DCI) {

  SDValue VA = N->getOperand(1);

  EVT VectorVT = VA.getValueType();

  if (VectorVT != MVT::v4i8)

    return SDValue();


  // We need to split vselect into individual per-element operations Because we

  // use BFE/BFI instruction for byte extraction/insertion, we do end up with

  // 32-bit values, so we may as well do comparison as i32 to avoid conversions

  // to/from i16 normally used for i8 values.

  SmallVector<SDValue, 4> E;

  SDLoc DL(N);

  SDValue VCond = N->getOperand(0);

  SDValue VB = N->getOperand(2);

  for (int I = 0; I < 4; ++I) {

    SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,

                                DCI.DAG.getConstant(I, DL, MVT::i32));

    SDValue EA = DCI.DAG.getAnyExtOrTrunc(

        DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,

                        DCI.DAG.getConstant(I, DL, MVT::i32)),

        DL, MVT::i32);

    SDValue EB = DCI.DAG.getAnyExtOrTrunc(

        DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,

                        DCI.DAG.getConstant(I, DL, MVT::i32)),

        DL, MVT::i32);

    E.push_back(DCI.DAG.getAnyExtOrTrunc(

        DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));

  }

  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);

}


static SDValue


PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  auto VT = N->getValueType(0);

  if (!DCI.isAfterLegalizeDAG() ||

      // only process v2*16 types

      !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&

        VT.getVectorNumElements() == 2))

    return SDValue();


  auto Op0 = N->getOperand(0);

  auto Op1 = N->getOperand(1);


  // Start out by assuming we want to take the lower 2 bytes of each i32

  // operand.

  uint64_t Op0Bytes = 0x10;

  uint64_t Op1Bytes = 0x54;


  std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},

                                                {&Op1, &Op1Bytes}};


  // Check that each operand is an i16, truncated from an i32 operand. We'll

  // select individual bytes from those original operands. Optionally, fold in a

  // shift right of that original operand.

  for (auto &[Op, OpBytes] : OpData) {

    // Eat up any bitcast

    if (Op->getOpcode() == ISD::BITCAST)

      *Op = Op->getOperand(0);


    if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&

          Op->getOperand(0).getValueType() == MVT::i32))

      return SDValue();


    // If the truncate has multiple uses, this optimization can increase

    // register pressure

    if (!Op->hasOneUse())

      return SDValue();


    *Op = Op->getOperand(0);


    // Optionally, fold in a shift-right of the original operand and let permute

    // pick the two higher bytes of the original value directly.

    if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {

      if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {

        // Shift the PRMT byte selector to pick upper bytes from each respective

        // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76

        assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&

               "PRMT selector values out of range");

        *OpBytes += 0x22;

        *Op = Op->getOperand(0);

      }

    }

  }


  SDLoc DL(N);

  auto &DAG = DCI.DAG;


  auto PRMT =

      getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),

              (Op1Bytes << 8) | Op0Bytes, DL, DAG);

  return DAG.getBitcast(VT, PRMT);

}


static SDValue combineADDRSPACECAST(SDNode *N,

                                    TargetLowering::DAGCombinerInfo &DCI) {

  auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);


  if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {

    assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());


    // Fold asc[B -> A](asc[A -> B](x)) -> x

    if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())

      return ASCN2->getOperand(0);

  }


  return SDValue();

}


// Given a constant selector value and a prmt mode, return the selector value

// normalized to the generic prmt mode. See the PTX ISA documentation for more

// details:

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt


static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {

  assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");


  if (Mode == NVPTX::PTXPrmtMode::NONE)

    return Selector;


  const unsigned V = Selector.trunc(2).getZExtValue();


  const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,

                              unsigned S3) {

    return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));

  };


  switch (Mode) {

  case NVPTX::PTXPrmtMode::F4E:

    return GetSelector(V, V + 1, V + 2, V + 3);

  case NVPTX::PTXPrmtMode::B4E:

    return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);

  case NVPTX::PTXPrmtMode::RC8:

    return GetSelector(V, V, V, V);

  case NVPTX::PTXPrmtMode::ECL:

    return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);

  case NVPTX::PTXPrmtMode::ECR:

    return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);

  case NVPTX::PTXPrmtMode::RC16: {

    unsigned V1 = (V & 1) << 1;

    return GetSelector(V1, V1 + 1, V1, V1 + 1);

  }

  default:

    llvm_unreachable("Invalid PRMT mode");

  }

}


static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {

  assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&

         Selector.getBitWidth() == 32 && "PRMT must have i32 operands");

  // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}

  APInt BitField = B.concat(A);

  APInt SelectorVal = getPRMTSelector(Selector, Mode);

  APInt Result(32, 0);

  for (unsigned I : llvm::seq(4U)) {

    APInt Sel = SelectorVal.extractBits(4, I * 4);

    unsigned Idx = Sel.getLoBits(3).getZExtValue();

    unsigned Sign = Sel.getHiBits(1).getZExtValue();

    APInt Byte = BitField.extractBits(8, Idx * 8);

    if (Sign)

      Byte = Byte.ashr(8);

    Result.insertBits(Byte, I * 8);

  }

  return Result;

}


static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,

                           CodeGenOptLevel OptLevel) {

  if (OptLevel == CodeGenOptLevel::None)

    return SDValue();


  // Constant fold PRMT

  if (isa<ConstantSDNode>(N->getOperand(0)) &&

      isa<ConstantSDNode>(N->getOperand(1)) &&

      isa<ConstantSDNode>(N->getOperand(2)))

    return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),

                                           N->getConstantOperandAPInt(1),

                                           N->getConstantOperandAPInt(2),

                                           N->getConstantOperandVal(3)),

                               SDLoc(N), N->getValueType(0));

  return SDValue();

}


// During call lowering we wrap the return values in a ProxyReg node which

// depend on the chain value produced by the completed call. This ensures that

// the full call is emitted in cases where libcalls are used to legalize

// operations. To improve the functioning of other DAG combines we pull all

// operations we can through one of these nodes, ensuring that the ProxyReg

// directly wraps a load. That is:

//

//  (ProxyReg (zext (load retval0)))  =>  (zext (ProxyReg (load retval0)))

//


static SDValue sinkProxyReg(SDValue R, SDValue Chain,

                            TargetLowering::DAGCombinerInfo &DCI) {

  switch (R.getOpcode()) {

  case ISD::TRUNCATE:

  case ISD::ANY_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::BITCAST: {

    if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))

      return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);

    return SDValue();

  }

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

  case ISD::OR: {

    if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))

      if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))

        return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);

    return SDValue();

  }

  case ISD::Constant:

    return R;

  case ISD::LOAD:

  case NVPTXISD::LoadV2:

  case NVPTXISD::LoadV4: {

    return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),

                           {Chain, R});

  }

  case ISD::BUILD_VECTOR: {

    if (DCI.isBeforeLegalize())

      return SDValue();


    SmallVector<SDValue, 16> Ops;

    for (auto &Op : R->ops()) {

      SDValue V = sinkProxyReg(Op, Chain, DCI);

      if (!V)

        return SDValue();

      Ops.push_back(V);

    }

    return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);

  }

  case ISD::EXTRACT_VECTOR_ELT: {

    if (DCI.isBeforeLegalize())

      return SDValue();


    if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))

      return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R),

                             R.getValueType(), V, R.getOperand(1));

    return SDValue();

  }

  default:

    return SDValue();

  }

}


static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {

  switch (AddIntrinsicID) {

  default:

    break;

  case Intrinsic::nvvm_add_rn_sat_f16:

  case Intrinsic::nvvm_add_rn_sat_v2f16:

    return NVPTXISD::SUB_RN_SAT;

  case Intrinsic::nvvm_add_rn_ftz_sat_f16:

  case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:

    return NVPTXISD::SUB_RN_FTZ_SAT;

  }

  llvm_unreachable("Invalid F16 add intrinsic");

}


static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG,

                                    Intrinsic::ID AddIntrinsicID) {

  SDValue Op1 = N->getOperand(1);

  SDValue Op2 = N->getOperand(2);


  SDValue SubOp1, SubOp2;


  if (Op1.getOpcode() == ISD::FNEG) {

    SubOp1 = Op2;

    SubOp2 = Op1.getOperand(0);

  } else if (Op2.getOpcode() == ISD::FNEG) {

    SubOp1 = Op1;

    SubOp2 = Op2.getOperand(0);

  } else {

    return SDValue();

  }


  SDLoc DL(N);

  return DAG.getNode(getF16SubOpc(AddIntrinsicID), DL, N->getValueType(0),

                     SubOp1, SubOp2);

}


static SDValue combineIntrinsicWOChain(SDNode *N,

                                       TargetLowering::DAGCombinerInfo &DCI,

                                       const NVPTXSubtarget &STI) {

  unsigned IID = N->getConstantOperandVal(0);


  switch (IID) {

  default:

    break;

  case Intrinsic::nvvm_add_rn_sat_f16:

  case Intrinsic::nvvm_add_rn_ftz_sat_f16:

  case Intrinsic::nvvm_add_rn_sat_v2f16:

  case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:

    return combineF16AddWithNeg(N, DCI.DAG, IID);

  }

  return SDValue();

}


static SDValue combineProxyReg(SDNode *N,

                               TargetLowering::DAGCombinerInfo &DCI) {


  SDValue Chain = N->getOperand(0);

  SDValue Reg = N->getOperand(1);


  // If the ProxyReg is not wrapping a load, try to pull the operations through

  // the ProxyReg.

  if (Reg.getOpcode() != ISD::LOAD) {

    if (SDValue V = sinkProxyReg(Reg, Chain, DCI))

      return V;

  }


  return SDValue();

}


SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,

                                               DAGCombinerInfo &DCI) const {

  CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();

  switch (N->getOpcode()) {

  default:

    break;

  case ISD::ADD:

    return PerformADDCombine(N, DCI, OptLevel);

  case ISD::ADDRSPACECAST:

    return combineADDRSPACECAST(N, DCI);

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

    return combineSZExtToMulWide(N, DCI, OptLevel);

  case ISD::BUILD_VECTOR:

    return PerformBUILD_VECTORCombine(N, DCI);

  case ISD::EXTRACT_VECTOR_ELT:

    return PerformEXTRACTCombine(N, DCI);

  case ISD::FADD:

    return performFADDCombine(N, DCI, OptLevel);

  case ISD::FMA:

  case ISD::FMUL:

  case ISD::FSUB:

    return performScalarizeV2F32Op(N, DCI, OptLevel);

  case ISD::FMAXNUM:

  case ISD::FMINNUM:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUMNUM:

  case ISD::FMINIMUMNUM:

    return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),

                                 STI.getSmVersion());

  case ISD::LOAD:

  case NVPTXISD::LoadV2:

  case NVPTXISD::LoadV4:

    return combineLOAD(N, DCI, STI);

  case ISD::MUL:

    return PerformMULCombine(N, DCI, OptLevel);

  case NVPTXISD::PRMT:

    return combinePRMT(N, DCI, OptLevel);

  case NVPTXISD::ProxyReg:

    return combineProxyReg(N, DCI);

  case ISD::SETCC:

    return PerformSETCCCombine(N, DCI, STI.getSmVersion());

  case ISD::SHL:

    return PerformSHLCombine(N, DCI, OptLevel);

  case ISD::SREM:

  case ISD::UREM:

    return PerformREMCombine(N, DCI, OptLevel);

  case ISD::STORE:

  case NVPTXISD::StoreV2:

  case NVPTXISD::StoreV4:

    return combineSTORE(N, DCI, STI);

  case ISD::SELECT:

    return PerformSELECTShiftCombine(N, DCI);

  case ISD::VSELECT:

    return PerformVSELECTCombine(N, DCI);

  case ISD::INTRINSIC_WO_CHAIN:

    return combineIntrinsicWOChain(N, DCI, STI);

  }

  return SDValue();

}


static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,

                           SmallVectorImpl<SDValue> &Results) {

  // Handle bitcasting to v2i8 without hitting the default promotion

  // strategy which goes through stack memory.

  SDValue Op(Node, 0);

  EVT ToVT = Op->getValueType(0);

  if (ToVT != MVT::v2i8) {

    return;

  }


  // Bitcast to i16 and unpack elements into a vector

  SDLoc DL(Node);

  SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));

  SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);

  SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);

  SDValue Vec1 =

      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

                  DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));

  Results.push_back(

      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));

}


static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,

                                     SmallVectorImpl<SDValue> &Results) {

  SDValue Chain = N->getOperand(0);

  SDValue Intrin = N->getOperand(1);

  SDLoc DL(N);


  // Get the intrinsic ID

  unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();

  switch (IntrinNo) {

  default:

    return;

  case Intrinsic::nvvm_ldu_global_i:

  case Intrinsic::nvvm_ldu_global_f:

  case Intrinsic::nvvm_ldu_global_p: {

    EVT ResVT = N->getValueType(0);


    if (ResVT.isVector()) {

      // Vector LDG/LDU


      unsigned NumElts = ResVT.getVectorNumElements();

      EVT EltVT = ResVT.getVectorElementType();


      // Since LDU/LDG are target nodes, we cannot rely on DAG type

      // legalization.

      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the

      // loaded type to i16 and propagate the "real" type as the memory type.

      bool NeedTrunc = false;

      if (EltVT.getSizeInBits() < 16) {

        EltVT = MVT::i16;

        NeedTrunc = true;

      }


      unsigned Opcode = 0;

      SDVTList LdResVTs;


      switch (NumElts) {

      default:

        return;

      case 2:

        Opcode = NVPTXISD::LDUV2;

        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);

        break;

      case 4: {

        Opcode = NVPTXISD::LDUV4;

        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };

        LdResVTs = DAG.getVTList(ListVTs);

        break;

      }

      }


      SmallVector<SDValue, 8> OtherOps;


      // Copy regular operands


      OtherOps.push_back(Chain); // Chain

                                 // Skip operand 1 (intrinsic ID)

      // Others

      OtherOps.append(N->op_begin() + 2, N->op_end());


      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);


      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,

                                              MemSD->getMemoryVT(),

                                              MemSD->getMemOperand());


      SmallVector<SDValue, 4> ScalarRes;


      for (unsigned i = 0; i < NumElts; ++i) {

        SDValue Res = NewLD.getValue(i);

        if (NeedTrunc)

          Res =

              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);

        ScalarRes.push_back(Res);

      }


      SDValue LoadChain = NewLD.getValue(NumElts);


      SDValue BuildVec =

          DAG.getBuildVector(ResVT, DL, ScalarRes);


      Results.push_back(BuildVec);

      Results.push_back(LoadChain);

    } else {

      // i8 LDG/LDU

      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&

             "Custom handling of non-i8 ldu/ldg?");


      // Just copy all operands as-is

      SmallVector<SDValue, 4> Ops(N->ops());


      // Force output to i16

      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);


      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);


      // We make sure the memory type is i8, which will be used during isel

      // to select the proper instruction.

      SDValue NewLD =

          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,

                                  MVT::i8, MemSD->getMemOperand());


      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

                                    NewLD.getValue(0)));

      Results.push_back(NewLD.getValue(1));

    }

    return;

  }


  case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:

  case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:

    if (auto Res = lowerTcgen05Ld(N, DAG)) {

      Results.push_back(Res->first);

      Results.push_back(Res->second);

    }

    return;


  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:

  case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:

    if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {

      Results.push_back(Res->first);

      Results.push_back(Res->second);

    }

    return;


  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:

  case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:

    if (auto Res = lowerTcgen05LdRed(N, DAG)) {

      Results.push_back(std::get<0>(*Res));

      Results.push_back(std::get<1>(*Res));

      Results.push_back(std::get<2>(*Res));

    }

    return;

  }

}


static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,

                                   SmallVectorImpl<SDValue> &Results) {

  // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit

  // result so that it can pass the legalization

  SDLoc DL(N);

  SDValue Chain = N->getOperand(0);

  SDValue Reg = N->getOperand(1);

  SDValue Glue = N->getOperand(2);


  assert(Reg.getValueType() == MVT::i128 &&

         "Custom lowering for CopyFromReg with 128-bit reg only");

  SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),

                                     N->getValueType(2)};

  SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};


  SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);

  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,

                             {NewValue.getValue(0), NewValue.getValue(1)});


  Results.push_back(Pair);

  Results.push_back(NewValue.getValue(2));

  Results.push_back(NewValue.getValue(3));

}


static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,

                            const TargetLowering &TLI,

                            SmallVectorImpl<SDValue> &Results) {

  SDValue Chain = N->getOperand(0);

  SDValue Reg = N->getOperand(1);


  MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());


  SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);

  SDValue NewProxy =

      DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});

  SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));


  Results.push_back(Res);

}


static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG,

                                 const NVPTXSubtarget &STI,

                                 SmallVectorImpl<SDValue> &Results) {

  assert(N->getValueType(0) == MVT::i128 &&

         "Custom lowering for atomic128 only supports i128");


  AtomicSDNode *AN = cast<AtomicSDNode>(N);

  SDLoc dl(N);


  if (!STI.hasAtomSwap128()) {

    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        DAG.getMachineFunction().getFunction(),

        "Support for b128 atomics introduced in PTX ISA version 8.3 and "

        "requires target sm_90.",

        dl.getDebugLoc()));


    Results.push_back(DAG.getUNDEF(MVT::i128));

    Results.push_back(AN->getOperand(0)); // Chain

    return;

  }


  SmallVector<SDValue, 6> Ops;

  Ops.push_back(AN->getOperand(0)); // Chain

  Ops.push_back(AN->getOperand(1)); // Ptr

  for (const auto &Op : AN->ops().drop_front(2)) {

    // Low part

    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,

                              DAG.getIntPtrConstant(0, dl)));

    // High part

    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,

                              DAG.getIntPtrConstant(1, dl)));

  }

  unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP

                        ? NVPTXISD::ATOMIC_SWAP_B128

                        : NVPTXISD::ATOMIC_CMP_SWAP_B128;

  SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);

  SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,

                                           AN->getMemOperand());

  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,

                                {Result.getValue(0), Result.getValue(1)}));

  Results.push_back(Result.getValue(2));

}


void NVPTXTargetLowering::ReplaceNodeResults(

    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {

  switch (N->getOpcode()) {

  default:

    report_fatal_error("Unhandled custom legalization");

  case ISD::BITCAST:

    ReplaceBITCAST(N, DAG, Results);

    return;

  case ISD::LOAD:

  case ISD::MLOAD:

    replaceLoadVector(N, DAG, Results, STI);

    return;

  case ISD::INTRINSIC_W_CHAIN:

    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);

    return;

  case ISD::CopyFromReg:

    ReplaceCopyFromReg_128(N, DAG, Results);

    return;

  case NVPTXISD::ProxyReg:

    replaceProxyReg(N, DAG, *this, Results);

    return;

  case ISD::ATOMIC_CMP_SWAP:

  case ISD::ATOMIC_SWAP:

    replaceAtomicSwap128(N, DAG, STI, Results);

    return;

  }

}


NVPTXTargetLowering::AtomicExpansionKind


NVPTXTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {

  Type *Ty = AI->getValOperand()->getType();


  // Try to lower LLVM atomicrmw fadd to PTX atomic.add.  This is complicated

  // by the weird FTZ behavior PTX atom.add has:

  //   - atom.add.f32 on global memory flushes denormals

  //   - atom.add.f32 on shared memory does not flush denormals

  //   - atom.add.f16 and atomic.add.bf16 never flush denormals

  //

  // We lower to atom.add only if the function's FTZ behavior matches that of

  // atom.add; otherwise, we lower to a CAS loop. But we always allow

  // atomic.add.bf16; even though it never flushes denormals, we never flush

  // bf16 denormals when doing regular arithmetic, even when FTZ is enabled.

  if (AI->isFloatingPointOperation() &&

      AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {

    const bool FTZ =

        AI->getFunction()->getDenormalMode(APFloat::IEEEsingle()).Output ==

        DenormalMode::PreserveSign;


    // AllowFTZAtomics forces atom.add regardless of the FTZ mismatch.

    if (Ty->isFloatTy()) {

      bool UseNative = AllowFTZAtomics;

      switch (AI->getPointerAddressSpace()) {

      case llvm::ADDRESS_SPACE_GLOBAL:

        UseNative |= FTZ;

        break;

      case llvm::ADDRESS_SPACE_SHARED:

      case llvm::ADDRESS_SPACE_SHARED_CLUSTER:

        UseNative |= !FTZ;

        break;

      }

      if (UseNative)

        return AtomicExpansionKind::None;

    }


    if (Ty->isHalfTy() && (!FTZ || AllowFTZAtomics) &&

        STI.getSmVersion() >= 70 && STI.getPTXVersion() >= 63)

      return AtomicExpansionKind::None;


    if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&

        STI.getPTXVersion() >= 78)

      return AtomicExpansionKind::None;


    if (Ty->isDoubleTy() && STI.hasAtomAddF64())

      return AtomicExpansionKind::None;

  }


  // PTX's only atomic fp op is `add`; all other ops expand to a CAS loop.

  if (AI->isFloatingPointOperation())

    return AtomicExpansionKind::CmpXChg;


  assert(Ty->isIntegerTy() && "Ty should be integer at this point");

  const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();


  switch (AI->getOperation()) {

  default:

    return AtomicExpansionKind::CmpXChg;

  case AtomicRMWInst::BinOp::Xchg:

    if (BitWidth == 128)

      return AtomicExpansionKind::None;

    [[fallthrough]];

  case AtomicRMWInst::BinOp::And:

  case AtomicRMWInst::BinOp::Or:

  case AtomicRMWInst::BinOp::Xor:

    switch (BitWidth) {

    case 8:

    case 16:

      return AtomicExpansionKind::CmpXChg;

    case 32:

      return AtomicExpansionKind::None;

    case 64:

      if (STI.hasAtomBitwise64())

        return AtomicExpansionKind::None;

      return AtomicExpansionKind::CmpXChg;

    case 128:

      return AtomicExpansionKind::CmpXChg;

    default:

      llvm_unreachable("unsupported width encountered");

    }

  case AtomicRMWInst::BinOp::Add:

  case AtomicRMWInst::BinOp::Sub:

  case AtomicRMWInst::BinOp::Max:

  case AtomicRMWInst::BinOp::Min:

  case AtomicRMWInst::BinOp::UMax:

  case AtomicRMWInst::BinOp::UMin:

    switch (BitWidth) {

    case 8:

    case 16:

      return AtomicExpansionKind::CmpXChg;

    case 32:

      return AtomicExpansionKind::None;

    case 64:

      if (STI.hasAtomMinMax64())

        return AtomicExpansionKind::None;

      return AtomicExpansionKind::CmpXChg;

    case 128:

      return AtomicExpansionKind::CmpXChg;

    default:

      llvm_unreachable("unsupported width encountered");

    }

  case AtomicRMWInst::BinOp::UIncWrap:

  case AtomicRMWInst::BinOp::UDecWrap:

    switch (BitWidth) {

    case 32:

      return AtomicExpansionKind::None;

    case 8:

    case 16:

    case 64:

    case 128:

      return AtomicExpansionKind::CmpXChg;

    default:

      llvm_unreachable("unsupported width encountered");

    }

  }


  return AtomicExpansionKind::CmpXChg;

}


bool NVPTXTargetLowering::shouldInsertFencesForAtomic(

    const Instruction *I) const {

  // This function returns true iff the operation is emulated using a CAS-loop,

  // or if it has the memory order seq_cst (which is not natively supported in

  // the PTX `atom` instruction).

  //

  // atomicrmw and cmpxchg instructions not efficiently supported by PTX

  // are lowered to CAS emulation loops that preserve their memory order,

  // syncscope, and volatile semantics. For PTX, it is more efficient to use

  // atom.cas.relaxed.sco instructions within the loop, and fences before and

  // after the loop to restore order.

  //

  // Atomic instructions efficiently supported by PTX are lowered to

  // `atom.<op>.<sem>.<scope` instruction with their corresponding memory order

  // and scope. Since PTX does not support seq_cst, we emulate it by lowering to

  // a fence.sc followed by an atom according to the PTX atomics ABI

  // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html

  if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I))

    return (cast<IntegerType>(CI->getCompareOperand()->getType())

                ->getBitWidth() < STI.getMinCmpXchgSizeInBits()) ||

           CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent;

  if (auto *RI = dyn_cast<AtomicRMWInst>(I))

    return shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::CmpXChg ||

           RI->getOrdering() == AtomicOrdering::SequentiallyConsistent;

  return false;

}


AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(

    const Instruction *I) const {

  // If the operation is emulated by a CAS-loop, we lower the instruction to

  // atom.<op>.relaxed, since AtomicExpandPass will insert fences for enforcing

  // the correct memory ordering around the CAS loop.

  //

  // When the operation is not emulated, but the memory order is seq_cst,

  // we must lower to "fence.sc.<scope>; atom.<op>.acquire.<scope>;" to conform

  // to the PTX atomics ABI.

  // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html

  // For such cases, emitLeadingFence() will separately insert the leading

  // "fence.sc.<scope>;". Here, we only set the memory order to acquire.

  //

  // Otherwise, the operation is not emulated, and the memory order is not

  // seq_cst.  In this case, the LLVM memory order is natively supported by the

  // PTX `atom` instruction, and we just lower to the corresponding

  // `atom.<op>.relaxed|acquire|release|acq_rel". For such cases, this function

  // will NOT be called.

  // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for

  // I before its memory order was modified.

  if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I);

      CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&

      cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=

          STI.getMinCmpXchgSizeInBits())

    return AtomicOrdering::Acquire;

  else if (auto *RI = dyn_cast<AtomicRMWInst>(I);

           RI && RI->getOrdering() == AtomicOrdering::SequentiallyConsistent &&

           shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::None)

    return AtomicOrdering::Acquire;


  return AtomicOrdering::Monotonic;

}


Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,

                                                   Instruction *Inst,

                                                   AtomicOrdering Ord) const {

  // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for

  // `Inst` before its memory order was modified. We cannot enforce this with an

  // assert, because AtomicExpandPass will have modified the memory order

  // between the initial call to shouldInsertFencesForAtomic() and the call to

  // this function.

  if (!isa<AtomicCmpXchgInst>(Inst) && !isa<AtomicRMWInst>(Inst))

    return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);


  // Specialize for cmpxchg and atomicrmw

  auto SSID = getAtomicSyncScopeID(Inst);

  assert(SSID.has_value() && "Expected an atomic operation");


  if (isReleaseOrStronger(Ord))

    return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent

                                   ? AtomicOrdering::SequentiallyConsistent

                                   : AtomicOrdering::Release,

                               SSID.value());


  return nullptr;

}


Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,

                                                    Instruction *Inst,

                                                    AtomicOrdering Ord) const {

  // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for

  // `Inst` before its memory order was modified. See `emitLeadingFence` for why

  // this cannot be enforced with an assert.  Specialize for cmpxchg and

  // atomicrmw

  auto *CI = dyn_cast<AtomicCmpXchgInst>(Inst);

  auto *RI = dyn_cast<AtomicRMWInst>(Inst);

  if (!CI && !RI)

    return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);


  auto SSID = getAtomicSyncScopeID(Inst);

  assert(SSID.has_value() && "Expected an atomic operation");


  bool IsEmulated =

      CI ? cast<IntegerType>(CI->getCompareOperand()->getType())

                   ->getBitWidth() < STI.getMinCmpXchgSizeInBits()

         : shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::CmpXChg;


  if (isAcquireOrStronger(Ord) && IsEmulated)

    return Builder.CreateFence(AtomicOrdering::Acquire, SSID.value());


  return nullptr;

}


// Rather than default to SINT when both UINT and SINT are custom, we only

// change the opcode when UINT is not legal and SINT is. UINT is preferred when

// both are custom since unsigned CVT instructions can lead to slightly better

// SASS code with fewer instructions.


unsigned NVPTXTargetLowering::getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,

                                                        EVT ToVT) const {

  if (isOperationLegal(Op, ToVT))

    return Op;

  switch (Op) {

  case ISD::FP_TO_UINT:

    if (isOperationLegal(ISD::FP_TO_SINT, ToVT))

      return ISD::FP_TO_SINT;

    break;

  case ISD::STRICT_FP_TO_UINT:

    if (isOperationLegal(ISD::STRICT_FP_TO_SINT, ToVT))

      return ISD::STRICT_FP_TO_SINT;

    break;

  case ISD::VP_FP_TO_UINT:

    if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))

      return ISD::VP_FP_TO_SINT;

    break;

  default:

    break;

  }

  return Op;

}


// Pin NVPTXTargetObjectFile's vtables to this file.

NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;


MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(

    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {

  return getDataSection();

}


static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,

                                    const SelectionDAG &DAG, unsigned Depth) {

  SDValue A = Op.getOperand(0);

  SDValue B = Op.getOperand(1);

  ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));

  unsigned Mode = Op.getConstantOperandVal(3);


  if (!Selector)

    return;


  KnownBits AKnown = DAG.computeKnownBits(A, Depth);

  KnownBits BKnown = DAG.computeKnownBits(B, Depth);


  // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}

  assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&

         "PRMT must have i32 operands");

  assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");

  KnownBits BitField = BKnown.concat(AKnown);


  APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);

  for (unsigned I : llvm::seq(4)) {

    APInt Sel = SelectorVal.extractBits(4, I * 4);

    unsigned Idx = Sel.getLoBits(3).getZExtValue();

    unsigned Sign = Sel.getHiBits(1).getZExtValue();

    KnownBits Byte = BitField.extractBits(8, Idx * 8);

    if (Sign)

      Byte = KnownBits::ashr(Byte, KnownBits::makeConstant(APInt(8, 7)));

    Known.insertBits(Byte, I * 8);

  }

}


static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {

  MemSDNode *LD = cast<MemSDNode>(Op);


  // We can't do anything without knowing the sign bit.

  auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);

  if (ExtType == ISD::SEXTLOAD)

    return;


  // ExtLoading to vector types is weird and may not work well with known bits.

  auto DestVT = LD->getValueType(0);

  if (DestVT.isVector())

    return;


  assert(Known.getBitWidth() == DestVT.getSizeInBits());

  auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);

  Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);

}


void NVPTXTargetLowering::computeKnownBitsForTargetNode(

    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,

    const SelectionDAG &DAG, unsigned Depth) const {

  Known.resetAll();


  switch (Op.getOpcode()) {

  case NVPTXISD::PRMT:

    computeKnownBitsForPRMT(Op, Known, DAG, Depth);

    break;

  case NVPTXISD::LoadV2:

  case NVPTXISD::LoadV4:

  case NVPTXISD::LoadV8:

    computeKnownBitsForLoadV(Op, Known);

    break;

  default:

    break;

  }

}


static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,

                                                   const APInt &DemandedBits) {

  APInt DemandedLHS = APInt(32, 0);

  APInt DemandedRHS = APInt(32, 0);


  for (unsigned I : llvm::seq(4)) {

    if (DemandedBits.extractBits(8, I * 8).isZero())

      continue;


    APInt Sel = SelectorVal.extractBits(4, I * 4);

    unsigned Idx = Sel.getLoBits(3).getZExtValue();

    unsigned Sign = Sel.getHiBits(1).getZExtValue();


    APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;

    unsigned ByteStart = (Idx % 4) * 8;

    if (Sign)

      Src.setBit(ByteStart + 7);

    else

      Src.setBits(ByteStart, ByteStart + 8);

  }


  return {DemandedLHS, DemandedRHS};

}


// Replace undef with 0 as this is easier for other optimizations such as

// known bits.


static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) {

  if (!Op)

    return SDValue();

  if (Op.isUndef())

    return DAG.getConstant(0, SDLoc(), MVT::i32);

  return Op;

}


static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT,

                                           const APInt &DemandedBits,

                                           SelectionDAG &DAG,

                                           const TargetLowering &TLI,

                                           unsigned Depth) {

  assert(PRMT.getOpcode() == NVPTXISD::PRMT);

  SDValue Op0 = PRMT.getOperand(0);

  SDValue Op1 = PRMT.getOperand(1);

  auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));

  if (!SelectorConst)

    return SDValue();


  unsigned Mode = PRMT.getConstantOperandVal(3);

  const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);


  // Try to simplify the PRMT to one of the inputs if the used bytes are all

  // from the same input in the correct order.

  const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;

  const unsigned SelBits = (4 - LeadingBytes) * 4;

  if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))

    return Op0;

  if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))

    return Op1;


  auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);


  // Attempt to avoid multi-use ops if we don't need anything from them.

  SDValue DemandedOp0 =

      TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);

  SDValue DemandedOp1 =

      TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);


  DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);

  DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);

  if ((DemandedOp0 && DemandedOp0 != Op0) ||

      (DemandedOp1 && DemandedOp1 != Op1)) {

    Op0 = DemandedOp0 ? DemandedOp0 : Op0;

    Op1 = DemandedOp1 ? DemandedOp1 : Op1;

    return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);

  }


  return SDValue();

}


bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode(

    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

    KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {

  Known.resetAll();


  switch (Op.getOpcode()) {

  case NVPTXISD::PRMT:

    if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG,

                                                     *this, Depth)) {

      TLO.CombineTo(Op, Result);

      return true;

    }

    break;

  default:

    break;

  }


  computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);

  return false;

}


SDValue
return SDValue()

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

S1
constexpr LLT S1
Definition AMDGPULegalizerInfo.cpp:296

F32
constexpr LLT F32
Definition AMDGPULegalizerInfo.cpp:300

UseNative
static cl::list< std::string > UseNative("amdgpu-use-native", cl::desc("Comma separated list of functions to replace with native, or all"), cl::CommaSeparated, cl::ValueOptional, cl::Hidden)

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

APFloat.h
This file declares a class to represent arbitrary precision floating point values and provide a varie...

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

PerformADDCombineWithOperands
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
Definition ARMISelLowering.cpp:13484

PerformADDCombine
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
Definition ARMISelLowering.cpp:13972

PerformVSELECTCombine
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Definition ARMISelLowering.cpp:13355

PerformMULCombine
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Definition ARMISelLowering.cpp:14219

PerformBUILD_VECTORCombine
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
Definition ARMISelLowering.cpp:15350

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:808

Alignment.h

AtomicOrdering.h
Atomic ordering constants.

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

X
#define X(NUM, ENUM, NAME)
Definition ELF.h:853

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Casting.h

CodeGen.h

CommandLine.h

clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition CommandLine.h:687

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

DataLayout.h

DerivedTypes.h

DiagnosticInfo.h

FPEnv.h
This file contains the declarations of entities that describe floating point environment and related ...

GlobalValue.h

IsIndirectCall
static bool IsIndirectCall(const MachineInstr *MI)
Definition HexagonGlobalScheduler.cpp:685

IRBuilder.h

Argument.h

Function.h

Instruction.h

Module.h
Module.h This file contains the declarations for the Module class.

Type.h

Value.h

ISDOpcodes.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

Instructions.h

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

KnownBits.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineFunction.h

getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Definition MachineInstrBundle.cpp:104

MachineJumpTableInfo.h

MachineMemOperand.h

Reg
Register Reg
Definition MachineSink.cpp:2126

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

MachineValueType.h

Context
@ Context
Definition MemProfContextDisambiguation.cpp:135

T
#define T
Definition Mips16ISelLowering.cpp:282

NVPTXAddrSpace.h
NVPTX address space definition.

NVPTXBaseInfo.h

NVPTXISelDAGToDAG.h

reportInvalidTensormapReplaceUsage
static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG, unsigned Val)
Definition NVPTXISelLowering.cpp:2788

combineADDRSPACECAST
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6884

sched4reg
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))

lowerTcgen05St
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG, bool hasOffset=false)
Definition NVPTXISelLowering.cpp:2585

PerformEXTRACTCombine
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6676

UsePrecDivF32
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))

isConstOne
static bool isConstOne(const SDValue &Operand)
Definition NVPTXISelLowering.cpp:6535

FMAContractLevelOpt
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it  2: do it aggressively"), cl::init(2))

IsPTXVectorType
static bool IsPTXVectorType(MVT VT)
Definition NVPTXISelLowering.cpp:161

PerformSELECTShiftCombine
static SDValue PerformSELECTShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Transform patterns like: (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt)) (select (ult...
Definition NVPTXISelLowering.cpp:6740

lowerLOADi1
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3827

lowerIntrinsicVoid
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2839

refinePtrAS
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
Definition NVPTXISelLowering.cpp:1344

lowerROT
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3285

ComputePTXValueVTs
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
Definition NVPTXISelLowering.cpp:308

ReplaceBITCAST
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition NVPTXISelLowering.cpp:7168

replaceAtomicSwap128
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
Definition NVPTXISelLowering.cpp:7413

getMinMax3Opcode
static unsigned getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
Definition NVPTXISelLowering.cpp:6262

lowerSTOREVector
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:3898

lowerLoadVector
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:3816

replaceProxyReg
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
Definition NVPTXISelLowering.cpp:7397

ReplaceCopyFromReg_128
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition NVPTXISelLowering.cpp:7373

TCGEN05_LD_RED_INST
#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)
Definition NVPTXISelLowering.cpp:3048

lowerCTLZCTPOP
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3226

combineMADConstOne
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6553

getTcgen05LdRedID
static unsigned getTcgen05LdRedID(Intrinsic::ID IID)
Definition NVPTXISelLowering.cpp:3051

combinePRMT
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
Definition NVPTXISelLowering.cpp:6955

combinePackingMovIntoStore
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
Definition NVPTXISelLowering.cpp:5956

ReplaceINTRINSIC_W_CHAIN
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition NVPTXISelLowering.cpp:7190

getBuildVectorizedValue
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
Definition NVPTXISelLowering.cpp:374

getArgumentAlignment
static Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx, const DataLayout &DL)
Definition NVPTXISelLowering.cpp:1313

getExtractVectorizedValue
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:357

combineSZExtToMulWide
static SDValue combineSZExtToMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
Definition NVPTXISelLowering.cpp:6346

canMergeParamLoadStoresStartingAt
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
Definition NVPTXISelLowering.cpp:432

getVectorizedVT
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
Definition NVPTXISelLowering.cpp:348

lowerIntrinsicWOChain
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3190

PerformFMinMaxCombine
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
Definition NVPTXISelLowering.cpp:6281

getScalar3OpcodeForReduction
static std::optional< unsigned > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
Definition NVPTXISelLowering.cpp:1986

lowerIntrinsicWChain
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3160

isNonCoalescableBuildVector
static bool isNonCoalescableBuildVector(const SDValue &BV)
Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent register pairs (non-coalescable...
Definition NVPTXISelLowering.cpp:6090

isConstZero
static bool isConstZero(const SDValue &Operand)
Definition NVPTXISelLowering.cpp:5704

getF16SubOpc
static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID)
Definition NVPTXISelLowering.cpp:7037

LowerVectorArith
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2565

LowerTcgen05MMADisableOutputLane
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2714

IsMulWideOperandDemotable
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
Definition NVPTXISelLowering.cpp:6406

getTcgen05MMADisableOutputLane
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
Definition NVPTXISelLowering.cpp:2654

getPRMTDemandedBits
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
Definition NVPTXISelLowering.cpp:7816

computePRMT
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
Definition NVPTXISelLowering.cpp:6936

getScalarOpcodeForReduction
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
Definition NVPTXISelLowering.cpp:1969

PerformREMCombine
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
Definition NVPTXISelLowering.cpp:6311

lowerBSWAP
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2616

lowerMSTORE
static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3353

PerformMULCombineWithOperands
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6594

computeKnownBitsForPRMT
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
Definition NVPTXISelLowering.cpp:7748

combineUnpackingMovIntoLoad
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
Definition NVPTXISelLowering.cpp:5842

TCGEN05_LD_RED_INTR
#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)
Definition NVPTXISelLowering.cpp:3045

lowerTensormapReplaceElemtype
static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2811

LowerClusterLaunchControlQueryCancel
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2921

lowerTcgen05Ld
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
Definition NVPTXISelLowering.cpp:2745

lowerCvtRSIntrinsics
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2961

replaceLoadVector
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
Definition NVPTXISelLowering.cpp:3706

expandFSH64
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3236

AreMulWideOperandsDemotable
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
Definition NVPTXISelLowering.cpp:6433

convertMLOADToLoadWithUsedBytesMask
static std::pair< MemSDNode *, uint32_t > convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:3653

TryMULWIDECombine
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
Definition NVPTXISelLowering.cpp:6469

lowerPrmtIntrinsic
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3016

combineMulSelectConstOne
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6564

buildTreeReduction
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
Definition NVPTXISelLowering.cpp:1922

combineProxyReg
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:7090

VectorizePTXValueVTs
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
Definition NVPTXISelLowering.cpp:486

getPRMT
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
Definition NVPTXISelLowering.cpp:1902

matchMADConstOnePattern
static SDValue matchMADConstOnePattern(SDValue Add)
Definition NVPTXISelLowering.cpp:6540

correctParamType
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
Definition NVPTXISelLowering.cpp:1376

getExtOpcode
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
Definition NVPTXISelLowering.cpp:1368

UsePrecSqrtF32
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))

AllowFTZAtomics
static cl::opt< bool > AllowFTZAtomics("nvptx-allow-ftz-atomics", cl::Hidden, cl::desc("NVPTX Specific: Lower atomicrmw fadd to atom.add even when its " "FTZ behavior does not match the function's denormal mode."), cl::init(false))

computeKnownBitsForLoadV
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
Definition NVPTXISelLowering.cpp:7779

getPRMTSelector
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
Definition NVPTXISelLowering.cpp:6903

promoteScalarIntegerPTX
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
Definition NVPTXISelLowering.cpp:398

lowerTcgen05LdRed
static std::optional< std::tuple< SDValue, SDValue, SDValue > > lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3116

simplifyDemandedBitsForPRMT
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
Definition NVPTXISelLowering.cpp:7850

lowerFREM
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3291

canonicalizePRMTInput
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:7842

sinkProxyReg
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
Definition NVPTXISelLowering.cpp:6981

lowerFSH
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3280

lowerTensormapReplaceSwizzleMode
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2825

combineIntrinsicWOChain
static SDValue combineIntrinsicWOChain(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:7073

PromoteBinOpToF32
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:2451

OperandSignedness
OperandSignedness
Definition NVPTXISelLowering.cpp:6397

Unknown
@ Unknown
Definition NVPTXISelLowering.cpp:6400

Unsigned
@ Unsigned
Definition NVPTXISelLowering.cpp:6399

Signed
@ Signed
Definition NVPTXISelLowering.cpp:6398

PerformSETCCCombine
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
Definition NVPTXISelLowering.cpp:6649

getVectorLoweringShape
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
Definition NVPTXISelLowering.cpp:206

combineF16AddWithNeg
static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG, Intrinsic::ID AddIntrinsicID)
Definition NVPTXISelLowering.cpp:7051

UseApproxLog2F32
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....

lowerSELECT
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
Definition NVPTXISelLowering.cpp:3319

combineLOAD
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:6051

combineSTORE
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:6036

PerformSHLCombine
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
Definition NVPTXISelLowering.cpp:6637

NVPTXISelLowering.h

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

NVPTXSelectionDAGInfo.h

NVPTXSubtarget.h

NVPTXTargetMachine.h

NVPTXTargetObjectFile.h

NVPTXUtilities.h

NVPTX.h

High
uint64_t High
Definition NVVMIntrRange.cpp:46

NVVMProperties.h

P
#define P(N)

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

SDPatternMatch.h
Contains matchers for matching SelectionDAG nodes and values.

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

SelectionDAGNodes.h

SelectionDAG.h

SmallVector.h
This file defines the SmallVector class.

StringRef.h

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

TargetCallingConv.h

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetOptions.h

ValueTypes.h

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:75

Node
Definition ItaniumDemangle.h:166

llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296

llvm::APFloat::getInf
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getLoBits
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:645

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563

llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414

llvm::APInt::getHiBits
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:640

llvm::APInt::trunc
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968

llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511

llvm::APInt::isSignedIntN
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436

llvm::APInt::slt
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137

llvm::APInt::extractBits
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483

llvm::APInt::isIntN
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433

llvm::APInt::sge
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244

llvm::AddrSpaceCastSDNode
Definition SelectionDAGNodes.h:1398

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition Instructions.h:710

llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition Instructions.h:726

llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition Instructions.h:747

llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition Instructions.h:740

llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition Instructions.h:734

llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition Instructions.h:728

llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition Instructions.h:730

llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition Instructions.h:736

llvm::AtomicRMWInst::UIncWrap
@ UIncWrap
Increment one up to a maximum value.
Definition Instructions.h:778

llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition Instructions.h:738

llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition Instructions.h:744

llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition Instructions.h:742

llvm::AtomicRMWInst::UDecWrap
@ UDecWrap
Decrement one until a minimum value or zero.
Definition Instructions.h:782

llvm::AtomicRMWInst::Xchg
@ Xchg
*p = v
Definition Instructions.h:724

llvm::AtomicRMWInst::isFloatingPointOperation
bool isFloatingPointOperation() const
Definition Instructions.h:917

llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition Instructions.h:830

llvm::AtomicRMWInst::getValOperand
Value * getValOperand()
Definition Instructions.h:909

llvm::AtomicRMWInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition Instructions.h:913

llvm::AtomicSDNode
This is an SDNode representing atomic operations.
Definition SelectionDAGNodes.h:1655

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition InstrTypes.h:1181

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1417

llvm::CallBase::getFunctionType
FunctionType * getFunctionType() const
Definition InstrTypes.h:1274

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1815

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1831

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition Constants.cpp:363

llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition DWARFExpression.h:93

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DataLayout::getTypeAllocSize
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.cpp:951

llvm::DataLayout::getPrefTypeAlign
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition DataLayout.cpp:993

llvm::DemandedBits
Definition DemandedBits.h:41

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1103

llvm::Function
Definition Function.h:65

llvm::Function::addFnAttr
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638

llvm::Function::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:804

llvm::GlobalObject
Definition GlobalObject.h:28

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition GlobalValue.h:663

llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::getFunction
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition Instruction.cpp:90

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:249

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2656

llvm::MCObjectFileInfo::getDataSection
MCSection * getDataSection() const
Definition MCObjectFileInfo.h:275

llvm::MCRegister::NoRegister
static constexpr unsigned NoRegister
Definition MCRegister.h:60

llvm::MCSection
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573

llvm::MCSymbol::getName
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::integer_fixedlen_vector_valuetypes
static auto integer_fixedlen_vector_valuetypes()
Definition MachineValueType.h:581

llvm::MVT::SimpleTy
SimpleValueType SimpleTy
Definition MachineValueType.h:55

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:322

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:106

llvm::MVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
Definition MachineValueType.h:118

llvm::MVT::integer_valuetypes
static auto integer_valuetypes()
Definition MachineValueType.h:552

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:336

llvm::MVT::fixedlen_vector_valuetypes
static auto fixedlen_vector_valuetypes()
Definition MachineValueType.h:569

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:384

llvm::MVT::getVectorVT
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition MachineValueType.h:479

llvm::MVT::getVectorElementType
MVT getVectorElementType() const
Definition MachineValueType.h:291

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:469

llvm::MVT::fp_valuetypes
static auto fp_valuetypes()
Definition MachineValueType.h:558

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:287

llvm::MVT::fp_fixedlen_vector_valuetypes
static auto fp_fixedlen_vector_valuetypes()
Definition MachineValueType.h:587

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition MachineFunction.cpp:331

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:784

llvm::MachineJumpTableInfo::EK_Inline
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
Definition MachineJumpTableInfo.h:84

llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition MachineMemOperand.h:145

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition MachineMemOperand.h:147

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MemIntrinsicSDNode
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
Definition SelectionDAGNodes.h:1729

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1418

llvm::MemSDNode::getAlign
Align getAlign() const
Definition SelectionDAGNodes.h:1443

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1514

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1509

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::NVPTXDAGToDAGISel::getFromTypeWidthForLoad
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
Definition NVPTXISelDAGToDAG.cpp:1332

llvm::NVPTXSubtarget
Definition NVPTXSubtarget.h:36

llvm::NVPTXSubtarget::hasTensormapReplaceSwizzleModeSupport
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
Definition NVPTXSubtarget.h:270

llvm::NVPTXSubtarget::hasUsedBytesMaskPragma
bool hasUsedBytesMaskPragma() const
Definition NVPTXSubtarget.h:98

llvm::NVPTXSubtarget::hasTensormapReplaceElemtypeSupport
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
Definition NVPTXSubtarget.h:255

llvm::NVPTXSubtarget::hasAtomSwap128
bool hasAtomSwap128() const
Definition NVPTXSubtarget.h:106

llvm::NVPTXSubtarget::hasF32x2Instructions
bool hasF32x2Instructions() const
Definition NVPTXSubtarget.cpp:202

llvm::NVPTXSubtarget::has256BitVectorLoadStore
bool has256BitVectorLoadStore(unsigned AS) const
Definition NVPTXSubtarget.h:94

llvm::NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
Definition NVPTXISelLowering.cpp:7630

llvm::NVPTXTargetLowering::getConstraintType
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
Definition NVPTXISelLowering.cpp:5633

llvm::NVPTXTargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
Definition NVPTXISelLowering.cpp:3434

llvm::NVPTXTargetLowering::nvTM
const NVPTXTargetMachine * nvTM
Definition NVPTXISelLowering.h:100

llvm::NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
Definition NVPTXISelLowering.cpp:7894

llvm::NVPTXTargetLowering::shouldExpandAtomicRMWInIR
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Definition NVPTXISelLowering.cpp:7485

llvm::NVPTXTargetLowering::NVPTXTargetLowering
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
Definition NVPTXISelLowering.cpp:521

llvm::NVPTXTargetLowering::getPrototype
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
Definition NVPTXISelLowering.cpp:1209

llvm::NVPTXTargetLowering::getPreferredFPToIntOpcode
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
Definition NVPTXISelLowering.cpp:7717

llvm::NVPTXTargetLowering::useF32FTZ
bool useF32FTZ(const MachineFunction &MF) const
Definition NVPTXISelLowering.cpp:156

llvm::NVPTXTargetLowering::LowerSTACKSAVE
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Definition NVPTXISelLowering.cpp:1856

llvm::NVPTXTargetLowering::getSqrtEstimate
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
Definition NVPTXISelLowering.cpp:1157

llvm::NVPTXTargetLowering::LowerReturn
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
Definition NVPTXISelLowering.cpp:4211

llvm::NVPTXTargetLowering::LowerFormalArguments
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
Definition NVPTXISelLowering.cpp:4087

llvm::NVPTXTargetLowering::LowerAsmOperandForConstraint
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
Definition NVPTXISelLowering.cpp:4276

llvm::NVPTXTargetLowering::LowerSTACKRESTORE
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Definition NVPTXISelLowering.cpp:1834

llvm::NVPTXTargetLowering::emitTrailingFence
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Definition NVPTXISelLowering.cpp:7687

llvm::NVPTXTargetLowering::getParamName
std::string getParamName(const Function *F, int Idx) const
Definition NVPTXISelLowering.cpp:5573

llvm::NVPTXTargetLowering::getPreferredVectorAction
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
Definition NVPTXISelLowering.cpp:1150

llvm::NVPTXTargetLowering::getDivF32Level
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
Definition NVPTXISelLowering.cpp:129

llvm::NVPTXTargetLowering::shouldInsertFencesForAtomic
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Definition NVPTXISelLowering.cpp:7603

llvm::NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
Definition NVPTXISelLowering.cpp:1794

llvm::NVPTXTargetLowering::getSetCCResultType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
Definition NVPTXISelLowering.h:61

llvm::NVPTXTargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition NVPTXISelLowering.cpp:5655

llvm::NVPTXTargetLowering::isLegalAddressingMode
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Definition NVPTXISelLowering.cpp:5592

llvm::NVPTXTargetLowering::emitLeadingFence
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
Definition NVPTXISelLowering.cpp:7663

llvm::NVPTXTargetLowering::getTgtMemIntrinsic
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
Definition NVPTXISelLowering.cpp:4289

llvm::NVPTXTargetLowering::allowFMA
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
Definition NVPTXISelLowering.cpp:5687

llvm::NVPTXTargetLowering::usePrecSqrtF32
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Definition NVPTXISelLowering.cpp:142

llvm::NVPTXTargetLowering::getJumpTableEncoding
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
Definition NVPTXISelLowering.cpp:3561

llvm::NVPTXTargetLowering::LowerCall
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
Definition NVPTXISelLowering.cpp:1391

llvm::NVPTXTargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition NVPTXISelLowering.cpp:7797

llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition NVPTXTargetMachine.h:25

llvm::NVPTXTargetObjectFile::SelectSectionForGlobal
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
Definition NVPTXISelLowering.cpp:7743

llvm::NVPTXTargetObjectFile::~NVPTXTargetObjectFile
~NVPTXTargetObjectFile() override

llvm::PointerType::get
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1246

llvm::SDLoc::getDebugLoc
const DebugLoc & getDebugLoc() const
Definition SelectionDAGNodes.h:1262

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:511

llvm::SDNode::ops
ArrayRef< SDUse > ops() const
Definition SelectionDAGNodes.h:1065

llvm::SDNode::getAsAPIntVal
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1866

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:706

llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition SelectionDAGNodes.h:778

llvm::SDNode::getIROrder
unsigned getIROrder() const
Return the node ordering.
Definition SelectionDAGNodes.h:805

llvm::SDNode::getFlags
SDNodeFlags getFlags() const
Definition SelectionDAGNodes.h:1107

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1858

llvm::SDNode::getNumValues
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
Definition SelectionDAGNodes.h:1123

llvm::SDNode::getVTList
SDVTList getVTList() const
Definition SelectionDAGNodes.h:1084

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1056

llvm::SDNode::isUndef
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
Definition SelectionDAGNodes.h:713

llvm::SDNode::users
iterator_range< user_iterator > users()
Definition SelectionDAGNodes.h:918

llvm::SDNode::setFlags
void setFlags(SDNodeFlags NewFlags)
Definition SelectionDAGNodes.h:1108

llvm::SDUse
Represents a use of a SDNode.
Definition SelectionDAGNodes.h:280

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::hasOneUse
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
Definition SelectionDAGNodes.h:1323

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1281

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1289

llvm::SDValue::getScalarValueSizeInBits
uint64_t getScalarValueSizeInBits() const
Definition SelectionDAGNodes.h:205

llvm::SDValue::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned i) const
Definition SelectionDAGNodes.h:1293

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1277

llvm::SectionKind
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:231

llvm::SelectionDAG::getExtLoad
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10663

llvm::SelectionDAG::getRoot
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition SelectionDAG.h:601

llvm::SelectionDAG::getAddrSpaceCast
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
Definition SelectionDAG.cpp:2546

llvm::SelectionDAG::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
Definition SelectionDAG.h:516

llvm::SelectionDAG::getMergeValues
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition SelectionDAG.cpp:10381

llvm::SelectionDAG::getVTList
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition SelectionDAG.cpp:12073

llvm::SelectionDAG::ExtractVectorElements
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
Definition SelectionDAG.cpp:14499

llvm::SelectionDAG::getFreeze
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
Definition SelectionDAG.cpp:2568

llvm::SelectionDAG::getSymbolFunctionGlobalAddress
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
Definition SelectionDAG.cpp:13598

llvm::SelectionDAG::getConstantFP
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1934

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2434

llvm::SelectionDAG::getLoad
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition SelectionDAG.cpp:10646

llvm::SelectionDAG::getMemIntrinsicNode
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
Definition SelectionDAG.cpp:10392

llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition SelectionDAG.h:1382

llvm::SelectionDAG::getEVTAlign
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
Definition SelectionDAG.cpp:1409

llvm::SelectionDAG::getNOT
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
Definition SelectionDAG.cpp:1681

llvm::SelectionDAG::MorphNodeTo
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
Definition SelectionDAG.cpp:12413

llvm::SelectionDAG::getUNDEF
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition SelectionDAG.h:1207

llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition SelectionDAG.h:1184

llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition SelectionDAG.h:896

llvm::SelectionDAG::getBitcast
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition SelectionDAG.cpp:2539

llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition SelectionDAG.h:1412

llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition SelectionDAG.h:514

llvm::SelectionDAG::getTokenFactor
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
Definition SelectionDAG.cpp:15011

llvm::SelectionDAG::getConstant
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1725

llvm::SelectionDAG::getTruncStore
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10772

llvm::SelectionDAG::getStore
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition SelectionDAG.cpp:10696

llvm::SelectionDAG::getSignedConstant
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition SelectionDAG.cpp:1855

llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition SelectionDAG.h:1172

llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition SelectionDAG.h:1422

llvm::SelectionDAG::getExternalSymbol
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
Definition SelectionDAG.cpp:2128

llvm::SelectionDAG::getAnyExtOrTrunc
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
Definition SelectionDAG.cpp:1549

llvm::SelectionDAG::getIntPtrConstant
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1867

llvm::SelectionDAG::getNode
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition SelectionDAG.cpp:11704

llvm::SelectionDAG::getFPExtendOrRound
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
Definition SelectionDAG.cpp:1528

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:730

llvm::SelectionDAG::getVectorIdxConstant
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1885

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:509

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3350

llvm::SelectionDAG::getZExtOrTrunc
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition SelectionDAG.cpp:1561

llvm::SelectionDAG::getObjectPtrOffset
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
Definition SelectionDAG.h:1157

llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition SelectionDAG.h:534

llvm::SelectionDAG::setRoot
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition SelectionDAG.h:610

llvm::SelectionDAG::getTargetExternalSymbol
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
Definition SelectionDAG.cpp:2150

llvm::ShuffleVectorSDNode::getMask
ArrayRef< int > getMask() const
Definition SelectionDAGNodes.h:1768

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:691

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StoreSDNode
This class is used to represent ISD::STORE nodes.
Definition SelectionDAGNodes.h:2684

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::StringRef::size
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144

llvm::StringRef::data
constexpr const char * data() const
Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:138

llvm::TargetFrameLowering::getStackAlign
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
Definition TargetFrameLowering.h:107

llvm::TargetLoweringBase::setBooleanVectorContents
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
Definition TargetLowering.h:2642

llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition TargetLowering.h:2705

llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
Definition TargetLowering.h:2934

llvm::TargetLoweringBase::Enabled
@ Enabled
Definition TargetLowering.h:591

llvm::TargetLoweringBase::Unspecified
@ Unspecified
Definition TargetLowering.h:589

llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition TargetLowering.h:1777

llvm::TargetLoweringBase::Custom
@ Custom
Definition TargetLowering.h:208

llvm::TargetLoweringBase::Expand
@ Expand
Definition TargetLowering.h:206

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:205

llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3985

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition TargetLowering.h:374

llvm::TargetLoweringBase::getNumRegistersForCallingConv
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
Definition TargetLowering.h:1895

llvm::TargetLoweringBase::getRegisterTypeForCallingConv
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition TargetLowering.h:1887

llvm::TargetLoweringBase::setOperationPromotedToType
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
Definition TargetLowering.h:2878

llvm::TargetLoweringBase::LegalizeTypeAction
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
Definition TargetLowering.h:213

llvm::TargetLoweringBase::TypeSplitVector
@ TypeSplitVector
Definition TargetLowering.h:220

llvm::TargetLoweringBase::addBypassSlowDiv
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
Definition TargetLowering.h:2681

llvm::TargetLoweringBase::getNumRegisters
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
Definition TargetLowering.h:1863

llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Definition TargetLowering.h:2928

llvm::TargetLoweringBase::getPreferredVectorAction
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Definition TargetLowering.h:538

llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3970

llvm::TargetLoweringBase::setBooleanContents
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
Definition TargetLowering.h:2628

llvm::TargetLoweringBase::MaxStoresPerMemmove
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
Definition TargetLowering.h:4018

llvm::TargetLoweringBase::computeRegisterProperties
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
Definition TargetLoweringBase.cpp:1717

llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:4020

llvm::TargetLoweringBase::addRegisterClass
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
Definition TargetLowering.h:2688

llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition TargetLowering.h:1105

llvm::TargetLoweringBase::getPointerTy
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition TargetLowering.h:381

llvm::TargetLoweringBase::isOperationLegal
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Definition TargetLowering.h:1483

llvm::TargetLoweringBase::MaxStoresPerMemset
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
Definition TargetLowering.h:3968

llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition TargetLowering.h:2768

llvm::TargetLoweringBase::ZeroOrNegativeOneBooleanContent
@ ZeroOrNegativeOneBooleanContent
Definition TargetLowering.h:240

llvm::TargetLoweringBase::setMinCmpXchgSizeInBits
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
Definition TargetLowering.h:2945

llvm::TargetLoweringBase::AddPromotedToType
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
Definition TargetLowering.h:2872

llvm::TargetLoweringBase::AtomicExpansionKind
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
Definition TargetLowering.h:256

llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg
@ CmpXChg
Definition TargetLowering.h:264

llvm::TargetLoweringBase::AtomicExpansionKind::None
@ None
Definition TargetLowering.h:257

llvm::TargetLoweringBase::setCondCodeAction
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
Definition TargetLowering.h:2829

llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition TargetLowering.h:2893

llvm::TargetLoweringBase::getMinStackArgumentAlignment
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
Definition TargetLowering.h:2145

llvm::TargetLoweringBase::setLoadExtAction
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
Definition TargetLowering.h:2722

llvm::TargetLoweringBase::ArgListTy
std::vector< ArgListEntry > ArgListTy
Definition TargetLowering.h:341

llvm::TargetLoweringBase::emitTrailingFence
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Definition TargetLoweringBase.cpp:2885

llvm::TargetLoweringBase::emitLeadingFence
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
Definition TargetLoweringBase.cpp:2876

llvm::TargetLoweringBase::MaxStoresPerMemcpy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
Definition TargetLowering.h:3983

llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition TargetLowering.h:2647

llvm::TargetLoweringBase::getRegisterType
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
Definition TargetLowering.h:1828

llvm::TargetLoweringBase::setJumpIsExpensive
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
Definition TargetLoweringBase.cpp:1383

llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition TargetLowering.h:1292

llvm::TargetLoweringObjectFile::TM
const TargetMachine * TM
Definition TargetLoweringObjectFile.h:70

llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition TargetLowering.h:4047

llvm::TargetLowering::ConstraintType
ConstraintType
Definition TargetLowering.h:5262

llvm::TargetLowering::C_RegisterClass
@ C_RegisterClass
Definition TargetLowering.h:5264

llvm::TargetLowering::SimplifyMultipleUseDemandedBits
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition TargetLowering.cpp:708

llvm::TargetLowering::getConstraintType
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Definition TargetLowering.cpp:5802

llvm::TargetLowering::getRegForInlineAsmConstraint
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
Definition TargetLowering.cpp:5946

llvm::TargetLowering::TargetLowering
TargetLowering(const TargetLowering &)=delete

llvm::TargetLowering::expandRoundInexactToOdd
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
Definition TargetLowering.cpp:12852

llvm::TargetLowering::expandFP_ROUND
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
Definition TargetLowering.cpp:12902

llvm::TargetLowering::LowerAsmOperandForConstraint
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Definition TargetLowering.cpp:5864

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getOptLevel
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition TargetMachine.h:289

llvm::TargetMachine::Options
TargetOptions Options
Definition TargetMachine.h:124

llvm::TargetMachine::getSymbol
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition TargetMachine.cpp:303

llvm::TargetOptions::AllowFPOpFusion
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
Definition TargetOptions.h:381

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::TargetSubtargetInfo::getFrameLowering
virtual const TargetFrameLowering * getFrameLowering() const
Definition TargetSubtargetInfo.h:101

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318

llvm::cl::opt
Definition CommandLine.h:1454

llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition raw_ostream.h:662

uint32_t

uint64_t

unsigned

Call
CallInst * Call
Definition ObjCARCOpts.cpp:2356

Analysis.h

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

llvm::APIntOps::pow
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3207

llvm::ARM_MB::LD
@ LD
Definition ARMBaseInfo.h:72

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:32

llvm::IRSimilarity::Legal
@ Legal
Definition IRSimilarityIdentifier.h:77

llvm::ISD::NodeType
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823

llvm::ISD::STACKRESTORE
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition ISDOpcodes.h:1264

llvm::ISD::STACKSAVE
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition ISDOpcodes.h:1260

llvm::ISD::STORE
@ STORE
Definition ISDOpcodes.h:1170

llvm::ISD::POISON
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236

llvm::ISD::MLOAD
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition ISDOpcodes.h:1431

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:269

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275

llvm::ISD::SSUBO_CARRY
@ SSUBO_CARRY
Definition ISDOpcodes.h:339

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:268

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:885

llvm::ISD::UMIN
@ UMIN
Definition ISDOpcodes.h:729

llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783

llvm::ISD::ROTR
@ ROTR
Definition ISDOpcodes.h:773

llvm::ISD::VAEND
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition ISDOpcodes.h:1293

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::UADDO
@ UADDO
Definition ISDOpcodes.h:349

llvm::ISD::FTRUNC
@ FTRUNC
Definition ISDOpcodes.h:1062

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:267

llvm::ISD::ADDC
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294

llvm::ISD::FMAXNUM_IEEE
@ FMAXNUM_IEEE
Definition ISDOpcodes.h:1104

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:1169

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518

llvm::ISD::SUBC
@ SUBC
Definition ISDOpcodes.h:295

llvm::ISD::FABS
@ FABS
Definition ISDOpcodes.h:1031

llvm::ISD::FNEARBYINT
@ FNEARBYINT
Definition ISDOpcodes.h:1064

llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220

llvm::ISD::RETURNADDR
@ RETURNADDR
Definition ISDOpcodes.h:111

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884

llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584

llvm::ISD::VECREDUCE_FMAX
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition ISDOpcodes.h:1510

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::VECREDUCE_FMAXIMUM
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition ISDOpcodes.h:1514

llvm::ISD::ABS
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747

llvm::ISD::UDIVREM
@ UDIVREM
Definition ISDOpcodes.h:281

llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::FMAXIMUM
@ FMAXIMUM
Definition ISDOpcodes.h:1110

llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254

llvm::ISD::FFLOOR
@ FFLOOR
Definition ISDOpcodes.h:1067

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::FrameIndex
@ FrameIndex
Definition ISDOpcodes.h:90

llvm::ISD::CTLZ_ZERO_POISON
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792

llvm::ISD::USUBO
@ USUBO
Definition ISDOpcodes.h:353

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848

llvm::ISD::FLOG2
@ FLOG2
Definition ISDOpcodes.h:1056

llvm::ISD::READSTEADYCOUNTER
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition ISDOpcodes.h:1326

llvm::ISD::USHLSAT
@ USHLSAT
Definition ISDOpcodes.h:387

llvm::ISD::UADDSAT
@ UADDSAT
Definition ISDOpcodes.h:366

llvm::ISD::FMAXNUM
@ FMAXNUM
Definition ISDOpcodes.h:1088

llvm::ISD::FRINT
@ FRINT
Definition ISDOpcodes.h:1063

llvm::ISD::VECREDUCE_FMIN
@ VECREDUCE_FMIN
Definition ISDOpcodes.h:1511

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1030

llvm::ISD::BR_CC
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:1215

llvm::ISD::CTTZ
@ CTTZ
Definition ISDOpcodes.h:784

llvm::ISD::SSUBO
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:931

llvm::ISD::BRIND
@ BRIND
BRIND - Indirect branch.
Definition ISDOpcodes.h:1190

llvm::ISD::BR_JT
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:1194

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::SSUBSAT
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374

llvm::ISD::UMULO
@ UMULO
Definition ISDOpcodes.h:357

llvm::ISD::SRA_PARTS
@ SRA_PARTS
Definition ISDOpcodes.h:838

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:276

llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233

llvm::ISD::FTANH
@ FTANH
Definition ISDOpcodes.h:1042

llvm::ISD::EXTRACT_ELEMENT
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247

llvm::ISD::VACOPY
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition ISDOpcodes.h:1289

llvm::ISD::FSHL
@ FSHL
Definition ISDOpcodes.h:774

llvm::ISD::CopyFromReg
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230

llvm::ISD::SADDO
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348

llvm::ISD::FSHR
@ FSHR
Definition ISDOpcodes.h:775

llvm::ISD::FROUND
@ FROUND
Definition ISDOpcodes.h:1065

llvm::ISD::USUBSAT
@ USUBSAT
Definition ISDOpcodes.h:375

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649

llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614

llvm::ISD::FMINNUM_IEEE
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition ISDOpcodes.h:1103

llvm::ISD::FCOS
@ FCOS
Definition ISDOpcodes.h:1035

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854

llvm::ISD::DEBUGTRAP
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition ISDOpcodes.h:1349

llvm::ISD::CTPOP
@ CTPOP
Definition ISDOpcodes.h:786

llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::MSTORE
@ MSTORE
Definition ISDOpcodes.h:1432

llvm::ISD::ATOMIC_CMP_SWAP
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition ISDOpcodes.h:1386

llvm::ISD::SRL_PARTS
@ SRL_PARTS
Definition ISDOpcodes.h:839

llvm::ISD::FMINNUM
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition ISDOpcodes.h:1087

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:705

llvm::ISD::SSHLSAT
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:386

llvm::ISD::SMULO
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356

llvm::ISD::PARITY
@ PARITY
Definition ISDOpcodes.h:788

llvm::ISD::DYNAMIC_STACKALLOC
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:1179

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982

llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328

llvm::ISD::FROUNDEVEN
@ FROUNDEVEN
Definition ISDOpcodes.h:1066

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::BF16_TO_FP
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition ISDOpcodes.h:1016

llvm::ISD::FRAMEADDR
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:421

llvm::ISD::STRICT_FP_TO_UINT
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478

llvm::ISD::STRICT_FP_TO_SINT
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477

llvm::ISD::FMINIMUM
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:1109

llvm::ISD::ATOMIC_LOAD_SUB
@ ATOMIC_LOAD_SUB
Definition ISDOpcodes.h:1402

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930

llvm::ISD::READCYCLECOUNTER
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition ISDOpcodes.h:1320

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::TRAP
@ TRAP
TRAP - Trapping instruction.
Definition ISDOpcodes.h:1346

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:329

llvm::ISD::SUBE
@ SUBE
Definition ISDOpcodes.h:305

llvm::ISD::ADDE
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:270

llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565

llvm::ISD::FSIN
@ FSIN
Definition ISDOpcodes.h:1034

llvm::ISD::FCEIL
@ FCEIL
Definition ISDOpcodes.h:1061

llvm::ISD::ATOMIC_SWAP
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition ISDOpcodes.h:1400

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963

llvm::ISD::CTLZ
@ CTLZ
Definition ISDOpcodes.h:785

llvm::ISD::FMAXIMUMNUM
@ FMAXIMUMNUM
Definition ISDOpcodes.h:1115

llvm::ISD::VASTART
@ VASTART
Definition ISDOpcodes.h:1294

llvm::ISD::FSQRT
@ FSQRT
Definition ISDOpcodes.h:1032

llvm::ISD::ADDRSPACECAST
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:1001

llvm::ISD::VECREDUCE_FMINIMUM
@ VECREDUCE_FMINIMUM
Definition ISDOpcodes.h:1515

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860

llvm::ISD::VAARG
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition ISDOpcodes.h:1284

llvm::ISD::ROTL
@ ROTL
Definition ISDOpcodes.h:772

llvm::ISD::SHL_PARTS
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:837

llvm::ISD::BITREVERSE
@ BITREVERSE
Definition ISDOpcodes.h:787

llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534

llvm::ISD::SADDSAT
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365

llvm::ISD::FEXP2
@ FEXP2
Definition ISDOpcodes.h:1059

llvm::ISD::SMAX
@ SMAX
Definition ISDOpcodes.h:728

llvm::ISD::UMAX
@ UMAX
Definition ISDOpcodes.h:730

llvm::ISD::FMINIMUMNUM
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition ISDOpcodes.h:1114

llvm::ISD::SADDO_CARRY
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213

llvm::ISD::ABS_MIN_POISON
@ ABS_MIN_POISON
ABS with a poison result for INT_MIN.
Definition ISDOpcodes.h:751

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556

llvm::ISD::allOperandsUndef
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
Definition SelectionDAG.cpp:338

llvm::ISD::SETOLE
@ SETOLE
Definition ISDOpcodes.h:1783

llvm::ISD::SETOLT
@ SETOLT
Definition ISDOpcodes.h:1782

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1801

llvm::ISD::SETUGT
@ SETUGT
Definition ISDOpcodes.h:1788

llvm::ISD::SETOGT
@ SETOGT
Definition ISDOpcodes.h:1780

llvm::ISD::SETULT
@ SETULT
Definition ISDOpcodes.h:1790

llvm::ISD::SETGT
@ SETGT
Definition ISDOpcodes.h:1797

llvm::ISD::SETLT
@ SETLT
Definition ISDOpcodes.h:1799

llvm::ISD::SETGE
@ SETGE
Definition ISDOpcodes.h:1798

llvm::ISD::SETUGE
@ SETUGE
Definition ISDOpcodes.h:1789

llvm::ISD::SETLE
@ SETLE
Definition ISDOpcodes.h:1800

llvm::ISD::SETULE
@ SETULE
Definition ISDOpcodes.h:1791

llvm::ISD::SETOGE
@ SETOGE
Definition ISDOpcodes.h:1781

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1796

llvm::ISD::NON_EXTLOAD
@ NON_EXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::EXTLOAD
@ EXTLOAD
Definition ISDOpcodes.h:1756

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition Intrinsics.h:49

llvm::Intrinsic::getName
LLVM_ABI StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition Intrinsics.cpp:58

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition LegacyLegalizerInfo.h:56

llvm::LegalizeActions::LegalizeAction
LegalizeAction
Definition LegalizerInfo.h:45

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:62

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:69

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::NVPTXAS::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition NVPTXAddrSpace.h:26

llvm::NVPTXAS::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition NVPTXAddrSpace.h:22

llvm::NVPTXAS::ADDRESS_SPACE_SHARED
@ ADDRESS_SPACE_SHARED
Definition NVPTXAddrSpace.h:24

llvm::NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER
@ ADDRESS_SPACE_SHARED_CLUSTER
Definition NVPTXAddrSpace.h:28

llvm::NVPTXAS::ADDRESS_SPACE_GLOBAL
@ ADDRESS_SPACE_GLOBAL
Definition NVPTXAddrSpace.h:23

llvm::NVPTXISD::LoadV8
@ LoadV8
Definition NVPTXSelectionDAGInfo.h:38

llvm::NVPTXISD::StoreV2
@ StoreV2
Definition NVPTXSelectionDAGInfo.h:42

llvm::NVPTXISD::LDUV2
@ LDUV2
Definition NVPTXSelectionDAGInfo.h:40

llvm::NVPTXISD::SETP_F16X2
@ SETP_F16X2
Definition NVPTXSelectionDAGInfo.h:21

llvm::NVPTXISD::StoreV8
@ StoreV8
Definition NVPTXSelectionDAGInfo.h:44

llvm::NVPTXISD::StoreV4
@ StoreV4
Definition NVPTXSelectionDAGInfo.h:43

llvm::NVPTXISD::UNPACK_VECTOR
@ UNPACK_VECTOR
Definition NVPTXSelectionDAGInfo.h:23

llvm::NVPTXISD::LoadV2
@ LoadV2
Definition NVPTXSelectionDAGInfo.h:36

llvm::NVPTXISD::MLoad
@ MLoad
Definition NVPTXSelectionDAGInfo.h:39

llvm::NVPTXISD::SETP_BF16X2
@ SETP_BF16X2
Definition NVPTXSelectionDAGInfo.h:22

llvm::NVPTXISD::LDUV4
@ LDUV4
Definition NVPTXSelectionDAGInfo.h:41

llvm::NVPTXISD::ATOMIC_SWAP_B128
@ ATOMIC_SWAP_B128
Definition NVPTXSelectionDAGInfo.h:34

llvm::NVPTXISD::ATOMIC_CMP_SWAP_B128
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
Definition NVPTXSelectionDAGInfo.h:33

llvm::NVPTXISD::LoadV4
@ LoadV4
Definition NVPTXSelectionDAGInfo.h:37

llvm::NVPTX::PTXCvtMode::CvtMode
CvtMode
Definition NVPTX.h:224

llvm::NVPTX::PTXPrmtMode::NONE
@ NONE
Definition NVPTX.h:268

llvm::NVPTX::PTXPrmtMode::B4E
@ B4E
Definition NVPTX.h:270

llvm::NVPTX::PTXPrmtMode::F4E
@ F4E
Definition NVPTX.h:269

llvm::NVPTX::PTXPrmtMode::RC8
@ RC8
Definition NVPTX.h:271

llvm::NVPTX::PTXPrmtMode::ECL
@ ECL
Definition NVPTX.h:272

llvm::NVPTX::PTXPrmtMode::ECR
@ ECR
Definition NVPTX.h:273

llvm::NVPTX::PTXPrmtMode::RC16
@ RC16
Definition NVPTX.h:274

llvm::NVPTX::DeviceParam
@ DeviceParam
Definition NVPTX.h:215

llvm::NVPTX::Const
@ Const
Definition NVPTX.h:206

llvm::NVPTX::EntryParam
@ EntryParam
Definition NVPTX.h:209

llvm::NVPTX::isPackedVectorTy
bool isPackedVectorTy(EVT VT)
Definition NVPTXUtilities.h:80

llvm::NVPTX::DivPrecisionLevel
DivPrecisionLevel
Definition NVPTX.h:278

llvm::NVPTX::DivPrecisionLevel::IEEE754_NoFTZ
@ IEEE754_NoFTZ
Definition NVPTX.h:282

llvm::NVPTX::DivPrecisionLevel::Approx
@ Approx
Definition NVPTX.h:279

llvm::NVPTX::DivPrecisionLevel::IEEE754
@ IEEE754
Definition NVPTX.h:281

llvm::NVPTX::DivPrecisionLevel::Full
@ Full
Definition NVPTX.h:280

llvm::PatternMatch::m_TruncOrSelf
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
Definition PatternMatch.h:2215

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition PatternMatch.h:1032

llvm::PatternMatch::m_Deferred
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition PatternMatch.h:951

llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition PatternMatch.h:1900

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition PatternMatch.h:1286

llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition PatternMatch.h:591

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:490

llvm::SDPatternMatch
Definition SDPatternMatch.h:29

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::Sched::RegPressure
@ RegPressure
Definition TargetLowering.h:107

llvm::Sched::Source
@ Source
Definition TargetLowering.h:106

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
Definition X86BaseInfo.h:109

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition CommandLine.h:712

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::dwarf::Index
Index
Definition Dwarf.h:909

llvm::logicalview::LVPrintKind::Elements
@ Elements
Definition LVOptions.h:153

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::objcarc::ARCInstKind::User
@ User
could "use" a pointer
Definition ObjCARCInstKind.h:52

llvm::rdf::Node
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm::tgtok::TrueVal
@ TrueVal
Definition TGLexer.h:57

llvm::tgtok::FalseVal
@ FalseVal
Definition TGLexer.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::ThreadPriority::Low
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::shouldEmitPTXNoReturn
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
Definition NVPTXUtilities.cpp:81

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::getAlign
MaybeAlign getAlign(const CallInst &I, unsigned Index)
Definition NVVMProperties.cpp:320

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668

llvm::peekThroughFreeze
SDValue peekThroughFreeze(SDValue V)
Return the non-frozen source operand of V if it exists.
Definition SelectionDAGNodes.h:1975

llvm::ComputeValueVTs
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::AlignStyle::Right
@ Right
Definition FormatCommon.h:17

llvm::AlignStyle::Left
@ Left
Definition FormatCommon.h:17

llvm::AllocFnKind::Alloc
@ Alloc
Definition Attributes.h:55

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385

llvm::isReleaseOrStronger
bool isReleaseOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:133

llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407

llvm::getAtomicSyncScopeID
std::optional< SyncScope::ID > getAtomicSyncScopeID(const Instruction *I)
A helper function that returns an atomic operation's sync scope; returns std::nullopt if it is not an...
Definition Instructions.h:5358

llvm::promoteScalarArgumentSize
unsigned promoteScalarArgumentSize(unsigned size)
Definition NVPTXUtilities.h:52

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163

llvm::shouldPassAsArray
bool shouldPassAsArray(Type *Ty)
Definition NVPTXUtilities.h:64

llvm::alignTo
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82

llvm::CodeGenOptLevel::None
@ None
-O0
Definition CodeGen.h:83

llvm::CodeGenOptLevel::Default
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::Monotonic
@ Monotonic
Definition AtomicOrdering.h:59

llvm::AtomicOrdering::Acquire
@ Acquire
Definition AtomicOrdering.h:61

llvm::AtomicOrdering::Release
@ Release
Definition AtomicOrdering.h:62

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::getFunctionByValParamAlign
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL)
Definition NVPTXUtilities.cpp:60

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:39

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
Definition IVDescriptors.h:49

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::isParamGridConstant
bool isParamGridConstant(const Argument &Arg)
Definition NVVMProperties.cpp:297

llvm::isAcquireOrStronger
bool isAcquireOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:129

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:219

llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition NVVMProperties.h:34

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::getMaybeBitcastedCallee
Function * getMaybeBitcastedCallee(const CallBase *CB)
Definition NVPTXUtilities.cpp:31

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::getFunctionArgumentAlignment
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL)
Definition NVPTXUtilities.cpp:55

llvm::seq
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305

llvm::VFParamKind::Vector
@ Vector
Definition VFABIDemangler.h:27

llvm::getFunctionParamOptimizedAlign
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL)
Since function arguments are passed via .param space, we may want to increase their alignment in a wa...
Definition NVPTXUtilities.cpp:35

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863

raw_ostream.h

N
#define N

LoadValue
Definition LoopUnroll.cpp:216

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::DenormalMode::PreserveSign
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Definition FloatingPointMode.h:81

llvm::DenormalMode::Output
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Definition FloatingPointMode.h:92

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145

llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70

llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129

llvm::EVT::bitsGT
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307

llvm::EVT::bitsLT
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155

llvm::EVT::getVectorElementCount
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373

llvm::EVT::is32BitVector
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:220

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339

llvm::EVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404

llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346

llvm::EVT::bitsEq
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351

llvm::EVT::changeElementType
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121

llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359

llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160

llvm::ISD::ArgFlagsTy
Definition TargetCallingConv.h:27

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315

llvm::KnownBits::ashr
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition KnownBits.cpp:547

llvm::KnownBits::concat
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:247

llvm::KnownBits::getBitWidth
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44

llvm::KnownBits::resetAll
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72

llvm::KnownBits::countMaxActiveBits
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310

llvm::KnownBits::insertBits
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:233

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::MIPatternMatch::Or
Definition MIPatternMatch.h:333

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::SDNodeFlags
These are IR-level optimization flags that may be propagated to SDNodes.
Definition SelectionDAGNodes.h:378

llvm::SDNodeFlags::AllowContract
@ AllowContract
Definition SelectionDAGNodes.h:401

llvm::SDNodeFlags::NonNeg
@ NonNeg
Definition SelectionDAGNodes.h:396

llvm::SDNodeFlags::hasAllowContract
bool hasAllowContract() const
Definition SelectionDAGNodes.h:481

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition TargetLowering.h:2978

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2980

llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition TargetLowering.h:2979

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2981

llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition TargetLowering.h:2982

llvm::TargetLoweringBase::IntrinsicInfo
Definition TargetLowering.h:1222

llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition TargetLowering.h:4831

llvm::TargetLowering::CallLoweringInfo::Args
ArgListTy Args
Definition TargetLowering.h:4858

llvm::TargetLowering::CallLoweringInfo::IsTailCall
bool IsTailCall
Definition TargetLowering.h:4850

llvm::TargetLowering::CallLoweringInfo::Callee
SDValue Callee
Definition TargetLowering.h:4857

llvm::TargetLowering::CallLoweringInfo::DL
SDLoc DL
Definition TargetLowering.h:4860

llvm::TargetLowering::CallLoweringInfo::IsVarArg
bool IsVarArg
Definition TargetLowering.h:4839

llvm::TargetLowering::CallLoweringInfo::Ins
SmallVector< ISD::InputArg, 32 > Ins
Definition TargetLowering.h:4864

llvm::TargetLowering::CallLoweringInfo::NumFixedArgs
unsigned NumFixedArgs
Definition TargetLowering.h:4855

llvm::TargetLowering::CallLoweringInfo::Chain
SDValue Chain
Definition TargetLowering.h:4832

llvm::TargetLowering::CallLoweringInfo::getArgs
ArgListTy & getArgs()
Definition TargetLowering.h:5027

llvm::TargetLowering::CallLoweringInfo::CB
const CallBase * CB
Definition TargetLowering.h:4861

llvm::TargetLowering::CallLoweringInfo::Outs
SmallVector< ISD::OutputArg, 32 > Outs
Definition TargetLowering.h:4862

llvm::TargetLowering::CallLoweringInfo::OutVals
SmallVector< SDValue, 32 > OutVals
Definition TargetLowering.h:4863

llvm::TargetLowering::CallLoweringInfo::RetTy
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
Definition TargetLowering.h:4836

llvm::TargetLowering::CallLoweringInfo::IsConvergent
bool IsConvergent
Definition TargetLowering.h:4843

llvm::TargetLowering::CallLoweringInfo::CallConv
CallingConv::ID CallConv
Definition TargetLowering.h:4856

llvm::TargetLowering::CallLoweringInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4859

llvm::TargetLowering::DAGCombinerInfo
Definition TargetLowering.h:4535

llvm::TargetLowering::DAGCombinerInfo::isAfterLegalizeDAG
bool isAfterLegalizeDAG() const
Definition TargetLowering.h:4548

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalize
bool isBeforeLegalize() const
Definition TargetLowering.h:4546

llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4541

llvm::TargetLowering::TargetLoweringOpt
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Definition TargetLowering.h:4229

llvm::TargetLowering::TargetLoweringOpt::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4230

llvm::TargetLowering::TargetLoweringOpt::CombineTo
bool CombineTo(SDValue O, SDValue N)
Definition TargetLowering.h:4243

llvm::cl::desc
Definition CommandLine.h:410