doxygen/AMDGPUInstructionSelector_8cpp_source.html

//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

/// \file

/// This file implements the targeting of the InstructionSelector class for

/// AMDGPU.

/// \todo This should be generated by TableGen.

//===----------------------------------------------------------------------===//


#include "AMDGPUInstructionSelector.h"

#include "AMDGPU.h"

#include "AMDGPUGlobalISelUtils.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPURegisterBankInfo.h"

#include "AMDGPUTargetMachine.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"

#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"

#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"

#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"

#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include <optional>


#define DEBUG_TYPE "amdgpu-isel"


using namespace llvm;

using namespace MIPatternMatch;


#define GET_GLOBALISEL_IMPL

#define AMDGPUSubtarget GCNSubtarget

#include "AMDGPUGenGlobalISel.inc"

#undef GET_GLOBALISEL_IMPL

#undef AMDGPUSubtarget


AMDGPUInstructionSelector::AMDGPUInstructionSelector(

    const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,

    const AMDGPUTargetMachine &TM)

    : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),

      STI(STI),

#define GET_GLOBALISEL_PREDICATES_INIT

#include "AMDGPUGenGlobalISel.inc"

#undef GET_GLOBALISEL_PREDICATES_INIT

#define GET_GLOBALISEL_TEMPORARIES_INIT

#include "AMDGPUGenGlobalISel.inc"

#undef GET_GLOBALISEL_TEMPORARIES_INIT

{

}


const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }


void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,

                                        GISelValueTracking *VT,

                                        CodeGenCoverage *CoverageInfo,

                                        ProfileSummaryInfo *PSI,

                                        BlockFrequencyInfo *BFI) {

  MRI = &MF.getRegInfo();

  Subtarget = &MF.getSubtarget<GCNSubtarget>();

  Subtarget->checkSubtargetFeatures(MF.getFunction());

  InstructionSelector::setupMF(MF, VT, CoverageInfo, PSI, BFI);

}


// Return the wave level SGPR base address if this is a wave address.


static Register getWaveAddress(const MachineInstr *Def) {

  return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS

             ? Def->getOperand(1).getReg()

             : Register();

}


static void diagnoseUnsupportedIntrinsic(const MachineInstr &I) {

  const Function &F = I.getMF()->getFunction();

  F.getContext().diagnose(DiagnosticInfoUnsupported(

      F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));

}


bool AMDGPUInstructionSelector::isVCC(Register Reg,

                                      const MachineRegisterInfo &MRI) const {

  // The verifier is oblivious to s1 being a valid value for wavesize registers.

  if (Reg.isPhysical())

    return false;


  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);

  const TargetRegisterClass *RC =

      dyn_cast<const TargetRegisterClass *>(RegClassOrBank);

  if (RC) {

    const LLT Ty = MRI.getType(Reg);

    if (!Ty.isValid() || Ty.getSizeInBits() != 1)

      return false;

    // G_TRUNC s1 result is never vcc.

    return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&

           RC->hasSuperClassEq(TRI.getBoolRC());

  }


  const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);

  return RB->getID() == AMDGPU::VCCRegBankID;

}


bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,

                                                        unsigned NewOpc) const {

  MI.setDesc(TII.get(NewOpc));

  MI.removeOperand(1); // Remove intrinsic ID.

  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));


  MachineOperand &Dst = MI.getOperand(0);

  MachineOperand &Src = MI.getOperand(1);


  // TODO: This should be legalized to s32 if needed

  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))

    return false;


  const TargetRegisterClass *DstRC

    = TRI.getConstrainedRegClassForOperand(Dst, *MRI);

  const TargetRegisterClass *SrcRC

    = TRI.getConstrainedRegClassForOperand(Src, *MRI);

  if (!DstRC || DstRC != SrcRC)

    return false;


  if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||

      !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))

    return false;

  const MCInstrDesc &MCID = MI.getDesc();

  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {

    MI.getOperand(0).setIsEarlyClobber(true);

  }

  return true;

}


bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {

  const DebugLoc &DL = I.getDebugLoc();

  MachineBasicBlock *BB = I.getParent();

  I.setDesc(TII.get(TargetOpcode::COPY));


  const MachineOperand &Src = I.getOperand(1);

  MachineOperand &Dst = I.getOperand(0);

  Register DstReg = Dst.getReg();

  Register SrcReg = Src.getReg();


  if (isVCC(DstReg, *MRI)) {

    if (SrcReg == AMDGPU::SCC) {

      const TargetRegisterClass *RC

        = TRI.getConstrainedRegClassForOperand(Dst, *MRI);

      if (!RC)

        return true;

      return RBI.constrainGenericRegister(DstReg, *RC, *MRI);

    }


    if (!isVCC(SrcReg, *MRI)) {

      // TODO: Should probably leave the copy and let copyPhysReg expand it.

      if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))

        return false;


      const TargetRegisterClass *SrcRC

        = TRI.getConstrainedRegClassForOperand(Src, *MRI);


      std::optional<ValueAndVReg> ConstVal =

          getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);

      if (ConstVal) {

        unsigned MovOpc =

            STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;

        BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)

            .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);

      } else {

        Register MaskedReg = MRI->createVirtualRegister(SrcRC);


        // We can't trust the high bits at this point, so clear them.


        // TODO: Skip masking high bits if def is known boolean.


        if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {

          assert(Subtarget->useRealTrue16Insts());

          const int64_t NoMods = 0;

          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)

              .addImm(NoMods)

              .addImm(1)

              .addImm(NoMods)

              .addReg(SrcReg)

              .addImm(NoMods);

          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)

              .addImm(NoMods)

              .addImm(0)

              .addImm(NoMods)

              .addReg(MaskedReg)

              .addImm(NoMods);

        } else {

          bool IsSGPR = TRI.isSGPRClass(SrcRC);

          unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;

          auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)

                         .addImm(1)

                         .addReg(SrcReg);

          if (IsSGPR)

            And.setOperandDead(3); // Dead scc


          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)

              .addImm(0)

              .addReg(MaskedReg);

        }

      }


      if (!MRI->getRegClassOrNull(SrcReg))

        MRI->setRegClass(SrcReg, SrcRC);

      I.eraseFromParent();

      return true;

    }


    const TargetRegisterClass *RC =

      TRI.getConstrainedRegClassForOperand(Dst, *MRI);

    if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))

      return false;


    return true;

  }


  for (const MachineOperand &MO : I.operands()) {

    if (MO.getReg().isPhysical())

      continue;


    const TargetRegisterClass *RC =

            TRI.getConstrainedRegClassForOperand(MO, *MRI);

    if (!RC)

      continue;

    RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);

  }

  return true;

}


bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {

  const DebugLoc &DL = I.getDebugLoc();

  MachineBasicBlock *BB = I.getParent();

  Register VCCReg = I.getOperand(1).getReg();

  MachineInstr *Cmp;


  // Set SCC as a side effect with S_CMP or S_OR.

  if (STI.hasScalarCompareEq64()) {

    unsigned CmpOpc =

        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;

    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);

  } else {

    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)

              .addReg(VCCReg)

              .addReg(VCCReg);

  }


  constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);


  Register DstReg = I.getOperand(0).getReg();

  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);


  I.eraseFromParent();

  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);

}


bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {

  const DebugLoc &DL = I.getDebugLoc();

  MachineBasicBlock *BB = I.getParent();


  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(1).getReg();

  std::optional<ValueAndVReg> Arg =

      getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);


  if (Arg) {

    const int64_t Value = Arg->Value.getZExtValue();

    if (Value == 0) {

      unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;

      BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);

    } else {

      assert(Value == 1);

      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());

    }

    I.eraseFromParent();

    return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);

  }


  // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).

  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);


  unsigned SelectOpcode =

      STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;

  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)

                             .addReg(TRI.getExec())

                             .addImm(0);


  I.eraseFromParent();

  constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(1).getReg();


  const DebugLoc &DL = I.getDebugLoc();

  MachineBasicBlock *BB = I.getParent();


  auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)

                 .addReg(SrcReg);


  I.eraseFromParent();

  constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {

  const Register DefReg = I.getOperand(0).getReg();

  const LLT DefTy = MRI->getType(DefReg);


  // S1 G_PHIs should not be selected in instruction-select, instead:

  // - divergent S1 G_PHI should go through lane mask merging algorithm

  //   and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering

  // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect

  if (DefTy == LLT::scalar(1))

    return false;


  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)


  const RegClassOrRegBank &RegClassOrBank =

    MRI->getRegClassOrRegBank(DefReg);


  const TargetRegisterClass *DefRC =

      dyn_cast<const TargetRegisterClass *>(RegClassOrBank);

  if (!DefRC) {

    if (!DefTy.isValid()) {

      LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");

      return false;

    }


    const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);

    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);

    if (!DefRC) {

      LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");

      return false;

    }

  }


  // If inputs have register bank, assign corresponding reg class.

  // Note: registers don't need to have the same reg bank.

  for (unsigned i = 1; i != I.getNumOperands(); i += 2) {

    const Register SrcReg = I.getOperand(i).getReg();


    const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);

    if (RB) {

      const LLT SrcTy = MRI->getType(SrcReg);

      const TargetRegisterClass *SrcRC =

          TRI.getRegClassForTypeOnBank(SrcTy, *RB);

      if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))

        return false;

    }

  }


  I.setDesc(TII.get(TargetOpcode::PHI));

  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);

}


MachineOperand

AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,

                                           const TargetRegisterClass &SubRC,

                                           unsigned SubIdx) const {


  MachineInstr *MI = MO.getParent();

  MachineBasicBlock *BB = MO.getParent()->getParent();

  Register DstReg = MRI->createVirtualRegister(&SubRC);


  if (MO.isReg()) {

    unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);

    Register Reg = MO.getReg();

    BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)

        .addReg(Reg, {}, ComposedSubIdx);


    return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),

                                     MO.isKill(), MO.isDead(), MO.isUndef(),

                                     MO.isEarlyClobber(), 0, MO.isDebug(),

                                     MO.isInternalRead());

  }


  assert(MO.isImm());


  APInt Imm(64, MO.getImm());


  switch (SubIdx) {

  default:

    llvm_unreachable("do not know to split immediate with this sub index.");

  case AMDGPU::sub0:

    return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());

  case AMDGPU::sub1:

    return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());

  }

}


static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {

  switch (Opc) {

  case AMDGPU::G_AND:

    return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;

  case AMDGPU::G_OR:

    return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;

  case AMDGPU::G_XOR:

    return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;

  default:

    llvm_unreachable("not a bit op");

  }

}


bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);


  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&

      DstRB->getID() != AMDGPU::VCCRegBankID)

    return false;


  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&

                            STI.isWave64());

  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));


  // Dead implicit-def of scc

  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef

                                         true,              // isImp

                                         false,             // isKill

                                         true));            // isDead

  constrainSelectedInstRegOperands(I, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  MachineFunction *MF = BB->getParent();

  Register DstReg = I.getOperand(0).getReg();

  const DebugLoc &DL = I.getDebugLoc();

  LLT Ty = MRI->getType(DstReg);

  if (Ty.isVector())

    return false;


  unsigned Size = Ty.getSizeInBits();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;

  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;


  if (Size == 32) {

    if (IsSALU) {

      const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;

      MachineInstr *Add =

        BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)

        .add(I.getOperand(1))

        .add(I.getOperand(2))

        .setOperandDead(3); // Dead scc

      I.eraseFromParent();

      constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);

      return true;

    }


    if (STI.hasAddNoCarryInsts()) {

      const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;

      I.setDesc(TII.get(Opc));

      I.addOperand(*MF, MachineOperand::CreateImm(0));

      I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));

      constrainSelectedInstRegOperands(I, TII, TRI, RBI);

      return true;

    }


    const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;


    Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());

    MachineInstr *Add

      = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)

      .addDef(UnusedCarry, RegState::Dead)

      .add(I.getOperand(1))

      .add(I.getOperand(2))

      .addImm(0);

    I.eraseFromParent();

    constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);

    return true;

  }


  assert(!Sub && "illegal sub should not reach here");


  const TargetRegisterClass &RC

    = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;

  const TargetRegisterClass &HalfRC

    = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;


  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));

  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));

  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));

  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));


  Register DstLo = MRI->createVirtualRegister(&HalfRC);

  Register DstHi = MRI->createVirtualRegister(&HalfRC);


  if (IsSALU) {

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)

      .add(Lo1)

      .add(Lo2);

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)

      .add(Hi1)

      .add(Hi2)

      .setOperandDead(3); // Dead scc

  } else {

    const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();

    Register CarryReg = MRI->createVirtualRegister(CarryRC);

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)

      .addDef(CarryReg)

      .add(Lo1)

      .add(Lo2)

      .addImm(0);

    MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)

      .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)

      .add(Hi1)

      .add(Hi2)

      .addReg(CarryReg, RegState::Kill)

      .addImm(0);


    constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);

  }


  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)

    .addReg(DstLo)

    .addImm(AMDGPU::sub0)

    .addReg(DstHi)

    .addImm(AMDGPU::sub1);


  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))

    return false;


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(

  MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  MachineFunction *MF = BB->getParent();

  const DebugLoc &DL = I.getDebugLoc();

  Register Dst0Reg = I.getOperand(0).getReg();

  Register Dst1Reg = I.getOperand(1).getReg();

  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||

                     I.getOpcode() == AMDGPU::G_UADDE;

  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||

                          I.getOpcode() == AMDGPU::G_USUBE;


  if (isVCC(Dst1Reg, *MRI)) {

    unsigned NoCarryOpc =

        IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;

    unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;

    I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));

    I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));

    I.addOperand(*MF, MachineOperand::CreateImm(0));

    constrainSelectedInstRegOperands(I, TII, TRI, RBI);

    return true;

  }


  Register Src0Reg = I.getOperand(2).getReg();

  Register Src1Reg = I.getOperand(3).getReg();


  if (HasCarryIn) {

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)

      .addReg(I.getOperand(4).getReg());

  }


  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;

  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;


  auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)

    .add(I.getOperand(2))

    .add(I.getOperand(3));


  if (MRI->use_nodbg_empty(Dst1Reg)) {

    CarryInst.setOperandDead(3); // Dead scc

  } else {

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)

      .addReg(AMDGPU::SCC);

    if (!MRI->getRegClassOrNull(Dst1Reg))

      MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);

  }


  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  if (HasCarryIn &&

      !RBI.constrainGenericRegister(I.getOperand(4).getReg(),

                                    AMDGPU::SReg_32RegClass, *MRI))

    return false;


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(

    MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  MachineFunction *MF = BB->getParent();

  const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;

  bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&

                    MRI->use_nodbg_empty(I.getOperand(1).getReg());


  unsigned Opc;

  if (Subtarget->hasMADIntraFwdBug())

    Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64

                     : AMDGPU::V_MAD_I64_I32_gfx11_e64;

  else if (UseNoCarry)

    Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64

                     : AMDGPU::V_MAD_NC_I64_I32_e64;

  else

    Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;


  if (UseNoCarry)

    I.removeOperand(1);


  I.setDesc(TII.get(Opc));

  I.addOperand(*MF, MachineOperand::CreateImm(0));

  I.addImplicitDefUseOperands(*MF);

  I.getOperand(0).setIsEarlyClobber(true);

  constrainSelectedInstRegOperands(I, TII, TRI, RBI);

  return true;

}


// TODO: We should probably legalize these to only using 32-bit results.

bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(1).getReg();

  LLT DstTy = MRI->getType(DstReg);

  LLT SrcTy = MRI->getType(SrcReg);

  const unsigned SrcSize = SrcTy.getSizeInBits();

  unsigned DstSize = DstTy.getSizeInBits();


  // TODO: Should handle any multiple of 32 offset.

  unsigned Offset = I.getOperand(2).getImm();

  if (Offset % 32 != 0 || DstSize > 128)

    return false;


  // 16-bit operations really use 32-bit registers.

  // FIXME: Probably should not allow 16-bit G_EXTRACT results.

  if (DstSize == 16)

    DstSize = 32;


  const TargetRegisterClass *DstRC =

    TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);

  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))

    return false;


  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);

  const TargetRegisterClass *SrcRC =

      TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);

  if (!SrcRC)

    return false;

  unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,

                                                         DstSize / 32);

  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);

  if (!SrcRC)

    return false;


  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,

                                    *SrcRC, I.getOperand(1));

  const DebugLoc &DL = I.getDebugLoc();

  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)

      .addReg(SrcReg, {}, SubReg);


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {

  Register Dst = MI.getOperand(0).getReg();

  Register Src0 = MI.getOperand(1).getReg();

  Register Src1 = MI.getOperand(2).getReg();


  LLT Src0Ty = MRI->getType(Src0);

  LLT Src1Ty = MRI->getType(Src1);


  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);

  const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);

  const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);

  const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;


  Register ShiftSrc0;

  Register ShiftSrc1;


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *BB = MI.getParent();


  // VGPR case

  if (IsVector) {

    // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters

    if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&

        Src1Bank->getID() == AMDGPU::VGPRRegBankID &&

        Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) {

      BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)

          .addReg(Src0)

          .addImm(AMDGPU::lo16)

          .addReg(Src1)

          .addImm(AMDGPU::hi16);


      if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))

        return false;


      MI.eraseFromParent();

      return true;

    }


    // Otherwise, use V_LSHL_OR_B32_e64

    Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)

                   .addImm(0xFFFF)

                   .addReg(Src0);

    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);


    MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)

              .addReg(Src1)

              .addImm(16)

              .addReg(TmpReg);

    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);


    MI.eraseFromParent();

    return true;

  }


  // SGPR case -> S_PACK_*_B32_B16

  // With multiple uses of the shift, this will duplicate the shift and

  // increase register pressure.

  //

  // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)

  //  => (S_PACK_HH_B32_B16 $src0, $src1)

  // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)

  //  => (S_PACK_HL_B32_B16 $src0, $src1)

  // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))

  //  => (S_PACK_LH_B32_B16 $src0, $src1)

  // (merge $src0, $src1)

  //  => (S_PACK_LL_B32_B16 $src0, $src1)


  bool Shift0 = mi_match(

      Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));


  bool Shift1 = mi_match(

      Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));


  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;

  if (Shift0 && Shift1) {

    Opc = AMDGPU::S_PACK_HH_B32_B16;

    MI.getOperand(1).setReg(ShiftSrc0);

    MI.getOperand(2).setReg(ShiftSrc1);

  } else if (Shift1) {

    Opc = AMDGPU::S_PACK_LH_B32_B16;

    MI.getOperand(2).setReg(ShiftSrc1);

  } else if (Shift0) {

    auto ConstSrc1 =

        getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);

    if (ConstSrc1 && ConstSrc1->Value == 0) {

      // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16

      auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)

                     .addReg(ShiftSrc0)

                     .addImm(16)

                     .setOperandDead(3); // Dead scc


      MI.eraseFromParent();

      constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

      return true;

    }

    if (STI.hasSPackHL()) {

      Opc = AMDGPU::S_PACK_HL_B32_B16;

      MI.getOperand(1).setReg(ShiftSrc0);

    }

  }


  MI.setDesc(TII.get(Opc));

  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {

  MachineBasicBlock *BB = MI.getParent();

  Register DstReg = MI.getOperand(0).getReg();

  LLT DstTy = MRI->getType(DstReg);

  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());


  const unsigned SrcSize = SrcTy.getSizeInBits();

  if (SrcSize < 32) {

    // Handle s32 <- G_MERGE_VALUES s16, s16

    if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&

        MI.getNumOperands() == 3) {

      return selectS16MergeToS32(MI);

    }

    return selectImpl(MI, *CoverageInfo);

  }


  const DebugLoc &DL = MI.getDebugLoc();

  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);

  const unsigned DstSize = DstTy.getSizeInBits();

  const TargetRegisterClass *DstRC =

      TRI.getRegClassForSizeOnBank(DstSize, *DstBank);

  if (!DstRC)

    return false;


  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);

  MachineInstrBuilder MIB =

    BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);

  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {

    MachineOperand &Src = MI.getOperand(I + 1);

    MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));

    MIB.addImm(SubRegs[I]);


    const TargetRegisterClass *SrcRC

      = TRI.getConstrainedRegClassForOperand(Src, *MRI);

    if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))

      return false;

  }


  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))

    return false;


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {

  MachineBasicBlock *BB = MI.getParent();

  const int NumDst = MI.getNumOperands() - 1;


  MachineOperand &Src = MI.getOperand(NumDst);


  Register SrcReg = Src.getReg();

  Register DstReg0 = MI.getOperand(0).getReg();

  LLT DstTy = MRI->getType(DstReg0);

  LLT SrcTy = MRI->getType(SrcReg);


  const unsigned DstSize = DstTy.getSizeInBits();

  const unsigned SrcSize = SrcTy.getSizeInBits();

  const DebugLoc &DL = MI.getDebugLoc();

  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);


  const TargetRegisterClass *SrcRC =

      TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);

  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))

    return false;


  // Note we could have mixed SGPR and VGPR destination banks for an SGPR

  // source, and this relies on the fact that the same subregister indices are

  // used for both.

  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);

  for (int I = 0, E = NumDst; I != E; ++I) {

    MachineOperand &Dst = MI.getOperand(I);

    // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.

    if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&

        SubRegs[I] == AMDGPU::hi16) {

      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())

          .addReg(SrcReg)

          .addImm(16);

    } else {

      BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())

          .addReg(SrcReg, {}, SubRegs[I]);

    }


    // Make sure the subregister index is valid for the source register.

    SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);

    if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))

      return false;


    const TargetRegisterClass *DstRC =

      TRI.getConstrainedRegClassForOperand(Dst, *MRI);

    if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))

      return false;

  }


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {

  assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||

         MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);


  Register Src0 = MI.getOperand(1).getReg();

  Register Src1 = MI.getOperand(2).getReg();

  LLT SrcTy = MRI->getType(Src0);

  const unsigned SrcSize = SrcTy.getSizeInBits();


  // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.

  if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {

    return selectG_MERGE_VALUES(MI);

  }


  // Selection logic below is for V2S16 only.

  // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.

  Register Dst = MI.getOperand(0).getReg();

  if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||

      (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&

       SrcTy != LLT::scalar(32)))

    return selectImpl(MI, *CoverageInfo);


  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);

  if (DstBank->getID() == AMDGPU::AGPRRegBankID)

    return false;


  assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||

         DstBank->getID() == AMDGPU::VGPRRegBankID);

  const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *BB = MI.getParent();


  // First, before trying TableGen patterns, check if both sources are

  // constants. In those cases, we can trivially compute the final constant

  // and emit a simple move.

  auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);

  if (ConstSrc1) {

    auto ConstSrc0 =

        getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);

    if (ConstSrc0) {

      const int64_t K0 = ConstSrc0->Value.getSExtValue();

      const int64_t K1 = ConstSrc1->Value.getSExtValue();

      uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;

      uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;

      uint32_t Imm = Lo16 | (Hi16 << 16);


      // VALU

      if (IsVector) {

        BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);

        MI.eraseFromParent();

        return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);

      }


      // SALU

      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);

      MI.eraseFromParent();

      return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);

    }

  }


  // Now try TableGen patterns.

  if (selectImpl(MI, *CoverageInfo))

    return true;


  // TODO: This should probably be a combine somewhere

  // (build_vector $src0, undef) -> copy $src0

  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);

  if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {

    MI.setDesc(TII.get(AMDGPU::COPY));

    MI.removeOperand(2);

    const auto &RC =

        IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;

    return RBI.constrainGenericRegister(Dst, RC, *MRI) &&

           RBI.constrainGenericRegister(Src0, RC, *MRI);

  }


  return selectS16MergeToS32(MI);

}


bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {

  const MachineOperand &MO = I.getOperand(0);


  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The

  // regbank check here is to know why getConstrainedRegClassForOperand failed.

  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);

  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||

      (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {

    I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));

    return true;

  }


  return false;

}


bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();


  Register DstReg = I.getOperand(0).getReg();

  Register Src0Reg = I.getOperand(1).getReg();

  Register Src1Reg = I.getOperand(2).getReg();

  LLT Src1Ty = MRI->getType(Src1Reg);


  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();

  unsigned InsSize = Src1Ty.getSizeInBits();


  int64_t Offset = I.getOperand(3).getImm();


  // FIXME: These cases should have been illegal and unnecessary to check here.

  if (Offset % 32 != 0 || InsSize % 32 != 0)

    return false;


  // Currently not handled by getSubRegFromChannel.

  if (InsSize > 128)

    return false;


  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);

  if (SubReg == AMDGPU::NoSubRegister)

    return false;


  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);

  const TargetRegisterClass *DstRC =

      TRI.getRegClassForSizeOnBank(DstSize, *DstBank);

  if (!DstRC)

    return false;


  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);

  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);

  const TargetRegisterClass *Src0RC =

      TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);

  const TargetRegisterClass *Src1RC =

      TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);


  // Deal with weird cases where the class only partially supports the subreg

  // index.

  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);

  if (!Src0RC || !Src1RC)

    return false;


  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||

      !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||

      !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))

    return false;


  const DebugLoc &DL = I.getDebugLoc();

  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)

    .addReg(Src0Reg)

    .addReg(Src1Reg)

    .addImm(SubReg);


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {

  Register DstReg = MI.getOperand(0).getReg();

  Register SrcReg = MI.getOperand(1).getReg();

  Register OffsetReg = MI.getOperand(2).getReg();

  Register WidthReg = MI.getOperand(3).getReg();


  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&

         "scalar BFX instructions are expanded in regbankselect");

  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&

         "64-bit vector BFX instructions are expanded in regbankselect");


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *MBB = MI.getParent();


  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;

  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;

  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)

                 .addReg(SrcReg)

                 .addReg(OffsetReg)

                 .addReg(WidthReg);

  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {

  if (STI.getLDSBankCount() != 16)

    return selectImpl(MI, *CoverageInfo);


  Register Dst = MI.getOperand(0).getReg();

  Register Src0 = MI.getOperand(2).getReg();

  Register M0Val = MI.getOperand(6).getReg();

  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))

    return false;


  // This requires 2 instructions. It is possible to write a pattern to support

  // this, but the generated isel emitter doesn't correctly deal with multiple

  // output instructions using the same physical register input. The copy to m0

  // is incorrectly placed before the second instruction.

  //

  // TODO: Match source modifiers.


  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *MBB = MI.getParent();


  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

    .addReg(M0Val);

  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)

    .addImm(2)

    .addImm(MI.getOperand(4).getImm())  // $attr

    .addImm(MI.getOperand(3).getImm()); // $attrchan


  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)

    .addImm(0)                          // $src0_modifiers

    .addReg(Src0)                       // $src0

    .addImm(MI.getOperand(4).getImm())  // $attr

    .addImm(MI.getOperand(3).getImm())  // $attrchan

    .addImm(0)                          // $src2_modifiers

    .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high

    .addImm(MI.getOperand(5).getImm())  // $high

    .addImm(0)                          // $clamp

    .addImm(0);                         // $omod


  MI.eraseFromParent();

  return true;

}


// Writelane is special in that it can use SGPR and M0 (which would normally

// count as using the constant bus twice - but in this case it is allowed since

// the lane selector doesn't count as a use of the constant bus). However, it is

// still required to abide by the 1 SGPR rule. Fix this up if we might have

// multiple SGPRs.

bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {

  // With a constant bus limit of at least 2, there's no issue.

  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)

    return selectImpl(MI, *CoverageInfo);


  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  Register VDst = MI.getOperand(0).getReg();

  Register Val = MI.getOperand(2).getReg();

  Register LaneSelect = MI.getOperand(3).getReg();

  Register VDstIn = MI.getOperand(4).getReg();


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);


  std::optional<ValueAndVReg> ConstSelect =

      getIConstantVRegValWithLookThrough(LaneSelect, *MRI);

  if (ConstSelect) {

    // The selector has to be an inline immediate, so we can use whatever for

    // the other operands.

    MIB.addReg(Val);

    MIB.addImm(ConstSelect->Value.getSExtValue() &

               maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));

  } else {

    std::optional<ValueAndVReg> ConstVal =

        getIConstantVRegValWithLookThrough(Val, *MRI);


    // If the value written is an inline immediate, we can get away without a

    // copy to m0.

    if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),

                                                 STI.hasInv2PiInlineImm())) {

      MIB.addImm(ConstVal->Value.getSExtValue());

      MIB.addReg(LaneSelect);

    } else {

      MIB.addReg(Val);


      // If the lane selector was originally in a VGPR and copied with

      // readfirstlane, there's a hazard to read the same SGPR from the

      // VALU. Constrain to a different SGPR to help avoid needing a nop later.

      RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);


      BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

        .addReg(LaneSelect);

      MIB.addReg(AMDGPU::M0);

    }

  }


  MIB.addReg(VDstIn);


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {

  Register Dst0 = MI.getOperand(0).getReg();

  Register Dst1 = MI.getOperand(1).getReg();


  LLT Ty = MRI->getType(Dst0);

  unsigned Opc;

  if (Ty == LLT::scalar(32))

    Opc = AMDGPU::V_DIV_SCALE_F32_e64;

  else if (Ty == LLT::scalar(64))

    Opc = AMDGPU::V_DIV_SCALE_F64_e64;

  else

    return false;


  // TODO: Match source modifiers.


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *MBB = MI.getParent();


  Register Numer = MI.getOperand(3).getReg();

  Register Denom = MI.getOperand(4).getReg();

  unsigned ChooseDenom = MI.getOperand(5).getImm();


  Register Src0 = ChooseDenom != 0 ? Numer : Denom;


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)

    .addDef(Dst1)

    .addImm(0)     // $src0_modifiers

    .addUse(Src0)  // $src0

    .addImm(0)     // $src1_modifiers

    .addUse(Denom) // $src1

    .addImm(0)     // $src2_modifiers

    .addUse(Numer) // $src2

    .addImm(0)     // $clamp

    .addImm(0);    // $omod


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {

  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();

  switch (IntrinsicID) {

  case Intrinsic::amdgcn_if_break: {

    MachineBasicBlock *BB = I.getParent();


    // FIXME: Manually selecting to avoid dealing with the SReg_1 trick

    // SelectionDAG uses for wave32 vs wave64.

    BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))

      .add(I.getOperand(0))

      .add(I.getOperand(2))

      .add(I.getOperand(3));


    Register DstReg = I.getOperand(0).getReg();

    Register Src0Reg = I.getOperand(2).getReg();

    Register Src1Reg = I.getOperand(3).getReg();


    I.eraseFromParent();


    for (Register Reg : { DstReg, Src0Reg, Src1Reg })

      MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());


    return true;

  }

  case Intrinsic::amdgcn_interp_p1_f16:

    return selectInterpP1F16(I);

  case Intrinsic::amdgcn_wqm:

    return constrainCopyLikeIntrin(I, AMDGPU::WQM);

  case Intrinsic::amdgcn_softwqm:

    return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);

  case Intrinsic::amdgcn_strict_wwm:

  case Intrinsic::amdgcn_wwm:

    return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);

  case Intrinsic::amdgcn_strict_wqm:

    return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);

  case Intrinsic::amdgcn_writelane:

    return selectWritelane(I);

  case Intrinsic::amdgcn_div_scale:

    return selectDivScale(I);

  case Intrinsic::amdgcn_icmp:

  case Intrinsic::amdgcn_fcmp:

    if (selectImpl(I, *CoverageInfo))

      return true;

    return selectIntrinsicCmp(I);

  case Intrinsic::amdgcn_ballot:

    return selectBallot(I);

  case Intrinsic::amdgcn_reloc_constant:

    return selectRelocConstant(I);

  case Intrinsic::amdgcn_groupstaticsize:

    return selectGroupStaticSize(I);

  case Intrinsic::returnaddress:

    return selectReturnAddress(I);

  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:

  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:

  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:

  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:

  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:

  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:

  case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:

  case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:

    return selectSMFMACIntrin(I);

  case Intrinsic::amdgcn_permlane16_swap:

  case Intrinsic::amdgcn_permlane32_swap:

    return selectPermlaneSwapIntrin(I, IntrinsicID);

  case Intrinsic::amdgcn_wave_shuffle:

    return selectWaveShuffleIntrin(I);

  case Intrinsic::amdgcn_fma_legacy:

    if (!STI.hasFmaLegacy32Insts()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectImpl(I, *CoverageInfo);

  case Intrinsic::amdgcn_sudot4:

  case Intrinsic::amdgcn_sudot8:

    if (!STI.hasDot8Insts()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectImpl(I, *CoverageInfo);

  case Intrinsic::amdgcn_permlane16:

  case Intrinsic::amdgcn_permlanex16:

    if (!STI.hasPermlane16Insts()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectImpl(I, *CoverageInfo);

  case Intrinsic::amdgcn_mov_dpp8:

    if (!STI.hasDPP8()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectImpl(I, *CoverageInfo);

  case Intrinsic::amdgcn_tanh:

    if (!STI.hasTanhInsts()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectImpl(I, *CoverageInfo);

  default:

    return selectImpl(I, *CoverageInfo);

  }

}


static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,

                          const GCNSubtarget &ST) {

  if (Size != 16 && Size != 32 && Size != 64)

    return -1;


  if (Size == 16 && !ST.has16BitInsts())

    return -1;


  const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,

                          unsigned FakeS16Opc, unsigned S32Opc,

                          unsigned S64Opc) {

    if (Size == 16)

      return ST.hasTrue16BitInsts()

                 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc

                 : S16Opc;

    if (Size == 32)

      return S32Opc;

    return S64Opc;

  };


  switch (P) {

  default:

    llvm_unreachable("Unknown condition code!");

  case CmpInst::ICMP_NE:

    return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,

                  AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,

                  AMDGPU::V_CMP_NE_U64_e64);

  case CmpInst::ICMP_EQ:

    return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,

                  AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,

                  AMDGPU::V_CMP_EQ_U64_e64);

  case CmpInst::ICMP_SGT:

    return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,

                  AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,

                  AMDGPU::V_CMP_GT_I64_e64);

  case CmpInst::ICMP_SGE:

    return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,

                  AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,

                  AMDGPU::V_CMP_GE_I64_e64);

  case CmpInst::ICMP_SLT:

    return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,

                  AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,

                  AMDGPU::V_CMP_LT_I64_e64);

  case CmpInst::ICMP_SLE:

    return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,

                  AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,

                  AMDGPU::V_CMP_LE_I64_e64);

  case CmpInst::ICMP_UGT:

    return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,

                  AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,

                  AMDGPU::V_CMP_GT_U64_e64);

  case CmpInst::ICMP_UGE:

    return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,

                  AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,

                  AMDGPU::V_CMP_GE_U64_e64);

  case CmpInst::ICMP_ULT:

    return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,

                  AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,

                  AMDGPU::V_CMP_LT_U64_e64);

  case CmpInst::ICMP_ULE:

    return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,

                  AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,

                  AMDGPU::V_CMP_LE_U64_e64);


  case CmpInst::FCMP_OEQ:

    return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,

                  AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,

                  AMDGPU::V_CMP_EQ_F64_e64);

  case CmpInst::FCMP_OGT:

    return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,

                  AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,

                  AMDGPU::V_CMP_GT_F64_e64);

  case CmpInst::FCMP_OGE:

    return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,

                  AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,

                  AMDGPU::V_CMP_GE_F64_e64);

  case CmpInst::FCMP_OLT:

    return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,

                  AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,

                  AMDGPU::V_CMP_LT_F64_e64);

  case CmpInst::FCMP_OLE:

    return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,

                  AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,

                  AMDGPU::V_CMP_LE_F64_e64);

  case CmpInst::FCMP_ONE:

    return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,

                  AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,

                  AMDGPU::V_CMP_NEQ_F64_e64);

  case CmpInst::FCMP_ORD:

    return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,

                  AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,

                  AMDGPU::V_CMP_O_F64_e64);

  case CmpInst::FCMP_UNO:

    return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,

                  AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,

                  AMDGPU::V_CMP_U_F64_e64);

  case CmpInst::FCMP_UEQ:

    return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,

                  AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,

                  AMDGPU::V_CMP_NLG_F64_e64);

  case CmpInst::FCMP_UGT:

    return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,

                  AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,

                  AMDGPU::V_CMP_NLE_F64_e64);

  case CmpInst::FCMP_UGE:

    return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,

                  AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,

                  AMDGPU::V_CMP_NLT_F64_e64);

  case CmpInst::FCMP_ULT:

    return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,

                  AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,

                  AMDGPU::V_CMP_NGE_F64_e64);

  case CmpInst::FCMP_ULE:

    return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,

                  AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,

                  AMDGPU::V_CMP_NGT_F64_e64);

  case CmpInst::FCMP_UNE:

    return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,

                  AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,

                  AMDGPU::V_CMP_NEQ_F64_e64);

  case CmpInst::FCMP_TRUE:

    return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,

                  AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,

                  AMDGPU::V_CMP_TRU_F64_e64);

  case CmpInst::FCMP_FALSE:

    return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,

                  AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,

                  AMDGPU::V_CMP_F_F64_e64);

  }

}


int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,

                                              unsigned Size) const {

  if (Size == 64) {

    if (!STI.hasScalarCompareEq64())

      return -1;


    switch (P) {

    case CmpInst::ICMP_NE:

      return AMDGPU::S_CMP_LG_U64;

    case CmpInst::ICMP_EQ:

      return AMDGPU::S_CMP_EQ_U64;

    default:

      return -1;

    }

  }


  if (Size == 32) {

    switch (P) {

    case CmpInst::ICMP_NE:

      return AMDGPU::S_CMP_LG_U32;

    case CmpInst::ICMP_EQ:

      return AMDGPU::S_CMP_EQ_U32;

    case CmpInst::ICMP_SGT:

      return AMDGPU::S_CMP_GT_I32;

    case CmpInst::ICMP_SGE:

      return AMDGPU::S_CMP_GE_I32;

    case CmpInst::ICMP_SLT:

      return AMDGPU::S_CMP_LT_I32;

    case CmpInst::ICMP_SLE:

      return AMDGPU::S_CMP_LE_I32;

    case CmpInst::ICMP_UGT:

      return AMDGPU::S_CMP_GT_U32;

    case CmpInst::ICMP_UGE:

      return AMDGPU::S_CMP_GE_U32;

    case CmpInst::ICMP_ULT:

      return AMDGPU::S_CMP_LT_U32;

    case CmpInst::ICMP_ULE:

      return AMDGPU::S_CMP_LE_U32;

    case CmpInst::FCMP_OEQ:

      return AMDGPU::S_CMP_EQ_F32;

    case CmpInst::FCMP_OGT:

      return AMDGPU::S_CMP_GT_F32;

    case CmpInst::FCMP_OGE:

      return AMDGPU::S_CMP_GE_F32;

    case CmpInst::FCMP_OLT:

      return AMDGPU::S_CMP_LT_F32;

    case CmpInst::FCMP_OLE:

      return AMDGPU::S_CMP_LE_F32;

    case CmpInst::FCMP_ONE:

      return AMDGPU::S_CMP_LG_F32;

    case CmpInst::FCMP_ORD:

      return AMDGPU::S_CMP_O_F32;

    case CmpInst::FCMP_UNO:

      return AMDGPU::S_CMP_U_F32;

    case CmpInst::FCMP_UEQ:

      return AMDGPU::S_CMP_NLG_F32;

    case CmpInst::FCMP_UGT:

      return AMDGPU::S_CMP_NLE_F32;

    case CmpInst::FCMP_UGE:

      return AMDGPU::S_CMP_NLT_F32;

    case CmpInst::FCMP_ULT:

      return AMDGPU::S_CMP_NGE_F32;

    case CmpInst::FCMP_ULE:

      return AMDGPU::S_CMP_NGT_F32;

    case CmpInst::FCMP_UNE:

      return AMDGPU::S_CMP_NEQ_F32;

    default:

      llvm_unreachable("Unknown condition code!");

    }

  }


  if (Size == 16) {

    if (!STI.hasSALUFloatInsts())

      return -1;


    switch (P) {

    case CmpInst::FCMP_OEQ:

      return AMDGPU::S_CMP_EQ_F16;

    case CmpInst::FCMP_OGT:

      return AMDGPU::S_CMP_GT_F16;

    case CmpInst::FCMP_OGE:

      return AMDGPU::S_CMP_GE_F16;

    case CmpInst::FCMP_OLT:

      return AMDGPU::S_CMP_LT_F16;

    case CmpInst::FCMP_OLE:

      return AMDGPU::S_CMP_LE_F16;

    case CmpInst::FCMP_ONE:

      return AMDGPU::S_CMP_LG_F16;

    case CmpInst::FCMP_ORD:

      return AMDGPU::S_CMP_O_F16;

    case CmpInst::FCMP_UNO:

      return AMDGPU::S_CMP_U_F16;

    case CmpInst::FCMP_UEQ:

      return AMDGPU::S_CMP_NLG_F16;

    case CmpInst::FCMP_UGT:

      return AMDGPU::S_CMP_NLE_F16;

    case CmpInst::FCMP_UGE:

      return AMDGPU::S_CMP_NLT_F16;

    case CmpInst::FCMP_ULT:

      return AMDGPU::S_CMP_NGE_F16;

    case CmpInst::FCMP_ULE:

      return AMDGPU::S_CMP_NGT_F16;

    case CmpInst::FCMP_UNE:

      return AMDGPU::S_CMP_NEQ_F16;

    default:

      llvm_unreachable("Unknown condition code!");

    }

  }


  return -1;

}


bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {


  MachineBasicBlock *BB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();


  Register SrcReg = I.getOperand(2).getReg();

  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);


  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();


  Register CCReg = I.getOperand(0).getReg();

  if (!isVCC(CCReg, *MRI)) {

    int Opcode = getS_CMPOpcode(Pred, Size);

    if (Opcode == -1)

      return false;

    MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))

            .add(I.getOperand(2))

            .add(I.getOperand(3));

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)

      .addReg(AMDGPU::SCC);

    constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);

    bool Ret =

        RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);

    I.eraseFromParent();

    return Ret;

  }


  if (I.getOpcode() == AMDGPU::G_FCMP)

    return false;


  int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);

  if (Opcode == -1)

    return false;


  MachineInstrBuilder ICmp;

  // t16 instructions

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {

    ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())

               .addImm(0)

               .add(I.getOperand(2))

               .addImm(0)

               .add(I.getOperand(3))

               .addImm(0); // op_sel

  } else {

    ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())

               .add(I.getOperand(2))

               .add(I.getOperand(3));

  }


  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),

                               *TRI.getBoolRC(), *MRI);

  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);

  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {

  Register Dst = I.getOperand(0).getReg();

  if (isVCC(Dst, *MRI))

    return false;


  LLT DstTy = MRI->getType(Dst);

  if (DstTy.getSizeInBits() != STI.getWavefrontSize())

    return false;


  MachineBasicBlock *BB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  Register SrcReg = I.getOperand(2).getReg();

  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);


  // i1 inputs are not supported in GlobalISel.

  if (Size == 1)

    return false;


  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());

  if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);

    I.eraseFromParent();

    return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);

  }


  const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);

  if (Opcode == -1)

    return false;


  MachineInstrBuilder SelectedMI;

  MachineOperand &LHS = I.getOperand(2);

  MachineOperand &RHS = I.getOperand(3);

  auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());

  auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());

  Register Src0Reg =

      copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);

  Register Src1Reg =

      copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);

  SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))

    SelectedMI.addImm(Src0Mods);

  SelectedMI.addReg(Src0Reg);

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))

    SelectedMI.addImm(Src1Mods);

  SelectedMI.addReg(Src1Reg);

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))

    SelectedMI.addImm(0); // clamp

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))

    SelectedMI.addImm(0); // op_sel


  RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);

  constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);


  I.eraseFromParent();

  return true;

}


// Ballot has to zero bits in input lane-mask that are zero in current exec,

// Done as AND with exec. For inputs that are results of instruction that

// implicitly use same exec, for example compares in same basic block or SCC to

// VCC copy, use copy.


static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,

                                    MachineBasicBlock *MBB) {

  MachineInstr *MI = MRI.getVRegDef(Reg);

  if (MI->getParent() != MBB)

    return false;


  // Lane mask generated by SCC to VCC copy.

  if (MI->getOpcode() == AMDGPU::COPY) {

    auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());

    auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());

    if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&

        SrcRB->getID() == AMDGPU::SGPRRegBankID)

      return true;

  }


  // Lane mask generated using compare with same exec.

  if (isa<GAnyCmp>(MI))

    return true;


  Register LHS, RHS;

  // Look through AND.

  if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))

    return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||

           isLaneMaskFromSameBlock(RHS, MRI, MBB);


  return false;

}


bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(2).getReg();

  const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();

  const unsigned WaveSize = STI.getWavefrontSize();


  // In the common case, the return type matches the wave size.

  // However we also support emitting i64 ballots in wave32 mode.

  if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))

    return false;


  std::optional<ValueAndVReg> Arg =

      getIConstantVRegValWithLookThrough(SrcReg, *MRI);


  Register Dst = DstReg;

  // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.

  if (BallotSize != WaveSize) {

    Dst = MRI->createVirtualRegister(TRI.getBoolRC());

  }


  if (Arg) {

    const int64_t Value = Arg->Value.getZExtValue();

    if (Value == 0) {

      // Dst = S_MOV 0

      unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;

      BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);

    } else {

      // Dst = COPY EXEC

      assert(Value == 1);

      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());

    }

    if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))

      return false;

  } else {

    if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {

      // Dst = COPY SrcReg

      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);

      if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))

        return false;

    } else {

      // Dst = S_AND SrcReg, EXEC

      unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;

      auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)

                     .addReg(SrcReg)

                     .addReg(TRI.getExec())

                     .setOperandDead(3); // Dead scc

      constrainSelectedInstRegOperands(*And, TII, TRI, RBI);

    }

  }


  // i64 ballot on Wave32: zero-extend i32 ballot to i64.

  if (BallotSize != WaveSize) {

    Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);

    BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)

        .addReg(Dst)

        .addImm(AMDGPU::sub0)

        .addReg(HiReg)

        .addImm(AMDGPU::sub1);

  }


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);

  const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);

  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))

    return false;


  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;


  Module *M = MF->getFunction().getParent();

  const MDNode *Metadata = I.getOperand(2).getMetadata();

  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();

  auto *RelocSymbol = cast<GlobalVariable>(

      M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));


  MachineBasicBlock *BB = I.getParent();

  BuildMI(*BB, &I, I.getDebugLoc(),

          TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)

    .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {

  Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();


  Register DstReg = I.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?

    AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;


  MachineBasicBlock *MBB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();


  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);


  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {

    const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

    MIB.addImm(MFI->getLDSSize());

  } else {

    Module *M = MF->getFunction().getParent();

    const GlobalValue *GV =

        Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);

    MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);

  }


  I.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {

  MachineBasicBlock *MBB = I.getParent();

  MachineFunction &MF = *MBB->getParent();

  const DebugLoc &DL = I.getDebugLoc();


  MachineOperand &Dst = I.getOperand(0);

  Register DstReg = Dst.getReg();

  unsigned Depth = I.getOperand(2).getImm();


  const TargetRegisterClass *RC

    = TRI.getConstrainedRegClassForOperand(Dst, *MRI);

  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||

      !RBI.constrainGenericRegister(DstReg, *RC, *MRI))

    return false;


  // Check for kernel and shader functions

  if (Depth != 0 ||

      MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {

    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)

      .addImm(0);

    I.eraseFromParent();

    return true;

  }


  MachineFrameInfo &MFI = MF.getFrameInfo();

  // There is a call to @llvm.returnaddress in this function

  MFI.setReturnAddressIsTaken(true);


  // Get the return address reg and mark it as an implicit live-in

  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);

  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,

                                             AMDGPU::SReg_64RegClass, DL);

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)

    .addReg(LiveIn);

  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {

  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick

  // SelectionDAG uses for wave32 vs wave64.

  MachineBasicBlock *BB = MI.getParent();

  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))

      .add(MI.getOperand(1));


  Register Reg = MI.getOperand(1).getReg();

  MI.eraseFromParent();


  if (!MRI->getRegClassOrNull(Reg))

    MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());

  return true;

}


bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(

  MachineInstr &MI, Intrinsic::ID IntrID) const {

  MachineBasicBlock *MBB = MI.getParent();

  MachineFunction *MF = MBB->getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  unsigned IndexOperand = MI.getOperand(7).getImm();

  bool WaveRelease = MI.getOperand(8).getImm() != 0;

  bool WaveDone = MI.getOperand(9).getImm() != 0;


  if (WaveDone && !WaveRelease) {

    // TODO: Move this to IR verifier

    const Function &Fn = MF->getFunction();

    Fn.getContext().diagnose(DiagnosticInfoUnsupported(

        Fn, "ds_ordered_count: wave_done requires wave_release", DL));

  }


  unsigned OrderedCountIndex = IndexOperand & 0x3f;

  IndexOperand &= ~0x3f;

  unsigned CountDw = 0;


  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {

    CountDw = (IndexOperand >> 24) & 0xf;

    IndexOperand &= ~(0xf << 24);


    if (CountDw < 1 || CountDw > 4) {

      const Function &Fn = MF->getFunction();

      Fn.getContext().diagnose(DiagnosticInfoUnsupported(

          Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));

      CountDw = 1;

    }

  }


  if (IndexOperand) {

    const Function &Fn = MF->getFunction();

    Fn.getContext().diagnose(DiagnosticInfoUnsupported(

        Fn, "ds_ordered_count: bad index operand", DL));

  }


  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;

  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);


  unsigned Offset0 = OrderedCountIndex << 2;

  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);


  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)

    Offset1 |= (CountDw - 1) << 6;


  if (STI.getGeneration() < AMDGPUSubtarget::GFX11)

    Offset1 |= ShaderType << 2;


  unsigned Offset = Offset0 | (Offset1 << 8);


  Register M0Val = MI.getOperand(2).getReg();

  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

    .addReg(M0Val);


  Register DstReg = MI.getOperand(0).getReg();

  Register ValReg = MI.getOperand(3).getReg();

  MachineInstrBuilder DS =

    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)

      .addReg(ValReg)

      .addImm(Offset)

      .cloneMemRefs(MI);


  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);

  MI.eraseFromParent();

  return true;

}


static unsigned gwsIntrinToOpcode(unsigned IntrID) {

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_gws_init:

    return AMDGPU::DS_GWS_INIT;

  case Intrinsic::amdgcn_ds_gws_barrier:

    return AMDGPU::DS_GWS_BARRIER;

  case Intrinsic::amdgcn_ds_gws_sema_v:

    return AMDGPU::DS_GWS_SEMA_V;

  case Intrinsic::amdgcn_ds_gws_sema_br:

    return AMDGPU::DS_GWS_SEMA_BR;

  case Intrinsic::amdgcn_ds_gws_sema_p:

    return AMDGPU::DS_GWS_SEMA_P;

  case Intrinsic::amdgcn_ds_gws_sema_release_all:

    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;

  default:

    llvm_unreachable("not a gws intrinsic");

  }

}


bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,

                                                     Intrinsic::ID IID) const {

  if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&

                        !STI.hasGWSSemaReleaseAll()))

    return false;


  // intrinsic ID, vsrc, offset

  const bool HasVSrc = MI.getNumOperands() == 3;

  assert(HasVSrc || MI.getNumOperands() == 2);


  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();

  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);

  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)

    return false;


  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);

  unsigned ImmOffset;


  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  MachineInstr *Readfirstlane = nullptr;


  // If we legalized the VGPR input, strip out the readfirstlane to analyze the

  // incoming offset, in case there's an add of a constant. We'll have to put it

  // back later.

  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {

    Readfirstlane = OffsetDef;

    BaseOffset = OffsetDef->getOperand(1).getReg();

    OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);

  }


  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {

    // If we have a constant offset, try to use the 0 in m0 as the base.

    // TODO: Look into changing the default m0 initialization value. If the

    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to

    // the immediate offset.


    ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();

    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)

      .addImm(0);

  } else {

    std::tie(BaseOffset, ImmOffset) =

        AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);


    if (Readfirstlane) {

      // We have the constant offset now, so put the readfirstlane back on the

      // variable component.

      if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))

        return false;


      Readfirstlane->getOperand(1).setReg(BaseOffset);

      BaseOffset = Readfirstlane->getOperand(0).getReg();

    } else {

      if (!RBI.constrainGenericRegister(BaseOffset,

                                        AMDGPU::SReg_32RegClass, *MRI))

        return false;

    }


    Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)

      .addReg(BaseOffset)

      .addImm(16)

      .setOperandDead(3); // Dead scc


    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

      .addReg(M0Base);

  }


  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +

  // offset field) % 64. Some versions of the programming guide omit the m0

  // part, or claim it's from offset 0.


  unsigned Opc = gwsIntrinToOpcode(IID);

  const MCInstrDesc &InstrDesc = TII.get(Opc);


  if (HasVSrc) {

    Register VSrc = MI.getOperand(1).getReg();


    int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);

    const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);

    const TargetRegisterClass *SubRC =

        TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);


    if (!SubRC) {

      // 32-bit normal case.

      if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))

        return false;


      BuildMI(*MBB, &MI, DL, InstrDesc)

        .addReg(VSrc)

        .addImm(ImmOffset)

        .cloneMemRefs(MI);

    } else {

      // Requires even register alignment, so create 64-bit value and pad the

      // top half with undef.

      Register DataReg = MRI->createVirtualRegister(DataRC);

      if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))

        return false;


      Register UndefReg = MRI->createVirtualRegister(SubRC);

      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);

      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)

        .addReg(VSrc)

        .addImm(AMDGPU::sub0)

        .addReg(UndefReg)

        .addImm(AMDGPU::sub1);


      BuildMI(*MBB, &MI, DL, InstrDesc)

        .addReg(DataReg)

        .addImm(ImmOffset)

        .cloneMemRefs(MI);

    }

  } else {

    BuildMI(*MBB, &MI, DL, InstrDesc)

      .addImm(ImmOffset)

      .cloneMemRefs(MI);

  }


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,

                                                      bool IsAppend) const {

  Register PtrBase = MI.getOperand(2).getReg();

  LLT PtrTy = MRI->getType(PtrBase);

  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;


  unsigned Offset;

  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));


  // TODO: Should this try to look through readfirstlane like GWS?

  if (!isDSOffsetLegal(PtrBase, Offset)) {

    PtrBase = MI.getOperand(2).getReg();

    Offset = 0;

  }


  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;


  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

    .addReg(PtrBase);

  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())

    .addImm(Offset)

    .addImm(IsGDS ? -1 : 0)

    .cloneMemRefs(MI);

  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {

  MachineFunction *MF = MI.getMF();

  SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();


  MFInfo->setInitWholeWave();

  return selectImpl(MI, *CoverageInfo);

}


static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,

                         bool &IsTexFail) {

  if (TexFailCtrl)

    IsTexFail = true;


  TFE = TexFailCtrl & 0x1;

  TexFailCtrl &= ~(uint64_t)0x1;

  LWE = TexFailCtrl & 0x2;

  TexFailCtrl &= ~(uint64_t)0x2;


  return TexFailCtrl == 0;

}


bool AMDGPUInstructionSelector::selectImageIntrinsic(

  MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  unsigned IntrOpcode = Intr->BaseOpcode;


  // For image atomic: use no-return opcode if result is unused.

  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {

    Register ResultDef = MI.getOperand(0).getReg();

    if (MRI->use_nodbg_empty(ResultDef))

      IntrOpcode = Intr->AtomicNoRetBaseOpcode;

  }


  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =

      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);


  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);

  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);

  const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);

  const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);

  const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);


  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;


  Register VDataIn = AMDGPU::NoRegister;

  Register VDataOut = AMDGPU::NoRegister;

  LLT VDataTy;

  int NumVDataDwords = -1;

  bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||

               MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;


  bool Unorm;

  if (!BaseOpcode->Sampler)

    Unorm = true;

  else

    Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;


  bool TFE;

  bool LWE;

  bool IsTexFail = false;

  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),

                    TFE, LWE, IsTexFail))

    return false;


  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();

  const bool IsA16 = (Flags & 1) != 0;

  const bool IsG16 = (Flags & 2) != 0;


  // A16 implies 16 bit gradients if subtarget doesn't support G16

  if (IsA16 && !STI.hasG16() && !IsG16)

    return false;


  unsigned DMask = 0;

  unsigned DMaskLanes = 0;


  if (BaseOpcode->Atomic) {

    if (!BaseOpcode->NoReturn)

      VDataOut = MI.getOperand(0).getReg();

    VDataIn = MI.getOperand(2).getReg();

    LLT Ty = MRI->getType(VDataIn);


    // Be careful to allow atomic swap on 16-bit element vectors.

    const bool Is64Bit = BaseOpcode->AtomicX2 ?

      Ty.getSizeInBits() == 128 :

      Ty.getSizeInBits() == 64;


    if (BaseOpcode->AtomicX2) {

      assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);


      DMask = Is64Bit ? 0xf : 0x3;

      NumVDataDwords = Is64Bit ? 4 : 2;

    } else {

      DMask = Is64Bit ? 0x3 : 0x1;

      NumVDataDwords = Is64Bit ? 2 : 1;

    }

  } else {

    DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();

    DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);


    if (BaseOpcode->Store) {

      VDataIn = MI.getOperand(1).getReg();

      VDataTy = MRI->getType(VDataIn);

      NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;

    } else if (BaseOpcode->NoReturn) {

      NumVDataDwords = 0;

    } else {

      VDataOut = MI.getOperand(0).getReg();

      VDataTy = MRI->getType(VDataOut);

      NumVDataDwords = DMaskLanes;


      if (IsD16 && !STI.hasUnpackedD16VMem())

        NumVDataDwords = (DMaskLanes + 1) / 2;

    }

  }


  // Set G16 opcode

  if (Subtarget->hasG16() && IsG16) {

    const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =

        AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);

    assert(G16MappingInfo);

    IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16

  }


  // TODO: Check this in verifier.

  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");


  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();

  // Keep GLC only when the atomic's result is actually used.

  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)

    CPol |= AMDGPU::CPol::GLC;

  if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |

               AMDGPU::CPol::VOLATILE))

    return false;


  int NumVAddrRegs = 0;

  int NumVAddrDwords = 0;

  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {

    // Skip the $noregs and 0s inserted during legalization.

    MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);

    if (!AddrOp.isReg())

      continue; // XXX - Break?


    Register Addr = AddrOp.getReg();

    if (!Addr)

      break;


    ++NumVAddrRegs;

    NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;

  }


  // The legalizer preprocessed the intrinsic arguments. If we aren't using

  // NSA, these should have been packed into a single value in the first

  // address register

  const bool UseNSA =

      NumVAddrRegs != 1 &&

      (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs

                                   : NumVAddrDwords == NumVAddrRegs);

  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {

    LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");

    return false;

  }


  if (IsTexFail)

    ++NumVDataDwords;


  int Opcode = -1;

  if (IsGFX13Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX12Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX11Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,

                                   UseNSA ? AMDGPU::MIMGEncGfx11NSA

                                          : AMDGPU::MIMGEncGfx11Default,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX10Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,

                                   UseNSA ? AMDGPU::MIMGEncGfx10NSA

                                          : AMDGPU::MIMGEncGfx10Default,

                                   NumVDataDwords, NumVAddrDwords);

  } else {

    if (Subtarget->hasGFX90AInsts()) {

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,

                                     NumVDataDwords, NumVAddrDwords);

      if (Opcode == -1) {

        LLVM_DEBUG(

            dbgs()

            << "requested image instruction is not supported on this GPU\n");

        return false;

      }

    }

    if (Opcode == -1 &&

        STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,

                                     NumVDataDwords, NumVAddrDwords);

    if (Opcode == -1)

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,

                                     NumVDataDwords, NumVAddrDwords);

  }

  if (Opcode == -1)

    return false;


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))

    .cloneMemRefs(MI);


  if (VDataOut) {

    if (BaseOpcode->AtomicX2) {

      const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;


      Register TmpReg = MRI->createVirtualRegister(

        Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);

      unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;


      MIB.addDef(TmpReg);

      if (!MRI->use_empty(VDataOut)) {

        BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)

            .addReg(TmpReg, RegState::Kill, SubReg);

      }


    } else {

      MIB.addDef(VDataOut); // vdata output

    }

  }


  if (VDataIn)

    MIB.addReg(VDataIn); // vdata input


  for (int I = 0; I != NumVAddrRegs; ++I) {

    MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);

    if (SrcOp.isReg()) {

      assert(SrcOp.getReg() != 0);

      MIB.addReg(SrcOp.getReg());

    }

  }


  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());

  if (BaseOpcode->Sampler)

    MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());


  MIB.addImm(DMask); // dmask


  if (IsGFX10Plus)

    MIB.addImm(DimInfo->Encoding);

  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))

    MIB.addImm(Unorm);


  MIB.addImm(CPol);

  MIB.addImm(IsA16 &&  // a16 or r128

             STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);

  if (IsGFX10Plus)

    MIB.addImm(IsA16 ? -1 : 0);


  if (!Subtarget->hasGFX90AInsts()) {

    MIB.addImm(TFE); // tfe

  } else if (TFE) {

    LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");

    return false;

  }


  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))

    MIB.addImm(LWE); // lwe

  if (!IsGFX10Plus)

    MIB.addImm(DimInfo->DA ? -1 : 0);

  if (BaseOpcode->HasD16)

    MIB.addImm(IsD16 ? -1 : 0);


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);

  return true;

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(

    MachineInstr &MI) const {

  Register Dst0 = MI.getOperand(0).getReg();

  Register Dst1 = MI.getOperand(1).getReg();


  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock *MBB = MI.getParent();


  Register Addr = MI.getOperand(3).getReg();

  Register Data0 = MI.getOperand(4).getReg();

  Register Data1 = MI.getOperand(5).getReg();

  unsigned Offset = MI.getOperand(6).getImm();


  unsigned Opc;

  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

  case Intrinsic::amdgcn_ds_bvh_stack_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

    Opc = AMDGPU::DS_BVH_STACK_RTN_B32;

    break;

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;

    break;

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:

    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;

    break;

  }


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)

                 .addDef(Dst1)

                 .addUse(Addr)

                 .addUse(Data0)

                 .addUse(Data1)

                 .addImm(Offset)

                 .cloneMemRefs(MI);


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(

    MachineInstr &I) const {

  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();

  switch (IntrinsicID) {

  case Intrinsic::amdgcn_end_cf:

    return selectEndCfIntrinsic(I);

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap:

    return selectDSOrderedIntrinsic(I, IntrinsicID);

  case Intrinsic::amdgcn_ds_gws_init:

  case Intrinsic::amdgcn_ds_gws_barrier:

  case Intrinsic::amdgcn_ds_gws_sema_v:

  case Intrinsic::amdgcn_ds_gws_sema_br:

  case Intrinsic::amdgcn_ds_gws_sema_p:

  case Intrinsic::amdgcn_ds_gws_sema_release_all:

    return selectDSGWSIntrinsic(I, IntrinsicID);

  case Intrinsic::amdgcn_ds_append:

    return selectDSAppendConsume(I, true);

  case Intrinsic::amdgcn_ds_consume:

    return selectDSAppendConsume(I, false);

  case Intrinsic::amdgcn_init_whole_wave:

    return selectInitWholeWave(I);

  case Intrinsic::amdgcn_raw_buffer_load_lds:

  case Intrinsic::amdgcn_raw_buffer_load_async_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_buffer_load_lds:

  case Intrinsic::amdgcn_struct_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:

    return selectBufferLoadLds(I);

  // Until we can store both the address space of the global and the LDS

  // arguments by having tto MachineMemOperands on an intrinsic, we just trust

  // that the argument is a global pointer (buffer pointers have been handled by

  // a LLVM IR-level lowering).

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_lds:

  case Intrinsic::amdgcn_global_load_async_lds:

    return selectGlobalLoadLds(I);

  case Intrinsic::amdgcn_tensor_load_to_lds:

  case Intrinsic::amdgcn_tensor_store_from_lds:

    return selectTensorLoadStore(I, IntrinsicID);

  case Intrinsic::amdgcn_asyncmark:

  case Intrinsic::amdgcn_wait_asyncmark:

    if (!Subtarget->hasAsyncMark())

      return false;

    break;

  case Intrinsic::amdgcn_exp_compr:

    if (!STI.hasCompressedExport()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    break;

  case Intrinsic::amdgcn_ds_bvh_stack_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:

    return selectDSBvhStackIntrinsic(I);

  case Intrinsic::amdgcn_s_alloc_vgpr: {

    // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets

    // SCC. We then need to COPY it into the result vreg.

    MachineBasicBlock *MBB = I.getParent();

    const DebugLoc &DL = I.getDebugLoc();


    Register ResReg = I.getOperand(0).getReg();


    MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))

                                .add(I.getOperand(2));

    (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)

        .addReg(AMDGPU::SCC);

    I.eraseFromParent();

    constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);

    return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);

  }

  case Intrinsic::amdgcn_s_barrier_init:

  case Intrinsic::amdgcn_s_barrier_signal_var:

    return selectNamedBarrierInit(I, IntrinsicID);

  case Intrinsic::amdgcn_s_wakeup_barrier: {

    if (!STI.hasSWakeupBarrier()) {

      diagnoseUnsupportedIntrinsic(I);

      return false;

    }

    return selectNamedBarrierInst(I, IntrinsicID);

  }

  case Intrinsic::amdgcn_s_barrier_join:

  case Intrinsic::amdgcn_s_get_named_barrier_state:

    return selectNamedBarrierInst(I, IntrinsicID);

  case Intrinsic::amdgcn_s_get_barrier_state:

    return selectSGetBarrierState(I, IntrinsicID);

  case Intrinsic::amdgcn_s_barrier_signal_isfirst:

    return selectSBarrierSignalIsfirst(I, IntrinsicID);

  }

  return selectImpl(I, *CoverageInfo);

}


bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {

  if (selectImpl(I, *CoverageInfo))

    return true;


  MachineBasicBlock *BB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();


  Register DstReg = I.getOperand(0).getReg();

  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);

  assert(Size <= 32 || Size == 64);

  const MachineOperand &CCOp = I.getOperand(1);

  Register CCReg = CCOp.getReg();

  if (!isVCC(CCReg, *MRI)) {

    unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :

                                         AMDGPU::S_CSELECT_B32;

    MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)

            .addReg(CCReg);


    // The generic constrainSelectedInstRegOperands doesn't work for the scc register

    // bank, because it does not cover the register class that we used to represent

    // for it.  So we need to manually set the register class here.

    if (!MRI->getRegClassOrNull(CCReg))

        MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));

    MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)

            .add(I.getOperand(2))

            .add(I.getOperand(3));


    constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);

    constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);

    I.eraseFromParent();

    return true;

  }


  // Wide VGPR select should have been split in RegBankSelect.

  if (Size > 32)

    return false;


  MachineInstr *Select =

      BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

              .addImm(0)

              .add(I.getOperand(3))

              .addImm(0)

              .add(I.getOperand(2))

              .add(I.getOperand(1));


  constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);

  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(1).getReg();

  const LLT DstTy = MRI->getType(DstReg);

  const LLT SrcTy = MRI->getType(SrcReg);

  const LLT S1 = LLT::scalar(1);


  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);

  const RegisterBank *DstRB;

  if (DstTy == S1) {

    // This is a special case. We don't treat s1 for legalization artifacts as

    // vcc booleans.

    DstRB = SrcRB;

  } else {

    DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

    if (SrcRB != DstRB)

      return false;

  }


  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;


  unsigned DstSize = DstTy.getSizeInBits();

  unsigned SrcSize = SrcTy.getSizeInBits();


  const TargetRegisterClass *SrcRC =

      TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);

  const TargetRegisterClass *DstRC =

      TRI.getRegClassForSizeOnBank(DstSize, *DstRB);

  if (!SrcRC || !DstRC)

    return false;


  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||

      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {

    LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");

    return false;

  }


  if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {

    assert(STI.useRealTrue16Insts());

    const DebugLoc &DL = I.getDebugLoc();

    MachineBasicBlock *MBB = I.getParent();

    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)

        .addReg(SrcReg, {}, AMDGPU::lo16);

    I.eraseFromParent();

    return true;

  }


  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {

    MachineBasicBlock *MBB = I.getParent();

    const DebugLoc &DL = I.getDebugLoc();


    Register LoReg = MRI->createVirtualRegister(DstRC);

    Register HiReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)

        .addReg(SrcReg, {}, AMDGPU::sub0);

    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)

        .addReg(SrcReg, {}, AMDGPU::sub1);


    if (IsVALU && STI.hasSDWA()) {

      // Write the low 16-bits of the high element into the high 16-bits of the

      // low element.

      MachineInstr *MovSDWA =

        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)

        .addImm(0)                             // $src0_modifiers

        .addReg(HiReg)                         // $src0

        .addImm(0)                             // $clamp

        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel

        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused

        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel

        .addReg(LoReg, RegState::Implicit);

      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);

    } else {

      Register TmpReg0 = MRI->createVirtualRegister(DstRC);

      Register TmpReg1 = MRI->createVirtualRegister(DstRC);

      Register ImmReg = MRI->createVirtualRegister(DstRC);

      if (IsVALU) {

        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)

          .addImm(16)

          .addReg(HiReg);

      } else {

        BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)

          .addReg(HiReg)

          .addImm(16)

          .setOperandDead(3); // Dead scc

      }


      unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;

      unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;

      unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;


      BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)

        .addImm(0xffff);

      auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)

        .addReg(LoReg)

        .addReg(ImmReg);

      auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)

        .addReg(TmpReg0)

        .addReg(TmpReg1);


      if (!IsVALU) {

        And.setOperandDead(3); // Dead scc

        Or.setOperandDead(3); // Dead scc

      }

    }


    I.eraseFromParent();

    return true;

  }


  if (!DstTy.isScalar())

    return false;


  if (SrcSize > 32) {

    unsigned SubRegIdx = DstSize < 32

                             ? static_cast<unsigned>(AMDGPU::sub0)

                             : TRI.getSubRegFromChannel(0, DstSize / 32);

    if (SubRegIdx == AMDGPU::NoSubRegister)

      return false;


    // Deal with weird cases where the class only partially supports the subreg

    // index.

    const TargetRegisterClass *SrcWithSubRC

      = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);

    if (!SrcWithSubRC)

      return false;


    if (SrcWithSubRC != SrcRC) {

      if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))

        return false;

    }


    I.getOperand(1).setSubReg(SubRegIdx);

  }


  I.setDesc(TII.get(TargetOpcode::COPY));

  return true;

}


/// \returns true if a bitmask for \p Size bits will be an inline immediate.


static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {

  Mask = maskTrailingOnes<unsigned>(Size);

  int SignedMask = static_cast<int>(Mask);

  return SignedMask >= -16 && SignedMask <= 64;

}


// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.

const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(

  Register Reg, const MachineRegisterInfo &MRI,

  const TargetRegisterInfo &TRI) const {

  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);

  if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))

    return RB;


  // Ignore the type, since we don't use vcc in artifacts.

  if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))

    return &RBI.getRegBankFromRegClass(*RC, LLT());

  return nullptr;

}


bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {

  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;

  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;

  const DebugLoc &DL = I.getDebugLoc();

  MachineBasicBlock &MBB = *I.getParent();

  const Register DstReg = I.getOperand(0).getReg();

  const Register SrcReg = I.getOperand(1).getReg();


  const LLT DstTy = MRI->getType(DstReg);

  const LLT SrcTy = MRI->getType(SrcReg);

  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?

    I.getOperand(2).getImm() : SrcTy.getSizeInBits();

  const unsigned DstSize = DstTy.getSizeInBits();

  if (!DstTy.isScalar())

    return false;


  // Artifact casts should never use vcc.

  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);


  // FIXME: This should probably be illegal and split earlier.

  if (I.getOpcode() == AMDGPU::G_ANYEXT) {

    if (DstSize <= 32)

      return selectCOPY(I);


    const TargetRegisterClass *SrcRC =

        TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);

    const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);

    const TargetRegisterClass *DstRC =

        TRI.getRegClassForSizeOnBank(DstSize, *DstBank);


    Register UndefReg = MRI->createVirtualRegister(SrcRC);

    BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);

    BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)

      .addReg(SrcReg)

      .addImm(AMDGPU::sub0)

      .addReg(UndefReg)

      .addImm(AMDGPU::sub1);

    I.eraseFromParent();


    return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&

           RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);

  }


  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {

    // 64-bit should have been split up in RegBankSelect


    // Try to use an and with a mask if it will save code size.

    unsigned Mask;

    if (!Signed && shouldUseAndMask(SrcSize, Mask)) {

      MachineInstr *ExtI =

      BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)

        .addImm(Mask)

        .addReg(SrcReg);

      I.eraseFromParent();

      constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);

      return true;

    }


    const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;

    MachineInstr *ExtI =

      BuildMI(MBB, I, DL, TII.get(BFE), DstReg)

      .addReg(SrcReg)

      .addImm(0) // Offset

      .addImm(SrcSize); // Width

    I.eraseFromParent();

    constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);

    return true;

  }


  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {

    const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?

      AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;

    if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))

      return false;


    if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {

      const unsigned SextOpc = SrcSize == 8 ?

        AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;

      BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)

        .addReg(SrcReg);

      I.eraseFromParent();

      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);

    }


    // Using a single 32-bit SALU to calculate the high half is smaller than

    // S_BFE with a literal constant operand.

    if (DstSize > 32 && SrcSize == 32) {

      Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

      unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;

      if (Signed) {

        BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)

            .addReg(SrcReg, {}, SubReg)

            .addImm(31)

            .setOperandDead(3); // Dead scc

      } else {

        BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)

          .addImm(0);

      }

      BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)

          .addReg(SrcReg, {}, SubReg)

          .addImm(AMDGPU::sub0)

          .addReg(HiReg)

          .addImm(AMDGPU::sub1);

      I.eraseFromParent();

      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,

                                          *MRI);

    }


    const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;

    const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;


    // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.

    if (DstSize > 32 && (SrcSize <= 32 || InReg)) {

      // We need a 64-bit register source, but the high bits don't matter.

      Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

      Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

      unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;


      BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);

      BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)

          .addReg(SrcReg, {}, SubReg)

          .addImm(AMDGPU::sub0)

          .addReg(UndefReg)

          .addImm(AMDGPU::sub1);


      BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)

        .addReg(ExtReg)

        .addImm(SrcSize << 16);


      I.eraseFromParent();

      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);

    }


    unsigned Mask;

    if (!Signed && shouldUseAndMask(SrcSize, Mask)) {

      BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)

        .addReg(SrcReg)

        .addImm(Mask)

        .setOperandDead(3); // Dead scc

    } else {

      BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)

        .addReg(SrcReg)

        .addImm(SrcSize << 16);

    }


    I.eraseFromParent();

    return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);

  }


  return false;

}


static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {

  return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;

}


static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {

  Register BitcastSrc;

  if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))

    Reg = BitcastSrc;

  return Reg;

}


static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,

                           Register &Out) {

  // When unmerging a register that is composed of 2 x 16-bit values allow to

  // use an extract hi instruction for the upper 16 bits. We only need to check

  // the size of `In` as all defs are guaranteed to be the same type for

  // GUnmerge.

  if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {

    if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&

        MRI.getType(In).getSizeInBits() == 16) {

      Out = Unmerge->getSourceReg();

      return true;

    }

  }


  Register Trunc;

  if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))

    return false;


  Register LShlSrc;

  Register Cst;

  if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {

    Cst = stripCopy(Cst, MRI);

    if (mi_match(Cst, MRI, m_SpecificICst(16))) {

      Out = stripBitCast(LShlSrc, MRI);

      return true;

    }

  }


  MachineInstr *Shuffle = MRI.getVRegDef(Trunc);

  if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)

    return false;


  assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==

         LLT::fixed_vector(2, 16));


  ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();

  assert(Mask.size() == 2);


  if (Mask[0] == 1 && Mask[1] <= 1) {

    Out = Shuffle->getOperand(0).getReg();

    return true;

  }


  return false;

}


bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {

  if (!Subtarget->hasSALUFloatInsts())

    return false;


  Register Dst = I.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);

  if (DstRB->getID() != AMDGPU::SGPRRegBankID)

    return false;


  Register Src = I.getOperand(1).getReg();


  if (MRI->getType(Dst) == LLT::scalar(32) &&

      MRI->getType(Src) == LLT::scalar(16)) {

    if (isExtractHiElt(*MRI, Src, Src)) {

      MachineBasicBlock *BB = I.getParent();

      BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)

          .addUse(Src);

      I.eraseFromParent();

      return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);

    }

  }


  return false;

}


bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {

  // Only manually handle the f64 SGPR case.

  //

  // FIXME: This is a workaround for 2.5 different tablegen problems. Because

  // the bit ops theoretically have a second result due to the implicit def of

  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing

  // that is easy by disabling the check. The result works, but uses a

  // nonsensical sreg32orlds_and_sreg_1 regclass.

  //

  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to

  // the variadic REG_SEQUENCE operands.


  Register Dst = MI.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);

  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||

      MRI->getType(Dst) != LLT::scalar(64))

    return false;


  Register Src = MI.getOperand(1).getReg();

  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);

  if (Fabs)

    Src = Fabs->getOperand(1).getReg();


  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))

    return false;


  MachineBasicBlock *BB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);


  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)

      .addReg(Src, {}, AMDGPU::sub0);

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)

      .addReg(Src, {}, AMDGPU::sub1);

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)

    .addImm(0x80000000);


  // Set or toggle sign bit.

  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;

  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)

    .addReg(HiReg)

    .addReg(ConstReg)

    .setOperandDead(3); // Dead scc

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)

    .addReg(LoReg)

    .addImm(AMDGPU::sub0)

    .addReg(OpReg)

    .addImm(AMDGPU::sub1);

  MI.eraseFromParent();

  return true;

}


// FIXME: This is a workaround for the same tablegen problems as G_FNEG

bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {

  Register Dst = MI.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);

  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||

      MRI->getType(Dst) != LLT::scalar(64))

    return false;


  Register Src = MI.getOperand(1).getReg();

  MachineBasicBlock *BB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);


  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||

      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))

    return false;


  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)

      .addReg(Src, {}, AMDGPU::sub0);

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)

      .addReg(Src, {}, AMDGPU::sub1);

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)

    .addImm(0x7fffffff);


  // Clear sign bit.

  // TODO: Should this used S_BITSET0_*?

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)

    .addReg(HiReg)

    .addReg(ConstReg)

    .setOperandDead(3); // Dead scc

  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)

    .addReg(LoReg)

    .addImm(AMDGPU::sub0)

    .addReg(OpReg)

    .addImm(AMDGPU::sub1);


  MI.eraseFromParent();

  return true;

}


static bool isConstant(const MachineInstr &MI) {

  return MI.getOpcode() == TargetOpcode::G_CONSTANT;

}


void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,

    const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {


  unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;

  const MachineInstr *PtrMI =

      MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());


  assert(PtrMI);


  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)

    return;


  GEPInfo GEPInfo;


  for (unsigned i = 1; i != 3; ++i) {

    const MachineOperand &GEPOp = PtrMI->getOperand(i);

    const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());

    assert(OpDef);

    if (i == 2 && isConstant(*OpDef)) {

      // TODO: Could handle constant base + variable offset, but a combine

      // probably should have commuted it.

      assert(GEPInfo.Imm == 0);

      GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();

      continue;

    }

    const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);

    if (OpBank->getID() == AMDGPU::SGPRRegBankID)

      GEPInfo.SgprParts.push_back(GEPOp.getReg());

    else

      GEPInfo.VgprParts.push_back(GEPOp.getReg());

  }


  AddrInfo.push_back(GEPInfo);

  getAddrModeInfo(*PtrMI, MRI, AddrInfo);

}


bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {

  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;

}


bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {

  if (!MI.hasOneMemOperand())

    return false;


  const MachineMemOperand *MMO = *MI.memoperands_begin();

  const Value *Ptr = MMO->getValue();


  // UndefValue means this is a load of a kernel input.  These are uniform.

  // Sometimes LDS instructions have constant pointers.

  // If Ptr is null, then that means this mem operand contains a

  // PseudoSourceValue like GOT.

  if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Ptr))

    return true;


  if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)

    return true;


  if (MI.getOpcode() == AMDGPU::G_PREFETCH)

    return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==

           AMDGPU::SGPRRegBankID;


  const Instruction *I = dyn_cast<Instruction>(Ptr);

  return I && I->getMetadata("amdgpu.uniform");

}


bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {

  for (const GEPInfo &GEPInfo : AddrInfo) {

    if (!GEPInfo.VgprParts.empty())

      return true;

  }

  return false;

}


void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {

  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());

  unsigned AS = PtrTy.getAddressSpace();

  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&

      STI.ldsRequiresM0Init()) {

    MachineBasicBlock *BB = I.getParent();


    // If DS instructions require M0 initialization, insert it before selecting.

    BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)

      .addImm(-1);

  }

}


bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(

  MachineInstr &I) const {

  initM0(I);

  return selectImpl(I, *CoverageInfo);

}


static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {

  if (Reg.isPhysical())

    return false;


  MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);

  const unsigned Opcode = MI.getOpcode();


  if (Opcode == AMDGPU::COPY)

    return isVCmpResult(MI.getOperand(1).getReg(), MRI);


  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||

      Opcode == AMDGPU::G_XOR)

    return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&

           isVCmpResult(MI.getOperand(2).getReg(), MRI);


  if (auto *GI = dyn_cast<GIntrinsic>(&MI))

    return GI->is(Intrinsic::amdgcn_class);


  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;

}


bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {

  MachineBasicBlock *BB = I.getParent();

  MachineOperand &CondOp = I.getOperand(0);

  Register CondReg = CondOp.getReg();

  const DebugLoc &DL = I.getDebugLoc();


  unsigned BrOpcode;

  Register CondPhysReg;

  const TargetRegisterClass *ConstrainRC;


  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide

  // whether the branch is uniform when selecting the instruction. In

  // GlobalISel, we should push that decision into RegBankSelect. Assume for now

  // RegBankSelect knows what it's doing if the branch condition is scc, even

  // though it currently does not.

  if (!isVCC(CondReg, *MRI)) {

    if (MRI->getType(CondReg) != LLT::scalar(32))

      return false;


    CondPhysReg = AMDGPU::SCC;

    BrOpcode = AMDGPU::S_CBRANCH_SCC1;

    ConstrainRC = &AMDGPU::SReg_32RegClass;

  } else {

    // FIXME: Should scc->vcc copies and with exec?


    // Unless the value of CondReg is a result of a V_CMP* instruction then we

    // need to insert an and with exec.

    if (!isVCmpResult(CondReg, *MRI)) {

      const bool Is64 = STI.isWave64();

      const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;

      const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;


      Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());

      BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)

          .addReg(CondReg)

          .addReg(Exec)

          .setOperandDead(3); // Dead scc

      CondReg = TmpReg;

    }


    CondPhysReg = TRI.getVCC();

    BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;

    ConstrainRC = TRI.getBoolRC();

  }


  if (!MRI->getRegClassOrNull(CondReg))

    MRI->setRegClass(CondReg, ConstrainRC);


  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)

    .addReg(CondReg);

  BuildMI(*BB, &I, DL, TII.get(BrOpcode))

    .addMBB(I.getOperand(1).getMBB());


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(

  MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;

  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));

  if (IsVGPR)

    I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));


  return RBI.constrainGenericRegister(

    DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);

}


bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {

  Register DstReg = I.getOperand(0).getReg();

  Register SrcReg = I.getOperand(1).getReg();

  Register MaskReg = I.getOperand(2).getReg();

  LLT Ty = MRI->getType(DstReg);

  LLT MaskTy = MRI->getType(MaskReg);

  MachineBasicBlock *BB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();


  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);

  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);

  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;

  if (DstRB != SrcRB) // Should only happen for hand written MIR.

    return false;


  // Try to avoid emitting a bit operation when we only need to touch half of

  // the 64-bit pointer.

  APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);

  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);

  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);


  const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;

  const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;


  if (!IsVGPR && Ty.getSizeInBits() == 64 &&

      !CanCopyLow32 && !CanCopyHi32) {

    auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)

      .addReg(SrcReg)

      .addReg(MaskReg)

      .setOperandDead(3); // Dead scc

    I.eraseFromParent();

    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

    return true;

  }


  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;

  const TargetRegisterClass &RegRC

    = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;


  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);

  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);

  const TargetRegisterClass *MaskRC =

      TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);


  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||

      !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||

      !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))

    return false;


  if (Ty.getSizeInBits() == 32) {

    assert(MaskTy.getSizeInBits() == 32 &&

           "ptrmask should have been narrowed during legalize");


    auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)

      .addReg(SrcReg)

      .addReg(MaskReg);


    if (!IsVGPR)

      NewOp.setOperandDead(3); // Dead scc

    I.eraseFromParent();

    return true;

  }


  Register HiReg = MRI->createVirtualRegister(&RegRC);

  Register LoReg = MRI->createVirtualRegister(&RegRC);


  // Extract the subregisters from the source pointer.

  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)

      .addReg(SrcReg, {}, AMDGPU::sub0);

  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)

      .addReg(SrcReg, {}, AMDGPU::sub1);


  Register MaskedLo, MaskedHi;


  if (CanCopyLow32) {

    // If all the bits in the low half are 1, we only need a copy for it.

    MaskedLo = LoReg;

  } else {

    // Extract the mask subregister and apply the and.

    Register MaskLo = MRI->createVirtualRegister(&RegRC);

    MaskedLo = MRI->createVirtualRegister(&RegRC);


    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)

        .addReg(MaskReg, {}, AMDGPU::sub0);

    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)

      .addReg(LoReg)

      .addReg(MaskLo);

  }


  if (CanCopyHi32) {

    // If all the bits in the high half are 1, we only need a copy for it.

    MaskedHi = HiReg;

  } else {

    Register MaskHi = MRI->createVirtualRegister(&RegRC);

    MaskedHi = MRI->createVirtualRegister(&RegRC);


    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)

        .addReg(MaskReg, {}, AMDGPU::sub1);

    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)

      .addReg(HiReg)

      .addReg(MaskHi);

  }


  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)

    .addReg(MaskedLo)

    .addImm(AMDGPU::sub0)

    .addReg(MaskedHi)

    .addImm(AMDGPU::sub1);

  I.eraseFromParent();

  return true;

}


/// Return the register to use for the index value, and the subregister to use

/// for the indirectly accessed register.

static std::pair<Register, unsigned>


computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,

                        const TargetRegisterClass *SuperRC, Register IdxReg,

                        unsigned EltSize, GISelValueTracking &ValueTracking) {

  Register IdxBaseReg;

  int Offset;


  std::tie(IdxBaseReg, Offset) =

      AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);

  if (IdxBaseReg == AMDGPU::NoRegister) {

    // This will happen if the index is a known constant. This should ordinarily

    // be legalized out, but handle it as a register just in case.

    assert(Offset == 0);

    IdxBaseReg = IdxReg;

  }


  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);


  // Skip out of bounds offsets, or else we would end up using an undefined

  // register.

  if (static_cast<unsigned>(Offset) >= SubRegs.size())

    return std::pair(IdxReg, SubRegs[0]);

  return std::pair(IdxBaseReg, SubRegs[Offset]);

}


bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(

  MachineInstr &MI) const {

  Register DstReg = MI.getOperand(0).getReg();

  Register SrcReg = MI.getOperand(1).getReg();

  Register IdxReg = MI.getOperand(2).getReg();


  LLT DstTy = MRI->getType(DstReg);

  LLT SrcTy = MRI->getType(SrcReg);


  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);

  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);


  // The index must be scalar. If it wasn't RegBankSelect should have moved this

  // into a waterfall loop.

  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)

    return false;


  const TargetRegisterClass *SrcRC =

      TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);

  const TargetRegisterClass *DstRC =

      TRI.getRegClassForTypeOnBank(DstTy, *DstRB);

  if (!SrcRC || !DstRC)

    return false;

  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||

      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||

      !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  MachineBasicBlock *BB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  const bool Is64 = DstTy.getSizeInBits() == 64;


  unsigned SubReg;

  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(

      *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);


  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {

    if (DstTy.getSizeInBits() != 32 && !Is64)

      return false;


    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

      .addReg(IdxReg);


    unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;

    BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)

        .addReg(SrcReg, {}, SubReg)

        .addReg(SrcReg, RegState::Implicit);

    MI.eraseFromParent();

    return true;

  }


  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)

    return false;


  if (!STI.useVGPRIndexMode()) {

    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

      .addReg(IdxReg);

    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)

        .addReg(SrcReg, {}, SubReg)

        .addReg(SrcReg, RegState::Implicit);

    MI.eraseFromParent();

    return true;

  }


  const MCInstrDesc &GPRIDXDesc =

      TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);

  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)

      .addReg(SrcReg)

      .addReg(IdxReg)

      .addImm(SubReg);


  MI.eraseFromParent();

  return true;

}


// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd

bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(

  MachineInstr &MI) const {

  Register DstReg = MI.getOperand(0).getReg();

  Register VecReg = MI.getOperand(1).getReg();

  Register ValReg = MI.getOperand(2).getReg();

  Register IdxReg = MI.getOperand(3).getReg();


  LLT VecTy = MRI->getType(DstReg);

  LLT ValTy = MRI->getType(ValReg);

  unsigned VecSize = VecTy.getSizeInBits();

  unsigned ValSize = ValTy.getSizeInBits();


  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);

  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);

  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);


  assert(VecTy.getElementType() == ValTy);


  // The index must be scalar. If it wasn't RegBankSelect should have moved this

  // into a waterfall loop.

  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)

    return false;


  const TargetRegisterClass *VecRC =

      TRI.getRegClassForTypeOnBank(VecTy, *VecRB);

  const TargetRegisterClass *ValRC =

      TRI.getRegClassForTypeOnBank(ValTy, *ValRB);


  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||

      !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||

      !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||

      !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)

    return false;


  unsigned SubReg;

  std::tie(IdxReg, SubReg) =

      computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);


  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&

                         STI.useVGPRIndexMode();


  MachineBasicBlock *BB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  if (!IndexMode) {

    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

      .addReg(IdxReg);


    const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(

        VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);

    BuildMI(*BB, MI, DL, RegWriteOp, DstReg)

        .addReg(VecReg)

        .addReg(ValReg)

        .addImm(SubReg);

    MI.eraseFromParent();

    return true;

  }


  const MCInstrDesc &GPRIDXDesc =

      TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);

  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)

      .addReg(VecReg)

      .addReg(ValReg)

      .addReg(IdxReg)

      .addImm(SubReg);


  MI.eraseFromParent();

  return true;

}


static bool isAsyncLDSDMA(Intrinsic::ID Intr) {

  switch (Intr) {

  case Intrinsic::amdgcn_raw_buffer_load_async_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_async_lds:

    return true;

  }

  return false;

}


bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {

  if (!Subtarget->hasVMemToLDSLoad())

    return false;

  unsigned Opc;

  unsigned Size = MI.getOperand(3).getImm();

  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();


  // The struct intrinsic variants add one additional operand over raw.

  const bool HasVIndex = MI.getNumOperands() == 9;

  Register VIndex;

  int OpOffset = 0;

  if (HasVIndex) {

    VIndex = MI.getOperand(4).getReg();

    OpOffset = 1;

  }


  Register VOffset = MI.getOperand(4 + OpOffset).getReg();

  std::optional<ValueAndVReg> MaybeVOffset =

      getIConstantVRegValWithLookThrough(VOffset, *MRI);

  const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();


  switch (Size) {

  default:

    return false;

  case 1:

    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN

                                 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN

                    : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN

                                 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;

    break;

  case 2:

    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN

                                 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN

                    : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN

                                 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;

    break;

  case 4:

    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN

                                 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN

                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN

                                 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;

    break;

  case 12:

    if (!Subtarget->hasLDSLoadB96_B128())

      return false;


    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN

                                 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN

                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN

                                 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;

    break;

  case 16:

    if (!Subtarget->hasLDSLoadB96_B128())

      return false;


    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN

                                 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN

                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN

                                 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;

    break;

  }


  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

    .add(MI.getOperand(2));


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));


  if (HasVIndex && HasVOffset) {

    Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());

    BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)

      .addReg(VIndex)

      .addImm(AMDGPU::sub0)

      .addReg(VOffset)

      .addImm(AMDGPU::sub1);


    MIB.addReg(IdxReg);

  } else if (HasVIndex) {

    MIB.addReg(VIndex);

  } else if (HasVOffset) {

    MIB.addReg(VOffset);

  }


  MIB.add(MI.getOperand(1));            // rsrc

  MIB.add(MI.getOperand(5 + OpOffset)); // soffset

  MIB.add(MI.getOperand(6 + OpOffset)); // imm offset

  bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);

  unsigned Aux = MI.getOperand(7 + OpOffset).getImm();

  MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL

                                : AMDGPU::CPol::ALL_pregfx12)); // cpol

  MIB.addImm(

      Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)

          ? 1

          : 0); // swz

  MIB.addImm(isAsyncLDSDMA(IntrinsicID));


  MachineMemOperand *LoadMMO = *MI.memoperands_begin();

  // Don't set the offset value here because the pointer points to the base of

  // the buffer.

  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();


  MachinePointerInfo StorePtrI = LoadPtrI;

  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),

                                                 AMDGPUAS::BUFFER_RESOURCE));

  LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;

  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;


  auto F = LoadMMO->getFlags() &

           ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);

  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,

                                     Size, LoadMMO->getBaseAlign());


  MachineMemOperand *StoreMMO =

      MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,

                               sizeof(int32_t), LoadMMO->getBaseAlign());


  MIB.setMemRefs({LoadMMO, StoreMMO});


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


/// Match a zero extend from a 32-bit value to 64-bits.

Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {

  Register ZExtSrc;

  if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))

    return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();


  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)

  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);

  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)

    return Register();


  assert(Def->getNumOperands() == 3 &&

         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));

  if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {

    return Def->getOperand(1).getReg();

  }


  return Register();

}


/// Match a sign extend from a 32-bit value to 64-bits.

Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {

  Register SExtSrc;

  if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))

    return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();


  // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))

  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);

  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)

    return Register();


  assert(Def->getNumOperands() == 3 &&

         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));

  if (mi_match(Def->getOperand(2).getReg(), *MRI,

               m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),

                       m_SpecificICst(31))))

    return Def->getOperand(1).getReg();


  if (VT->signBitIsZero(Reg))

    return matchZeroExtendFromS32(Reg);


  return Register();

}


/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it

/// is 32-bit.

Register

AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {

  return MRI->getType(Reg) == LLT::scalar(32) ? Reg

                                              : matchZeroExtendFromS32(Reg);

}


/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it

/// is 32-bit.

Register

AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {

  return MRI->getType(Reg) == LLT::scalar(32) ? Reg

                                              : matchSignExtendFromS32(Reg);

}


Register

AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,

                                                   bool IsSigned) const {

  if (IsSigned)

    return matchSignExtendFromS32OrS32(Reg);


  return matchZeroExtendFromS32OrS32(Reg);

}


Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {

  Register AnyExtSrc;

  if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))

    return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();


  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)

  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);

  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)

    return Register();


  assert(Def->getNumOperands() == 3 &&

         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));


  if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))

    return Def->getOperand(1).getReg();


  return Register();

}


bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{

  if (!Subtarget->hasVMemToLDSLoad())

    return false;


  unsigned Opc;

  unsigned Size = MI.getOperand(3).getImm();

  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();


  switch (Size) {

  default:

    return false;

  case 1:

    Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;

    break;

  case 2:

    Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;

    break;

  case 4:

    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;

    break;

  case 12:

    if (!Subtarget->hasLDSLoadB96_B128())

      return false;

    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;

    break;

  case 16:

    if (!Subtarget->hasLDSLoadB96_B128())

      return false;

    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;

    break;

  }


  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

    .add(MI.getOperand(2));


  Register Addr = MI.getOperand(1).getReg();

  Register VOffset;

  // Try to split SAddr and VOffset. Global and LDS pointers share the same

  // immediate offset, so we cannot use a regular SelectGlobalSAddr().

  if (!isSGPR(Addr)) {

    auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);

    if (isSGPR(AddrDef->Reg)) {

      Addr = AddrDef->Reg;

    } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {

      Register SAddr =

          getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);

      if (isSGPR(SAddr)) {

        Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();

        if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {

          Addr = SAddr;

          VOffset = Off;

        }

      }

    }

  }


  if (isSGPR(Addr)) {

    Opc = AMDGPU::getGlobalSaddrOp(Opc);

    if (!VOffset) {

      VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)

        .addImm(0);

    }

  }


  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))

    .addReg(Addr);


  if (isSGPR(Addr))

    MIB.addReg(VOffset);


  MIB.add(MI.getOperand(4)); // offset


  unsigned Aux = MI.getOperand(5).getImm();

  MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol

  MIB.addImm(isAsyncLDSDMA(IntrinsicID));


  MachineMemOperand *LoadMMO = *MI.memoperands_begin();

  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();

  LoadPtrI.Offset = MI.getOperand(4).getImm();

  MachinePointerInfo StorePtrI = LoadPtrI;

  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),

                                                 AMDGPUAS::GLOBAL_ADDRESS));

  LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;

  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;

  auto F = LoadMMO->getFlags() &

           ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);

  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,

                                     Size, LoadMMO->getBaseAlign());

  MachineMemOperand *StoreMMO =

      MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,

                               sizeof(int32_t), Align(4));


  MIB.setMemRefs({LoadMMO, StoreMMO});


  MI.eraseFromParent();

  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,

                                                      Intrinsic::ID IID) const {

  bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;

  unsigned Opc =

      IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;

  int NumGroups = 4;


  // A lamda function to check whether an operand is a vector of all 0s.

  const auto isAllZeros = [&](MachineOperand &Opnd) {

    const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());

    if (!DefMI)

      return false;

    return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);

  };


  // Use _D2 version if both group 2 and 3 are zero-initialized.

  if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {

    NumGroups = 2;

    Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2

                 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;

  }


  // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored

  // for now because all existing targets only support up to 4 groups.

  MachineBasicBlock *MBB = MI.getParent();

  auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))

                 .add(MI.getOperand(1))  // D# group 0

                 .add(MI.getOperand(2)); // D# group 1


  if (NumGroups >= 4) {         // Has at least 4 groups

    MIB.add(MI.getOperand(3))   // D# group 2

        .add(MI.getOperand(4)); // D# group 3

  }


  MIB.addImm(0)               // r128

      .add(MI.getOperand(6)); // cpol


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(

    MachineInstr &MI) const {

  unsigned OpcodeOpIdx =

      MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;

  MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));

  MI.removeOperand(OpcodeOpIdx);

  MI.addImplicitDefUseOperands(*MI.getMF());

  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);

  return true;

}


// FIXME: This should be removed and let the patterns select. We just need the

// AGPR/VGPR combination versions.

bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {

  unsigned Opc;

  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:

    Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:

    Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;

    break;

  case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:

    Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:

    Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;

    break;

  case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:

    Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;

    break;

  default:

    llvm_unreachable("unhandled smfmac intrinsic");

  }


  auto VDst_In = MI.getOperand(4);


  MI.setDesc(TII.get(Opc));

  MI.removeOperand(4); // VDst_In

  MI.removeOperand(1); // Intrinsic ID

  MI.addOperand(VDst_In); // Readd VDst_In to the end

  MI.addImplicitDefUseOperands(*MI.getMF());

  const MCInstrDesc &MCID = MI.getDesc();

  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {

    MI.getOperand(0).setIsEarlyClobber(true);

  }

  return true;

}


bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(

    MachineInstr &MI, Intrinsic::ID IntrID) const {

  if (IntrID == Intrinsic::amdgcn_permlane16_swap &&

      !Subtarget->hasPermlane16Swap())

    return false;

  if (IntrID == Intrinsic::amdgcn_permlane32_swap &&

      !Subtarget->hasPermlane32Swap())

    return false;


  unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap

                        ? AMDGPU::V_PERMLANE16_SWAP_B32_e64

                        : AMDGPU::V_PERMLANE32_SWAP_B32_e64;


  MI.removeOperand(2);

  MI.setDesc(TII.get(Opcode));

  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));


  MachineOperand &FI = MI.getOperand(4);

  FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);


  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);

  return true;

}


bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {

  Register DstReg = MI.getOperand(0).getReg();

  Register SrcReg = MI.getOperand(1).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  if (IsVALU) {

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)

      .addImm(Subtarget->getWavefrontSizeLog2())

      .addReg(SrcReg);

  } else {

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)

      .addReg(SrcReg)

      .addImm(Subtarget->getWavefrontSizeLog2())

      .setOperandDead(3); // Dead scc

  }


  const TargetRegisterClass &RC =

      IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;

  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))

    return false;


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(

    MachineInstr &MI) const {

  assert(MI.getNumOperands() == 4);

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  Register DstReg = MI.getOperand(0).getReg();

  Register ValReg = MI.getOperand(2).getReg();

  Register IdxReg = MI.getOperand(3).getReg();


  const LLT DstTy = MRI->getType(DstReg);

  unsigned DstSize = DstTy.getSizeInBits();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const TargetRegisterClass *DstRC =

      TRI.getRegClassForSizeOnBank(DstSize, *DstRB);


  if (DstTy != LLT::scalar(32))

    return false;


  if (!Subtarget->supportsBPermute())

    return false;


  // If we can bpermute across the whole wave, then just do that

  if (Subtarget->supportsWaveWideBPermute()) {

    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)

        .addImm(2)

        .addReg(IdxReg);


    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)

        .addReg(ShiftIdxReg)

        .addReg(ValReg)

        .addImm(0);

  } else {

    // Otherwise, we need to make use of whole wave mode

    assert(Subtarget->isWave64());


    // Set inactive lanes to poison

    Register UndefValReg =

        MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);


    Register UndefExecReg = MRI->createVirtualRegister(

        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);


    Register PoisonValReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)

        .addImm(0)

        .addReg(ValReg)

        .addImm(0)

        .addReg(UndefValReg)

        .addReg(UndefExecReg);


    // ds_bpermute requires index to be multiplied by 4

    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)

        .addImm(2)

        .addReg(IdxReg);


    Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)

        .addImm(0)

        .addReg(ShiftIdxReg)

        .addImm(0)

        .addReg(UndefValReg)

        .addReg(UndefExecReg);


    // Get permutation of each half, then we'll select which one to use

    Register SameSidePermReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)

        .addReg(PoisonIdxReg)

        .addReg(PoisonValReg)

        .addImm(0);


    Register SwappedValReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)

        .addReg(PoisonValReg);


    Register OppSidePermReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)

        .addReg(PoisonIdxReg)

        .addReg(SwappedValReg)

        .addImm(0);


    Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)

        .addReg(OppSidePermReg);


    // Select which side to take the permute from

    // We can get away with only using mbcnt_lo here since we're only

    // trying to detect which side of 32 each lane is on, and mbcnt_lo

    // returns 32 for lanes 32-63.

    Register ThreadIDReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)

        .addImm(-1)

        .addImm(0);


    Register XORReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)

        .addReg(ThreadIDReg)

        .addReg(PoisonIdxReg);


    Register ANDReg = MRI->createVirtualRegister(DstRC);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)

        .addReg(XORReg)

        .addImm(32);


    Register CompareReg = MRI->createVirtualRegister(

        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)

        .addReg(ANDReg)

        .addImm(0);


    // Finally do the selection

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)

        .addImm(0)

        .addReg(WWMSwapPermReg)

        .addImm(0)

        .addReg(SameSidePermReg)

        .addReg(CompareReg);

  }


  MI.eraseFromParent();

  return true;

}


// Match BITOP3 operation and return a number of matched instructions plus

// truth table.


static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,

                                              SmallVectorImpl<Register> &Src,

                                              const MachineRegisterInfo &MRI) {

  unsigned NumOpcodes = 0;

  uint8_t LHSBits, RHSBits;


  auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {

    // Define truth table given Src0, Src1, Src2 bits permutations:

    //                          0     0     0

    //                          0     0     1

    //                          0     1     0

    //                          0     1     1

    //                          1     0     0

    //                          1     0     1

    //                          1     1     0

    //                          1     1     1

    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };


    if (mi_match(Op, MRI, m_AllOnesInt())) {

      Bits = 0xff;

      return true;

    }

    if (mi_match(Op, MRI, m_ZeroInt())) {

      Bits = 0;

      return true;

    }


    for (unsigned I = 0; I < Src.size(); ++I) {

      // Try to find existing reused operand

      if (Src[I] == Op) {

        Bits = SrcBits[I];

        return true;

      }

      // Try to replace parent operator

      if (Src[I] == R) {

        Bits = SrcBits[I];

        Src[I] = Op;

        return true;

      }

    }


    if (Src.size() == 3) {

      // No room left for operands. Try one last time, there can be a 'not' of

      // one of our source operands. In this case we can compute the bits

      // without growing Src vector.

      Register LHS;

      if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {

        LHS = getSrcRegIgnoringCopies(LHS, MRI);

        for (unsigned I = 0; I < Src.size(); ++I) {

          if (Src[I] == LHS) {

            Bits = ~SrcBits[I];

            return true;

          }

        }

      }


      return false;

    }


    Bits = SrcBits[Src.size()];

    Src.push_back(Op);

    return true;

  };


  MachineInstr *MI = MRI.getVRegDef(R);

  switch (MI->getOpcode()) {

  case TargetOpcode::G_AND:

  case TargetOpcode::G_OR:

  case TargetOpcode::G_XOR: {

    Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);

    Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);


    SmallVector<Register, 3> Backup(Src.begin(), Src.end());

    if (!getOperandBits(LHS, LHSBits) ||

        !getOperandBits(RHS, RHSBits)) {

      Src = std::move(Backup);

      return std::make_pair(0, 0);

    }


    // Recursion is naturally limited by the size of the operand vector.

    auto Op = BitOp3_Op(LHS, Src, MRI);

    if (Op.first) {

      NumOpcodes += Op.first;

      LHSBits = Op.second;

    }


    Op = BitOp3_Op(RHS, Src, MRI);

    if (Op.first) {

      NumOpcodes += Op.first;

      RHSBits = Op.second;

    }

    break;

  }

  default:

    return std::make_pair(0, 0);

  }


  uint8_t TTbl;

  switch (MI->getOpcode()) {

  case TargetOpcode::G_AND:

    TTbl = LHSBits & RHSBits;

    break;

  case TargetOpcode::G_OR:

    TTbl = LHSBits | RHSBits;

    break;

  case TargetOpcode::G_XOR:

    TTbl = LHSBits ^ RHSBits;

    break;

  default:

    break;

  }


  return std::make_pair(NumOpcodes + 1, TTbl);

}


bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {

  if (!Subtarget->hasBitOp3Insts())

    return false;


  Register DstReg = MI.getOperand(0).getReg();

  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);

  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;

  if (!IsVALU)

    return false;


  SmallVector<Register, 3> Src;

  uint8_t TTbl;

  unsigned NumOpcodes;


  std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);


  // Src.empty() case can happen if all operands are all zero or all ones.

  // Normally it shall be optimized out before reaching this.

  if (NumOpcodes < 2 || Src.empty())

    return false;


  const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);

  if (NumOpcodes == 2 && IsB32) {

    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes

    // asm more readable. This cannot be modeled with AddedComplexity because

    // selector does not know how many operations did we match.

    if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||

        mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||

        mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))

      return false;

  } else if (NumOpcodes < 4) {

    // For a uniform case threshold should be higher to account for moves

    // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be

    // in SGPRs and a readtfirstlane after.

    return false;

  }


  unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;

  if (!IsB32 && STI.hasTrue16BitInsts())

    Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64

                                   : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;

  unsigned CBL = STI.getConstantBusLimit(Opc);

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  for (unsigned I = 0; I < Src.size(); ++I) {

    const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);

    if (RB->getID() != AMDGPU::SGPRRegBankID)

      continue;

    if (CBL > 0) {

      --CBL;

      continue;

    }

    Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)

        .addReg(Src[I]);

    Src[I] = NewReg;

  }


  // Last operand can be ignored, turning a ternary operation into a binary.

  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace

  // 'c' with 'a' here without changing the answer. In some pathological

  // cases it should be possible to get an operation with a single operand

  // too if optimizer would not catch it.

  while (Src.size() < 3)

    Src.push_back(Src[0]);


  auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);

  if (!IsB32)

    MIB.addImm(0); // src_mod0

  MIB.addReg(Src[0]);

  if (!IsB32)

    MIB.addImm(0); // src_mod1

  MIB.addReg(Src[1]);

  if (!IsB32)

    MIB.addImm(0); // src_mod2

  MIB.addReg(Src[2])

     .addImm(TTbl);

  if (!IsB32)

    MIB.addImm(0); // op_sel


  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);

  MI.eraseFromParent();


  return true;

}


bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {

  Register SrcReg = MI.getOperand(0).getReg();

  if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))

    return false;


  MachineInstr *DefMI = MRI->getVRegDef(SrcReg);

  Register SP =

      Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();

  Register WaveAddr = getWaveAddress(DefMI);

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();


  if (!WaveAddr) {

    WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)

      .addReg(SrcReg)

      .addImm(Subtarget->getWavefrontSizeLog2())

      .setOperandDead(3); // Dead scc

  }


  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)

    .addReg(WaveAddr);


  MI.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::select(MachineInstr &I) {


  if (!I.isPreISelOpcode()) {

    if (I.isCopy())

      return selectCOPY(I);

    return true;

  }


  switch (I.getOpcode()) {

  case TargetOpcode::G_AND:

  case TargetOpcode::G_OR:

  case TargetOpcode::G_XOR:

    if (selectBITOP3(I))

      return true;

    if (selectImpl(I, *CoverageInfo))

      return true;

    return selectG_AND_OR_XOR(I);

  case TargetOpcode::G_ADD:

  case TargetOpcode::G_SUB:

  case TargetOpcode::G_PTR_ADD:

    if (selectImpl(I, *CoverageInfo))

      return true;

    return selectG_ADD_SUB(I);

  case TargetOpcode::G_UADDO:

  case TargetOpcode::G_USUBO:

  case TargetOpcode::G_UADDE:

  case TargetOpcode::G_USUBE:

    return selectG_UADDO_USUBO_UADDE_USUBE(I);

  case AMDGPU::G_AMDGPU_MAD_U64_U32:

  case AMDGPU::G_AMDGPU_MAD_I64_I32:

    return selectG_AMDGPU_MAD_64_32(I);

  case TargetOpcode::G_INTTOPTR:

  case TargetOpcode::G_BITCAST:

  case TargetOpcode::G_PTRTOINT:

  case TargetOpcode::G_FREEZE:

    return selectCOPY(I);

  case TargetOpcode::G_FNEG:

    if (selectImpl(I, *CoverageInfo))

      return true;

    return selectG_FNEG(I);

  case TargetOpcode::G_FABS:

    if (selectImpl(I, *CoverageInfo))

      return true;

    return selectG_FABS(I);

  case TargetOpcode::G_EXTRACT:

    return selectG_EXTRACT(I);

  case TargetOpcode::G_MERGE_VALUES:

  case TargetOpcode::G_CONCAT_VECTORS:

    return selectG_MERGE_VALUES(I);

  case TargetOpcode::G_UNMERGE_VALUES:

    return selectG_UNMERGE_VALUES(I);

  case TargetOpcode::G_BUILD_VECTOR:

  case TargetOpcode::G_BUILD_VECTOR_TRUNC:

    return selectG_BUILD_VECTOR(I);

  case TargetOpcode::G_IMPLICIT_DEF:

    return selectG_IMPLICIT_DEF(I);

  case TargetOpcode::G_INSERT:

    return selectG_INSERT(I);

  case TargetOpcode::G_INTRINSIC:

  case TargetOpcode::G_INTRINSIC_CONVERGENT:

    return selectG_INTRINSIC(I);

  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:

  case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:

    return selectG_INTRINSIC_W_SIDE_EFFECTS(I);

  case TargetOpcode::G_ICMP:

  case TargetOpcode::G_FCMP:

    if (selectG_ICMP_or_FCMP(I))

      return true;

    return selectImpl(I, *CoverageInfo);

  case TargetOpcode::G_LOAD:

  case TargetOpcode::G_ZEXTLOAD:

  case TargetOpcode::G_SEXTLOAD:

  case TargetOpcode::G_STORE:

  case TargetOpcode::G_ATOMIC_CMPXCHG:

  case TargetOpcode::G_ATOMICRMW_XCHG:

  case TargetOpcode::G_ATOMICRMW_ADD:

  case TargetOpcode::G_ATOMICRMW_SUB:

  case TargetOpcode::G_ATOMICRMW_AND:

  case TargetOpcode::G_ATOMICRMW_OR:

  case TargetOpcode::G_ATOMICRMW_XOR:

  case TargetOpcode::G_ATOMICRMW_MIN:

  case TargetOpcode::G_ATOMICRMW_MAX:

  case TargetOpcode::G_ATOMICRMW_UMIN:

  case TargetOpcode::G_ATOMICRMW_UMAX:

  case TargetOpcode::G_ATOMICRMW_UINC_WRAP:

  case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:

  case TargetOpcode::G_ATOMICRMW_USUB_COND:

  case TargetOpcode::G_ATOMICRMW_USUB_SAT:

  case TargetOpcode::G_ATOMICRMW_FADD:

  case TargetOpcode::G_ATOMICRMW_FMIN:

  case TargetOpcode::G_ATOMICRMW_FMAX:

    return selectG_LOAD_STORE_ATOMICRMW(I);

  case TargetOpcode::G_SELECT:

    return selectG_SELECT(I);

  case TargetOpcode::G_TRUNC:

    return selectG_TRUNC(I);

  case TargetOpcode::G_SEXT:

  case TargetOpcode::G_ZEXT:

  case TargetOpcode::G_ANYEXT:

  case TargetOpcode::G_SEXT_INREG:

    // This is a workaround. For extension from type i1, `selectImpl()` uses

    // patterns from TD file and generates an illegal VGPR to SGPR COPY as type

    // i1 can only be hold in a SGPR class.

    if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&

        selectImpl(I, *CoverageInfo))

      return true;

    return selectG_SZA_EXT(I);

  case TargetOpcode::G_FPEXT:

    if (selectG_FPEXT(I))

      return true;

    return selectImpl(I, *CoverageInfo);

  case TargetOpcode::G_BRCOND:

    return selectG_BRCOND(I);

  case TargetOpcode::G_GLOBAL_VALUE:

    return selectG_GLOBAL_VALUE(I);

  case TargetOpcode::G_PTRMASK:

    return selectG_PTRMASK(I);

  case TargetOpcode::G_EXTRACT_VECTOR_ELT:

    return selectG_EXTRACT_VECTOR_ELT(I);

  case TargetOpcode::G_INSERT_VECTOR_ELT:

    return selectG_INSERT_VECTOR_ELT(I);

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:

  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {

    const AMDGPU::ImageDimIntrinsicInfo *Intr =

        AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));

    assert(Intr && "not an image intrinsic with image pseudo");

    return selectImageIntrinsic(I, Intr);

  }

  case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:

  case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:

    return selectBVHIntersectRayIntrinsic(I);

  case AMDGPU::G_SBFX:

  case AMDGPU::G_UBFX:

    return selectG_SBFX_UBFX(I);

  case AMDGPU::G_SI_CALL:

    I.setDesc(TII.get(AMDGPU::SI_CALL));

    return true;

  case AMDGPU::G_AMDGPU_WAVE_ADDRESS:

    return selectWaveAddress(I);

  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {

    I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));

    return true;

  }

  case AMDGPU::G_STACKRESTORE:

    return selectStackRestore(I);

  case AMDGPU::G_PHI:

    return selectPHI(I);

  case AMDGPU::G_AMDGPU_COPY_SCC_VCC:

    return selectCOPY_SCC_VCC(I);

  case AMDGPU::G_AMDGPU_COPY_VCC_SCC:

    return selectCOPY_VCC_SCC(I);

  case AMDGPU::G_AMDGPU_READANYLANE:

    return selectReadAnyLane(I);

  case TargetOpcode::G_CONSTANT:

  case TargetOpcode::G_FCONSTANT:

  default:

    return selectImpl(I, *CoverageInfo);

  }

  return false;

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }

  }};


}


std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(

    Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {

  unsigned Mods = 0;

  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);


  if (MI->getOpcode() == AMDGPU::G_FNEG) {

    Src = MI->getOperand(1).getReg();

    Mods |= SISrcMods::NEG;

    MI = getDefIgnoringCopies(Src, *MRI);

  } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {

    // Fold fsub [+-]0 into fneg. This may not have folded depending on the

    // denormal mode, but we're implicitly canonicalizing in a source operand.

    const ConstantFP *LHS =

        getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);

    if (LHS && LHS->isZero()) {

      Mods |= SISrcMods::NEG;

      Src = MI->getOperand(2).getReg();

    }

  }


  if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {

    Src = MI->getOperand(1).getReg();

    Mods |= SISrcMods::ABS;

  }


  if (OpSel)

    Mods |= SISrcMods::OP_SEL_0;


  return std::pair(Src, Mods);

}


std::pair<Register, unsigned>

AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Src);

  Mods |= SISrcMods::OP_SEL_1;

  return std::pair(Src, Mods);

}


Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(

    Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,

    bool ForceVGPR) const {

  if ((Mods != 0 || ForceVGPR) &&

      RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {


    // If we looked through copies to find source modifiers on an SGPR operand,

    // we now have an SGPR register source. To avoid potentially violating the

    // constant bus restriction, we need to insert a copy to a VGPR.

    Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());

    BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),

            TII.get(AMDGPU::COPY), VGPRSrc)

        .addReg(Src);

    Src = VGPRSrc;

  }


  return Src;

}


///

/// This will select either an SGPR or VGPR operand and will save us from

/// having to write an extra tablegen pattern.

InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.add(Root); }

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),

                                           /*IsCanonicalizing=*/true,

                                           /*AllowAbs=*/false);


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp

      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(

    MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) =

      selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) =

      selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,

                         /*AllowAbs=*/false);


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {

  Register Reg = Root.getReg();

  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);

  if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)

    return {};

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },

  }};

}


enum class SrcStatus {

  IS_SAME,

  IS_UPPER_HALF,

  IS_LOWER_HALF,

  IS_UPPER_HALF_NEG,

  // This means current op = [op_upper, op_lower] and src = -op_lower.

  IS_LOWER_HALF_NEG,

  IS_HI_NEG,

  // This means current op = [op_upper, op_lower] and src = [op_upper,

  // -op_lower].

  IS_LO_NEG,

  IS_BOTH_NEG,

  INVALID,

  NEG_START = IS_UPPER_HALF_NEG,

  NEG_END = IS_BOTH_NEG,

  HALF_START = IS_UPPER_HALF,

  HALF_END = IS_LOWER_HALF_NEG

};


/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`


static bool isTruncHalf(const MachineInstr *MI,

                        const MachineRegisterInfo &MRI) {

  if (MI->getOpcode() != AMDGPU::G_TRUNC)

    return false;


  unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();

  unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();

  return DstSize * 2 == SrcSize;

}


/// Test if the MI is logic shift right with half bits,

/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`


static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {

  if (MI->getOpcode() != AMDGPU::G_LSHR)

    return false;


  Register ShiftSrc;

  std::optional<ValueAndVReg> ShiftAmt;

  if (mi_match(MI->getOperand(0).getReg(), MRI,

               m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {

    unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();

    unsigned Shift = ShiftAmt->Value.getZExtValue();

    return Shift * 2 == SrcSize;

  }

  return false;

}


/// Test if the MI is shift left with half bits,

/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`


static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {

  if (MI->getOpcode() != AMDGPU::G_SHL)

    return false;


  Register ShiftSrc;

  std::optional<ValueAndVReg> ShiftAmt;

  if (mi_match(MI->getOperand(0).getReg(), MRI,

               m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {

    unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();

    unsigned Shift = ShiftAmt->Value.getZExtValue();

    return Shift * 2 == SrcSize;

  }

  return false;

}


/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`


static bool isUnmergeHalf(const MachineInstr *MI,

                          const MachineRegisterInfo &MRI) {

  if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)

    return false;

  return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&

         MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();

}


enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };


static TypeClass isVectorOfTwoOrScalar(Register Reg,

                                       const MachineRegisterInfo &MRI) {

  LLT OpTy = MRI.getType(Reg);

  if (OpTy.isScalar())

    return TypeClass::SCALAR;

  if (OpTy.isVector() && OpTy.getNumElements() == 2)

    return TypeClass::VECTOR_OF_TWO;

  return TypeClass::NONE_OF_LISTED;

}


static SrcStatus getNegStatus(Register Reg, SrcStatus S,

                              const MachineRegisterInfo &MRI) {

  TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);

  if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)

    return SrcStatus::INVALID;


  switch (S) {

  case SrcStatus::IS_SAME:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // [SrcHi, SrcLo]   = [CurrHi, CurrLo]

      // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)

      // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)

      // [SrcHi, SrcLo]   = [-OpHi, -OpLo]

      return SrcStatus::IS_BOTH_NEG;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // [SrcHi, SrcLo]   = [CurrHi, CurrLo]

      // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)

      // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)

      // [SrcHi, SrcLo]   = [-OpHi, OpLo]

      return SrcStatus::IS_HI_NEG;

    }

    break;

  case SrcStatus::IS_HI_NEG:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]

      // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)

      // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)

      // [SrcHi, SrcLo]   = [-(-OpHi), -OpLo] = [OpHi, -OpLo]

      return SrcStatus::IS_LO_NEG;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]

      // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)

      // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)

      // [SrcHi, SrcLo]   = [-(-OpHi), OpLo] = [OpHi, OpLo]

      return SrcStatus::IS_SAME;

    }

    break;

  case SrcStatus::IS_LO_NEG:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // [SrcHi, SrcLo]   = [CurrHi, -CurrLo]

      // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)

      // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)

      // [SrcHi, SrcLo]   = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]

      return SrcStatus::IS_HI_NEG;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // [SrcHi, SrcLo]   = [CurrHi, -CurrLo]

      // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)

      // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)

      // [SrcHi, SrcLo]   = [-OpHi, -OpLo]

      return SrcStatus::IS_BOTH_NEG;

    }

    break;

  case SrcStatus::IS_BOTH_NEG:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // [SrcHi, SrcLo]   = [-CurrHi, -CurrLo]

      // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)

      // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)

      // [SrcHi, SrcLo]   = [OpHi, OpLo]

      return SrcStatus::IS_SAME;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // [SrcHi, SrcLo]   = [-CurrHi, -CurrLo]

      // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)

      // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)

      // [SrcHi, SrcLo]   = [OpHi, -OpLo]

      return SrcStatus::IS_LO_NEG;

    }

    break;

  case SrcStatus::IS_UPPER_HALF:

    // Vector of 2:

    // Src = CurrUpper

    // Curr = [CurrUpper, CurrLower]

    // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)

    // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)

    // Src = -OpUpper

    //

    // Scalar:

    // Src = CurrUpper

    // Curr = [CurrUpper, CurrLower]

    // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)

    // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)

    // Src = -OpUpper

    return SrcStatus::IS_UPPER_HALF_NEG;

  case SrcStatus::IS_LOWER_HALF:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // Src = CurrLower

      // Curr = [CurrUpper, CurrLower]

      // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)

      // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)

      // Src = -OpLower

      return SrcStatus::IS_LOWER_HALF_NEG;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // Src = CurrLower

      // Curr = [CurrUpper, CurrLower]

      // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)

      // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)

      // Src = OpLower

      return SrcStatus::IS_LOWER_HALF;

    }

    break;

  case SrcStatus::IS_UPPER_HALF_NEG:

    // Vector of 2:

    // Src = -CurrUpper

    // Curr = [CurrUpper, CurrLower]

    // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)

    // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)

    // Src = -(-OpUpper) = OpUpper

    //

    // Scalar:

    // Src = -CurrUpper

    // Curr = [CurrUpper, CurrLower]

    // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)

    // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)

    // Src = -(-OpUpper) = OpUpper

    return SrcStatus::IS_UPPER_HALF;

  case SrcStatus::IS_LOWER_HALF_NEG:

    if (NegType == TypeClass::VECTOR_OF_TWO) {

      // Vector of 2:

      // Src = -CurrLower

      // Curr = [CurrUpper, CurrLower]

      // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)

      // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)

      // Src = -(-OpLower) = OpLower

      return SrcStatus::IS_LOWER_HALF;

    }

    if (NegType == TypeClass::SCALAR) {

      // Scalar:

      // Src = -CurrLower

      // Curr = [CurrUpper, CurrLower]

      // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)

      // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)

      // Src = -OpLower

      return SrcStatus::IS_LOWER_HALF_NEG;

    }

    break;

  default:

    break;

  }

  llvm_unreachable("unexpected SrcStatus & NegType combination");

}


static std::optional<std::pair<Register, SrcStatus>>


calcNextStatus(std::pair<Register, SrcStatus> Curr,

               const MachineRegisterInfo &MRI) {

  const MachineInstr *MI = MRI.getVRegDef(Curr.first);


  unsigned Opc = MI->getOpcode();


  // Handle general Opc cases.

  switch (Opc) {

  case AMDGPU::G_BITCAST:

    return std::optional<std::pair<Register, SrcStatus>>(

        {MI->getOperand(1).getReg(), Curr.second});

  case AMDGPU::COPY:

    if (MI->getOperand(1).getReg().isPhysical())

      return std::nullopt;

    return std::optional<std::pair<Register, SrcStatus>>(

        {MI->getOperand(1).getReg(), Curr.second});

  case AMDGPU::G_FNEG: {

    SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);

    if (Stat == SrcStatus::INVALID)

      return std::nullopt;

    return std::optional<std::pair<Register, SrcStatus>>(

        {MI->getOperand(1).getReg(), Stat});

  }

  default:

    break;

  }


  // Calc next Stat from current Stat.

  switch (Curr.second) {

  case SrcStatus::IS_SAME:

    if (isTruncHalf(MI, MRI))

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});

    else if (isUnmergeHalf(MI, MRI)) {

      if (Curr.first == MI->getOperand(0).getReg())

        return std::optional<std::pair<Register, SrcStatus>>(

            {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});

    }

    break;

  case SrcStatus::IS_HI_NEG:

    if (isTruncHalf(MI, MRI)) {

      // [SrcHi, SrcLo]   = [-CurrHi, CurrLo]

      // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower

      //                  = [OpLowerHi, OpLowerLo]

      // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]

      //     = [-OpLowerHi, OpLowerLo]

      //     = -OpLower

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});

    }

    if (isUnmergeHalf(MI, MRI)) {

      if (Curr.first == MI->getOperand(0).getReg())

        return std::optional<std::pair<Register, SrcStatus>>(

            {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});

    }

    break;

  case SrcStatus::IS_UPPER_HALF:

    if (isShlHalf(MI, MRI))

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});

    break;

  case SrcStatus::IS_LOWER_HALF:

    if (isLshrHalf(MI, MRI))

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});

    break;

  case SrcStatus::IS_UPPER_HALF_NEG:

    if (isShlHalf(MI, MRI))

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});

    break;

  case SrcStatus::IS_LOWER_HALF_NEG:

    if (isLshrHalf(MI, MRI))

      return std::optional<std::pair<Register, SrcStatus>>(

          {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});

    break;

  default:

    break;

  }

  return std::nullopt;

}


/// This is used to control valid status that current MI supports. For example,

/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG

/// bit on VOP3P.

/// The class can be further extended to recognize support on SEL, NEG, ABS bit

/// for different MI on different arch


class SearchOptions {

private:

  bool HasNeg = false;

  // Assume all complex pattern of VOP3P have opsel.

  bool HasOpsel = true;


public:


  SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {

    const MachineInstr *MI = MRI.getVRegDef(Reg);

    unsigned Opc = MI->getOpcode();


    if (Opc == TargetOpcode::G_INTRINSIC) {

      Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();

      // Only float point intrinsic has neg & neg_hi bits.

      if (IntrinsicID == Intrinsic::amdgcn_fdot2)

        HasNeg = true;

    } else if (TargetInstrInfo::isGenericOpcode(Opc)) {

      // Keep same for generic op.

      HasNeg = true;

    }

  }


  bool checkOptions(SrcStatus Stat) const {

    if (!HasNeg &&

        (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {

      return false;

    }

    if (!HasOpsel &&

        (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {

      return false;

    }

    return true;

  }


};


static SmallVector<std::pair<Register, SrcStatus>>


getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,

            int MaxDepth = 3) {

  int Depth = 0;

  auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);

  SmallVector<std::pair<Register, SrcStatus>> Statlist;


  while (Depth <= MaxDepth && Curr.has_value()) {

    Depth++;

    if (SO.checkOptions(Curr.value().second))

      Statlist.push_back(Curr.value());

    Curr = calcNextStatus(Curr.value(), MRI);

  }


  return Statlist;

}


static std::pair<Register, SrcStatus>


getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,

                 int MaxDepth = 3) {

  int Depth = 0;

  std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};

  auto Curr = calcNextStatus(LastSameOrNeg, MRI);


  while (Depth <= MaxDepth && Curr.has_value()) {

    Depth++;

    SrcStatus Stat = Curr.value().second;

    if (SO.checkOptions(Stat)) {

      if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||

          Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)

        LastSameOrNeg = Curr.value();

    }

    Curr = calcNextStatus(Curr.value(), MRI);

  }


  return LastSameOrNeg;

}


static bool isSameBitWidth(Register Reg1, Register Reg2,

                           const MachineRegisterInfo &MRI) {

  unsigned Width1 = MRI.getType(Reg1).getSizeInBits();

  unsigned Width2 = MRI.getType(Reg2).getSizeInBits();

  return Width1 == Width2;

}


static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {

  // SrcStatus::IS_LOWER_HALF remain 0.

  if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {

    Mods ^= SISrcMods::NEG_HI;

    Mods |= SISrcMods::OP_SEL_1;

  } else if (HiStat == SrcStatus::IS_UPPER_HALF)

    Mods |= SISrcMods::OP_SEL_1;

  else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)

    Mods ^= SISrcMods::NEG_HI;

  else if (HiStat == SrcStatus::IS_HI_NEG)

    Mods ^= SISrcMods::NEG_HI;


  if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {

    Mods ^= SISrcMods::NEG;

    Mods |= SISrcMods::OP_SEL_0;

  } else if (LoStat == SrcStatus::IS_UPPER_HALF)

    Mods |= SISrcMods::OP_SEL_0;

  else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)

    Mods |= SISrcMods::NEG;

  else if (LoStat == SrcStatus::IS_HI_NEG)

    Mods ^= SISrcMods::NEG;


  return Mods;

}


static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,

                          Register RootReg, const SIInstrInfo &TII,

                          const MachineRegisterInfo &MRI) {

  auto IsHalfState = [](SrcStatus S) {

    return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||

           S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;

  };

  return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&

         IsHalfState(HiStat);

}


std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(

    Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {

  unsigned Mods = 0;

  // No modification if Root type is not form of <2 x Type>.

  if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {

    Mods |= SISrcMods::OP_SEL_1;

    return {RootReg, Mods};

  }


  SearchOptions SO(RootReg, MRI);


  std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);


  if (Stat.second == SrcStatus::IS_BOTH_NEG)

    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);

  else if (Stat.second == SrcStatus::IS_HI_NEG)

    Mods ^= SISrcMods::NEG_HI;

  else if (Stat.second == SrcStatus::IS_LO_NEG)

    Mods ^= SISrcMods::NEG;


  MachineInstr *MI = MRI.getVRegDef(Stat.first);


  if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||

      (IsDOT && Subtarget->hasDOTOpSelHazard())) {

    Mods |= SISrcMods::OP_SEL_1;

    return {Stat.first, Mods};

  }


  SmallVector<std::pair<Register, SrcStatus>> StatlistHi =

      getSrcStats(MI->getOperand(2).getReg(), MRI, SO);


  if (StatlistHi.empty()) {

    Mods |= SISrcMods::OP_SEL_1;

    return {Stat.first, Mods};

  }


  SmallVector<std::pair<Register, SrcStatus>> StatlistLo =

      getSrcStats(MI->getOperand(1).getReg(), MRI, SO);


  if (StatlistLo.empty()) {

    Mods |= SISrcMods::OP_SEL_1;

    return {Stat.first, Mods};

  }


  for (int I = StatlistHi.size() - 1; I >= 0; I--) {

    for (int J = StatlistLo.size() - 1; J >= 0; J--) {

      if (StatlistHi[I].first == StatlistLo[J].first &&

          isValidToPack(StatlistHi[I].second, StatlistLo[J].second,

                        StatlistHi[I].first, RootReg, TII, MRI))

        return {StatlistHi[I].first,

                updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};

    }

  }

  // Packed instructions do not have abs modifiers.

  Mods |= SISrcMods::OP_SEL_1;


  return {Stat.first, Mods};

}


// Removed unused function `getAllKindImm` to eliminate dead code.


static bool checkRB(Register Reg, unsigned int RBNo,

                    const AMDGPURegisterBankInfo &RBI,

                    const MachineRegisterInfo &MRI,

                    const TargetRegisterInfo &TRI) {

  const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);

  return RB->getID() == RBNo;

}


// This function is used to get the correct register bank for returned reg.

// Assume:

// 1. VOP3P is always legal for VGPR.

// 2. RootOp's regbank is legal.

// Thus

// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.

// 2. If RootOp is VGPR, then NewOp must be VGPR.


static Register getLegalRegBank(Register NewReg, Register RootReg,

                                const AMDGPURegisterBankInfo &RBI,

                                MachineRegisterInfo &MRI,

                                const TargetRegisterInfo &TRI,

                                const SIInstrInfo &TII) {

  // RootOp can only be VGPR or SGPR (some hand written cases such as.

  // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).

  if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||

      checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))

    return NewReg;


  MachineInstr *MI = MRI.getVRegDef(RootReg);

  if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {

    // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.

    return RootReg;

  }


  MachineBasicBlock *BB = MI->getParent();

  Register DstReg = MRI.cloneVirtualRegister(RootReg);


  MachineInstrBuilder MIB =

      BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)

          .addReg(NewReg);


  // Only accept VGPR.

  return MIB->getOperand(0).getReg();

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,

                                                bool IsDOT) const {

  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();

  Register Reg;

  unsigned Mods;

  std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);


  Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {


  return selectVOP3PRetHelper(Root);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {


  return selectVOP3PRetHelper(Root, true);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {

  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);

  if (Mods != SISrcMods::OP_SEL_1)

    return {};


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());

  if (Mods != SISrcMods::OP_SEL_1)

    return {};


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(

    MachineOperand &Root) const {

  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&

         "expected i1 value");

  unsigned Mods = SISrcMods::OP_SEL_1;

  if (Root.getImm() != 0)

    Mods |= SISrcMods::OP_SEL_0;


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


static Register buildRegSequence(SmallVectorImpl<Register> &Elts,

                                 MachineInstr *InsertPt,

                                 MachineRegisterInfo &MRI) {

  const TargetRegisterClass *DstRegClass;

  switch (Elts.size()) {

  case 8:

    DstRegClass = &AMDGPU::VReg_256RegClass;

    break;

  case 4:

    DstRegClass = &AMDGPU::VReg_128RegClass;

    break;

  case 2:

    DstRegClass = &AMDGPU::VReg_64RegClass;

    break;

  default:

    llvm_unreachable("unhandled Reg sequence size");

  }


  MachineIRBuilder B(*InsertPt);

  auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)

                 .addDef(MRI.createVirtualRegister(DstRegClass));

  for (unsigned i = 0; i < Elts.size(); ++i) {

    MIB.addReg(Elts[i]);

    MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));

  }

  return MIB->getOperand(0).getReg();

}


static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,

                                 SmallVectorImpl<Register> &Elts, Register &Src,

                                 MachineInstr *InsertPt,

                                 MachineRegisterInfo &MRI) {

  if (ModOpcode == TargetOpcode::G_FNEG) {

    Mods |= SISrcMods::NEG;

    // Check if all elements also have abs modifier

    SmallVector<Register, 8> NegAbsElts;

    for (auto El : Elts) {

      Register FabsSrc;

      if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))

        break;

      NegAbsElts.push_back(FabsSrc);

    }

    if (Elts.size() != NegAbsElts.size()) {

      // Neg

      Src = buildRegSequence(Elts, InsertPt, MRI);

    } else {

      // Neg and Abs

      Mods |= SISrcMods::NEG_HI;

      Src = buildRegSequence(NegAbsElts, InsertPt, MRI);

    }

  } else {

    assert(ModOpcode == TargetOpcode::G_FABS);

    // Abs

    Mods |= SISrcMods::NEG_HI;

    Src = buildRegSequence(Elts, InsertPt, MRI);

  }

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {

  Register Src = Root.getReg();

  unsigned Mods = SISrcMods::OP_SEL_1;

  SmallVector<Register, 8> EltsF32;


  if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {

    assert(BV->getNumSources() > 0);

    // Based on first element decide which mod we match, neg or abs

    MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));

    unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)

                             ? AMDGPU::G_FNEG

                             : AMDGPU::G_FABS;

    for (unsigned i = 0; i < BV->getNumSources(); ++i) {

      ElF32 = MRI->getVRegDef(BV->getSourceReg(i));

      if (ElF32->getOpcode() != ModOpcode)

        break;

      EltsF32.push_back(ElF32->getOperand(1).getReg());

    }


    // All elements had ModOpcode modifier

    if (BV->getNumSources() == EltsF32.size()) {

      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),

                           *MRI);

    }

  }


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {

  Register Src = Root.getReg();

  unsigned Mods = SISrcMods::OP_SEL_1;

  SmallVector<Register, 8> EltsV2F16;


  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {

    for (unsigned i = 0; i < CV->getNumSources(); ++i) {

      Register FNegSrc;

      if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))

        break;

      EltsV2F16.push_back(FNegSrc);

    }


    // All elements had ModOpcode modifier

    if (CV->getNumSources() == EltsV2F16.size()) {

      Mods |= SISrcMods::NEG;

      Mods |= SISrcMods::NEG_HI;

      Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);

    }

  }


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {

  Register Src = Root.getReg();

  unsigned Mods = SISrcMods::OP_SEL_1;

  SmallVector<Register, 8> EltsV2F16;


  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {

    assert(CV->getNumSources() > 0);

    MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));

    // Based on first element decide which mod we match, neg or abs

    unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)

                             ? AMDGPU::G_FNEG

                             : AMDGPU::G_FABS;


    for (unsigned i = 0; i < CV->getNumSources(); ++i) {

      ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));

      if (ElV2F16->getOpcode() != ModOpcode)

        break;

      EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());

    }


    // All elements had ModOpcode modifier

    if (CV->getNumSources() == EltsV2F16.size()) {

      MachineIRBuilder B(*Root.getParent());

      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),

                           *MRI);

    }

  }


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {

  std::optional<FPValueAndVReg> FPValReg;

  if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {

    if (TII.isInlineConstant(FPValReg->Value)) {

      return {{[=](MachineInstrBuilder &MIB) {

        MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());

      }}};

    }

    // Non-inlineable splat floats should not fall-through for integer immediate

    // checks.

    return {};

  }


  APInt ICst;

  if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {

    if (TII.isInlineConstant(ICst)) {

      return {

          {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};

    }

  }


  return {};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {

  Register Src =

      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();

  unsigned Key = 0;


  Register ShiftSrc;

  std::optional<ValueAndVReg> ShiftAmt;

  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&

      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&

      ShiftAmt->Value.getZExtValue() % 8 == 0) {

    Key = ShiftAmt->Value.getZExtValue() / 8;

    Src = ShiftSrc;

  }


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {


  Register Src =

      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();

  unsigned Key = 0;


  Register ShiftSrc;

  std::optional<ValueAndVReg> ShiftAmt;

  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&

      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&

      ShiftAmt->Value.getZExtValue() == 16) {

    Src = ShiftSrc;

    Key = 1;

  }


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {

  Register Src =

      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();

  unsigned Key = 0;


  Register S32 = matchZeroExtendFromS32(Src);

  if (!S32)

    S32 = matchAnyExtendFromS32(Src);


  if (S32) {

    const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);

    if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {

      assert(Def->getNumOperands() == 3);

      Register DstReg1 = Def->getOperand(1).getReg();

      if (mi_match(S32, *MRI,

                   m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {

        Src = Def->getOperand(2).getReg();

        Key = 1;

      }

    }

  }


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());


  // FIXME: Handle op_sel

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


// FIXME-TRUE16 remove when fake16 is removed

InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),

                                           /*IsCanonicalizing=*/true,

                                           /*AllowAbs=*/false,

                                           /*OpSel=*/false);


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(

            copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),

                                           /*IsCanonicalizing=*/true,

                                           /*AllowAbs=*/false,

                                           /*OpSel=*/true);


  return {{

      [=](MachineInstrBuilder &MIB) {

        MIB.addReg(

            copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods

  }};

}


// Given \p Offset and load specified by the \p Root operand check if \p Offset

// is a multiple of the load byte size. If it is update \p Offset to a

// pre-scaled value and return true.

bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,

                                                  Register &Offset,

                                                  bool IsSigned) const {

  if (!Subtarget->hasScaleOffset())

    return false;


  const MachineInstr &MI = *Root.getParent();

  MachineMemOperand *MMO = *MI.memoperands_begin();


  if (!MMO->getSize().hasValue())

    return false;


  uint64_t Size = MMO->getSize().getValue();


  Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);

  if (!OffsetReg)

    OffsetReg = Offset;


  if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))

    OffsetReg = Def->Reg;


  Register Op0;

  MachineInstr *Mul;

  bool ScaleOffset =

      (isPowerOf2_64(Size) &&

       mi_match(OffsetReg, *MRI,

                m_GShl(m_Reg(Op0),

                       m_any_of(m_SpecificICst(Log2_64(Size)),

                                m_Copy(m_SpecificICst(Log2_64(Size))))))) ||

      mi_match(OffsetReg, *MRI,

               m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),

                                           m_Copy(m_SpecificICst(Size))))) ||

      mi_match(

          OffsetReg, *MRI,

          m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,

                  m_Reg(Op0), m_SpecificICst(Size))) ||

      // Match G_AMDGPU_MAD_U64_U32 offset, c, 0

      (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&

       (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32

                                      : AMDGPU::G_AMDGPU_MAD_U64_U32) ||

        (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&

         VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&

       mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&

       mi_match(Mul->getOperand(3).getReg(), *MRI,

                m_GTrunc(m_any_of(m_SpecificICst(Size),

                                  m_Copy(m_SpecificICst(Size))))) &&

       mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));


  if (ScaleOffset)

    Offset = Op0;


  return ScaleOffset;

}


bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,

                                                 Register &Base,

                                                 Register *SOffset,

                                                 int64_t *Offset,

                                                 bool *ScaleOffset) const {

  MachineInstr *MI = Root.getParent();

  MachineBasicBlock *MBB = MI->getParent();


  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,

  // then we can select all ptr + 32-bit offsets.

  SmallVector<GEPInfo, 4> AddrInfo;

  getAddrModeInfo(*MI, *MRI, AddrInfo);


  if (AddrInfo.empty())

    return false;


  const GEPInfo &GEPI = AddrInfo[0];

  std::optional<int64_t> EncodedImm;


  if (ScaleOffset)

    *ScaleOffset = false;


  if (SOffset && Offset) {

    EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,

                                              /*HasSOffset=*/true);

    if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&

        AddrInfo.size() > 1) {

      const GEPInfo &GEPI2 = AddrInfo[1];

      if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {

        Register OffsetReg = GEPI2.SgprParts[1];

        if (ScaleOffset)

          *ScaleOffset =

              selectScaleOffset(Root, OffsetReg, false /* IsSigned */);

        OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);

        if (OffsetReg) {

          Base = GEPI2.SgprParts[0];

          *SOffset = OffsetReg;

          *Offset = *EncodedImm;

          if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))

            return true;


          // For unbuffered smem loads, it is illegal for the Immediate Offset

          // to be negative if the resulting (Offset + (M0 or SOffset or zero)

          // is negative. Handle the case where the Immediate Offset + SOffset

          // is negative.

          auto SKnown = VT->getKnownBits(*SOffset);

          if (*Offset + SKnown.getMinValue().getSExtValue() < 0)

            return false;


          return true;

        }

      }

    }

    return false;

  }


  EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,

                                            /*HasSOffset=*/false);

  if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {

    Base = GEPI.SgprParts[0];

    *Offset = *EncodedImm;

    return true;

  }


  // SGPR offset is unsigned.

  if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&

      GEPI.Imm != 0) {

    // If we make it this far we have a load with an 32-bit immediate offset.

    // It is OK to select this using a sgpr offset, because we have already

    // failed trying to select this load into one of the _IMM variants since

    // the _IMM Patterns are considered before the _SGPR patterns.

    Base = GEPI.SgprParts[0];

    *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)

        .addImm(GEPI.Imm);

    return true;

  }


  if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {

    Register OffsetReg = GEPI.SgprParts[1];

    if (ScaleOffset)

      *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);

    OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);

    if (OffsetReg) {

      Base = GEPI.SgprParts[0];

      *SOffset = OffsetReg;

      return true;

    }

  }


  return false;

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {

  Register Base;

  int64_t Offset;

  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,

                        /* ScaleOffset */ nullptr))

    return std::nullopt;


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {

  SmallVector<GEPInfo, 4> AddrInfo;

  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);


  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)

    return std::nullopt;


  const GEPInfo &GEPInfo = AddrInfo[0];

  Register PtrReg = GEPInfo.SgprParts[0];

  std::optional<int64_t> EncodedImm =

      AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);

  if (!EncodedImm)

    return std::nullopt;


  return {{

    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },

    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {

  Register Base, SOffset;

  bool ScaleOffset;

  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,

                        &ScaleOffset))

    return std::nullopt;


  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;

  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },

           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {

  Register Base, SOffset;

  int64_t Offset;

  bool ScaleOffset;

  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))

    return std::nullopt;


  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;

  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },

           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};

}


std::pair<Register, int> AMDGPUInstructionSelector::selectFlatOffsetImpl(

    MachineOperand &Root, AMDGPU::FlatAddrSpace FlatVariant) const {

  MachineInstr *MI = Root.getParent();


  auto Default = std::pair(Root.getReg(), 0);


  if (!STI.hasFlatInstOffsets())

    return Default;


  Register PtrBase;

  int64_t ConstOffset;

  bool IsInBounds;

  std::tie(PtrBase, ConstOffset, IsInBounds) =

      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);


  // Adding the offset to the base address with an immediate in a FLAT

  // instruction must not change the memory aperture in which the address falls.

  // Therefore we can only fold offsets from inbounds GEPs into FLAT

  // instructions.

  if (ConstOffset == 0 ||

      (FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch &&

       !isFlatScratchBaseLegal(Root.getReg())) ||

      (FlatVariant == AMDGPU::FlatAddrSpace::FLAT && !IsInBounds))

    return Default;


  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();

  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))

    return Default;


  return std::pair(PtrBase, ConstOffset);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {

  auto PtrWithOffset = selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FLAT);


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },

    }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {

  auto PtrWithOffset =

      selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatGlobal);


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {

  auto PtrWithOffset =

      selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatScratch);


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },

    }};

}


// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)

InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,

                                             unsigned CPolBits,

                                             bool NeedIOffset) const {

  Register Addr = Root.getReg();

  Register PtrBase;

  int64_t ConstOffset;

  int64_t ImmOffset = 0;


  // Match the immediate offset first, which canonically is moved as low as

  // possible.

  std::tie(PtrBase, ConstOffset, std::ignore) =

      getPtrBaseWithConstantOffset(Addr, *MRI);


  if (ConstOffset != 0) {

    if (NeedIOffset &&

        TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,

                              AMDGPU::FlatAddrSpace::FlatGlobal)) {

      Addr = PtrBase;

      ImmOffset = ConstOffset;

    } else {

      auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);

      if (isSGPR(PtrBaseDef->Reg)) {

        if (ConstOffset > 0) {

          // Offset is too large.

          //

          // saddr + large_offset -> saddr +

          //                         (voffset = large_offset & ~MaxOffset) +

          //                         (large_offset & MaxOffset);

          int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;

          if (NeedIOffset) {

            std::tie(SplitImmOffset, RemainderOffset) =

                TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,

                                    AMDGPU::FlatAddrSpace::FlatGlobal);

          }


          if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)

                                              : isUInt<32>(RemainderOffset)) {

            MachineInstr *MI = Root.getParent();

            MachineBasicBlock *MBB = MI->getParent();

            Register HighBits =

                MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);


            BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),

                    HighBits)

                .addImm(RemainderOffset);


            if (NeedIOffset)

              return {{

                  [=](MachineInstrBuilder &MIB) {

                    MIB.addReg(PtrBase);

                  }, // saddr

                  [=](MachineInstrBuilder &MIB) {

                    MIB.addReg(HighBits);

                  }, // voffset

                  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },

                  [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },

              }};

            return {{

                [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr

                [=](MachineInstrBuilder &MIB) {

                  MIB.addReg(HighBits);

                }, // voffset

                [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },

            }};

          }

        }


        // We are adding a 64 bit SGPR and a constant. If constant bus limit

        // is 1 we would need to perform 1 or 2 extra moves for each half of

        // the constant and it is better to do a scalar add and then issue a

        // single VALU instruction to materialize zero. Otherwise it is less

        // instructions to perform VALU adds with immediates or inline literals.

        unsigned NumLiterals =

            !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +

            !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));

        if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)

          return std::nullopt;

      }

    }

  }


  // Match the variable offset.

  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);

  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {

    // Look through the SGPR->VGPR copy.

    Register SAddr =

        getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);


    if (isSGPR(SAddr)) {

      Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();


      // It's possible voffset is an SGPR here, but the copy to VGPR will be

      // inserted later.

      bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,

                                           Subtarget->hasSignedGVSOffset());

      if (Register VOffset = matchExtendFromS32OrS32(

              PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {

        if (NeedIOffset)

          return {{[=](MachineInstrBuilder &MIB) { // saddr

                     MIB.addReg(SAddr);

                   },

                   [=](MachineInstrBuilder &MIB) { // voffset

                     MIB.addReg(VOffset);

                   },

                   [=](MachineInstrBuilder &MIB) { // offset

                     MIB.addImm(ImmOffset);

                   },

                   [=](MachineInstrBuilder &MIB) { // cpol

                     MIB.addImm(CPolBits |

                                (ScaleOffset ? AMDGPU::CPol::SCAL : 0));

                   }}};

        return {{[=](MachineInstrBuilder &MIB) { // saddr

                   MIB.addReg(SAddr);

                 },

                 [=](MachineInstrBuilder &MIB) { // voffset

                   MIB.addReg(VOffset);

                 },

                 [=](MachineInstrBuilder &MIB) { // cpol

                   MIB.addImm(CPolBits |

                              (ScaleOffset ? AMDGPU::CPol::SCAL : 0));

                 }}};

      }

    }

  }


  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and

  // drop this.

  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||

      AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))

    return std::nullopt;


  // It's cheaper to materialize a single 32-bit zero for vaddr than the two

  // moves required to copy a 64-bit SGPR to VGPR.

  MachineInstr *MI = Root.getParent();

  MachineBasicBlock *MBB = MI->getParent();

  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)

      .addImm(0);


  if (NeedIOffset)

    return {{

        [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr

        [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset

        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); },    // offset

        [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }      // cpol

    }};

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr

      [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset

      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }      // cpol

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {

  return selectGlobalSAddr(Root, 0);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {

  const MachineInstr &I = *Root.getParent();


  // We are assuming CPol is always the last operand of the intrinsic.

  auto PassedCPol =

      I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;

  return selectGlobalSAddr(Root, PassedCPol);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {

  const MachineInstr &I = *Root.getParent();


  // We are assuming CPol is second from last operand of the intrinsic.

  auto PassedCPol =

      I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;

  return selectGlobalSAddr(Root, PassedCPol);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {

  return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(

    MachineOperand &Root) const {

  const MachineInstr &I = *Root.getParent();


  // We are assuming CPol is always the last operand of the intrinsic.

  auto PassedCPol =

      I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;

  return selectGlobalSAddr(Root, PassedCPol, false);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(

    MachineOperand &Root) const {

  const MachineInstr &I = *Root.getParent();


  // We are assuming CPol is second from last operand of the intrinsic.

  auto PassedCPol =

      I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;

  return selectGlobalSAddr(Root, PassedCPol, false);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {

  Register Addr = Root.getReg();

  Register PtrBase;

  int64_t ConstOffset;

  int64_t ImmOffset = 0;


  // Match the immediate offset first, which canonically is moved as low as

  // possible.

  std::tie(PtrBase, ConstOffset, std::ignore) =

      getPtrBaseWithConstantOffset(Addr, *MRI);


  if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&

      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,

                            AMDGPU::FlatAddrSpace::FlatScratch)) {

    Addr = PtrBase;

    ImmOffset = ConstOffset;

  }


  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);

  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {

    int FI = AddrDef->MI->getOperand(1).getIndex();

    return {{

        [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr

        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset

    }};

  }


  Register SAddr = AddrDef->Reg;


  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {

    Register LHS = AddrDef->MI->getOperand(1).getReg();

    Register RHS = AddrDef->MI->getOperand(2).getReg();

    auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);

    auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);


    if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&

        isSGPR(RHSDef->Reg)) {

      int FI = LHSDef->MI->getOperand(1).getIndex();

      MachineInstr &I = *Root.getParent();

      MachineBasicBlock *BB = I.getParent();

      const DebugLoc &DL = I.getDebugLoc();

      SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);


      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)

          .addFrameIndex(FI)

          .addReg(RHSDef->Reg)

          .setOperandDead(3); // Dead scc

    }

  }


  if (!isSGPR(SAddr))

    return std::nullopt;


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr

      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset

  }};

}


// Check whether the flat scratch SVS swizzle bug affects this access.

bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(

    Register VAddr, Register SAddr, uint64_t ImmOffset) const {

  if (!Subtarget->hasFlatScratchSVSSwizzleBug())

    return false;


  // The bug affects the swizzling of SVS accesses if there is any carry out

  // from the two low order bits (i.e. from bit 1 into bit 2) when adding

  // voffset to (soffset + inst_offset).

  auto VKnown = VT->getKnownBits(VAddr);

  auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),

                               KnownBits::makeConstant(APInt(32, ImmOffset)));

  uint64_t VMax = VKnown.getMaxValue().getZExtValue();

  uint64_t SMax = SKnown.getMaxValue().getZExtValue();

  return (VMax & 3) + (SMax & 3) >= 4;

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {

  Register Addr = Root.getReg();

  Register PtrBase;

  int64_t ConstOffset;

  int64_t ImmOffset = 0;


  // Match the immediate offset first, which canonically is moved as low as

  // possible.

  std::tie(PtrBase, ConstOffset, std::ignore) =

      getPtrBaseWithConstantOffset(Addr, *MRI);


  Register OrigAddr = Addr;

  if (ConstOffset != 0 &&

      TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,

                            AMDGPU::FlatAddrSpace::FlatScratch)) {

    Addr = PtrBase;

    ImmOffset = ConstOffset;

  }


  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);

  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)

    return std::nullopt;


  Register RHS = AddrDef->MI->getOperand(2).getReg();

  if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)

    return std::nullopt;


  Register LHS = AddrDef->MI->getOperand(1).getReg();

  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);


  if (OrigAddr != Addr) {

    if (!isFlatScratchBaseLegalSVImm(OrigAddr))

      return std::nullopt;

  } else {

    if (!isFlatScratchBaseLegalSV(OrigAddr))

      return std::nullopt;

  }


  if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))

    return std::nullopt;


  unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)

                      ? AMDGPU::CPol::SCAL

                      : 0;


  if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {

    int FI = LHSDef->MI->getOperand(1).getIndex();

    return {{

        [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },       // vaddr

        [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr

        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset

        [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }       // cpol

    }};

  }


  if (!isSGPR(LHS))

    if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))

      LHS = Def->Reg;


  if (!isSGPR(LHS))

    return std::nullopt;


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },       // vaddr

      [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); },       // saddr

      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset

      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }       // cpol

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {

  MachineInstr *MI = Root.getParent();

  MachineBasicBlock *MBB = MI->getParent();

  MachineFunction *MF = MBB->getParent();

  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();


  int64_t Offset = 0;

  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&

      Offset != AMDGPU::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {

    Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);


    // TODO: Should this be inside the render function? The iterator seems to

    // move.

    const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);

    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),

            HighBits)

        .addImm(Offset & ~MaxOffset);


    return {{[=](MachineInstrBuilder &MIB) { // rsrc

               MIB.addReg(Info->getScratchRSrcReg());

             },

             [=](MachineInstrBuilder &MIB) { // vaddr

               MIB.addReg(HighBits);

             },

             [=](MachineInstrBuilder &MIB) { // soffset

               // Use constant zero for soffset and rely on eliminateFrameIndex

               // to choose the appropriate frame register if need be.

               MIB.addImm(0);

             },

             [=](MachineInstrBuilder &MIB) { // offset

               MIB.addImm(Offset & MaxOffset);

             }}};

  }


  assert(Offset == 0 || Offset == -1);


  // Try to fold a frame index directly into the MUBUF vaddr field, and any

  // offsets.

  std::optional<int> FI;

  Register VAddr = Root.getReg();


  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());

  Register PtrBase;

  int64_t ConstOffset;

  std::tie(PtrBase, ConstOffset, std::ignore) =

      getPtrBaseWithConstantOffset(VAddr, *MRI);

  if (ConstOffset != 0) {

    if (TII.isLegalMUBUFImmOffset(ConstOffset) &&

        (!STI.privateMemoryResourceIsRangeChecked() ||

         VT->signBitIsZero(PtrBase))) {

      const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);

      if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)

        FI = PtrBaseDef->getOperand(1).getIndex();

      else

        VAddr = PtrBase;

      Offset = ConstOffset;

    }

  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {

    FI = RootDef->getOperand(1).getIndex();

  }


  return {{[=](MachineInstrBuilder &MIB) { // rsrc

             MIB.addReg(Info->getScratchRSrcReg());

           },

           [=](MachineInstrBuilder &MIB) { // vaddr

             if (FI)

               MIB.addFrameIndex(*FI);

             else

               MIB.addReg(VAddr);

           },

           [=](MachineInstrBuilder &MIB) { // soffset

             // Use constant zero for soffset and rely on eliminateFrameIndex

             // to choose the appropriate frame register if need be.

             MIB.addImm(0);

           },

           [=](MachineInstrBuilder &MIB) { // offset

             MIB.addImm(Offset);

           }}};

}


bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,

                                                int64_t Offset) const {

  if (!isUInt<16>(Offset))

    return false;


  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())

    return true;


  // On Southern Islands instruction with a negative base value and an offset

  // don't seem to work.

  return VT->signBitIsZero(Base);

}


bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,

                                                 int64_t Offset1,

                                                 unsigned Size) const {

  if (Offset0 % Size != 0 || Offset1 % Size != 0)

    return false;

  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))

    return false;


  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())

    return true;


  // On Southern Islands instruction with a negative base value and an offset

  // don't seem to work.

  return VT->signBitIsZero(Base);

}


// Return whether the operation has NoUnsignedWrap property.


static bool isNoUnsignedWrap(MachineInstr *Addr) {

  return Addr->getOpcode() == TargetOpcode::G_OR ||

         (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&

          Addr->getFlag(MachineInstr::NoUWrap));

}


// Check that the base address of flat scratch load/store in the form of `base +

// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware

// requirement). We always treat the first operand as the base address here.

bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {

  MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);


  if (isNoUnsignedWrap(AddrMI))

    return true;


  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (STI.hasSignedScratchOffsets())

    return true;


  Register LHS = AddrMI->getOperand(1).getReg();

  Register RHS = AddrMI->getOperand(2).getReg();


  if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {

    std::optional<ValueAndVReg> RhsValReg =

        getIConstantVRegValWithLookThrough(RHS, *MRI);

    // If the immediate offset is negative and within certain range, the base

    // address cannot also be negative. If the base is also negative, the sum

    // would be either negative or much larger than the valid range of scratch

    // memory a thread can access.

    if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&

        RhsValReg->Value.getSExtValue() > -0x40000000)

      return true;

  }


  return VT->signBitIsZero(LHS);

}


// Check address value in SGPR/VGPR are legal for flat scratch in the form

// of: SGPR + VGPR.

bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {

  MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);


  if (isNoUnsignedWrap(AddrMI))

    return true;


  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (STI.hasSignedScratchOffsets())

    return true;


  Register LHS = AddrMI->getOperand(1).getReg();

  Register RHS = AddrMI->getOperand(2).getReg();

  return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);

}


// Check address value in SGPR/VGPR are legal for flat scratch in the form

// of: SGPR + VGPR + Imm.

bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(

    Register Addr) const {

  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (STI.hasSignedScratchOffsets())

    return true;


  MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);

  Register Base = AddrMI->getOperand(1).getReg();

  std::optional<DefinitionAndSourceRegister> BaseDef =

      getDefSrcRegIgnoringCopies(Base, *MRI);

  std::optional<ValueAndVReg> RHSOffset =

      getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);

  assert(RHSOffset);


  // If the immediate offset is negative and within certain range, the base

  // address cannot also be negative. If the base is also negative, the sum

  // would be either negative or much larger than the valid range of scratch

  // memory a thread can access.

  if (isNoUnsignedWrap(BaseDef->MI) &&

      (isNoUnsignedWrap(AddrMI) ||

       (RHSOffset->Value.getSExtValue() < 0 &&

        RHSOffset->Value.getSExtValue() > -0x40000000)))

    return true;


  Register LHS = BaseDef->MI->getOperand(1).getReg();

  Register RHS = BaseDef->MI->getOperand(2).getReg();

  return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);

}


bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,

                                                    unsigned ShAmtBits) const {

  assert(MI.getOpcode() == TargetOpcode::G_AND);


  std::optional<APInt> RHS =

      getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);

  if (!RHS)

    return false;


  if (RHS->countr_one() >= ShAmtBits)

    return true;


  const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());

  return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectMUBUFScratchOffset(

    MachineOperand &Root) const {

  Register Reg = Root.getReg();

  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();


  std::optional<DefinitionAndSourceRegister> Def =

    getDefSrcRegIgnoringCopies(Reg, *MRI);

  assert(Def && "this shouldn't be an optional result");

  Reg = Def->Reg;


  if (Register WaveBase = getWaveAddress(Def->MI)) {

    return {{

        [=](MachineInstrBuilder &MIB) { // rsrc

          MIB.addReg(Info->getScratchRSrcReg());

        },

        [=](MachineInstrBuilder &MIB) { // soffset

          MIB.addReg(WaveBase);

        },

        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset

    }};

  }


  int64_t Offset = 0;


  // FIXME: Copy check is a hack

  Register BasePtr;

  if (mi_match(Reg, *MRI,

               m_GPtrAdd(m_Reg(BasePtr),

                         m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {

    if (!TII.isLegalMUBUFImmOffset(Offset))

      return {};

    MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);

    Register WaveBase = getWaveAddress(BasePtrDef);

    if (!WaveBase)

      return {};


    return {{

        [=](MachineInstrBuilder &MIB) { // rsrc

          MIB.addReg(Info->getScratchRSrcReg());

        },

        [=](MachineInstrBuilder &MIB) { // soffset

          MIB.addReg(WaveBase);

        },

        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset

    }};

  }


  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||

      !TII.isLegalMUBUFImmOffset(Offset))

    return {};


  return {{

      [=](MachineInstrBuilder &MIB) { // rsrc

        MIB.addReg(Info->getScratchRSrcReg());

      },

      [=](MachineInstrBuilder &MIB) { // soffset

        MIB.addImm(0);

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset

  }};

}


std::pair<Register, unsigned>

AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {

  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());

  int64_t ConstAddr = 0;


  Register PtrBase;

  int64_t Offset;

  std::tie(PtrBase, Offset, std::ignore) =

      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);


  if (Offset) {

    if (isDSOffsetLegal(PtrBase, Offset)) {

      // (add n0, c0)

      return std::pair(PtrBase, Offset);

    }

  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {

    // TODO


  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {

    // TODO


  }


  return std::pair(Root.getReg(), 0);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {

  Register Reg;

  unsigned Offset;

  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }

    }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {

  return selectDSReadWrite2(Root, 4);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {

  return selectDSReadWrite2(Root, 8);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,

                                              unsigned Size) const {

  Register Reg;

  unsigned Offset;

  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);

  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }

    }};

}


std::pair<Register, unsigned>

AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,

                                                  unsigned Size) const {

  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());

  int64_t ConstAddr = 0;


  Register PtrBase;

  int64_t Offset;

  std::tie(PtrBase, Offset, std::ignore) =

      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);


  if (Offset) {

    int64_t OffsetValue0 = Offset;

    int64_t OffsetValue1 = Offset + Size;

    if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {

      // (add n0, c0)

      return std::pair(PtrBase, OffsetValue0 / Size);

    }

  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {

    // TODO


  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {

    // TODO


  }


  return std::pair(Root.getReg(), 0);

}


/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return

/// the base value with the constant offset, and if the offset computation is

/// known to be inbounds. There may be intervening copies between \p Root and

/// the identified constant. Returns \p Root, 0, false if this does not match

/// the pattern.

std::tuple<Register, int64_t, bool>

AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(

    Register Root, const MachineRegisterInfo &MRI) const {

  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);

  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)

    return {Root, 0, false};


  MachineOperand &RHS = RootI->getOperand(2);

  std::optional<ValueAndVReg> MaybeOffset =

      getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);

  if (!MaybeOffset)

    return {Root, 0, false};

  bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);

  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),

          IsInBounds};

}


static void addZeroImm(MachineInstrBuilder &MIB) {

  MIB.addImm(0);

}


/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p

/// BasePtr is not valid, a null base pointer will be used.


static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,

                          uint32_t FormatLo, uint32_t FormatHi,

                          Register BasePtr) {

  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);


  B.buildInstr(AMDGPU::S_MOV_B32)

    .addDef(RSrc2)

    .addImm(FormatLo);

  B.buildInstr(AMDGPU::S_MOV_B32)

    .addDef(RSrc3)

    .addImm(FormatHi);


  // Build the half of the subregister with the constants before building the

  // full 128-bit register. If we are building multiple resource descriptors,

  // this will allow CSEing of the 2-component register.

  B.buildInstr(AMDGPU::REG_SEQUENCE)

    .addDef(RSrcHi)

    .addReg(RSrc2)

    .addImm(AMDGPU::sub0)

    .addReg(RSrc3)

    .addImm(AMDGPU::sub1);


  Register RSrcLo = BasePtr;

  if (!BasePtr) {

    RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

    B.buildInstr(AMDGPU::S_MOV_B64)

      .addDef(RSrcLo)

      .addImm(0);

  }


  B.buildInstr(AMDGPU::REG_SEQUENCE)

    .addDef(RSrc)

    .addReg(RSrcLo)

    .addImm(AMDGPU::sub0_sub1)

    .addReg(RSrcHi)

    .addImm(AMDGPU::sub2_sub3);


  return RSrc;

}


static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,

                                const SIInstrInfo &TII, Register BasePtr) {

  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();


  // FIXME: Why are half the "default" bits ignored based on the addressing

  // mode?

  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);

}


static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,

                               const SIInstrInfo &TII, Register BasePtr) {

  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();


  // FIXME: Why are half the "default" bits ignored based on the addressing

  // mode?

  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);

}


AMDGPUInstructionSelector::MUBUFAddressData

AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {

  MUBUFAddressData Data;

  Data.N0 = Src;


  Register PtrBase;

  int64_t Offset;


  std::tie(PtrBase, Offset, std::ignore) =

      getPtrBaseWithConstantOffset(Src, *MRI);

  if (isUInt<32>(Offset)) {

    Data.N0 = PtrBase;

    Data.Offset = Offset;

  }


  if (MachineInstr *InputAdd

      = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {

    Data.N2 = InputAdd->getOperand(1).getReg();

    Data.N3 = InputAdd->getOperand(2).getReg();


    // FIXME: Need to fix extra SGPR->VGPRcopies inserted

    // FIXME: Don't know this was defined by operand 0

    //

    // TODO: Remove this when we have copy folding optimizations after

    // RegBankSelect.

    Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();

    Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();

  }


  return Data;

}


/// Return if the addr64 mubuf mode should be used for the given address.

bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {

  // (ptr_add N2, N3) -> addr64, or

  // (ptr_add (ptr_add N2, N3), C1) -> addr64

  if (Addr.N2)

    return true;


  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);

  return N0Bank->getID() == AMDGPU::VGPRRegBankID;

}


/// Split an immediate offset \p ImmOffset depending on whether it fits in the

/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable

/// component.

void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(

  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {

  if (TII.isLegalMUBUFImmOffset(ImmOffset))

    return;


  // Illegal offset, store it in soffset.

  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  B.buildInstr(AMDGPU::S_MOV_B32)

    .addDef(SOffset)

    .addImm(ImmOffset);

  ImmOffset = 0;

}


bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(

  MachineOperand &Root, Register &VAddr, Register &RSrcReg,

  Register &SOffset, int64_t &Offset) const {

  // FIXME: Predicates should stop this from reaching here.

  // addr64 bit was removed for volcanic islands.

  if (!STI.hasAddr64() || STI.useFlatForGlobal())

    return false;


  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());

  if (!shouldUseAddr64(AddrData))

    return false;


  Register N0 = AddrData.N0;

  Register N2 = AddrData.N2;

  Register N3 = AddrData.N3;

  Offset = AddrData.Offset;


  // Base pointer for the SRD.

  Register SRDPtr;


  if (N2) {

    if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {

      assert(N3);

      if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {

        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the

        // addr64, and construct the default resource from a 0 address.

        VAddr = N0;

      } else {

        SRDPtr = N3;

        VAddr = N2;

      }

    } else {

      // N2 is not divergent.

      SRDPtr = N2;

      VAddr = N3;

    }

  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {

    // Use the default null pointer in the resource

    VAddr = N0;

  } else {

    // N0 -> offset, or

    // (N0 + C1) -> offset

    SRDPtr = N0;

  }


  MachineIRBuilder B(*Root.getParent());

  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);

  splitIllegalMUBUFOffset(B, SOffset, Offset);

  return true;

}


bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(

  MachineOperand &Root, Register &RSrcReg, Register &SOffset,

  int64_t &Offset) const {


  // FIXME: Pattern should not reach here.

  if (STI.useFlatForGlobal())

    return false;


  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());

  if (shouldUseAddr64(AddrData))

    return false;


  // N0 -> offset, or

  // (N0 + C1) -> offset

  Register SRDPtr = AddrData.N0;

  Offset = AddrData.Offset;


  // TODO: Look through extensions for 32-bit soffset.

  MachineIRBuilder B(*Root.getParent());


  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);

  splitIllegalMUBUFOffset(B, SOffset, Offset);

  return true;

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {

  Register VAddr;

  Register RSrcReg;

  Register SOffset;

  int64_t Offset = 0;


  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))

    return {};


  // FIXME: Use defaulted operands for trailing 0s and remove from the complex

  // pattern.

  return {{

      [=](MachineInstrBuilder &MIB) {  // rsrc

        MIB.addReg(RSrcReg);

      },

      [=](MachineInstrBuilder &MIB) { // vaddr

        MIB.addReg(VAddr);

      },

      [=](MachineInstrBuilder &MIB) { // soffset

        if (SOffset)

          MIB.addReg(SOffset);

        else if (STI.hasRestrictedSOffset())

          MIB.addReg(AMDGPU::SGPR_NULL);

        else

          MIB.addImm(0);

      },

      [=](MachineInstrBuilder &MIB) { // offset

        MIB.addImm(Offset);

      },

      addZeroImm, //  cpol

      addZeroImm, //  tfe

      addZeroImm  //  swz

    }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {

  Register RSrcReg;

  Register SOffset;

  int64_t Offset = 0;


  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))

    return {};


  return {{

      [=](MachineInstrBuilder &MIB) {  // rsrc

        MIB.addReg(RSrcReg);

      },

      [=](MachineInstrBuilder &MIB) { // soffset

        if (SOffset)

          MIB.addReg(SOffset);

        else if (STI.hasRestrictedSOffset())

          MIB.addReg(AMDGPU::SGPR_NULL);

        else

          MIB.addImm(0);

      },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset

      addZeroImm, //  cpol

      addZeroImm, //  tfe

      addZeroImm, //  swz

    }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {


  Register SOffset = Root.getReg();


  if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))

    SOffset = AMDGPU::SGPR_NULL;


  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};

}


/// Get an immediate that must be 32-bits, and treated as zero extended.

static std::optional<uint64_t>


getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {

  // getIConstantVRegVal sexts any values, so see if that matters.

  std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);

  if (!OffsetVal || !isInt<32>(*OffsetVal))

    return std::nullopt;

  return Lo_32(*OffsetVal);

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {

  std::optional<uint64_t> OffsetVal =

      Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);

  if (!OffsetVal)

    return {};


  std::optional<int64_t> EncodedImm =

      AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);

  if (!EncodedImm)

    return {};


  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {

  assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);


  std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);

  if (!OffsetVal)

    return {};


  std::optional<int64_t> EncodedImm =

      AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);

  if (!EncodedImm)

    return {};


  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {

  // Match the (soffset + offset) pair as a 32-bit register base and

  // an immediate offset.

  Register SOffset;

  unsigned Offset;

  std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(

      *MRI, Root.getReg(), VT, /*CheckNUW*/ true);

  if (!SOffset)

    return std::nullopt;


  std::optional<int64_t> EncodedOffset =

      AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);

  if (!EncodedOffset)

    return std::nullopt;


  assert(MRI->getType(SOffset) == LLT::scalar(32));

  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },

           [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};

}


std::pair<Register, unsigned>

AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,

                                                     bool &Matched) const {

  Matched = false;


  Register Src;

  unsigned Mods;

  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());


  if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {

    assert(MRI->getType(Src) == LLT::scalar(16));


    // Only change Src if src modifier could be gained. In such cases new Src

    // could be sgpr but this does not violate constant bus restriction for

    // instruction that is being selected.

    Src = stripBitCast(Src, *MRI);


    const auto CheckAbsNeg = [&]() {

      // Be careful about folding modifiers if we already have an abs. fneg is

      // applied last, so we don't want to apply an earlier fneg.

      if ((Mods & SISrcMods::ABS) == 0) {

        unsigned ModsTmp;

        std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);


        if ((ModsTmp & SISrcMods::NEG) != 0)

          Mods ^= SISrcMods::NEG;


        if ((ModsTmp & SISrcMods::ABS) != 0)

          Mods |= SISrcMods::ABS;

      }

    };


    CheckAbsNeg();


    // op_sel/op_sel_hi decide the source type and source.

    // If the source's op_sel_hi is set, it indicates to do a conversion from

    // fp16. If the sources's op_sel is set, it picks the high half of the

    // source register.


    Mods |= SISrcMods::OP_SEL_1;


    if (isExtractHiElt(*MRI, Src, Src)) {

      Mods |= SISrcMods::OP_SEL_0;

      CheckAbsNeg();

    }


    Matched = true;

  }


  return {Src, Mods};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(

    MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  bool Matched;

  std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);

  if (!Matched)

    return {};


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


InstructionSelector::ComplexRendererFns

AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {

  Register Src;

  unsigned Mods;

  bool Matched;

  std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);


  return {{

      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },

      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods

  }};

}


bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(

    MachineInstr &I, Intrinsic::ID IntrID) const {

  MachineBasicBlock *MBB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  Register CCReg = I.getOperand(0).getReg();


  // Set SCC to true, in case the barrier instruction gets converted to a NOP.

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);


  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))

      .addImm(I.getOperand(2).getImm());


  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);


  I.eraseFromParent();

  return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,

                                      *MRI);

}


bool AMDGPUInstructionSelector::selectSGetBarrierState(

    MachineInstr &I, Intrinsic::ID IntrID) const {

  MachineBasicBlock *MBB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  const MachineOperand &BarOp = I.getOperand(2);

  std::optional<int64_t> BarValImm =

      getIConstantVRegSExtVal(BarOp.getReg(), *MRI);


  if (!BarValImm) {

    auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

                       .addReg(BarOp.getReg());

    constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);

  }

  MachineInstrBuilder MIB;

  unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM

                           : AMDGPU::S_GET_BARRIER_STATE_M0;

  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));


  auto DstReg = I.getOperand(0).getReg();

  const TargetRegisterClass *DstRC =

      TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);

  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))

    return false;

  MIB.addDef(DstReg);

  if (BarValImm) {

    MIB.addImm(*BarValImm);

  }

  I.eraseFromParent();

  return true;

}


unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {

  if (HasInlineConst) {

    switch (IntrID) {

    default:

      llvm_unreachable("not a named barrier op");

    case Intrinsic::amdgcn_s_barrier_join:

      return AMDGPU::S_BARRIER_JOIN_IMM;

    case Intrinsic::amdgcn_s_wakeup_barrier:

      return AMDGPU::S_WAKEUP_BARRIER_IMM;

    case Intrinsic::amdgcn_s_get_named_barrier_state:

      return AMDGPU::S_GET_BARRIER_STATE_IMM;

    };

  } else {

    switch (IntrID) {

    default:

      llvm_unreachable("not a named barrier op");

    case Intrinsic::amdgcn_s_barrier_join:

      return AMDGPU::S_BARRIER_JOIN_M0;

    case Intrinsic::amdgcn_s_wakeup_barrier:

      return AMDGPU::S_WAKEUP_BARRIER_M0;

    case Intrinsic::amdgcn_s_get_named_barrier_state:

      return AMDGPU::S_GET_BARRIER_STATE_M0;

    };

  }

}


bool AMDGPUInstructionSelector::selectNamedBarrierInit(

    MachineInstr &I, Intrinsic::ID IntrID) const {

  MachineBasicBlock *MBB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  const MachineOperand &BarOp = I.getOperand(1);

  const MachineOperand &CntOp = I.getOperand(2);


  // A member count of 0 means "keep existing member count". That plus a known

  // constant value for the barrier ID lets us use the immarg form.

  if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {

    std::optional<int64_t> CntImm =

        getIConstantVRegSExtVal(CntOp.getReg(), *MRI);

    if (CntImm && *CntImm == 0) {

      std::optional<int64_t> BarValImm =

          getIConstantVRegSExtVal(BarOp.getReg(), *MRI);

      if (BarValImm) {

        auto BarID = ((*BarValImm) >> 4) & 0x3F;

        BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))

            .addImm(BarID);

        I.eraseFromParent();

        return true;

      }

    }

  }


  // BarID = (BarOp >> 4) & 0x3F

  Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)

      .add(BarOp)

      .addImm(4u)

      .setOperandDead(3); // Dead scc


  Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)

      .addReg(TmpReg0)

      .addImm(0x3F)

      .setOperandDead(3); // Dead scc


  // MO = ((CntOp & 0x3F) << shAmt) | BarID

  Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)

      .add(CntOp)

      .addImm(0x3F)

      .setOperandDead(3); // Dead scc


  Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  constexpr unsigned ShAmt = 16;

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)

      .addReg(TmpReg2)

      .addImm(ShAmt)

      .setOperandDead(3); // Dead scc


  Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)

      .addReg(TmpReg1)

      .addReg(TmpReg3)

      .setOperandDead(3); // Dead scc;


  auto CopyMIB =

      BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);

  constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);


  unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init

                     ? AMDGPU::S_BARRIER_INIT_M0

                     : AMDGPU::S_BARRIER_SIGNAL_M0;

  MachineInstrBuilder MIB;

  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));


  I.eraseFromParent();

  return true;

}


bool AMDGPUInstructionSelector::selectNamedBarrierInst(

    MachineInstr &I, Intrinsic::ID IntrID) const {

  MachineBasicBlock *MBB = I.getParent();

  const DebugLoc &DL = I.getDebugLoc();

  MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state

                             ? I.getOperand(2)

                             : I.getOperand(1);

  std::optional<int64_t> BarValImm =

      getIConstantVRegSExtVal(BarOp.getReg(), *MRI);


  if (!BarValImm) {

    // BarID = (BarOp >> 4) & 0x3F

    Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)

        .addReg(BarOp.getReg())

        .addImm(4u)

        .setOperandDead(3); // Dead scc;


    Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)

        .addReg(TmpReg0)

        .addImm(0x3F)

        .setOperandDead(3); // Dead scc;


    auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)

                       .addReg(TmpReg1);

    constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);

  }


  MachineInstrBuilder MIB;

  unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);

  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));


  if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {

    auto DstReg = I.getOperand(0).getReg();

    const TargetRegisterClass *DstRC =

        TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);

    if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))

      return false;

    MIB.addDef(DstReg);

  }


  if (BarValImm) {

    auto BarId = ((*BarValImm) >> 4) & 0x3F;

    MIB.addImm(BarId);

  }


  I.eraseFromParent();

  return true;

}


void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,

                                                 const MachineInstr &MI,

                                                 int OpIdx) const {

  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&

         "Expected G_CONSTANT");

  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());

}


void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,

                                                const MachineInstr &MI,

                                                int OpIdx) const {

  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&

         "Expected G_CONSTANT");

  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());

}


void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,

                                                   const MachineInstr &MI,

                                                   int OpIdx) const {

  const MachineOperand &Op = MI.getOperand(1);

  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);

  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());

}


void AMDGPUInstructionSelector::renderCountTrailingOnesImm(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&

         "Expected G_CONSTANT");

  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());

}


/// This only really exists to satisfy DAG type checking machinery, so is a

/// no-op here.

void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,

                                                const MachineInstr &MI,

                                                int OpIdx) const {

  const MachineOperand &Op = MI.getOperand(OpIdx);

  int64_t Imm;

  if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))

    MIB.addImm(Imm);

  else

    MIB.addImm(Op.getImm());

}


void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,

                                                   const MachineInstr &MI,

                                                   int OpIdx) const {

  MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);

}


void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,

                                                const MachineInstr &MI,

                                                int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);

}


void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(

      (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);

}


void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)

                 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)

                 : (int64_t)SISrcMods::DST_OP_SEL);

}


void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(

      (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);

}


void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)

                 ? (int64_t)(SISrcMods::OP_SEL_0)

                 : 0);

}


void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)

                                           : 0);

}


void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)

                                           : 0);

}


void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(

      (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);

}


void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)

                 ? (int64_t)SISrcMods::DST_OP_SEL

                 : 0);

}


void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,

                                                  const MachineInstr &MI,

                                                  int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  MIB.addImm(MI.getOperand(OpIdx).getImm() &

             (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL

                                       : AMDGPU::CPol::ALL_pregfx12));

}


void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,

                                                 const MachineInstr &MI,

                                                 int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  const bool Swizzle = MI.getOperand(OpIdx).getImm() &

                       (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ

                                                 : AMDGPU::CPol::SWZ_pregfx12);

  MIB.addImm(Swizzle);

}


void AMDGPUInstructionSelector::renderExtractCpolSetGLC(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  assert(OpIdx >= 0 && "expected to match an immediate operand");

  const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &

                        (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL

                                                  : AMDGPU::CPol::ALL_pregfx12);

  MIB.addImm(Cpol | AMDGPU::CPol::GLC);

}


void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,

                                                 const MachineInstr &MI,

                                                 int OpIdx) const {

  MIB.addFrameIndex(MI.getOperand(1).getIndex());

}


void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,

                                                       const MachineInstr &MI,

                                                       int OpIdx) const {

  const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();

  int ExpVal = APF.getExactLog2Abs();

  assert(ExpVal != INT_MIN);

  MIB.addImm(ExpVal);

}


void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,

                                                const MachineInstr &MI,

                                                int OpIdx) const {

  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3

  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0

  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1

  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2

  MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);

}


void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,

                                                   const MachineInstr &MI,

                                                   int OpIdx) const {

  unsigned Mods = SISrcMods::OP_SEL_1;

  if (MI.getOperand(OpIdx).getImm())

    Mods ^= SISrcMods::NEG;

  MIB.addImm((int64_t)Mods);

}


void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,

                                                    const MachineInstr &MI,

                                                    int OpIdx) const {

  unsigned Mods = SISrcMods::OP_SEL_1;

  if (MI.getOperand(OpIdx).getImm())

    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);

  MIB.addImm((int64_t)Mods);

}


void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,

                                                      const MachineInstr &MI,

                                                      int OpIdx) const {

  unsigned Val = MI.getOperand(OpIdx).getImm();

  unsigned Mods = SISrcMods::OP_SEL_1; // default: none

  if (Val == 1) // neg

    Mods ^= SISrcMods::NEG;

  if (Val == 2) // abs

    Mods ^= SISrcMods::ABS;

  if (Val == 3) // neg and abs

    Mods ^= (SISrcMods::NEG | SISrcMods::ABS);

  MIB.addImm((int64_t)Mods);

}


void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,

                                                  const MachineInstr &MI,

                                                  int OpIdx) const {

  uint32_t V = MI.getOperand(2).getImm();

  V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))

      << AMDGPU::CPol::SCOPE_SHIFT;

  if (!Subtarget->hasSafeCUPrefetch())

    V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe

  MIB.addImm(V);

}


/// Convert from 2-bit value to enum values used for op_sel* source modifiers.

void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(

    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

  unsigned Val = MI.getOperand(OpIdx).getImm();

  unsigned New = 0;

  if (Val & 0x1)

    New |= SISrcMods::OP_SEL_0;

  if (Val & 0x2)

    New |= SISrcMods::OP_SEL_1;

  MIB.addImm(New);

}


bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {

  return TII.isInlineConstant(Imm);

}


bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {

  return TII.isInlineConstant(Imm);

}

DefMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Definition AArch64ExpandPseudoInsts.cpp:128

getIntrinsicID
static unsigned getIntrinsicID(const SDNode *N)
Definition AArch64ISelLowering.cpp:8725

GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT

GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

AMDGPUBaseInfo.h

AMDGPUGlobalISelUtils.h

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

TypeClass
TypeClass
Definition AMDGPUInstructionSelector.cpp:4874

TypeClass::SCALAR
@ SCALAR
Definition AMDGPUInstructionSelector.cpp:4874

TypeClass::VECTOR_OF_TWO
@ VECTOR_OF_TWO
Definition AMDGPUInstructionSelector.cpp:4874

TypeClass::NONE_OF_LISTED
@ NONE_OF_LISTED
Definition AMDGPUInstructionSelector.cpp:4874

getLegalRegBank
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
Definition AMDGPUInstructionSelector.cpp:5324

isShlHalf
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
Definition AMDGPUInstructionSelector.cpp:4850

isNoUnsignedWrap
static bool isNoUnsignedWrap(MachineInstr *Addr)
Definition AMDGPUInstructionSelector.cpp:6466

buildOffsetSrc
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition AMDGPUInstructionSelector.cpp:6803

getNamedBarrierOp
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
Definition AMDGPUInstructionSelector.cpp:7215

checkRB
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
Definition AMDGPUInstructionSelector.cpp:5309

updateMods
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
Definition AMDGPUInstructionSelector.cpp:5212

isTruncHalf
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
Definition AMDGPUInstructionSelector.cpp:4821

getWaveAddress
static Register getWaveAddress(const MachineInstr *Def)
Definition AMDGPUInstructionSelector.cpp:71

isExtractHiElt
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
Definition AMDGPUInstructionSelector.cpp:2918

shouldUseAndMask
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
Definition AMDGPUInstructionSelector.cpp:2735

BitOp3_Op
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:4232

isVectorOfTwoOrScalar
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:4876

isLaneMaskFromSameBlock
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
Definition AMDGPUInstructionSelector.cpp:1685

parseTexFail
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
Definition AMDGPUInstructionSelector.cpp:2141

addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition AMDGPUInstructionSelector.cpp:6745

gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition AMDGPUInstructionSelector.cpp:1958

isConstant
static bool isConstant(const MachineInstr &MI)
Definition AMDGPUInstructionSelector.cpp:3088

isSameBitWidth
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5205

SrcStatus
SrcStatus
Definition AMDGPUInstructionSelector.cpp:4802

SrcStatus::HALF_START
@ HALF_START
Definition AMDGPUInstructionSelector.cpp:4817

SrcStatus::IS_UPPER_HALF_NEG
@ IS_UPPER_HALF_NEG
Definition AMDGPUInstructionSelector.cpp:4806

SrcStatus::NEG_START
@ NEG_START
Definition AMDGPUInstructionSelector.cpp:4815

SrcStatus::NEG_END
@ NEG_END
Definition AMDGPUInstructionSelector.cpp:4816

SrcStatus::IS_UPPER_HALF
@ IS_UPPER_HALF
Definition AMDGPUInstructionSelector.cpp:4804

SrcStatus::IS_BOTH_NEG
@ IS_BOTH_NEG
Definition AMDGPUInstructionSelector.cpp:4813

SrcStatus::IS_HI_NEG
@ IS_HI_NEG
Definition AMDGPUInstructionSelector.cpp:4809

SrcStatus::HALF_END
@ HALF_END
Definition AMDGPUInstructionSelector.cpp:4818

SrcStatus::INVALID
@ INVALID
Definition AMDGPUInstructionSelector.cpp:4814

SrcStatus::IS_SAME
@ IS_SAME
Definition AMDGPUInstructionSelector.cpp:4803

SrcStatus::IS_LOWER_HALF
@ IS_LOWER_HALF
Definition AMDGPUInstructionSelector.cpp:4805

SrcStatus::IS_LO_NEG
@ IS_LO_NEG
Definition AMDGPUInstructionSelector.cpp:4812

SrcStatus::IS_LOWER_HALF_NEG
@ IS_LOWER_HALF_NEG
Definition AMDGPUInstructionSelector.cpp:4808

buildRegSequence
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5428

buildRSRC
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
Definition AMDGPUInstructionSelector.cpp:6751

isAsyncLDSDMA
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
Definition AMDGPUInstructionSelector.cpp:3565

diagnoseUnsupportedIntrinsic
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
Definition AMDGPUInstructionSelector.cpp:77

computeIndirectRegIndex
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
Definition AMDGPUInstructionSelector.cpp:3391

getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition AMDGPUInstructionSelector.cpp:397

getLastSameOrNeg
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
Definition AMDGPUInstructionSelector.cpp:5185

stripCopy
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:2907

calcNextStatus
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5042

stripBitCast
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:2911

getConstantZext32Val
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
Definition AMDGPUInstructionSelector.cpp:7024

isValidToPack
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5237

getV_CMPOpcode
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
Definition AMDGPUInstructionSelector.cpp:1325

getSrcStats
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
Definition AMDGPUInstructionSelector.cpp:5168

isUnmergeHalf
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
Definition AMDGPUInstructionSelector.cpp:4866

getNegStatus
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:4886

isVCmpResult
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:3184

buildAddr64RSrc
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition AMDGPUInstructionSelector.cpp:6794

isLshrHalf
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
Definition AMDGPUInstructionSelector.cpp:4833

selectWMMAModsNegAbs
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5456

AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.

S1
constexpr LLT S1
Definition AMDGPULegalizerInfo.cpp:296

S32
constexpr LLT S32
Definition AMDGPULegalizerInfo.cpp:299

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

isAllZeros
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
Definition Constants.cpp:3036

Metadata
dxil translate DXIL Translate Metadata
Definition DXILTranslateMetadata.cpp:647

DiagnosticInfo.h

GIMatchTableExecutorImpl.h

GISelValueTracking.h
Provides analysis for querying information about KnownBits during GISel passes.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

GenericMachineInstrs.h
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MIPatternMatch.h
Contains matchers for matching SSA Machine Instructions.

Module
Machine Check Debug Module
Definition MachineCheckDebugify.cpp:124

MachineFrameInfo.h

MachineIRBuilder.h
This file declares the MachineIRBuilder class.

Reg
Register Reg
Definition MachineSink.cpp:2126

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

P
#define P(N)

Swizzle
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
Definition R600InstrInfo.cpp:340

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

SIMachineFunctionInfo.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

SearchOptions
This is used to control valid status that current MI supports.
Definition AMDGPUInstructionSelector.cpp:5133

SearchOptions::checkOptions
bool checkOptions(SrcStatus Stat) const
Definition AMDGPUInstructionSelector.cpp:5154

SearchOptions::SearchOptions
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:5140

llvm::AMDGPUInstructionSelector::AMDGPUInstructionSelector
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
Definition AMDGPUInstructionSelector.cpp:43

llvm::AMDGPUInstructionSelector::getName
static const char * getName()

llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition AMDGPUInstructionSelector.cpp:4461

llvm::AMDGPUInstructionSelector::setupMF
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
Definition AMDGPUInstructionSelector.cpp:59

llvm::AMDGPUMachineFunctionInfo::setInitWholeWave
void setInitWholeWave()
Definition AMDGPUMachineFunctionInfo.h:103

llvm::AMDGPUMachineFunctionInfo::getLDSSize
uint32_t getLDSSize() const
Definition AMDGPUMachineFunctionInfo.h:79

llvm::AMDGPURegisterBankInfo
Definition AMDGPURegisterBankInfo.h:42

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::GFX11
@ GFX11
Definition AMDGPUSubtarget.h:43

llvm::AMDGPUTargetMachine
Definition AMDGPUTargetMachine.h:34

llvm::APFloat
Definition APFloat.h:1029

llvm::APFloat::getExactLog2Abs
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297

llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition BlockFrequencyInfo.h:38

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740

llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743

llvm::CmpInst::FCMP_TRUE
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769

llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770

llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746

llvm::CmpInst::FCMP_ULE
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755

llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744

llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745

llvm::CmpInst::ICMP_UGE
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764

llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763

llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767

llvm::CmpInst::FCMP_ULT
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754

llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748

llvm::CmpInst::FCMP_UEQ
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765

llvm::CmpInst::FCMP_UGT
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752

llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747

llvm::CmpInst::FCMP_ORD
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:761

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:762

llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768

llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756

llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766

llvm::CmpInst::FCMP_UGE
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753

llvm::CmpInst::FCMP_FALSE
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742

llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750

llvm::CmpInst::isFPPredicate
bool isFPPredicate() const
Definition InstrTypes.h:845

llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition InstrTypes.h:846

llvm::CodeGenCoverage
Definition CodeGenCoverage.h:19

llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174

llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168

llvm::DebugLoc::get
DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.h:218

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1103

llvm::Function
Definition Function.h:65

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNSubtarget::checkSubtargetFeatures
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Definition GCNSubtarget.cpp:170

llvm::GIMatchTableExecutor::BFI
BlockFrequencyInfo * BFI
Definition GIMatchTableExecutor.h:612

llvm::GIMatchTableExecutor::MF
MachineFunction * MF
Definition GIMatchTableExecutor.h:610

llvm::GIMatchTableExecutor::ComplexRendererFns
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
Definition GIMatchTableExecutor.h:633

llvm::GIMatchTableExecutor::VT
GISelValueTracking * VT
Definition GIMatchTableExecutor.h:609

llvm::GIMatchTableExecutor::PSI
ProfileSummaryInfo * PSI
Definition GIMatchTableExecutor.h:611

llvm::GIMatchTableExecutor::setupMF
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
Definition GIMatchTableExecutor.h:619

llvm::GIMatchTableExecutor::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition GIMatchTableExecutor.h:608

llvm::GISelValueTracking
Definition GISelValueTracking.h:34

llvm::LLT
Definition LowLevelType.h:45

llvm::LLT::isScalar
constexpr bool isScalar() const
Definition LowLevelType.h:282

llvm::LLT::scalar
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition LowLevelType.h:88

llvm::LLT::isValid
constexpr bool isValid() const
Definition LowLevelType.h:262

llvm::LLT::getNumElements
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition LowLevelType.h:350

llvm::LLT::isVector
constexpr bool isVector() const
Definition LowLevelType.h:289

llvm::LLT::getSizeInBits
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition LowLevelType.h:387

llvm::LLT::getAddressSpace
constexpr unsigned getAddressSpace() const
Definition LowLevelType.h:503

llvm::LLT::fixed_vector
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition LowLevelType.h:203

llvm::LLT::getElementType
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition LowLevelType.h:510

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:249

llvm::LocationSize::hasValue
bool hasValue() const
Definition MemoryLocation.h:153

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::MCInstrDesc::getOperandConstraint
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Definition MCInstrDesc.h:220

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:119

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:327

llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition MachineFrameInfo.h:394

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition MachineIRBuilder.h:237

llvm::MachineInstrBuilder
Definition MachineInstrBuilder.h:171

llvm::MachineInstrBuilder::setMemRefs
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
Definition MachineInstrBuilder.h:310

llvm::MachineInstrBuilder::setOperandDead
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
Definition MachineInstrBuilder.h:389

llvm::MachineInstrBuilder::addUse
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
Definition MachineInstrBuilder.h:225

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:199

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:233

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:326

llvm::MachineInstrBuilder::addFrameIndex
const MachineInstrBuilder & addFrameIndex(int Idx) const
Definition MachineInstrBuilder.h:254

llvm::MachineInstrBuilder::addGlobalAddress
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:279

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:248

llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition MachineInstrBuilder.h:218

llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition MachineInstrBuilder.h:315

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:601

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:373

llvm::MachineInstr::getFlag
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition MachineInstr.h:423

llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition MachineInstr.h:604

llvm::MachineInstr::tieOperands
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
Definition MachineInstr.cpp:1215

llvm::MachineInstr::NoUWrap
@ NoUWrap
Definition MachineInstr.h:110

llvm::MachineInstr::InBounds
@ InBounds
Definition MachineInstr.h:128

llvm::MachineInstr::getMF
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
Definition MachineInstr.cpp:782

llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition MachineInstr.h:525

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:609

llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition MachineMemOperand.h:243

llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition MachineMemOperand.h:236

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition MachineMemOperand.h:207

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition MachineMemOperand.h:227

llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition MachineMemOperand.h:216

llvm::MachineMemOperand::getBaseAlign
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
Definition MachineMemOperand.h:266

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:377

llvm::MachineOperand::isUndef
bool isUndef() const
Definition MachineOperand.h:407

llvm::MachineOperand::getCImm
const ConstantInt * getCImm() const
Definition MachineOperand.h:565

llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition MachineOperand.h:694

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition MachineOperand.h:392

llvm::MachineOperand::isKill
bool isKill() const
Definition MachineOperand.h:402

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:331

llvm::MachineOperand::getShuffleMask
ArrayRef< int > getShuffleMask() const
Definition MachineOperand.h:626

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isDef
bool isDef() const
Definition MachineOperand.h:387

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:333

llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition MachineOperand.h:246

llvm::MachineOperand::isDebug
bool isDebug() const
Definition MachineOperand.h:458

llvm::MachineOperand::getIndex
int getIndex() const
Definition MachineOperand.h:580

llvm::MachineOperand::isDead
bool isDead() const
Definition MachineOperand.h:397

llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition MachineOperand.h:833

llvm::MachineOperand::isEarlyClobber
bool isEarlyClobber() const
Definition MachineOperand.h:448

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachineOperand::isInternalRead
bool isInternalRead() const
Definition MachineOperand.h:443

llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition MachineOperand.h:851

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::getVRegDef
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition MachineRegisterInfo.cpp:404

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::getType
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
Definition MachineRegisterInfo.h:771

llvm::MachineRegisterInfo::getRegBankOrNull
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Definition MachineRegisterInfo.h:680

llvm::MachineRegisterInfo::cloneVirtualRegister
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
Definition MachineRegisterInfo.cpp:176

llvm::MachineRegisterInfo::getUniqueVRegDef
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition MachineRegisterInfo.cpp:417

llvm::PointerType::get
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:2026

llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition ProfileSummaryInfo.h:42

llvm::RegisterBankInfo::getRegBank
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition RegisterBankInfo.h:446

llvm::RegisterBank
This class implements the register bank concept.
Definition RegisterBank.h:29

llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition RegisterBank.h:46

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83

llvm::SIInstrInfo
Definition SIInstrInfo.h:101

llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:10304

llvm::SIInstrInfo::getDSShaderTypeValue
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
Definition SIInstrInfo.cpp:11090

llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition SIInstrInfo.h:255

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition SIRegisterInfo.cpp:560

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::TargetInstrInfo::isGenericOpcode
static bool isGenericOpcode(unsigned Opc)
Definition TargetInstrInfo.h:140

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetRegisterClass::getID
unsigned getID() const
Return the register class ID number.
Definition TargetRegisterInfo.h:74

llvm::TargetRegisterClass::hasSubClassEq
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
Definition TargetRegisterInfo.h:136

llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition TargetRegisterInfo.h:148

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::Triple::OSType
OSType
Definition Triple.h:212

llvm::Triple::AMDHSA
@ AMDHSA
Definition Triple.h:236

llvm::Triple::AMDPAL
@ AMDPAL
Definition Triple.h:246

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309

uint32_t

uint64_t

uint8_t

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition AMDGPUAddrSpace.h:45

llvm::AMDGPU::CPol::CPol
CPol
Definition SIDefines.h:372

llvm::AMDGPU::CPol::GLC
@ GLC
Definition SIDefines.h:373

llvm::AMDGPU::CPol::SWZ_pregfx12
@ SWZ_pregfx12
Definition SIDefines.h:381

llvm::AMDGPU::CPol::SCOPE_MASK
@ SCOPE_MASK
Definition SIDefines.h:407

llvm::AMDGPU::CPol::ALL
@ ALL
Definition SIDefines.h:420

llvm::AMDGPU::CPol::VIRTUAL_BITS
@ VIRTUAL_BITS
Definition SIDefines.h:433

llvm::AMDGPU::CPol::SCOPE_SE
@ SCOPE_SE
Definition SIDefines.h:410

llvm::AMDGPU::CPol::SWZ
@ SWZ
Definition SIDefines.h:416

llvm::AMDGPU::CPol::SCAL
@ SCAL
Definition SIDefines.h:418

llvm::AMDGPU::CPol::SCOPE_SHIFT
@ SCOPE_SHIFT
Definition SIDefines.h:406

llvm::AMDGPU::CPol::ALL_pregfx12
@ ALL_pregfx12
Definition SIDefines.h:380

llvm::AMDGPU::CPol::VOLATILE
@ VOLATILE
Definition SIDefines.h:430

llvm::AMDGPU::DPP::DPP_FI_1
@ DPP_FI_1
Definition SIDefines.h:1016

llvm::AMDGPU::DPP::DPP_FI_0
@ DPP_FI_0
Definition SIDefines.h:1015

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::Key::SymbolName
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Definition AMDGPUMetadata.h:388

llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition SIDefines.h:945

llvm::AMDGPU::SDWA::WORD_1
@ WORD_1
Definition SIDefines.h:938

llvm::AMDGPU::SDWA::WORD_0
@ WORD_0
Definition SIDefines.h:937

llvm::AMDGPU::getMIMGG16MappingInfo
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)

llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition AMDGPUBaseInfo.cpp:314

llvm::AMDGPU::getSMRDEncodedLiteralOffset32
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition AMDGPUBaseInfo.cpp:3473

llvm::AMDGPU::isGFX12Plus
bool isGFX12Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2647

llvm::AMDGPU::getNullPointerValue
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
Definition AMDGPUAddrSpace.h:178

llvm::AMDGPU::FlatAddrSpace
FlatAddrSpace
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FlatGlobal
@ FlatGlobal
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FlatScratch
@ FlatScratch
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FLAT
@ FLAT
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:152

llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
Definition AMDGPUBaseInfo.h:436

llvm::AMDGPU::isInlinableLiteral32
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3090

llvm::AMDGPU::hasSMRDSignedImmOffset
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
Definition AMDGPUBaseInfo.cpp:204

llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)

llvm::AMDGPU::isGFX13Plus
bool isGFX13Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2665

llvm::AMDGPU::isGFX11Plus
bool isGFX11Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2639

llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2631

llvm::AMDGPU::getSMRDEncodedOffset
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
Definition AMDGPUBaseInfo.cpp:3444

llvm::AMDGPU::getRegBitWidth
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
Definition SIRegisterInfo.cpp:3576

llvm::AMDGPU::getMIMGDimInfo
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::getIntrinsicID
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
Definition AMDGPUInstrInfo.cpp:25

llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
Definition AMDGPUGlobalISelUtils.cpp:26

llvm::AMDGPU::getImageDimIntrinsicInfo
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)

llvm::ARMII::VecSize
@ VecSize
Definition ARMBaseInfo.h:437

llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition ARMBaseInfo.h:177

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition Intrinsics.cpp:780

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:62

llvm::MCOI::EARLY_CLOBBER
@ EARLY_CLOBBER
Definition MCInstrDesc.h:38

llvm::MIPatternMatch
Definition MIPatternMatch.h:25

llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition MIPatternMatch.h:311

llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
Definition MIPatternMatch.h:213

llvm::MIPatternMatch::m_GCst
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
Definition MIPatternMatch.h:160

llvm::MIPatternMatch::m_Copy
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
Definition MIPatternMatch.h:755

llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition MIPatternMatch.h:706

llvm::MIPatternMatch::m_GXor
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:612

llvm::MIPatternMatch::m_GSExt
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
Definition MIPatternMatch.h:701

llvm::MIPatternMatch::m_GFPExt
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
Definition MIPatternMatch.h:711

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:278

llvm::MIPatternMatch::m_ICst
ConstantMatch< APInt > m_ICst(APInt &Cst)
Definition MIPatternMatch.h:102

llvm::MIPatternMatch::m_AllOnesInt
SpecificConstantMatch m_AllOnesInt()
Definition MIPatternMatch.h:281

llvm::MIPatternMatch::m_GOr
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:617

llvm::MIPatternMatch::m_ICstOrSplat
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
Definition MIPatternMatch.h:143

llvm::MIPatternMatch::m_GImplicitDef
ImplicitDefMatch m_GImplicitDef()
Definition MIPatternMatch.h:472

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition MIPatternMatch.h:943

llvm::MIPatternMatch::m_GAShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:649

llvm::MIPatternMatch::mi_match
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
Definition MIPatternMatch.h:28

llvm::MIPatternMatch::m_GPtrAdd
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:570

llvm::MIPatternMatch::m_SpecificReg
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
Definition MIPatternMatch.h:295

llvm::MIPatternMatch::m_GShl
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:637

llvm::MIPatternMatch::m_any_of
Or< Preds... > m_any_of(Preds &&... preds)
Definition MIPatternMatch.h:355

llvm::MIPatternMatch::m_GAnd
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:606

llvm::MIPatternMatch::m_GBitcast
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
Definition MIPatternMatch.h:722

llvm::MIPatternMatch::m_MInstr
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
Definition MIPatternMatch.h:424

llvm::MIPatternMatch::m_GFNeg
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
Definition MIPatternMatch.h:750

llvm::MIPatternMatch::m_GFCstOrSplat
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
Definition MIPatternMatch.h:189

llvm::MIPatternMatch::m_GFabs
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
Definition MIPatternMatch.h:745

llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:643

llvm::MIPatternMatch::m_GAnyExt
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
Definition MIPatternMatch.h:696

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition MIPatternMatch.h:56

llvm::MIPatternMatch::m_GMul
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
Definition MIPatternMatch.h:582

llvm::MIPatternMatch::m_GTrunc
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
Definition MIPatternMatch.h:716

llvm::PatternMatch::m_BinOp
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition PatternMatch.h:141

llvm::SIInstrFlags::DS
@ DS
Definition SIDefines.h:92

llvm::SISrcMods::ABS
@ ABS
Definition SIDefines.h:279

llvm::SISrcMods::OP_SEL_0
@ OP_SEL_0
Definition SIDefines.h:282

llvm::SISrcMods::DST_OP_SEL
@ DST_OP_SEL
Definition SIDefines.h:284

llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition SIDefines.h:281

llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition SIDefines.h:283

llvm::SISrcMods::NEG
@ NEG
Definition SIDefines.h:278

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
Definition X86BaseInfo.h:109

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:527

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::lltok::APFloat
@ APFloat
Definition LLToken.h:533

llvm::lsp::TraceLevel::Off
@ Off
Definition Protocol.h:200

llvm::lsp::MessageType::Info
@ Info
Definition Protocol.h:1295

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::orc::MemProt::Exec
@ Exec
Definition MemoryFlags.h:31

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm::sframe::BaseReg::SP
@ SP
Definition SFrame.h:79

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::getFunctionLiveInPhysReg
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::isBuildVectorAllZeros
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1444

llvm::constrainOperandRegClass
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57

llvm::getOpcodeDef
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653

llvm::RegClassOrRegBank
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
Definition MachineRegisterInfo.h:47

llvm::getConstantFPVRegVal
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:449

llvm::getIConstantVRegVal
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition MachineInstrBuilder.h:59

llvm::RegState::Dead
@ Dead
Unused definition.
Definition MachineInstrBuilder.h:63

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition MachineInstrBuilder.h:61

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::constrainSelectedInstRegOperands
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156

llvm::isPowerOf2_64
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284

llvm::getDefIgnoringCopies
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494

llvm::popcount
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156

llvm::Log2_64
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337

llvm::getIConstantVRegSExtVal
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:1127

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::getAnyConstantVRegValWithLookThrough
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::Key
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
Definition PassManager.h:690

llvm::Data
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:232

llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
Definition IVDescriptors.h:42

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:41

llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
Definition IVDescriptors.h:46

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
Definition IVDescriptors.h:43

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:39

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:38

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::DS_Error
@ DS_Error
Definition DiagnosticInfo.h:51

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::getIConstantVRegValWithLookThrough
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433

llvm::getDefSrcRegIgnoringCopies
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469

llvm::getSrcRegIgnoringCopies
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501

llvm::maskTrailingOnes
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77

llvm::getUndefRegState
constexpr RegState getUndefRegState(bool B)
Definition MachineInstrBuilder.h:96

llvm::ValueUniformity::Default
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20

llvm::AMDGPU::ImageDimIntrinsicInfo
Definition AMDGPUInstrInfo.h:49

llvm::AMDGPU::ImageDimIntrinsicInfo::BaseOpcode
unsigned BaseOpcode
Definition AMDGPUInstrInfo.h:51

llvm::AMDGPU::ImageDimIntrinsicInfo::AtomicNoRetBaseOpcode
unsigned AtomicNoRetBaseOpcode
Definition AMDGPUInstrInfo.h:52

llvm::AMDGPU::ImageDimIntrinsicInfo::CachePolicyIndex
uint8_t CachePolicyIndex
Definition AMDGPUInstrInfo.h:78

llvm::AMDGPU::ImageDimIntrinsicInfo::Dim
MIMGDim Dim
Definition AMDGPUInstrInfo.h:53

llvm::AMDGPU::ImageDimIntrinsicInfo::NumArgs
uint8_t NumArgs
Definition AMDGPUInstrInfo.h:62

llvm::AMDGPU::ImageDimIntrinsicInfo::RsrcIndex
uint8_t RsrcIndex
Definition AMDGPUInstrInfo.h:74

llvm::AMDGPU::ImageDimIntrinsicInfo::UnormIndex
uint8_t UnormIndex
Definition AMDGPUInstrInfo.h:76

llvm::AMDGPU::ImageDimIntrinsicInfo::SampIndex
uint8_t SampIndex
Definition AMDGPUInstrInfo.h:75

llvm::AMDGPU::ImageDimIntrinsicInfo::TexFailCtrlIndex
uint8_t TexFailCtrlIndex
Definition AMDGPUInstrInfo.h:77

llvm::AMDGPU::ImageDimIntrinsicInfo::VAddrStart
uint8_t VAddrStart
Definition AMDGPUInstrInfo.h:65

llvm::AMDGPU::ImageDimIntrinsicInfo::DMaskIndex
uint8_t DMaskIndex
Definition AMDGPUInstrInfo.h:64

llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition AMDGPUBaseInfo.h:449

llvm::AMDGPU::MIMGBaseOpcodeInfo::AtomicX2
bool AtomicX2
Definition AMDGPUBaseInfo.h:447

llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition AMDGPUBaseInfo.h:448

llvm::AMDGPU::MIMGBaseOpcodeInfo::NoReturn
bool NoReturn
Definition AMDGPUBaseInfo.h:460

llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition AMDGPUBaseInfo.h:456

llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition AMDGPUBaseInfo.h:445

llvm::AMDGPU::MIMGBaseOpcodeInfo::Atomic
bool Atomic
Definition AMDGPUBaseInfo.h:446

llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition AMDGPUBaseInfo.h:476

llvm::AMDGPU::MIMGDimInfo::DA
bool DA
Definition AMDGPUBaseInfo.h:475

llvm::AMDGPU::MIMGG16MappingInfo::G16
MIMGBaseOpcode G16
Definition AMDGPUBaseInfo.h:511

llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315

llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361

llvm::MachinePointerInfo::Offset
int64_t Offset
Offset - This is an offset from the base Value*.
Definition MachineMemOperand.h:47

llvm::MachinePointerInfo::AddrSpace
unsigned AddrSpace
Definition MachineMemOperand.h:49

llvm::MachinePointerInfo::V
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
Definition MachineMemOperand.h:44