doxygen/GCNSubtarget_8cpp_source.html

//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Implements the GCN specific subclass of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#include "GCNSubtarget.h"

#include "AMDGPUCallLowering.h"

#include "AMDGPUInstructionSelector.h"

#include "AMDGPULegalizerInfo.h"

#include "AMDGPURegisterBankInfo.h"

#include "AMDGPUSelectionDAGInfo.h"

#include "AMDGPUTargetMachine.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"

#include "llvm/CodeGen/MachineScheduler.h"

#include "llvm/CodeGen/TargetFrameLowering.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/MDBuilder.h"

#include <algorithm>


using namespace llvm;


#define DEBUG_TYPE "gcn-subtarget"


#define GET_SUBTARGETINFO_TARGET_DESC

#define GET_SUBTARGETINFO_CTOR

#define AMDGPUSubtarget GCNSubtarget

#include "AMDGPUGenSubtargetInfo.inc"

#undef AMDGPUSubtarget


static cl::opt<bool> EnableVGPRIndexMode(

    "amdgpu-vgpr-index-mode",

    cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),

    cl::init(false));


static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",

                           cl::desc("Enable the use of AA during codegen."),

                           cl::init(true));


static cl::opt<unsigned>

    NSAThreshold("amdgpu-nsa-threshold",

                 cl::desc("Number of addresses from which to enable MIMG NSA."),

                 cl::init(2), cl::Hidden);


GCNSubtarget::~GCNSubtarget() = default;


GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,

                                                            StringRef GPU,

                                                            StringRef FS) {

  // Determine default and user-specified characteristics

  //

  // We want to be able to turn these off, but making this a subtarget feature

  // for SI has the unhelpful behavior that it unsets everything else if you

  // disable it.

  //

  // Similarly we want enable-prt-strict-null to be on by default and not to

  // unset everything else if it is disabled


  SmallString<256> FullFS("+load-store-opt,+enable-ds128,");


  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by

  // default

  if (isAmdHsaOS())

    FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";


  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS


  // Disable mutually exclusive bits.

  if (FS.contains_insensitive("+wavefrontsize")) {

    if (!FS.contains_insensitive("wavefrontsize16"))

      FullFS += "-wavefrontsize16,";

    if (!FS.contains_insensitive("wavefrontsize32"))

      FullFS += "-wavefrontsize32,";

    if (!FS.contains_insensitive("wavefrontsize64"))

      FullFS += "-wavefrontsize64,";

  }


  FullFS += FS;


  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);


  // Implement the "generic" processors, which acts as the default when no

  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to

  // the first amdgcn target that supports flat addressing. Other OSes defaults

  // to the first amdgcn target.

  if (Gen == AMDGPUSubtarget::INVALID) {

    Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS

                                       : AMDGPUSubtarget::SOUTHERN_ISLANDS;

    // Assume wave64 for the unknown target, if not explicitly set.

    if (getWavefrontSizeLog2() == 0)

      WavefrontSizeLog2 = 6;

  } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&

             !hasFeature(AMDGPU::FeatureWavefrontSize64)) {

    // If there is no default wave size it must be a generation before gfx10,

    // these have FeatureWavefrontSize64 in their definition already. For gfx10+

    // set wave32 as a default.

    ToggleFeature(AMDGPU::FeatureWavefrontSize32);

    WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;

  }


  // We don't support FP64 for EG/NI atm.

  assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));


  // Targets must either support 64-bit offsets for MUBUF instructions, and/or

  // support flat operations, otherwise they cannot access a 64-bit global

  // address space

  assert(hasAddr64() || hasFlat());

  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets

  // that do not support ADDR64 variants of MUBUF instructions. Such targets

  // cannot use a 64 bit offset with a MUBUF instruction to access the global

  // address space

  if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);

    UseFlatForGlobal = true;

  }

  // Unless +-flat-for-global is specified, use MUBUF instructions for global

  // address space access if flat operations are not available.

  if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);

    UseFlatForGlobal = false;

  }


  // Set defaults if needed.

  if (MaxPrivateElementSize == 0)

    MaxPrivateElementSize = 4;


  if (LDSBankCount == 0)

    LDSBankCount = 32;


  if (AddressableLocalMemorySize == 0)

    AddressableLocalMemorySize = 32768;


  if (FlatOffsetBitWidth == 0)

    FlatOffsetBitWidth = 13;


  LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(*this);


  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;

  HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;


  // InstCacheLineSize is set from TableGen subtarget features

  // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).

  // Fall back to 64 if no feature was specified (e.g. generic targets).

  if (InstCacheLineSize == 0)

    InstCacheLineSize = 64;


  assert(llvm::isPowerOf2_32(InstCacheLineSize) &&

         "InstCacheLineSize must be a power of 2");


  TargetID.setTargetIDFromFeaturesString(FS);


  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "

                    << TargetID.getXnackSetting() << '\n');

  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "

                    << TargetID.getSramEccSetting() << '\n');


  return *this;

}


void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {

  LLVMContext &Ctx = F.getContext();

  if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&

      hasFeature(AMDGPU::FeatureWavefrontSize64)) {

    Ctx.diagnose(DiagnosticInfoUnsupported(

        F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));

  }

  if (hasFeature(AMDGPU::FeatureXNACKAnyOnly) && TargetID.isXnackOnOrOff()) {

    Ctx.diagnose(DiagnosticInfoUnsupported(

        F, "target only supports xnack 'Any'; '+/-xnack' is not allowed"));

  }

}


GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,

                           const GCNTargetMachine &TM, bool BufferOOBRelaxed,

                           bool TBufferOOBRelaxed)

    : // clang-format off

    AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),

    AMDGPUSubtarget(TT),

    TargetID(*this),

    InstrItins(getInstrItineraryForCPU(GPU)),

    BufferOOBRelaxed(BufferOOBRelaxed),

    TBufferOOBRelaxed(TBufferOOBRelaxed),

    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),

    TLInfo(TM, *this),

    // Frame index expansion sometimes assumes the low bit of SP is 0

    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0,

                  /*TransAl=*/Align(4)) {

  // clang-format on

  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(*this);

  EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(*this);


  TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();


  CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());

  InlineAsmLoweringInfo =

      std::make_unique<InlineAsmLowering>(getTargetLowering());

  Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);

  RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);

  InstSelector =

      std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);

}


const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {

  return TSInfo.get();

}


unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {

  if (getGeneration() < GFX10)

    return 1;


  switch (Opcode) {

  case AMDGPU::V_LSHLREV_B64_e64:

  case AMDGPU::V_LSHLREV_B64_gfx10:

  case AMDGPU::V_LSHLREV_B64_e64_gfx11:

  case AMDGPU::V_LSHLREV_B64_e32_gfx12:

  case AMDGPU::V_LSHLREV_B64_e64_gfx12:

  case AMDGPU::V_LSHL_B64_e64:

  case AMDGPU::V_LSHRREV_B64_e64:

  case AMDGPU::V_LSHRREV_B64_gfx10:

  case AMDGPU::V_LSHRREV_B64_e64_gfx11:

  case AMDGPU::V_LSHRREV_B64_e64_gfx12:

  case AMDGPU::V_LSHR_B64_e64:

  case AMDGPU::V_ASHRREV_I64_e64:

  case AMDGPU::V_ASHRREV_I64_gfx10:

  case AMDGPU::V_ASHRREV_I64_e64_gfx11:

  case AMDGPU::V_ASHRREV_I64_e64_gfx12:

  case AMDGPU::V_ASHR_I64_e64:

    return 1;

  }


  return 2;

}


/// This list was mostly derived from experimentation.


bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {

  switch (Opcode) {

  case AMDGPU::V_CVT_F16_F32_e32:

  case AMDGPU::V_CVT_F16_F32_e64:

  case AMDGPU::V_CVT_F16_U16_e32:

  case AMDGPU::V_CVT_F16_U16_e64:

  case AMDGPU::V_CVT_F16_I16_e32:

  case AMDGPU::V_CVT_F16_I16_e64:

  case AMDGPU::V_RCP_F16_e64:

  case AMDGPU::V_RCP_F16_e32:

  case AMDGPU::V_RSQ_F16_e64:

  case AMDGPU::V_RSQ_F16_e32:

  case AMDGPU::V_SQRT_F16_e64:

  case AMDGPU::V_SQRT_F16_e32:

  case AMDGPU::V_LOG_F16_e64:

  case AMDGPU::V_LOG_F16_e32:

  case AMDGPU::V_EXP_F16_e64:

  case AMDGPU::V_EXP_F16_e32:

  case AMDGPU::V_SIN_F16_e64:

  case AMDGPU::V_SIN_F16_e32:

  case AMDGPU::V_COS_F16_e64:

  case AMDGPU::V_COS_F16_e32:

  case AMDGPU::V_FLOOR_F16_e64:

  case AMDGPU::V_FLOOR_F16_e32:

  case AMDGPU::V_CEIL_F16_e64:

  case AMDGPU::V_CEIL_F16_e32:

  case AMDGPU::V_TRUNC_F16_e64:

  case AMDGPU::V_TRUNC_F16_e32:

  case AMDGPU::V_RNDNE_F16_e64:

  case AMDGPU::V_RNDNE_F16_e32:

  case AMDGPU::V_FRACT_F16_e64:

  case AMDGPU::V_FRACT_F16_e32:

  case AMDGPU::V_FREXP_MANT_F16_e64:

  case AMDGPU::V_FREXP_MANT_F16_e32:

  case AMDGPU::V_FREXP_EXP_I16_F16_e64:

  case AMDGPU::V_FREXP_EXP_I16_F16_e32:

  case AMDGPU::V_LDEXP_F16_e64:

  case AMDGPU::V_LDEXP_F16_e32:

  case AMDGPU::V_LSHLREV_B16_e64:

  case AMDGPU::V_LSHLREV_B16_e32:

  case AMDGPU::V_LSHRREV_B16_e64:

  case AMDGPU::V_LSHRREV_B16_e32:

  case AMDGPU::V_ASHRREV_I16_e64:

  case AMDGPU::V_ASHRREV_I16_e32:

  case AMDGPU::V_ADD_U16_e64:

  case AMDGPU::V_ADD_U16_e32:

  case AMDGPU::V_SUB_U16_e64:

  case AMDGPU::V_SUB_U16_e32:

  case AMDGPU::V_SUBREV_U16_e64:

  case AMDGPU::V_SUBREV_U16_e32:

  case AMDGPU::V_MUL_LO_U16_e64:

  case AMDGPU::V_MUL_LO_U16_e32:

  case AMDGPU::V_ADD_F16_e64:

  case AMDGPU::V_ADD_F16_e32:

  case AMDGPU::V_SUB_F16_e64:

  case AMDGPU::V_SUB_F16_e32:

  case AMDGPU::V_SUBREV_F16_e64:

  case AMDGPU::V_SUBREV_F16_e32:

  case AMDGPU::V_MUL_F16_e64:

  case AMDGPU::V_MUL_F16_e32:

  case AMDGPU::V_MAX_F16_e64:

  case AMDGPU::V_MAX_F16_e32:

  case AMDGPU::V_MIN_F16_e64:

  case AMDGPU::V_MIN_F16_e32:

  case AMDGPU::V_MAX_U16_e64:

  case AMDGPU::V_MAX_U16_e32:

  case AMDGPU::V_MIN_U16_e64:

  case AMDGPU::V_MIN_U16_e32:

  case AMDGPU::V_MAX_I16_e64:

  case AMDGPU::V_MAX_I16_e32:

  case AMDGPU::V_MIN_I16_e64:

  case AMDGPU::V_MIN_I16_e32:

  case AMDGPU::V_MAD_F16_e64:

  case AMDGPU::V_MAD_U16_e64:

  case AMDGPU::V_MAD_I16_e64:

  case AMDGPU::V_FMA_F16_e64:

  case AMDGPU::V_DIV_FIXUP_F16_e64:

    // On gfx10, all 16-bit instructions preserve the high bits.

    return getGeneration() <= AMDGPUSubtarget::GFX9;

  case AMDGPU::V_MADAK_F16:

  case AMDGPU::V_MADMK_F16:

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_FMAMK_F16:

  case AMDGPU::V_FMAAK_F16:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_e32:

    // In gfx9, the preferred handling of the unused high 16-bits changed. Most

    // instructions maintain the legacy behavior of 0ing. Some instructions

    // changed to preserving the high bits.

    return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;

  case AMDGPU::V_MAD_MIXLO_F16:

  case AMDGPU::V_MAD_MIXHI_F16:

  default:

    return false;

  }

}


void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,

                                       const SchedRegion &Region) const {

  // Track register pressure so the scheduler can try to decrease

  // pressure once register usage is above the threshold defined by

  // SIRegisterInfo::getRegPressureSetLimit()

  Policy.ShouldTrackPressure = true;


  const Function &F = Region.RegionBegin->getMF()->getFunction();

  if (AMDGPU::getSchedStrategy(F) == "coexec") {

    Policy.OnlyTopDown = true;

    Policy.OnlyBottomUp = false;

    return;

  }


  // Enabling both top down and bottom up scheduling seems to give us less

  // register spills than just using one of these approaches on its own.

  Policy.OnlyTopDown = false;

  Policy.OnlyBottomUp = false;


  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.

  if (!enableSIScheduler())

    Policy.ShouldTrackLaneMasks = true;

}


void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,

                                             const SchedRegion &Region) const {

  const Function &F = Region.RegionBegin->getMF()->getFunction();

  Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");

  if (!PostRADirectionAttr.isValid())

    return;


  StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();

  if (PostRADirectionStr == "topdown") {

    Policy.OnlyTopDown = true;

    Policy.OnlyBottomUp = false;

  } else if (PostRADirectionStr == "bottomup") {

    Policy.OnlyTopDown = false;

    Policy.OnlyBottomUp = true;

  } else if (PostRADirectionStr == "bidirectional") {

    Policy.OnlyTopDown = false;

    Policy.OnlyBottomUp = false;

  } else {

    DiagnosticInfoOptimizationFailure Diag(

        F, F.getSubprogram(), "invalid value for postRA direction attribute");

    F.getContext().diagnose(Diag);

  }


  LLVM_DEBUG({

    const char *DirStr = "default";

    if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)

      DirStr = "topdown";

    else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)

      DirStr = "bottomup";

    else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)

      DirStr = "bidirectional";


    dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr

           << '\n';

  });

}


void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {

  if (isWave32()) {

    // Fix implicit $vcc operands after MIParser has verified that they match

    // the instruction definitions.

    for (auto &MBB : MF) {

      for (auto &MI : MBB)

        InstrInfo.fixImplicitOperands(MI);

    }

  }

}


bool GCNSubtarget::hasMadF16() const {

  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;

}


bool GCNSubtarget::useVGPRIndexMode() const {

  return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);

}


bool GCNSubtarget::useAA() const { return UseAA; }


unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {

  return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),

                                                   getGeneration());

}


unsigned


GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,

                                       unsigned DynamicVGPRBlockSize) const {

  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(*this, NumVGPRs,

                                                       DynamicVGPRBlockSize);

}


unsigned


GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX10)

    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.


  if (HasFlatScratch || HasArchitectedFlatScratch) {

    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).

    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)

      return 4; // FLAT_SCRATCH, VCC (in that order).

  }


  if (isXNACKEnabled())

    return 4; // XNACK, VCC (in that order).

  return 2;   // VCC.

}


unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());

}


unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {

  // In principle we do not need to reserve SGPR pair used for flat_scratch if

  // we know flat instructions do not access the stack anywhere in the

  // program. For now assume it's needed if we have flat instructions.

  const bool KernelUsesFlatScratch = hasFlatAddressSpace();

  return getBaseReservedNumSGPRs(KernelUsesFlatScratch);

}


std::pair<unsigned, unsigned>


GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,

                               unsigned NumSGPRs, unsigned NumVGPRs) const {

  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);

  // Temporarily check both the attribute and the subtarget feature until the

  // latter is removed.

  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())

    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();


  auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);

  unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);

  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);


  // Maximum occupancy may be further limited by high SGPR/VGPR usage.

  MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));

  return {std::min(MinOcc, MaxOcc), MaxOcc};

}


unsigned GCNSubtarget::getBaseMaxNumSGPRs(

    const Function &F, std::pair<unsigned, unsigned> WavesPerEU,

    unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {

  // Compute maximum number of SGPRs function can use using default/requested

  // minimum number of waves per execution unit.

  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);

  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);


  // Check if maximum number of SGPRs was explicitly requested using

  // "amdgpu-num-sgpr" attribute.

  unsigned Requested =

      F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);


  if (Requested != MaxNumSGPRs) {

    // Make sure requested value does not violate subtarget's specifications.

    if (Requested && (Requested <= ReservedNumSGPRs))

      Requested = 0;


    // If more SGPRs are required to support the input user/system SGPRs,

    // increase to accommodate them.

    //

    // FIXME: This really ends up using the requested number of SGPRs + number

    // of reserved special registers in total. Theoretically you could re-use

    // the last input registers for these special registers, but this would

    // require a lot of complexity to deal with the weird aliasing.

    unsigned InputNumSGPRs = PreloadedSGPRs;

    if (Requested && Requested < InputNumSGPRs)

      Requested = InputNumSGPRs;


    // Make sure requested value is compatible with values implied by

    // default/requested minimum/maximum number of waves per execution unit.

    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))

      Requested = 0;

    if (WavesPerEU.second && Requested &&

        Requested < getMinNumSGPRs(WavesPerEU.second))

      Requested = 0;


    if (Requested)

      MaxNumSGPRs = Requested;

  }


  if (hasSGPRInitBug())

    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;


  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);

}


unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {

  const Function &F = MF.getFunction();

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),

                            getReservedNumSGPRs(MF));

}


unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {

  using USI = GCNUserSGPRUsageInfo;

  // Max number of user SGPRs

  const unsigned MaxUserSGPRs =

      USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +

      USI::getNumUserSGPRForField(USI::DispatchPtrID) +

      USI::getNumUserSGPRForField(USI::QueuePtrID) +

      USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +

      USI::getNumUserSGPRForField(USI::DispatchIdID) +

      USI::getNumUserSGPRForField(USI::FlatScratchInitID) +

      USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);


  // Max number of system SGPRs

  const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX

                                  1 + // WorkGroupIDY

                                  1 + // WorkGroupIDZ

                                  1 + // WorkGroupInfo

                                  1;  // private segment wave byte offset


  // Max number of synthetic SGPRs

  const unsigned SyntheticSGPRs = 1; // LDSKernelId


  return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;

}


unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {

  return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),

                            getReservedNumSGPRs(F));

}


unsigned GCNSubtarget::getBaseMaxNumVGPRs(

    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {

  const auto [Min, Max] = NumVGPRBounds;


  // Check if maximum number of VGPRs was explicitly requested using

  // "amdgpu-num-vgpr" attribute.


  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);

  if (Requested != Max && hasGFX90AInsts())

    Requested *= 2;


  // Make sure requested value is inside the range of possible VGPR usage.

  return std::clamp(Requested, Min, Max);

}


unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {

  // Temporarily check both the attribute and the subtarget feature, until the

  // latter is removed.

  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);

  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())

    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();


  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);

  return getBaseMaxNumVGPRs(

      F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),

          getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});

}


unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

  return getMaxNumVGPRs(MF.getFunction());

}


std::pair<unsigned, unsigned>


GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {

  const unsigned MaxVectorRegs = getMaxNumVGPRs(F);


  unsigned MaxNumVGPRs = MaxVectorRegs;

  unsigned MaxNumAGPRs = 0;

  unsigned NumArchVGPRs = getAddressableNumArchVGPRs();


  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,

  // a wave may have up to 512 total vector registers combining together both

  // VGPRs and AGPRs. Hence, in an entry function without calls and without

  // AGPRs used within it, it is possible to use the whole vector register

  // budget for VGPRs.

  //

  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split

  //       register file accordingly.

  if (hasGFX90AInsts()) {

    unsigned MinNumAGPRs = 0;

    const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();


    const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};


    // TODO: The lower bound should probably force the number of required

    // registers up, overriding amdgpu-waves-per-eu.

    std::tie(MinNumAGPRs, MaxNumAGPRs) =

        AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,

                                        /*OnlyFirstRequired=*/true);


    if (MinNumAGPRs == DefaultNumAGPR.first) {

      // Default to splitting half the registers if AGPRs are required.

      MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;

    } else {

      // Align to accum_offset's allocation granularity.

      MinNumAGPRs = alignTo(MinNumAGPRs, 4);


      MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);

    }


    // Clamp values to be inbounds of our limits, and ensure min <= max.


    MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);

    MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);


    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);

    MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);


    assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&

           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&

           "invalid register counts");

  } else if (hasMAIInsts()) {

    // On gfx908 the number of AGPRs always equals the number of VGPRs.

    MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;

  }


  return std::pair(MaxNumVGPRs, MaxNumAGPRs);

}


// Check to which source operand UseOpIdx points to and return a pointer to the

// operand of the corresponding source modifier.

// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no

// operand for the corresponding source modifier.

static const MachineOperand *


getVOP3PSourceModifierFromOpIdx(const MachineInstr &UseI, int UseOpIdx,

                                const SIInstrInfo &InstrInfo) {

  AMDGPU::OpName UseName =

      AMDGPU::getOperandIdxName(UseI.getOpcode(), UseOpIdx);

  switch (UseName) {

  case AMDGPU::OpName::src0:

    return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src0_modifiers);

  case AMDGPU::OpName::src1:

    return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src1_modifiers);

  case AMDGPU::OpName::src2:

    return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src2_modifiers);

  default:

    return nullptr;

  }

}


// Get the subreg idx of the subreg that is used by the given instruction

// operand, considering the given op_sel modifier.

// Return 0 if the whole register is used or as a conservative fallback.


static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI,

                                      const SIInstrInfo &InstrInfo,

                                      const MachineInstr &I,

                                      const MachineOperand &Op) {

  if (!InstrInfo.isVOP3P(I) || InstrInfo.isWMMA(I) || InstrInfo.isSWMMAC(I))

    return AMDGPU::NoSubRegister;


  const MachineOperand *OpMod =

      getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo);

  if (!OpMod)

    return AMDGPU::NoSubRegister;


  // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for

  // the op_sel and op_sel_hi source modifiers:

  // - op_sel: selects low/high operand bits as input to the operation;

  //           has only meaning for 16-bit source operands

  // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);

  //              a value of 0 indicates 32 bit, 1 indicates 16 bit

  // For the other VOP3P instructions, the semantics are:

  // - op_sel: selects low/high operand bits as input to the operation which

  //           results in the lower-half of the destination

  // - op_sel_hi: selects the low/high operand bits as input to the operation

  //              which results in the higher-half of the destination

  int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;

  int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;


  // Check if all parts of the register are being used (= op_sel and op_sel_hi

  // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return

  // early.

  if ((!InstrInfo.isVOP3PMix(I) && (!OpSel || !OpSelHi) &&

       (OpSel || OpSelHi)) ||

      (InstrInfo.isVOP3PMix(I) && !OpSelHi))

    return AMDGPU::NoSubRegister;


  const MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();

  const TargetRegisterClass *RC = TRI.getRegClassForOperandReg(MRI, Op);


  if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;

      TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)

    return SubRegIdx;

  if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;

      TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)

    return SubRegIdx;


  return AMDGPU::NoSubRegister;

}


Register GCNSubtarget::getRealSchedDependency(const MachineInstr &DefI,

                                              int DefOpIdx,

                                              const MachineInstr &UseI,

                                              int UseOpIdx) const {

  const SIRegisterInfo *TRI = getRegisterInfo();

  const MachineOperand &DefOp = DefI.getOperand(DefOpIdx);

  const MachineOperand &UseOp = UseI.getOperand(UseOpIdx);

  Register DefReg = DefOp.getReg();

  Register UseReg = UseOp.getReg();


  // If the registers aren't restricted to a sub-register, there is no point in

  // further analysis. This check makes only sense for virtual registers because

  // physical registers may form a tuple and thus be part of a superregister

  // although they are not a subregister themselves (vgpr0 is a "subreg" of

  // vgpr0_vgpr1 without being a subreg in itself).

  unsigned DefSubRegIdx = DefOp.getSubReg();

  if (DefReg.isVirtual() && DefSubRegIdx == AMDGPU::NoSubRegister)

    return DefReg;

  unsigned UseSubRegIdx = getEffectiveSubRegIdx(*TRI, InstrInfo, UseI, UseOp);

  if (UseReg.isVirtual() && UseSubRegIdx == AMDGPU::NoSubRegister)

    return DefReg;


  if (!TRI->checkSubRegInterference(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx))

    return Register(); // No real dependency


  // UseReg might be smaller or larger than DefReg, depending on the subreg and

  // on whether DefReg is a subreg, too. -> Find the smaller one.  This does not

  // apply to virtual registers because we cannot construct a subreg for them.

  if (DefReg.isVirtual())

    return DefReg;

  MCRegister DefMCReg =

      DefSubRegIdx ? TRI->getSubReg(DefReg, DefSubRegIdx) : DefReg.asMCReg();

  MCRegister UseMCReg =

      UseSubRegIdx ? TRI->getSubReg(UseReg, UseSubRegIdx) : UseReg.asMCReg();

  return TRI->isSubRegisterEq(DefMCReg, UseMCReg) ? UseMCReg : DefMCReg;

}


void GCNSubtarget::adjustSchedDependency(

    SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,

    const TargetSchedModel *SchedModel) const {

  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||

      !Use->isInstr())

    return;


  MachineInstr *DefI = Def->getInstr();

  MachineInstr *UseI = Use->getInstr();


  // Check for false latency on $tensorcnt / $asynccnt dependencies

  if (Dep.getReg() == AMDGPU::TENSORcnt || Dep.getReg() == AMDGPU::ASYNCcnt) {

    unsigned UseOp = UseI->getOpcode();

    // Do not adjust latency for load->s_wait

    bool IsBarrierCase =

        InstrInfo.isLDSDMA(*DefI) &&

        (UseOp == AMDGPU::S_WAIT_TENSORCNT || UseOp == AMDGPU::S_WAIT_ASYNCCNT);

    if (!IsBarrierCase) {

      Dep.setLatency(1);

      return;

    }

  }


  if (Register Reg = getRealSchedDependency(*DefI, DefOpIdx, *UseI, UseOpIdx)) {

    Dep.setReg(Reg);

  } else {

    Dep = SDep(Def, SDep::Artificial);

    return; // This is not a data dependency anymore.

  }


  if (DefI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(DefI->getIterator());

    MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());

    unsigned Lat = 0;

    for (++I; I != E && I->isBundledWithPred(); ++I) {

      if (I->isMetaInstruction())

        continue;

      if (I->modifiesRegister(Reg, TRI))

        Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);

      else if (Lat)

        --Lat;

    }

    Dep.setLatency(Lat);

  } else if (UseI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(UseI->getIterator());

    MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());

    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);

    for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {

      if (I->isMetaInstruction())

        continue;

      if (I->readsRegister(Reg, TRI))

        break;

      --Lat;

    }

    Dep.setLatency(Lat);

  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {

    // Work around the fact that SIInstrInfo::fixImplicitOperands modifies

    // implicit operands which come from the MCInstrDesc, which can fool

    // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit

    // pseudo operands.

    Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(

        DefI, DefOpIdx, UseI, UseOpIdx));

  }

}


unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX12)

    return 0; // Not MIMG encoding.


  if (NSAThreshold.getNumOccurrences() > 0)

    return std::max(NSAThreshold.getValue(), 2u);


  int Value = MF.getFunction().getFnAttributeAsParsedInteger(

      "amdgpu-nsa-threshold", -1);

  if (Value > 0)

    return std::max(Value, 2);


  return NSAThreshold;

}


GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,

                                           const GCNSubtarget &ST)

    : ST(ST) {

  const CallingConv::ID CC = F.getCallingConv();

  const bool IsKernel =

      CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;


  if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))

    KernargSegmentPtr = true;


  bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);

  if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())

    PrivateSegmentBuffer = true;

  else if (ST.isMesaGfxShader(F))

    ImplicitBufferPtr = true;


  if (!AMDGPU::isGraphics(CC)) {

    if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))

      DispatchPtr = true;


    // FIXME: Can this always be disabled with < COv5?

    if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))

      QueuePtr = true;


    if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))

      DispatchID = true;

  }


  if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&

      (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&

      // FlatScratchInit cannot be true for graphics CC if

      // hasFlatScratchEnabled() is false.

      (ST.hasFlatScratchEnabled() ||

       (!AMDGPU::isGraphics(CC) &&

        !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&

      !ST.hasArchitectedFlatScratch()) {

    FlatScratchInit = true;

  }


  if (hasImplicitBufferPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);


  if (hasPrivateSegmentBuffer())

    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);


  if (hasDispatchPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);


  if (hasQueuePtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);


  if (hasKernargSegmentPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);


  if (hasDispatchID())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);


  if (hasFlatScratchInit())

    NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);


  if (hasPrivateSegmentSize())

    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);

}


void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {

  assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));

  NumKernargPreloadSGPRs += NumSGPRs;

  NumUsedUserSGPRs += NumSGPRs;

}


unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {

  return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

UseAA
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))

AMDGPUBaseInfo.h

AMDGPUCallLowering.h
This file describes how to lower LLVM calls to machine code calls.

AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.

AMDGPULegalizerInfo.h
This file declares the targeting of the Machinelegalizer class for AMDGPU.

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

AMDGPUSelectionDAGInfo.h

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

DiagnosticInfo.h

NSAThreshold
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)

EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))

UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))

getVOP3PSourceModifierFromOpIdx
static const MachineOperand * getVOP3PSourceModifierFromOpIdx(const MachineInstr &UseI, int UseOpIdx, const SIInstrInfo &InstrInfo)
Definition GCNSubtarget.cpp:663

getEffectiveSubRegIdx
static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI, const SIInstrInfo &InstrInfo, const MachineInstr &I, const MachineOperand &Op)
Definition GCNSubtarget.cpp:682

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

UseReg
static Register UseReg(const MachineOperand &MO)
Definition HexagonCopyToCombine.cpp:245

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InlineAsmLowering.h
This file describes how to lower LLVM inline asm to machine code INLINEASM.

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MDBuilder.h

MachineScheduler.h

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

if
if(PassOpts->AAPipeline)
Definition PassBuilderBindings.cpp:64

SIMachineFunctionInfo.h

SmallString.h
This file defines the SmallString class.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

TargetFrameLowering.h

AMDGPUGenSubtargetInfo

llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition AMDGPUSubtarget.h:55

llvm::AMDGPUSubtarget::FlatOffsetBitWidth
unsigned FlatOffsetBitWidth
Definition AMDGPUSubtarget.h:62

llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition AMDGPUSubtarget.h:61

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition AMDGPUSubtarget.cpp:207

llvm::AMDGPUSubtarget::getOccupancyWithWorkGroupSizes
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition AMDGPUSubtarget.h:146

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition AMDGPUSubtarget.h:44

llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition AMDGPUSubtarget.h:33

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::EUsPerCU
unsigned EUsPerCU
Definition AMDGPUSubtarget.h:57

llvm::AMDGPUSubtarget::getWavefrontSizeLog2
unsigned getWavefrontSizeLog2() const
Definition AMDGPUSubtarget.h:223

llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition AMDGPUSubtarget.h:54

llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition AMDGPUSubtarget.h:59

llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition AMDGPUSubtarget.h:58

llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(const Triple &TT)
Definition AMDGPUSubtarget.h:65

llvm::AMDGPUSubtarget::AddressableLocalMemorySize
unsigned AddressableLocalMemorySize
Definition AMDGPUSubtarget.h:60

llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition AMDGPUSubtarget.h:165

llvm::Attribute
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105

llvm::Attribute::getValueAsString
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
Definition Attributes.cpp:405

llvm::Attribute::isValid
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261

llvm::DiagnosticInfoOptimizationFailure
Diagnostic information for optimization failures.
Definition DiagnosticInfo.h:1071

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1103

llvm::Function
Definition Function.h:65

llvm::Function::getFnAttributeAsParsedInteger
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:775

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition GCNSubtarget.h:226

llvm::GCNSubtarget::InstrItins
InstrItineraryData InstrItins
Definition GCNSubtarget.h:76

llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition GCNSubtarget.cpp:419

llvm::GCNSubtarget::mirFileLoaded
void mirFileLoaded(MachineFunction &MF) const override
Definition GCNSubtarget.cpp:404

llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition GCNSubtarget.h:78

llvm::GCNSubtarget::getAddressableNumArchVGPRs
unsigned getAddressableNumArchVGPRs() const
Definition GCNSubtarget.h:848

llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition GCNSubtarget.h:780

llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)

llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM, bool BufferOOBRelaxed=false, bool TBufferOOBRelaxed=false)
Definition GCNSubtarget.cpp:183

llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition GCNSubtarget.cpp:217

llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition GCNSubtarget.h:164

llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
Definition GCNSubtarget.cpp:766

llvm::GCNSubtarget::overridePostRASchedPolicy
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Definition GCNSubtarget.cpp:367

llvm::GCNSubtarget::getStackAlignment
Align getStackAlignment() const
Definition GCNSubtarget.h:473

llvm::GCNSubtarget::BufferOOBRelaxed
const bool BufferOOBRelaxed
Definition GCNSubtarget.h:87

llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition GCNSubtarget.cpp:415

llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
Definition GCNSubtarget.h:859

llvm::GCNSubtarget::isDynamicVGPREnabled
bool isDynamicVGPREnabled() const
Definition GCNSubtarget.h:974

llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition GCNSubtarget.h:134

llvm::GCNSubtarget::getBaseMaxNumVGPRs
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
Definition GCNSubtarget.cpp:569

llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition GCNSubtarget.h:77

llvm::GCNSubtarget::zeroesHigh16BitsOfDest
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Definition GCNSubtarget.cpp:245

llvm::GCNSubtarget::getBaseMaxNumSGPRs
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
Definition GCNSubtarget.cpp:485

llvm::GCNSubtarget::getMaxNumPreloadedSGPRs
unsigned getMaxNumPreloadedSGPRs() const
Definition GCNSubtarget.cpp:539

llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition GCNSubtarget.cpp:57

llvm::GCNSubtarget::Gen
unsigned Gen
Definition GCNSubtarget.h:75

llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Definition GCNSubtarget.cpp:343

llvm::GCNSubtarget::computeOccupancy
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
Definition GCNSubtarget.cpp:468

llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
Definition GCNSubtarget.h:867

llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition GCNSubtarget.h:132

llvm::GCNSubtarget::getNSAThreshold
unsigned getNSAThreshold(const MachineFunction &MF) const
Definition GCNSubtarget.cpp:835

llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition GCNSubtarget.cpp:454

llvm::GCNSubtarget::TBufferOOBRelaxed
const bool TBufferOOBRelaxed
Definition GCNSubtarget.h:88

llvm::GCNSubtarget::useAA
bool useAA() const override
Definition GCNSubtarget.cpp:423

llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition GCNSubtarget.h:909

llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition GCNSubtarget.cpp:431

llvm::GCNSubtarget::InstCacheLineSize
unsigned InstCacheLineSize
Definition GCNSubtarget.h:81

llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition GCNSubtarget.cpp:425

llvm::GCNSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition AMDGPUSubtarget.h:291

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition GCNSubtarget.h:170

llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition GCNSubtarget.h:786

llvm::GCNSubtarget::getMaxNumVectorRegs
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
Definition GCNSubtarget.cpp:602

llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition GCNSubtarget.h:348

llvm::GCNSubtarget::getBaseReservedNumSGPRs
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
Definition GCNSubtarget.cpp:438

llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition GCNSubtarget.h:222

llvm::GCNSubtarget::getDynamicVGPRBlockSize
unsigned getDynamicVGPRBlockSize() const
Definition GCNSubtarget.h:975

llvm::GCNSubtarget::checkSubtargetFeatures
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Definition GCNSubtarget.cpp:170

llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override

llvm::GCNSubtarget::getSelectionDAGInfo
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
Definition GCNSubtarget.cpp:213

llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition GCNSubtarget.h:74

llvm::GCNTargetMachine
Definition AMDGPUTargetMachine.h:83

llvm::GCNUserSGPRUsageInfo
Definition GCNSubtarget.h:1045

llvm::GCNUserSGPRUsageInfo::getNumUserSGPRForField
static unsigned getNumUserSGPRForField(UserSGPRID ID)
Definition GCNSubtarget.h:1083

llvm::GCNUserSGPRUsageInfo::hasQueuePtr
bool hasQueuePtr() const
Definition GCNSubtarget.h:1053

llvm::GCNUserSGPRUsageInfo::hasKernargSegmentPtr
bool hasKernargSegmentPtr() const
Definition GCNSubtarget.h:1055

llvm::GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
Definition GCNSubtarget.cpp:914

llvm::GCNUserSGPRUsageInfo::hasDispatchID
bool hasDispatchID() const
Definition GCNSubtarget.h:1057

llvm::GCNUserSGPRUsageInfo::ImplicitBufferPtrID
@ ImplicitBufferPtrID
Definition GCNSubtarget.h:1072

llvm::GCNUserSGPRUsageInfo::DispatchIdID
@ DispatchIdID
Definition GCNSubtarget.h:1077

llvm::GCNUserSGPRUsageInfo::QueuePtrID
@ QueuePtrID
Definition GCNSubtarget.h:1075

llvm::GCNUserSGPRUsageInfo::DispatchPtrID
@ DispatchPtrID
Definition GCNSubtarget.h:1074

llvm::GCNUserSGPRUsageInfo::FlatScratchInitID
@ FlatScratchInitID
Definition GCNSubtarget.h:1078

llvm::GCNUserSGPRUsageInfo::PrivateSegmentBufferID
@ PrivateSegmentBufferID
Definition GCNSubtarget.h:1073

llvm::GCNUserSGPRUsageInfo::PrivateSegmentSizeID
@ PrivateSegmentSizeID
Definition GCNSubtarget.h:1079

llvm::GCNUserSGPRUsageInfo::KernargSegmentPtrID
@ KernargSegmentPtrID
Definition GCNSubtarget.h:1076

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentBuffer
bool hasPrivateSegmentBuffer() const
Definition GCNSubtarget.h:1049

llvm::GCNUserSGPRUsageInfo::getNumFreeUserSGPRs
unsigned getNumFreeUserSGPRs()
Definition GCNSubtarget.cpp:920

llvm::GCNUserSGPRUsageInfo::hasImplicitBufferPtr
bool hasImplicitBufferPtr() const
Definition GCNSubtarget.h:1047

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentSize
bool hasPrivateSegmentSize() const
Definition GCNSubtarget.h:1061

llvm::GCNUserSGPRUsageInfo::hasDispatchPtr
bool hasDispatchPtr() const
Definition GCNSubtarget.h:1051

llvm::GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Definition GCNSubtarget.cpp:850

llvm::GCNUserSGPRUsageInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition GCNSubtarget.h:1059

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition MachineBasicBlock.h:367

llvm::MachineBasicBlock::const_instr_iterator
Instructions::const_iterator const_instr_iterator
Definition MachineBasicBlock.h:341

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:884

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:601

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition MachineInstr.h:373

llvm::MachineInstr::isBundle
bool isBundle() const
Definition MachineInstr.h:1452

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:609

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:377

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::Region
Definition RegionInfo.h:887

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107

llvm::Register::isVirtual
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79

llvm::SDep
Scheduling dependency.
Definition ScheduleDAG.h:51

llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition ScheduleDAG.h:515

llvm::SDep::Data
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition ScheduleDAG.h:147

llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74

llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition ScheduleDAG.h:142

llvm::SDep::getReg
Register getReg() const
Returns the register associated with this edge.
Definition ScheduleDAG.h:216

llvm::SDep::setReg
void setReg(Register Reg)
Assigns the associated register for this edge.
Definition ScheduleDAG.h:226

llvm::SIInstrInfo
Definition SIInstrInfo.h:101

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:418

llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition SIMachineFunctionInfo.h:1016

llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition SIMachineFunctionInfo.h:1177

llvm::SIMachineFunctionInfo::getUserSGPRInfo
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Definition SIMachineFunctionInfo.h:704

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition ScheduleDAG.h:249

llvm::SelectionDAGTargetInfo
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
Definition SelectionDAGTargetInfo.h:33

llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition TargetFrameLowering.h:47

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition TargetSchedule.h:31

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47

llvm::Triple::AMDHSA
@ AMDHSA
Definition Triple.h:236

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::cl::opt
Definition CommandLine.h:1454

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

llvm::AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo &STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
Definition AMDGPUBaseInfo.cpp:1484

llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition AMDGPUBaseInfo.h:150

llvm::AMDGPU::IsaInfo::getEUsPerCU
unsigned getEUsPerCU(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:1244

llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:1285

llvm::AMDGPU::IsaInfo::getLocalMemorySize
unsigned getLocalMemorySize(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:1220

llvm::AMDGPU::IsaInfo::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, AMDGPUSubtarget::Generation Gen)
Definition AMDGPUBaseInfo.cpp:1501

llvm::AMDGPU::getSchedStrategy
StringRef getSchedStrategy(const Function &F)
Definition AMDGPUTargetMachine.cpp:590

llvm::AMDGPU::getMaxNumUserSGPRs
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2579

llvm::AMDGPU::isEntryFunctionCC
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1485

llvm::AMDGPU::getDynamicVGPRBlockSize
unsigned getDynamicVGPRBlockSize(const Function &F)
Definition AMDGPUBaseInfo.cpp:2532

llvm::AMDGPU::getIntegerPairAttribute
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
Definition AMDGPUBaseInfo.cpp:1688

llvm::AMDGPU::isGraphics
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1474

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition CallingConv.h:144

llvm::SISrcMods::OP_SEL_0
@ OP_SEL_0
Definition SIDefines.h:282

llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition SIDefines.h:283

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::alignTo
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition MachineScheduler.h:202

llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition MachineScheduler.h:211

llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition MachineScheduler.h:212

llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition MachineScheduler.h:204

llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition MachineScheduler.h:207

llvm::SchedRegion
A region of an MBB for scheduling.
Definition MachineScheduler.h:228

llvm::cl::desc
Definition CommandLine.h:410