doxygen/SILoadStoreOptimizer_8cpp_source.html

//===- SILoadStoreOptimizer.cpp -------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass tries to fuse DS instructions with close by immediate offsets.

// This will fuse operations such as

//  ds_read_b32 v0, v2 offset:16

//  ds_read_b32 v1, v2 offset:32

// ==>

//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8

//

// The same is done for certain SMEM and VMEM opcodes, e.g.:

//  s_buffer_load_dword s4, s[0:3], 4

//  s_buffer_load_dword s5, s[0:3], 8

// ==>

//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4

//

// This pass also tries to promote constant offset to the immediate by

// adjusting the base. It tries to use a base from the nearby instructions that

// allows it to have a 13bit constant offset and then promotes the 13bit offset

// to the immediate.

// E.g.

//  s_movk_i32 s0, 0x1800

//  v_add_co_u32_e32 v0, vcc, s0, v2

//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc

//

//  s_movk_i32 s0, 0x1000

//  v_add_co_u32_e32 v5, vcc, s0, v2

//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc

//  global_load_dwordx2 v[5:6], v[5:6], off

//  global_load_dwordx2 v[0:1], v[0:1], off

// =>

//  s_movk_i32 s0, 0x1000

//  v_add_co_u32_e32 v5, vcc, s0, v2

//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc

//  global_load_dwordx2 v[5:6], v[5:6], off

//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048

//

// Future improvements:

//

// - This is currently missing stores of constants because loading

//   the constant into the data register is placed between the stores, although

//   this is arguably a scheduling problem.

//

// - Live interval recomputing seems inefficient. This currently only matches

//   one pair, and recomputes live intervals and moves on to the next pair. It

//   would be better to compute a list of all merges that need to occur.

//

// - With a list of instructions to process, we can also merge more. If a

//   cluster of loads have offsets that are too large to fit in the 8-bit

//   offsets, but are close enough to fit in the 8 bits, we can add to the base

//   pointer and use the new reduced offsets.

//

//===----------------------------------------------------------------------===//


#include "SILoadStoreOptimizer.h"

#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIDefines.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/CodeGen/MachineFunctionPass.h"

#include "llvm/InitializePasses.h"


using namespace llvm;


#define DEBUG_TYPE "si-load-store-opt"


namespace {

enum InstClassEnum {

  UNKNOWN,

  DS_READ,

  DS_WRITE,

  S_BUFFER_LOAD_IMM,

  S_BUFFER_LOAD_SGPR_IMM,

  S_LOAD_IMM,

  BUFFER_LOAD,

  BUFFER_STORE,

  MIMG,

  TBUFFER_LOAD,

  TBUFFER_STORE,

  GLOBAL_LOAD_SADDR,

  GLOBAL_STORE_SADDR,

  FLAT_LOAD,

  FLAT_STORE,

  FLAT_LOAD_SADDR,

  FLAT_STORE_SADDR,

  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of

  GLOBAL_STORE // any CombineInfo, they are only ever returned by

               // getCommonInstClass.

};


struct AddressRegs {

  unsigned char NumVAddrs = 0;

  bool SBase = false;

  bool SRsrc = false;

  bool SOffset = false;

  bool SAddr = false;

  bool VAddr = false;

  bool Addr = false;

  bool SSamp = false;

};


// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.

const unsigned MaxAddressRegs = 12 + 1 + 1;


class SILoadStoreOptimizer {

  struct CombineInfo {

    MachineBasicBlock::iterator I;

    unsigned EltSize;

    unsigned Offset;

    unsigned Width;

    unsigned Format;

    unsigned BaseOff;

    unsigned DMask;

    InstClassEnum InstClass;

    unsigned CPol = 0;

    const TargetRegisterClass *DataRC;

    bool UseST64;

    int AddrIdx[MaxAddressRegs];

    const MachineOperand *AddrReg[MaxAddressRegs];

    unsigned NumAddresses;

    unsigned Order;


    bool hasSameBaseAddress(const CombineInfo &CI) {

      if (NumAddresses != CI.NumAddresses)

        return false;


      const MachineInstr &MI = *CI.I;

      for (unsigned i = 0; i < NumAddresses; i++) {

        const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);


        if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {

          if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||

              AddrReg[i]->getImm() != AddrRegNext.getImm()) {

            return false;

          }

          continue;

        }


        // Check same base pointer. Be careful of subregisters, which can occur

        // with vectors of pointers.

        if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||

            AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {

         return false;

        }

      }

      return true;

    }


    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {

      for (unsigned i = 0; i < NumAddresses; ++i) {

        const MachineOperand *AddrOp = AddrReg[i];

        // Immediates are always OK.

        if (AddrOp->isImm())

          continue;


        // Don't try to merge addresses that aren't either immediates or registers.

        // TODO: Should be possible to merge FrameIndexes and maybe some other

        // non-register

        if (!AddrOp->isReg())

          return false;


        // TODO: We should be able to merge instructions with other physical reg

        // addresses too.

        if (AddrOp->getReg().isPhysical() &&

            AddrOp->getReg() != AMDGPU::SGPR_NULL)

          return false;


        // If an address has only one use then there will be no other

        // instructions with the same address, so we can't merge this one.

        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))

          return false;

      }

      return true;

    }


    void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);


    // Compare by pointer order.

    bool operator<(const CombineInfo& Other) const {

      return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;

    }

  };


  struct BaseRegisters {

    Register LoReg;

    Register HiReg;


    unsigned LoSubReg = 0;

    unsigned HiSubReg = 0;

    // True when using V_ADD_U64_e64 pattern

    bool UseV64Pattern = false;

  };


  struct MemAddress {

    BaseRegisters Base;

    int64_t Offset = 0;

  };


  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;


private:

  MachineFunction *MF = nullptr;

  const GCNSubtarget *STM = nullptr;

  const SIInstrInfo *TII = nullptr;

  const SIRegisterInfo *TRI = nullptr;

  MachineRegisterInfo *MRI = nullptr;

  AliasAnalysis *AA = nullptr;

  bool OptimizeAgain;


  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,

                           const DenseSet<Register> &ARegUses,

                           const MachineInstr &A, const MachineInstr &B) const;

  static bool dmasksCanBeCombined(const CombineInfo &CI,

                                  const SIInstrInfo &TII,

                                  const CombineInfo &Paired);

  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,

                                   CombineInfo &Paired, bool Modify = false);

  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,

                        const CombineInfo &Paired);

  unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);

  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,

                                                     const CombineInfo &Paired);

  const TargetRegisterClass *

  getTargetRegisterClass(const CombineInfo &CI,

                         const CombineInfo &Paired) const;

  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;


  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);


  void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,

                      MachineBasicBlock::iterator InsertBefore,

                      const DebugLoc &DL, AMDGPU::OpName OpName,

                      Register DestReg) const;

  Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

                           MachineBasicBlock::iterator InsertBefore,

                           const DebugLoc &DL, AMDGPU::OpName OpName) const;


  unsigned read2Opcode(unsigned EltSize) const;

  unsigned read2ST64Opcode(unsigned EltSize) const;

  MachineBasicBlock::iterator

  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

                 MachineBasicBlock::iterator InsertBefore);


  unsigned write2Opcode(unsigned EltSize) const;

  unsigned write2ST64Opcode(unsigned EltSize) const;

  unsigned getWrite2Opcode(const CombineInfo &CI) const;


  MachineBasicBlock::iterator

  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,

                  MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

                 MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,

                       MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

                      MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

                       MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

                       MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

                        MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,

                    MachineBasicBlock::iterator InsertBefore);

  MachineBasicBlock::iterator

  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,

                     MachineBasicBlock::iterator InsertBefore);


  void updateBaseAndOffset(MachineInstr &I, Register NewBase,

                           int32_t NewOffset) const;

  void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;

  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;

  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;

  bool processBaseWithConstOffset64(MachineInstr *AddDef,

                                    const MachineOperand &Base,

                                    MemAddress &Addr) const;

  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;

  /// Promotes constant offset to the immediate by adjusting the base. It

  /// tries to use a base from the nearby instructions that allows it to have

  /// a 13bit constant offset which gets promoted to the immediate.

  bool promoteConstantOffsetToImm(MachineInstr &CI,

                                  MemInfoMap &Visited,

                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;

  void addInstToMergeableList(const CombineInfo &CI,

                  std::list<std::list<CombineInfo> > &MergeableInsts) const;


  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(

      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,

      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,

      std::list<std::list<CombineInfo>> &MergeableInsts) const;


  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,

                                                     const CombineInfo &Paired);


  static InstClassEnum getCommonInstClass(const CombineInfo &CI,

                                          const CombineInfo &Paired);


  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,

                                     bool &OptimizeListAgain);

  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);


public:

  SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}

  bool run(MachineFunction &MF);

};


class SILoadStoreOptimizerLegacy : public MachineFunctionPass {

public:

  static char ID;


  SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}


  bool runOnMachineFunction(MachineFunction &MF) override;


  StringRef getPassName() const override { return "SI Load Store Optimizer"; }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    AU.addRequired<AAResultsWrapperPass>();


    MachineFunctionPass::getAnalysisUsage(AU);

  }


  MachineFunctionProperties getRequiredProperties() const override {

    return MachineFunctionProperties().setIsSSA();

  }

};


static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {

  const unsigned Opc = MI.getOpcode();


  if (TII.isMUBUF(Opc)) {

    // FIXME: Handle d16 correctly

    return AMDGPU::getMUBUFElements(Opc);

  }

  if (TII.isImage(MI)) {

    uint64_t DMaskImm =

        TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();

    return llvm::popcount(DMaskImm);

  }

  if (TII.isMTBUF(Opc)) {

    return AMDGPU::getMTBUFElements(Opc);

  }


  switch (Opc) {

  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

  case AMDGPU::S_LOAD_DWORD_IMM:

  case AMDGPU::GLOBAL_LOAD_DWORD:

  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORD:

  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

  case AMDGPU::FLAT_LOAD_DWORD:

  case AMDGPU::FLAT_STORE_DWORD:

  case AMDGPU::FLAT_LOAD_DWORD_SADDR:

  case AMDGPU::FLAT_STORE_DWORD_SADDR:

    return 1;

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX2_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::GLOBAL_LOAD_DWORDX2:

  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX2:

  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX2:

  case AMDGPU::FLAT_STORE_DWORDX2:

  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

    return 2;

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX3_IMM:

  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::GLOBAL_LOAD_DWORDX3:

  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX3:

  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX3:

  case AMDGPU::FLAT_STORE_DWORDX3:

  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

    return 3;

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX4_IMM:

  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::GLOBAL_LOAD_DWORDX4:

  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX4:

  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX4:

  case AMDGPU::FLAT_STORE_DWORDX4:

  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

    return 4;

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX8_IMM:

  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

    return 8;

  case AMDGPU::DS_READ_B32:

  case AMDGPU::DS_READ_B32_gfx9:

  case AMDGPU::DS_WRITE_B32:

  case AMDGPU::DS_WRITE_B32_gfx9:

    return 1;

  case AMDGPU::DS_READ_B64:

  case AMDGPU::DS_READ_B64_gfx9:

  case AMDGPU::DS_WRITE_B64:

  case AMDGPU::DS_WRITE_B64_gfx9:

    return 2;

  default:

    return 0;

  }

}


/// Maps instruction opcode to enum InstClassEnum.

static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {

  switch (Opc) {

  default:

    if (TII.isMUBUF(Opc)) {

      switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {

      default:

        return UNKNOWN;

      case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:

      case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:

      case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:

      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:

      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:

      case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:

        return BUFFER_LOAD;

      case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:

      case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_IDXEN:

      case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_OFFEN:

      case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_OFFSET:

      case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:

      case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:

        return BUFFER_STORE;

      }

    }

    if (TII.isImage(Opc)) {

      // Ignore instructions encoded without vaddr.

      if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&

          !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))

        return UNKNOWN;

      // Ignore BVH instructions

      if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)

        return UNKNOWN;

      // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.

      if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||

          TII.isGather4(Opc))

        return UNKNOWN;

      return MIMG;

    }

    if (TII.isMTBUF(Opc)) {

      switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {

      default:

        return UNKNOWN;

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:

      case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:

        return TBUFFER_LOAD;

      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:

      case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:

        return TBUFFER_STORE;

      }

    }

    return UNKNOWN;

  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

    return S_BUFFER_LOAD_IMM;

  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

    return S_BUFFER_LOAD_SGPR_IMM;

  case AMDGPU::S_LOAD_DWORD_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM:

  case AMDGPU::S_LOAD_DWORDX3_IMM:

  case AMDGPU::S_LOAD_DWORDX4_IMM:

  case AMDGPU::S_LOAD_DWORDX8_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

    return S_LOAD_IMM;

  case AMDGPU::DS_READ_B32:

  case AMDGPU::DS_READ_B32_gfx9:

  case AMDGPU::DS_READ_B64:

  case AMDGPU::DS_READ_B64_gfx9:

    return DS_READ;

  case AMDGPU::DS_WRITE_B32:

  case AMDGPU::DS_WRITE_B32_gfx9:

  case AMDGPU::DS_WRITE_B64:

  case AMDGPU::DS_WRITE_B64_gfx9:

    return DS_WRITE;

  case AMDGPU::GLOBAL_LOAD_DWORD:

  case AMDGPU::GLOBAL_LOAD_DWORDX2:

  case AMDGPU::GLOBAL_LOAD_DWORDX3:

  case AMDGPU::GLOBAL_LOAD_DWORDX4:

  case AMDGPU::FLAT_LOAD_DWORD:

  case AMDGPU::FLAT_LOAD_DWORDX2:

  case AMDGPU::FLAT_LOAD_DWORDX3:

  case AMDGPU::FLAT_LOAD_DWORDX4:

    return FLAT_LOAD;

  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

    return GLOBAL_LOAD_SADDR;

  case AMDGPU::GLOBAL_STORE_DWORD:

  case AMDGPU::GLOBAL_STORE_DWORDX2:

  case AMDGPU::GLOBAL_STORE_DWORDX3:

  case AMDGPU::GLOBAL_STORE_DWORDX4:

  case AMDGPU::FLAT_STORE_DWORD:

  case AMDGPU::FLAT_STORE_DWORDX2:

  case AMDGPU::FLAT_STORE_DWORDX3:

  case AMDGPU::FLAT_STORE_DWORDX4:

    return FLAT_STORE;

  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

    return GLOBAL_STORE_SADDR;

  case AMDGPU::FLAT_LOAD_DWORD_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

    return FLAT_LOAD_SADDR;

  case AMDGPU::FLAT_STORE_DWORD_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

    return FLAT_STORE_SADDR;

  }

}


/// Determines instruction subclass from opcode. Only instructions

/// of the same subclass can be merged together. The merged instruction may have

/// a different subclass but must have the same class.

static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {

  switch (Opc) {

  default:

    if (TII.isMUBUF(Opc))

      return AMDGPU::getMUBUFBaseOpcode(Opc);

    if (TII.isImage(Opc)) {

      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);

      assert(Info);

      return Info->BaseOpcode;

    }

    if (TII.isMTBUF(Opc))

      return AMDGPU::getMTBUFBaseOpcode(Opc);

    return -1;

  case AMDGPU::DS_READ_B32:

  case AMDGPU::DS_READ_B32_gfx9:

  case AMDGPU::DS_READ_B64:

  case AMDGPU::DS_READ_B64_gfx9:

  case AMDGPU::DS_WRITE_B32:

  case AMDGPU::DS_WRITE_B32_gfx9:

  case AMDGPU::DS_WRITE_B64:

  case AMDGPU::DS_WRITE_B64_gfx9:

    return Opc;

  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

    return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;

  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

    return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;

  case AMDGPU::S_LOAD_DWORD_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM:

  case AMDGPU::S_LOAD_DWORDX3_IMM:

  case AMDGPU::S_LOAD_DWORDX4_IMM:

  case AMDGPU::S_LOAD_DWORDX8_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

    return AMDGPU::S_LOAD_DWORD_IMM;

  case AMDGPU::GLOBAL_LOAD_DWORD:

  case AMDGPU::GLOBAL_LOAD_DWORDX2:

  case AMDGPU::GLOBAL_LOAD_DWORDX3:

  case AMDGPU::GLOBAL_LOAD_DWORDX4:

  case AMDGPU::FLAT_LOAD_DWORD:

  case AMDGPU::FLAT_LOAD_DWORDX2:

  case AMDGPU::FLAT_LOAD_DWORDX3:

  case AMDGPU::FLAT_LOAD_DWORDX4:

    return AMDGPU::FLAT_LOAD_DWORD;

  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

    return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;

  case AMDGPU::GLOBAL_STORE_DWORD:

  case AMDGPU::GLOBAL_STORE_DWORDX2:

  case AMDGPU::GLOBAL_STORE_DWORDX3:

  case AMDGPU::GLOBAL_STORE_DWORDX4:

  case AMDGPU::FLAT_STORE_DWORD:

  case AMDGPU::FLAT_STORE_DWORDX2:

  case AMDGPU::FLAT_STORE_DWORDX3:

  case AMDGPU::FLAT_STORE_DWORDX4:

    return AMDGPU::FLAT_STORE_DWORD;

  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;

  case AMDGPU::FLAT_LOAD_DWORD_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

    return AMDGPU::FLAT_LOAD_DWORD_SADDR;

  case AMDGPU::FLAT_STORE_DWORD_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

    return AMDGPU::FLAT_STORE_DWORD_SADDR;

  }

}


// GLOBAL loads and stores are classified as FLAT initially. If both combined

// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.

// If either or both instructions are non segment specific FLAT the resulting

// combined operation will be FLAT, potentially promoting one of the GLOBAL

// operations to FLAT.

// For other instructions return the original unmodified class.

InstClassEnum

SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,

                                         const CombineInfo &Paired) {

  assert(CI.InstClass == Paired.InstClass);


  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&

      SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))

    return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;


  return CI.InstClass;

}


static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {

  AddressRegs Result;


  if (TII.isMUBUF(Opc)) {

    if (AMDGPU::getMUBUFHasVAddr(Opc))

      Result.VAddr = true;

    if (AMDGPU::getMUBUFHasSrsrc(Opc))

      Result.SRsrc = true;

    if (AMDGPU::getMUBUFHasSoffset(Opc))

      Result.SOffset = true;


    return Result;

  }


  if (TII.isImage(Opc)) {

    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);

    if (VAddr0Idx >= 0) {

      AMDGPU::OpName RsrcName =

          TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;

      int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);

      Result.NumVAddrs = RsrcIdx - VAddr0Idx;

    } else {

      Result.VAddr = true;

    }

    Result.SRsrc = true;

    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);

    if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)

      Result.SSamp = true;


    return Result;

  }

  if (TII.isMTBUF(Opc)) {

    if (AMDGPU::getMTBUFHasVAddr(Opc))

      Result.VAddr = true;

    if (AMDGPU::getMTBUFHasSrsrc(Opc))

      Result.SRsrc = true;

    if (AMDGPU::getMTBUFHasSoffset(Opc))

      Result.SOffset = true;


    return Result;

  }


  switch (Opc) {

  default:

    return Result;

  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

    Result.SOffset = true;

    [[fallthrough]];

  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

  case AMDGPU::S_LOAD_DWORD_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM:

  case AMDGPU::S_LOAD_DWORDX3_IMM:

  case AMDGPU::S_LOAD_DWORDX4_IMM:

  case AMDGPU::S_LOAD_DWORDX8_IMM:

  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

    Result.SBase = true;

    return Result;

  case AMDGPU::DS_READ_B32:

  case AMDGPU::DS_READ_B64:

  case AMDGPU::DS_READ_B32_gfx9:

  case AMDGPU::DS_READ_B64_gfx9:

  case AMDGPU::DS_WRITE_B32:

  case AMDGPU::DS_WRITE_B64:

  case AMDGPU::DS_WRITE_B32_gfx9:

  case AMDGPU::DS_WRITE_B64_gfx9:

    Result.Addr = true;

    return Result;

  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

  case AMDGPU::FLAT_LOAD_DWORD_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

  case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

  case AMDGPU::FLAT_STORE_DWORD_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

  case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

    Result.SAddr = true;

    [[fallthrough]];

  case AMDGPU::GLOBAL_LOAD_DWORD:

  case AMDGPU::GLOBAL_LOAD_DWORDX2:

  case AMDGPU::GLOBAL_LOAD_DWORDX3:

  case AMDGPU::GLOBAL_LOAD_DWORDX4:

  case AMDGPU::GLOBAL_STORE_DWORD:

  case AMDGPU::GLOBAL_STORE_DWORDX2:

  case AMDGPU::GLOBAL_STORE_DWORDX3:

  case AMDGPU::GLOBAL_STORE_DWORDX4:

  case AMDGPU::FLAT_LOAD_DWORD:

  case AMDGPU::FLAT_LOAD_DWORDX2:

  case AMDGPU::FLAT_LOAD_DWORDX3:

  case AMDGPU::FLAT_LOAD_DWORDX4:

  case AMDGPU::FLAT_STORE_DWORD:

  case AMDGPU::FLAT_STORE_DWORDX2:

  case AMDGPU::FLAT_STORE_DWORDX3:

  case AMDGPU::FLAT_STORE_DWORDX4:

    Result.VAddr = true;

    return Result;

  }

}


void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,

                                              const SILoadStoreOptimizer &LSO) {

  I = MI;

  unsigned Opc = MI->getOpcode();

  InstClass = getInstClass(Opc, *LSO.TII);


  if (InstClass == UNKNOWN)

    return;


  DataRC = LSO.getDataRegClass(*MI);


  switch (InstClass) {

  case DS_READ:

   EltSize =

          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8

                                                                          : 4;

   break;

  case DS_WRITE:

    EltSize =

          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8

                                                                            : 4;

    break;

  case S_BUFFER_LOAD_IMM:

  case S_BUFFER_LOAD_SGPR_IMM:

  case S_LOAD_IMM:

    EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);

    break;

  default:

    EltSize = 4;

    break;

  }


  if (InstClass == MIMG) {

    DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();

    // Offset is not considered for MIMG instructions.

    Offset = 0;

  } else {

    int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);

    Offset = I->getOperand(OffsetIdx).getImm();

  }


  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {

    Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();

    const AMDGPU::GcnBufferFormatInfo *Info =

        AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);

    EltSize = Info->BitsPerComp / 8;

  }


  Width = getOpcodeWidth(*I, *LSO.TII);


  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {

    Offset &= 0xffff;

  } else if (InstClass != MIMG) {

    CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();

  }


  AddressRegs Regs = getRegs(Opc, *LSO.TII);

  bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);


  NumAddresses = 0;

  for (unsigned J = 0; J < Regs.NumVAddrs; J++)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;

  if (Regs.Addr)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);

  if (Regs.SBase)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);

  if (Regs.SRsrc)

    AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(

        Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);

  if (Regs.SOffset)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);

  if (Regs.SAddr)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);

  if (Regs.VAddr)

    AddrIdx[NumAddresses++] =

        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);

  if (Regs.SSamp)

    AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(

        Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);

  assert(NumAddresses <= MaxAddressRegs);


  for (unsigned J = 0; J < NumAddresses; J++)

    AddrReg[J] = &I->getOperand(AddrIdx[J]);

}


} // end anonymous namespace.


INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,

                      "SI Load Store Optimizer", false, false)

INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,

                    "SI Load Store Optimizer", false, false)


char SILoadStoreOptimizerLegacy::ID = 0;


char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;


FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {

  return new SILoadStoreOptimizerLegacy();

}


static void addDefsUsesToList(const MachineInstr &MI,

                              DenseSet<Register> &RegDefs,

                              DenseSet<Register> &RegUses) {

  for (const auto &Op : MI.operands()) {

    if (!Op.isReg())

      continue;

    if (Op.isDef())

      RegDefs.insert(Op.getReg());

    if (Op.readsReg())

      RegUses.insert(Op.getReg());

  }

}


bool SILoadStoreOptimizer::canSwapInstructions(

    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,

    const MachineInstr &A, const MachineInstr &B) const {

  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&

      (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))

    return false;

  for (const auto &BOp : B.operands()) {

    if (!BOp.isReg())

      continue;

    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))

      return false;

    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))

      return false;

  }

  return true;

}


// Given that \p CI and \p Paired are adjacent memory operations produce a new

// MMO for the combined operation with a new access size.

MachineMemOperand *

SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,

                                               const CombineInfo &Paired) {

  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();

  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();


  unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();


  // A base pointer for the combined operation is the same as the leading

  // operation's pointer.

  if (Paired < CI)

    std::swap(MMOa, MMOb);


  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());

  // If merging FLAT and GLOBAL set address space to FLAT.

  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)

    PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;


  MachineFunction *MF = CI.I->getMF();

  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);

}


bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,

                                               const SIInstrInfo &TII,

                                               const CombineInfo &Paired) {

  assert(CI.InstClass == MIMG);


  // Ignore instructions with tfe/lwe set.

  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);

  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);


  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))

    return false;


  // Check other optional immediate operands for equality.

  AMDGPU::OpName OperandsToMatch[] = {

      AMDGPU::OpName::cpol, AMDGPU::OpName::d16,  AMDGPU::OpName::unorm,

      AMDGPU::OpName::da,   AMDGPU::OpName::r128, AMDGPU::OpName::a16};


  for (AMDGPU::OpName op : OperandsToMatch) {

    int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);

    if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)

      return false;

    if (Idx != -1 &&

        CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())

      return false;

  }


  // Check DMask for overlaps.

  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);

  unsigned MinMask = std::min(CI.DMask, Paired.DMask);


  if (!MaxMask)

    return false;


  unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);

  if ((1u << AllowedBitsForMin) <= MinMask)

    return false;


  return true;

}


static unsigned getBufferFormatWithCompCount(unsigned OldFormat,

                                       unsigned ComponentCount,

                                       const GCNSubtarget &STI) {

  if (ComponentCount > 4)

    return 0;


  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =

      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);

  if (!OldFormatInfo)

    return 0;


  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =

      llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,

                                           ComponentCount,

                                           OldFormatInfo->NumFormat, STI);


  if (!NewFormatInfo)

    return 0;


  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&

         NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);


  return NewFormatInfo->Format;

}


// Return the value in the inclusive range [Lo,Hi] that is aligned to the

// highest power of two. Note that the result is well defined for all inputs

// including corner cases like:

// - if Lo == Hi, return that value

// - if Lo == 0, return 0 (even though the "- 1" below underflows

// - if Lo > Hi, return 0 (as if the range wrapped around)


static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {

  return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);

}


bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,

                                                const GCNSubtarget &STI,

                                                CombineInfo &Paired,

                                                bool Modify) {

  assert(CI.InstClass != MIMG);


  // XXX - Would the same offset be OK? Is there any reason this would happen or

  // be useful?

  if (CI.Offset == Paired.Offset)

    return false;


  // This won't be valid if the offset isn't aligned.

  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))

    return false;


  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {


    const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =

        llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);

    const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =

        llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);


    if (Info0->BitsPerComp != Info1->BitsPerComp ||

        Info0->NumFormat != Info1->NumFormat)

      return false;


    // For 8-bit or 16-bit formats there is no 3-component variant.

    // If NumCombinedComponents is 3, try the 4-component format and use XYZ.

    // Example:

    //   tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x

    //   ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]

    unsigned NumCombinedComponents = CI.Width + Paired.Width;

    if (NumCombinedComponents == 3 && CI.EltSize <= 2)

      NumCombinedComponents = 4;


    if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==

        0)

      return false;


    // Merge only when the two access ranges are strictly back-to-back,

    // any gap or overlap can over-write data or leave holes.

    unsigned ElemIndex0 = CI.Offset / CI.EltSize;

    unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;

    if (ElemIndex0 + CI.Width != ElemIndex1 &&

        ElemIndex1 + Paired.Width != ElemIndex0)

      return false;


    // 1-byte formats require 1-byte alignment.

    // 2-byte formats require 2-byte alignment.

    // 4-byte and larger formats require 4-byte alignment.

    unsigned MergedBytes = CI.EltSize * NumCombinedComponents;

    unsigned RequiredAlign = std::min(MergedBytes, 4u);

    unsigned MinOff = std::min(CI.Offset, Paired.Offset);

    if (MinOff % RequiredAlign != 0)

      return false;


    return true;

  }


  uint32_t EltOffset0 = CI.Offset / CI.EltSize;

  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;

  CI.UseST64 = false;

  CI.BaseOff = 0;


  // Handle all non-DS instructions.

  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {

    if (EltOffset0 + CI.Width != EltOffset1 &&

            EltOffset1 + Paired.Width != EltOffset0)

      return false;

    // Instructions with scale_offset modifier cannot be combined unless we

    // also generate a code to scale the offset and reset that bit.

    if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))

      return false;

    if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||

        CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {

      // Reject cases like:

      //   dword + dwordx2 -> dwordx3

      //   dword + dwordx3 -> dwordx4

      // If we tried to combine these cases, we would fail to extract a subreg

      // for the result of the second load due to SGPR alignment requirements.

      if (CI.Width != Paired.Width &&

          (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))

        return false;

    }

    return true;

  }


  // If the offset in elements doesn't fit in 8-bits, we might be able to use

  // the stride 64 versions.

  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&

      isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {

    if (Modify) {

      CI.Offset = EltOffset0 / 64;

      Paired.Offset = EltOffset1 / 64;

      CI.UseST64 = true;

    }

    return true;

  }


  // Check if the new offsets fit in the reduced 8-bit range.

  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {

    if (Modify) {

      CI.Offset = EltOffset0;

      Paired.Offset = EltOffset1;

    }

    return true;

  }


  // Try to shift base address to decrease offsets.

  uint32_t Min = std::min(EltOffset0, EltOffset1);

  uint32_t Max = std::max(EltOffset0, EltOffset1);


  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;

  if (((Max - Min) & ~Mask) == 0) {

    if (Modify) {

      // From the range of values we could use for BaseOff, choose the one that

      // is aligned to the highest power of two, to maximise the chance that

      // the same offset can be reused for other load/store pairs.

      uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);

      // Copy the low bits of the offsets, so that when we adjust them by

      // subtracting BaseOff they will be multiples of 64.

      BaseOff |= Min & maskTrailingOnes<uint32_t>(6);

      CI.BaseOff = BaseOff * CI.EltSize;

      CI.Offset = (EltOffset0 - BaseOff) / 64;

      Paired.Offset = (EltOffset1 - BaseOff) / 64;

      CI.UseST64 = true;

    }

    return true;

  }


  if (isUInt<8>(Max - Min)) {

    if (Modify) {

      // From the range of values we could use for BaseOff, choose the one that

      // is aligned to the highest power of two, to maximise the chance that

      // the same offset can be reused for other load/store pairs.

      uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);

      CI.BaseOff = BaseOff * CI.EltSize;

      CI.Offset = EltOffset0 - BaseOff;

      Paired.Offset = EltOffset1 - BaseOff;

    }

    return true;

  }


  return false;

}


bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,

                                     const CombineInfo &CI,

                                     const CombineInfo &Paired) {

  const unsigned Width = (CI.Width + Paired.Width);

  switch (CI.InstClass) {

  default:

    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));

  case S_BUFFER_LOAD_IMM:

  case S_BUFFER_LOAD_SGPR_IMM:

  case S_LOAD_IMM:

    switch (Width) {

    default:

      return false;

    case 2:

    case 4:

    case 8:

      return true;

    case 3:

      return STM.hasScalarDwordx3Loads();

    }

  }

}


const TargetRegisterClass *

SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {

  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {

    return TRI->getRegClassForReg(*MRI, Dst->getReg());

  }

  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {

    return TRI->getRegClassForReg(*MRI, Src->getReg());

  }

  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {

    return TRI->getRegClassForReg(*MRI, Src->getReg());

  }

  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {

    return TRI->getRegClassForReg(*MRI, Dst->getReg());

  }

  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {

    return TRI->getRegClassForReg(*MRI, Src->getReg());

  }

  return nullptr;

}


/// This function assumes that CI comes before Paired in a basic block. Return

/// an insertion point for the merged instruction or nullptr on failure.

SILoadStoreOptimizer::CombineInfo *

SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,

                                           CombineInfo &Paired) {

  // If another instruction has already been merged into CI, it may now be a

  // type that we can't do any further merging into.

  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)

    return nullptr;

  assert(CI.InstClass == Paired.InstClass);


  if (getInstSubclass(CI.I->getOpcode(), *TII) !=

      getInstSubclass(Paired.I->getOpcode(), *TII))

    return nullptr;


  // Check both offsets (or masks for MIMG) can be combined and fit in the

  // reduced range.

  if (CI.InstClass == MIMG) {

    if (!dmasksCanBeCombined(CI, *TII, Paired))

      return nullptr;

  } else {

    if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))

      return nullptr;

  }


  DenseSet<Register> RegDefs;

  DenseSet<Register> RegUses;

  CombineInfo *Where;

  if (CI.I->mayLoad()) {

    // Try to hoist Paired up to CI.

    addDefsUsesToList(*Paired.I, RegDefs, RegUses);

    for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {

      if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))

        return nullptr;

    }

    Where = &CI;

  } else {

    // Try to sink CI down to Paired.

    addDefsUsesToList(*CI.I, RegDefs, RegUses);

    for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {

      if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))

        return nullptr;

    }

    Where = &Paired;

  }


  // Call offsetsCanBeCombined with modify = true so that the offsets are

  // correct for the new instruction.  This should return true, because

  // this function should only be called on CombineInfo objects that

  // have already been confirmed to be mergeable.

  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)

    offsetsCanBeCombined(CI, *STM, Paired, true);


  if (CI.InstClass == DS_WRITE) {

    // Both data operands must be AGPR or VGPR, so the data registers needs to

    // be constrained to one or the other. We expect to only emit the VGPR form

    // here for now.

    //

    // FIXME: There is currently a hack in getRegClass to report that the write2

    // operands are VGPRs. In the future we should have separate agpr

    // instruction definitions.

    const MachineOperand *Data0 =

        TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);

    const MachineOperand *Data1 =

        TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);


    const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));

    int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),

                                              AMDGPU::OpName::data0);

    int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),

                                              AMDGPU::OpName::data1);


    const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);


    const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);


    if (unsigned SubReg = Data0->getSubReg()) {

      DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),

                                              DataRC0, SubReg);

    }


    if (unsigned SubReg = Data1->getSubReg()) {

      DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),

                                              DataRC1, SubReg);

    }


    if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||

        !MRI->constrainRegClass(Data1->getReg(), DataRC1))

      return nullptr;


    // TODO: If one register can be constrained, and not the other, insert a

    // copy.

  }


  return Where;

}


// Copy the merged load result from DestReg to the original dest regs of CI and

// Paired.

void SILoadStoreOptimizer::copyToDestRegs(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,

    AMDGPU::OpName OpName, Register DestReg) const {

  MachineBasicBlock *MBB = CI.I->getParent();


  auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);


  // Copy to the old destination registers.

  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);

  auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);

  auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);


  // The constrained sload instructions in S_LOAD_IMM class will have

  // `early-clobber` flag in the dst operand. Remove the flag before using the

  // MOs in copies.

  Dest0->setIsEarlyClobber(false);

  Dest1->setIsEarlyClobber(false);


  BuildMI(*MBB, InsertBefore, DL, CopyDesc)

      .add(*Dest0) // Copy to same destination including flags and sub reg.

      .addReg(DestReg, {}, SubRegIdx0);

  BuildMI(*MBB, InsertBefore, DL, CopyDesc)

      .add(*Dest1)

      .addReg(DestReg, RegState::Kill, SubRegIdx1);

}


// Return a register for the source of the merged store after copying the

// original source regs of CI and Paired into it.

Register

SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

                                      MachineBasicBlock::iterator InsertBefore,

                                      const DebugLoc &DL,

                                      AMDGPU::OpName OpName) const {

  MachineBasicBlock *MBB = CI.I->getParent();


  auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);


  // Copy to the new source register.

  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

  Register SrcReg = MRI->createVirtualRegister(SuperRC);


  const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);

  const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);


  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)

      .add(*Src0)

      .addImm(SubRegIdx0)

      .add(*Src1)

      .addImm(SubRegIdx1);


  return SrcReg;

}


unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {

  if (STM->ldsRequiresM0Init())

    return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;

}


unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {

  if (STM->ldsRequiresM0Init())

    return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;


  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9

                        : AMDGPU::DS_READ2ST64_B64_gfx9;

}


MachineBasicBlock::iterator

SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

                                     MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  // Be careful, since the addresses could be subregisters themselves in weird

  // cases, like vectors of pointers.

  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);


  unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);

  unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);

  unsigned Opc =

      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);


  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&

         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");


  const MCInstrDesc &Read2Desc = TII->get(Opc);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

  Register DestReg = MRI->createVirtualRegister(SuperRC);


  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  Register BaseReg = AddrReg->getReg();

  unsigned BaseSubReg = AddrReg->getSubReg();

  RegState BaseRegFlags = {};

  if (CI.BaseOff) {

    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)

        .addImm(CI.BaseOff);


    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BaseRegFlags = RegState::Kill;


    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)

        .addReg(ImmReg)

        .addReg(AddrReg->getReg(), {}, BaseSubReg)

        .addImm(0); // clamp bit

    BaseSubReg = 0;

  }


  MachineInstrBuilder Read2 =

      BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)

          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr

          .addImm(NewOffset0)                        // offset0

          .addImm(NewOffset1)                        // offset1

          .addImm(0)                                 // gds

          .cloneMergedMemRefs({&*CI.I, &*Paired.I});


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();


  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');

  return Read2;

}


unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {

  if (STM->ldsRequiresM0Init())

    return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;

  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9

                        : AMDGPU::DS_WRITE2_B64_gfx9;

}


unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {

  if (STM->ldsRequiresM0Init())

    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32

                          : AMDGPU::DS_WRITE2ST64_B64;


  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9

                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;

}


unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {

  return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  // Be sure to use .addOperand(), and not .addReg() with these. We want to be

  // sure we preserve the subregister index and any register flags set on them.

  const MachineOperand *AddrReg =

      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

  const MachineOperand *Data0 =

      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);

  const MachineOperand *Data1 =

      TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);


  unsigned NewOffset0 = CI.Offset;

  unsigned NewOffset1 = Paired.Offset;

  unsigned Opc = getWrite2Opcode(CI);


  if (NewOffset0 > NewOffset1) {

    // Canonicalize the merged instruction so the smaller offset comes first.

    std::swap(NewOffset0, NewOffset1);

    std::swap(Data0, Data1);

  }


  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&

         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");


  const MCInstrDesc &Write2Desc = TII->get(Opc);

  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  Register BaseReg = AddrReg->getReg();

  unsigned BaseSubReg = AddrReg->getSubReg();

  RegState BaseRegFlags = {};

  if (CI.BaseOff) {

    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)

        .addImm(CI.BaseOff);


    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    BaseRegFlags = RegState::Kill;


    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)

        .addReg(ImmReg)

        .addReg(AddrReg->getReg(), {}, BaseSubReg)

        .addImm(0); // clamp bit

    BaseSubReg = 0;

  }


  MachineInstrBuilder Write2 =

      BuildMI(*MBB, InsertBefore, DL, Write2Desc)

          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr

          .add(*Data0)                               // data0

          .add(*Data1)                               // data1

          .addImm(NewOffset0)                        // offset0

          .addImm(NewOffset1)                        // offset1

          .addImm(0)                                 // gds

          .cloneMergedMemRefs({&*CI.I, &*Paired.I});


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();


  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');

  return Write2;

}


MachineBasicBlock::iterator

SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

                                     MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();

  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);


  Register DestReg = MRI->createVirtualRegister(SuperRC);

  unsigned MergedDMask = CI.DMask | Paired.DMask;

  unsigned DMaskIdx =

      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);

  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {

    if (I == DMaskIdx)

      MIB.addImm(MergedDMask);

    else

      MIB.add((*CI.I).getOperand(I));

  }


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();

  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);


  Register DestReg = MRI->createVirtualRegister(SuperRC);

  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstrBuilder New =

      BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)

          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));

  if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)

    New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));

  New.addImm(MergedOffset);

  New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);


  // Copy to the new source register.

  Register DestReg = MRI->createVirtualRegister(SuperRC);

  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);


  AddressRegs Regs = getRegs(Opcode, *TII);


  if (Regs.VAddr)

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstr *New =

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

        .addImm(MergedOffset) // offset

        .addImm(CI.CPol)      // cpol

        .addImm(0)            // swz

        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);


  // Copy to the new source register.

  Register DestReg = MRI->createVirtualRegister(SuperRC);

  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);


  AddressRegs Regs = getRegs(Opcode, *TII);


  if (Regs.VAddr)

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));


  // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.

  // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components

  // and use XYZ of XYZW to enable the merge.

  unsigned NumCombinedComponents = CI.Width + Paired.Width;

  if (NumCombinedComponents == 3 && CI.EltSize <= 2)

    NumCombinedComponents = 4;

  unsigned JoinedFormat =

      getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstr *New =

      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

          .addImm(MergedOffset) // offset

          .addImm(JoinedFormat) // format

          .addImm(CI.CPol)      // cpol

          .addImm(0)            // swz

          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();

  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  Register SrcReg =

      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))

                 .addReg(SrcReg, RegState::Kill);


  AddressRegs Regs = getRegs(Opcode, *TII);


  if (Regs.VAddr)

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));


  // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.

  // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components

  // and use XYZ of XYZW to enable the merge.

  unsigned NumCombinedComponents = CI.Width + Paired.Width;

  if (NumCombinedComponents == 3 && CI.EltSize <= 2)

    NumCombinedComponents = 4;

  unsigned JoinedFormat =

      getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstr *New =

      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

          .addImm(std::min(CI.Offset, Paired.Offset)) // offset

          .addImm(JoinedFormat)                     // format

          .addImm(CI.CPol)                          // cpol

          .addImm(0)                                // swz

          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

  Register DestReg = MRI->createVirtualRegister(SuperRC);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);


  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

    MIB.add(*SAddr);


  MachineInstr *New =

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

       .addImm(std::min(CI.Offset, Paired.Offset))

       .addImm(CI.CPol)

       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();


  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  Register SrcReg =

      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))

                 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

                 .addReg(SrcReg, RegState::Kill);


  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

    MIB.add(*SAddr);


  MachineInstr *New =

    MIB.addImm(std::min(CI.Offset, Paired.Offset))

       .addImm(CI.CPol)

       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


static bool needsConstrainedOpcode(const GCNSubtarget &STM,

                                   ArrayRef<MachineMemOperand *> MMOs,

                                   unsigned Width) {

  // Conservatively returns true if not found the MMO.

  return STM.isXNACKEnabled() &&

         (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);

}


unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,

                                            const CombineInfo &Paired) {

  const unsigned Width = CI.Width + Paired.Width;


  switch (getCommonInstClass(CI, Paired)) {

  default:

    assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);

    // FIXME: Handle d16 correctly

    return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),

                                  Width);

  case TBUFFER_LOAD:

  case TBUFFER_STORE:

    return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),

                                  Width);


  case UNKNOWN:

    llvm_unreachable("Unknown instruction class");

  case S_BUFFER_LOAD_IMM: {

    // If XNACK is enabled, use the constrained opcodes when the first load is

    // under-aligned.

    bool NeedsConstrainedOpc =

        needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

    switch (Width) {

    default:

      return 0;

    case 2:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;

    case 3:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;

    case 4:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;

    case 8:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;

    }

  }

  case S_BUFFER_LOAD_SGPR_IMM: {

    // If XNACK is enabled, use the constrained opcodes when the first load is

    // under-aligned.

    bool NeedsConstrainedOpc =

        needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

    switch (Width) {

    default:

      return 0;

    case 2:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;

    case 3:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;

    case 4:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;

    case 8:

      return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec

                                 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;

    }

  }

  case S_LOAD_IMM: {

    // If XNACK is enabled, use the constrained opcodes when the first load is

    // under-aligned.

    bool NeedsConstrainedOpc =

        needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

    switch (Width) {

    default:

      return 0;

    case 2:

      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec

                                 : AMDGPU::S_LOAD_DWORDX2_IMM;

    case 3:

      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec

                                 : AMDGPU::S_LOAD_DWORDX3_IMM;

    case 4:

      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec

                                 : AMDGPU::S_LOAD_DWORDX4_IMM;

    case 8:

      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec

                                 : AMDGPU::S_LOAD_DWORDX8_IMM;

    }

  }

  case GLOBAL_LOAD:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::GLOBAL_LOAD_DWORDX2;

    case 3:

      return AMDGPU::GLOBAL_LOAD_DWORDX3;

    case 4:

      return AMDGPU::GLOBAL_LOAD_DWORDX4;

    }

  case GLOBAL_LOAD_SADDR:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;

    case 3:

      return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;

    case 4:

      return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;

    }

  case GLOBAL_STORE:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::GLOBAL_STORE_DWORDX2;

    case 3:

      return AMDGPU::GLOBAL_STORE_DWORDX3;

    case 4:

      return AMDGPU::GLOBAL_STORE_DWORDX4;

    }

  case GLOBAL_STORE_SADDR:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;

    case 3:

      return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;

    case 4:

      return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;

    }

  case FLAT_LOAD:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::FLAT_LOAD_DWORDX2;

    case 3:

      return AMDGPU::FLAT_LOAD_DWORDX3;

    case 4:

      return AMDGPU::FLAT_LOAD_DWORDX4;

    }

  case FLAT_STORE:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::FLAT_STORE_DWORDX2;

    case 3:

      return AMDGPU::FLAT_STORE_DWORDX3;

    case 4:

      return AMDGPU::FLAT_STORE_DWORDX4;

    }

  case FLAT_LOAD_SADDR:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;

    case 3:

      return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;

    case 4:

      return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;

    }

  case FLAT_STORE_SADDR:

    switch (Width) {

    default:

      return 0;

    case 2:

      return AMDGPU::FLAT_STORE_DWORDX2_SADDR;

    case 3:

      return AMDGPU::FLAT_STORE_DWORDX3_SADDR;

    case 4:

      return AMDGPU::FLAT_STORE_DWORDX4_SADDR;

    }

  case MIMG:

    assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&

           "No overlaps");

    return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);

  }

}


std::pair<unsigned, unsigned>

SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,

                                    const CombineInfo &Paired) {

  assert((CI.InstClass != MIMG ||

          ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==

           CI.Width + Paired.Width)) &&

         "No overlaps");


  unsigned Idx0;

  unsigned Idx1;


  static const unsigned Idxs[5][4] = {

      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},

      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},

      {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},

      {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},

      {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},

  };


  assert(CI.Width >= 1 && CI.Width <= 4);

  assert(Paired.Width >= 1 && Paired.Width <= 4);


  if (Paired < CI) {

    Idx1 = Idxs[0][Paired.Width - 1];

    Idx0 = Idxs[Paired.Width][CI.Width - 1];

  } else {

    Idx0 = Idxs[0][CI.Width - 1];

    Idx1 = Idxs[CI.Width][Paired.Width - 1];

  }


  return {Idx0, Idx1};

}


const TargetRegisterClass *

SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,

                                             const CombineInfo &Paired) const {

  if (CI.InstClass == S_BUFFER_LOAD_IMM ||

      CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {

    switch (CI.Width + Paired.Width) {

    default:

      return nullptr;

    case 2:

      return &AMDGPU::SReg_64_XEXECRegClass;

    case 3:

      return &AMDGPU::SGPR_96RegClass;

    case 4:

      return &AMDGPU::SGPR_128RegClass;

    case 8:

      return &AMDGPU::SGPR_256RegClass;

    case 16:

      return &AMDGPU::SGPR_512RegClass;

    }

  }


  // FIXME: This should compute the instruction to use, and then use the result

  // of TII->getRegClass.

  unsigned BitWidth = 32 * (CI.Width + Paired.Width);

  return TRI->isAGPRClass(getDataRegClass(*CI.I))

             ? TRI->getAGPRClassForBitWidth(BitWidth)

             : TRI->getVGPRClassForBitWidth(BitWidth);

}


MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(

    CombineInfo &CI, CombineInfo &Paired,

    MachineBasicBlock::iterator InsertBefore) {

  MachineBasicBlock *MBB = CI.I->getParent();

  DebugLoc DL =

      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());


  const unsigned Opcode = getNewOpcode(CI, Paired);


  Register SrcReg =

      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);


  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))

                 .addReg(SrcReg, RegState::Kill);


  AddressRegs Regs = getRegs(Opcode, *TII);


  if (Regs.VAddr)

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));


  // It shouldn't be possible to get this far if the two instructions

  // don't have a single memoperand, because MachineInstr::mayAlias()

  // will return true if this is the case.

  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());


  MachineInstr *New =

    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

        .addImm(std::min(CI.Offset, Paired.Offset)) // offset

        .addImm(CI.CPol)      // cpol

        .addImm(0)            // swz

        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));


  CI.I->eraseFromParent();

  Paired.I->eraseFromParent();

  return New;

}


MachineOperand

SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {

  APInt V(32, Val, true);

  if (TII->isInlineConstant(V))

    return MachineOperand::CreateImm(Val);


  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

  MachineInstr *Mov =

  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),

          TII->get(AMDGPU::S_MOV_B32), Reg)

    .addImm(Val);

  (void)Mov;

  LLVM_DEBUG(dbgs() << "    "; Mov->dump());

  return MachineOperand::CreateReg(Reg, false);

}


// Compute base address using Addr and return the final register.

Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,

                                           const MemAddress &Addr) const {

  MachineBasicBlock *MBB = MI.getParent();

  MachineBasicBlock::iterator MBBI = MI.getIterator();

  const DebugLoc &DL = MI.getDebugLoc();


  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");


  // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)

  if (Addr.Base.UseV64Pattern) {

    Register FullDestReg = MRI->createVirtualRegister(

        TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0));


    // Load the 64-bit offset into an SGPR pair if needed

    Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

    MachineInstr *MovOffset =

        BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),

                OffsetReg)

            .addImm(Addr.Offset);

    MachineInstr *Add64 =

        BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg)

            .addReg(Addr.Base.LoReg)

            .addReg(OffsetReg, RegState::Kill)

            .addImm(0);

    (void)MovOffset;

    (void)Add64;

    LLVM_DEBUG(dbgs() << "    " << *MovOffset << "\n";

               dbgs() << "    " << *Add64 << "\n\n";);


    return FullDestReg;

  }


  // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)

  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||

          Addr.Base.LoSubReg) &&

         "Expected 32-bit Base-Register-Low!!");


  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||

          Addr.Base.HiSubReg) &&

         "Expected 32-bit Base-Register-Hi!!");


  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);

  MachineOperand OffsetHi =

    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);


  const auto *CarryRC = TRI->getWaveMaskRegClass();

  Register CarryReg = MRI->createVirtualRegister(CarryRC);

  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);


  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  MachineInstr *LoHalf =

      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)

          .addReg(CarryReg, RegState::Define)

          .addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)

          .add(OffsetLo)

          .addImm(0); // clamp bit


  MachineInstr *HiHalf =

      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)

          .addReg(DeadCarryReg, RegState::Define | RegState::Dead)

          .addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)

          .add(OffsetHi)

          .addReg(CarryReg, RegState::Kill)

          .addImm(0); // clamp bit


  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());

  MachineInstr *FullBase =

    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)

      .addReg(DestSub0)

      .addImm(AMDGPU::sub0)

      .addReg(DestSub1)

      .addImm(AMDGPU::sub1);


  (void)LoHalf;

  (void)HiHalf;

  (void)FullBase;

  LLVM_DEBUG(dbgs() << "    " << *LoHalf << "\n";

             dbgs() << "    " << *HiHalf << "\n";

             dbgs() << "    " << *FullBase << "\n\n";);


  return FullDestReg;

}


// Update base and offset with the NewBase and NewOffset in MI.

void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,

                                               Register NewBase,

                                               int32_t NewOffset) const {

  auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

  Base->setReg(NewBase);

  Base->setIsKill(false);

  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);

}


// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.

// Returns true if successful, populating Addr with base register info and

// offset.

bool SILoadStoreOptimizer::processBaseWithConstOffset64(

    MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {

  if (!Base.isReg())

    return false;


  MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);

  MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);


  const MachineOperand *BaseOp = nullptr;


  auto Offset = TII->getImmOrMaterializedImm(*Src1);


  if (Offset) {

    BaseOp = Src0;

    Addr.Offset = *Offset;

  } else {

    // Both or neither are constants - can't handle this pattern

    return false;

  }


  // Now extract the base register (which should be a 64-bit VGPR).

  Addr.Base.LoReg = BaseOp->getReg();

  Addr.Base.UseV64Pattern = true;

  return true;

}


// Analyze Base and extracts:

//  - 32bit base registers, subregisters

//  - 64bit constant offset

// Expecting base computation as:

//   %OFFSET0:sgpr_32 = S_MOV_B32 8000

//   %LO:vgpr_32, %c:sreg_64_xexec =

//       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,

//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec

//   %Base:vreg_64 =

//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1

//

// Also handles V_ADD_U64_e64 pattern (gfx1250+):

//   %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256

//   %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0

void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,

                                                      MemAddress &Addr) const {

  if (!Base.isReg())

    return;


  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());

  if (!Def)

    return;


  // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)

  if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {

    if (processBaseWithConstOffset64(Def, Base, Addr))

      return;

  }


  // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern

  if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)

    return;


  MachineOperand BaseLo = Def->getOperand(1);

  MachineOperand BaseHi = Def->getOperand(3);

  if (!BaseLo.isReg() || !BaseHi.isReg())

    return;


  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());

  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());


  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||

      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)

    return;


  MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);

  MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);


  auto Offset0P = TII->getImmOrMaterializedImm(*Src0);

  if (Offset0P)

    BaseLo = *Src1;

  else {

    if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1)))

      return;

    BaseLo = *Src0;

  }


  if (!BaseLo.isReg())

    return;


  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);

  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);


  if (Src0->isImm())

    std::swap(Src0, Src1);


  if (!Src1->isImm() || Src0->isImm())

    return;


  uint64_t Offset1 = Src1->getImm();

  BaseHi = *Src0;


  if (!BaseHi.isReg())

    return;


  Addr.Base.LoReg = BaseLo.getReg();

  Addr.Base.HiReg = BaseHi.getReg();

  Addr.Base.LoSubReg = BaseLo.getSubReg();

  Addr.Base.HiSubReg = BaseHi.getSubReg();

  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);

}


// Maintain the correct LDS address for async loads and stores.

// It becomes incorrect when promoteConstantOffsetToImm adds an offset only

// meant for the global address operand. For async loads the LDS address is in

// vdst. For async stores, the LDS address is in vdata.

void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,

                                                 int32_t OffsetDiff) const {

  if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)

    return;


  MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

  if (!LDSAddr)

    LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);

  assert(LDSAddr);


  Register OldReg = LDSAddr->getReg();

  Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));

  MachineBasicBlock &MBB = *MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)

      .addReg(OldReg)

      .addImm(-OffsetDiff)

      .addImm(0);


  LDSAddr->setReg(NewReg);

}


bool SILoadStoreOptimizer::promoteConstantOffsetToImm(

    MachineInstr &MI,

    MemInfoMap &Visited,

    SmallPtrSet<MachineInstr *, 4> &AnchorList) const {


  if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))

    return false;


  // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.

  if (SIInstrInfo::isFLATScratch(MI))

    return false;


  unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS

                                              : AMDGPUAS::FLAT_ADDRESS;


  AMDGPU::FlatAddrSpace FlatVariant = AS == AMDGPUAS::GLOBAL_ADDRESS

                                          ? AMDGPU::FlatAddrSpace::FlatGlobal

                                          : AMDGPU::FlatAddrSpace::FLAT;

  bool AllowNegativeOffset =

      TII->allowNegativeFlatOffset(FlatVariant) && !TII->usesASYNC_CNT(MI);

  // The async global instructions use i24 offset for global address but u16

  // offset for LDS address. In this case, we just only promote when the offset

  // is u16.

  bool IsOffsetU16 = TII->usesASYNC_CNT(MI);


  if (AnchorList.count(&MI))

    return false;


  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());


  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {

    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);

    return false;

  }


  // Step1: Find the base-registers and a 64bit constant offset.

  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

  auto [It, Inserted] = Visited.try_emplace(&MI);

  MemAddress MAddr;

  if (Inserted) {

    processBaseWithConstOffset(Base, MAddr);

    It->second = MAddr;

  } else

    MAddr = It->second;


  if (MAddr.Offset == 0) {

    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"

                         " constant offsets that can be promoted.\n";);

    return false;

  }


  LLVM_DEBUG(dbgs() << "  BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "

                    << printReg(MAddr.Base.LoReg, TRI)

                    << "} Offset: " << MAddr.Offset << "\n\n";);


  // Step2: Traverse through MI's basic block and find an anchor(that has the

  // same base-registers) with the highest 13bit distance from MI's offset.

  // E.g. (64bit loads)

  // bb:

  //   addr1 = &a + 4096;   load1 = load(addr1,  0)

  //   addr2 = &a + 6144;   load2 = load(addr2,  0)

  //   addr3 = &a + 8192;   load3 = load(addr3,  0)

  //   addr4 = &a + 10240;  load4 = load(addr4,  0)

  //   addr5 = &a + 12288;  load5 = load(addr5,  0)

  //

  // Starting from the first load, the optimization will try to find a new base

  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192

  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192

  // as the new-base(anchor) because of the maximum distance which can

  // accommodate more intermediate bases presumably.

  //

  // Step3: move (&a + 8192) above load1. Compute and promote offsets from

  // (&a + 8192) for load1, load2, load4.

  //   addr = &a + 8192

  //   load1 = load(addr,       -4096)

  //   load2 = load(addr,       -2048)

  //   load3 = load(addr,       0)

  //   load4 = load(addr,       2048)

  //   addr5 = &a + 12288;  load5 = load(addr5,  0)

  //

  MachineInstr *AnchorInst = nullptr;

  MemAddress AnchorAddr;

  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();

  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;

  bool MIIsAnchor = false;


  MachineBasicBlock *MBB = MI.getParent();

  MachineBasicBlock::iterator E = MBB->end();

  MachineBasicBlock::iterator MBBI = MI.getIterator();

  ++MBBI;

  const SITargetLowering *TLI = STM->getTargetLowering();


  for ( ; MBBI != E; ++MBBI) {

    MachineInstr &MINext = *MBBI;

    // TODO: Support finding an anchor(with same base) from store addresses or

    // any other load addresses where the opcodes are different.

    if (MINext.getOpcode() != MI.getOpcode() ||

        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())

      continue;


    const MachineOperand &BaseNext =

      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);

    MemAddress MAddrNext;

    auto [It, Inserted] = Visited.try_emplace(&MINext);

    if (Inserted) {

      processBaseWithConstOffset(BaseNext, MAddrNext);

      It->second = MAddrNext;

    } else

      MAddrNext = It->second;


    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||

        MAddrNext.Base.HiReg != MAddr.Base.HiReg ||

        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||

        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)

      continue;


    InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);


    if (AllowNegativeOffset) {

      int64_t Dist = MAddr.Offset - MAddrNext.Offset;

      TargetLoweringBase::AddrMode AM;

      AM.HasBaseReg = true;

      AM.BaseOffs = Dist;

      if (TLI->isLegalFlatAddressingMode(AM, AS) &&

          (uint32_t)std::abs(Dist) > MaxDist) {

        MaxDist = std::abs(Dist);


        AnchorAddr = MAddrNext;

        AnchorInst = &MINext;

      }

    }

  }


  // When negative offsets are not allowed, pick the candidate with the smallest

  // offset as anchor so all promoted offsets are non-negative. If MI itself has

  // the smallest offset, MI becomes the reference point (MIIsAnchor).

  if (!AllowNegativeOffset && !InstsWCommonBase.empty()) {

    for (auto &[Inst, Offset] : InstsWCommonBase) {

      int64_t Dist = MAddr.Offset - Offset;

      TargetLoweringBase::AddrMode AM;

      AM.HasBaseReg = true;

      AM.BaseOffs = Dist;

      if (Dist >= 0 && TLI->isLegalFlatAddressingMode(AM, AS) &&

          (!IsOffsetU16 || isUInt<16>(Dist)) &&

          (!AnchorInst || Offset < AnchorAddr.Offset)) {

        AnchorAddr = Visited[Inst];

        AnchorInst = Inst;

      }

    }

    if (!AnchorInst)

      MIIsAnchor = true;

  }


  if (AnchorInst) {

    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";

               AnchorInst->dump());

    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "

               <<  AnchorAddr.Offset << "\n\n");


    // Instead of moving up, just re-compute anchor-instruction's base address.

    Register Base = computeBase(MI, AnchorAddr);


    int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;

    updateBaseAndOffset(MI, Base, OffsetDiff);

    updateAsyncLDSAddress(MI, OffsetDiff);

    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););


    for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {

      TargetLoweringBase::AddrMode AM;

      AM.HasBaseReg = true;

      AM.BaseOffs = OtherOffset - AnchorAddr.Offset;


      if (TLI->isLegalFlatAddressingMode(AM, AS) &&

          (AllowNegativeOffset || AM.BaseOffs >= 0) &&

          (!IsOffsetU16 || isUInt<16>(AM.BaseOffs))) {

        LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";

                   OtherMI->dump());

        int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;

        updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff);

        updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);

        LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());

      }

    }

    AnchorList.insert(AnchorInst);

    return true;

  }


  if (MIIsAnchor) {

    LLVM_DEBUG(dbgs() << "  MI is anchor (smallest offset); promoting "

                         "candidates relative to MI's base.\n");


    Register Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->getReg();

    bool AnyPromoted = false;


    for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {

      int64_t Dist = OtherOffset - MAddr.Offset;

      TargetLoweringBase::AddrMode AM;

      AM.HasBaseReg = true;

      AM.BaseOffs = Dist;

      if (Dist >= 0 && TLI->isLegalFlatAddressingMode(AM, AS) &&

          (!IsOffsetU16 || isUInt<16>(Dist))) {

        LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset << ")";

                   OtherMI->dump());

        updateBaseAndOffset(*OtherMI, Base, Dist);

        updateAsyncLDSAddress(*OtherMI, Dist);

        LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());

        AnyPromoted = true;

      }

    }


    if (AnyPromoted) {

      TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setIsKill(false);

      AnchorList.insert(&MI);

      return true;

    }

  }


  return false;

}


void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,

                 std::list<std::list<CombineInfo> > &MergeableInsts) const {

  for (std::list<CombineInfo> &AddrList : MergeableInsts) {

    if (AddrList.front().InstClass == CI.InstClass &&

        AddrList.front().hasSameBaseAddress(CI)) {

      AddrList.emplace_back(CI);

      return;

    }

  }


  // Base address not found, so add a new list.

  MergeableInsts.emplace_back(1, CI);

}


std::pair<MachineBasicBlock::iterator, bool>

SILoadStoreOptimizer::collectMergeableInsts(

    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,

    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,

    std::list<std::list<CombineInfo>> &MergeableInsts) const {

  bool Modified = false;


  // Sort potential mergeable instructions into lists.  One list per base address.

  unsigned Order = 0;

  MachineBasicBlock::iterator BlockI = Begin;

  for (; BlockI != End; ++BlockI) {

    MachineInstr &MI = *BlockI;


    // We run this before checking if an address is mergeable, because it can produce

    // better code even if the instructions aren't mergeable.

    if (promoteConstantOffsetToImm(MI, Visited, AnchorList))

      Modified = true;


    // Treat volatile accesses, ordered accesses and unmodeled side effects as

    // barriers. We can look after this barrier for separate merges.

    if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {

      LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);


      // Search will resume after this instruction in a separate merge list.

      ++BlockI;

      break;

    }


    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);

    if (InstClass == UNKNOWN)

      continue;


    // Do not merge VMEM buffer instructions with "swizzled" bit set.

    int Swizzled =

        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);

    if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())

      continue;


    if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {

      const MachineOperand *Fmt =

          TII->getNamedOperand(MI, AMDGPU::OpName::format);

      if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {

        LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);

        continue;

      }

    }


    CombineInfo CI;

    CI.setMI(MI, *this);

    CI.Order = Order++;


    if (!CI.hasMergeableAddress(*MRI))

      continue;


    LLVM_DEBUG(dbgs() << "Mergeable: " << MI);


    addInstToMergeableList(CI, MergeableInsts);

  }


  // At this point we have lists of Mergeable instructions.

  //

  // Part 2: Sort lists by offset and then for each CombineInfo object in the

  // list try to find an instruction that can be merged with I.  If an instruction

  // is found, it is stored in the Paired field.  If no instructions are found, then

  // the CombineInfo object is deleted from the list.


  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),

                                                   E = MergeableInsts.end(); I != E;) {


    std::list<CombineInfo> &MergeList = *I;

    if (MergeList.size() <= 1) {

      // This means we have found only one instruction with a given address

      // that can be merged, and we need at least 2 instructions to do a merge,

      // so this list can be discarded.

      I = MergeableInsts.erase(I);

      continue;

    }


    // Sort the lists by offsets, this way mergeable instructions will be

    // adjacent to each other in the list, which will make it easier to find

    // matches.

    MergeList.sort(

        [] (const CombineInfo &A, const CombineInfo &B) {

          return A.Offset < B.Offset;

        });

    ++I;

  }


  return {BlockI, Modified};

}


// Scan through looking for adjacent LDS operations with constant offsets from

// the same base register. We rely on the scheduler to do the hard work of

// clustering nearby loads, and assume these are all adjacent.

bool SILoadStoreOptimizer::optimizeBlock(

                       std::list<std::list<CombineInfo> > &MergeableInsts) {

  bool Modified = false;


  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),

                                                   E = MergeableInsts.end(); I != E;) {

    std::list<CombineInfo> &MergeList = *I;


    bool OptimizeListAgain = false;

    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {

      // We weren't able to make any changes, so delete the list so we don't

      // process the same instructions the next time we try to optimize this

      // block.

      I = MergeableInsts.erase(I);

      continue;

    }


    Modified = true;


    // We made changes, but also determined that there were no more optimization

    // opportunities, so we don't need to reprocess the list

    if (!OptimizeListAgain) {

      I = MergeableInsts.erase(I);

      continue;

    }

    OptimizeAgain = true;

  }

  return Modified;

}


bool

SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(

                                          std::list<CombineInfo> &MergeList,

                                          bool &OptimizeListAgain) {

  if (MergeList.empty())

    return false;


  bool Modified = false;


  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();

       Next = std::next(I)) {


    auto First = I;

    auto Second = Next;


    if ((*First).Order > (*Second).Order)

      std::swap(First, Second);

    CombineInfo &CI = *First;

    CombineInfo &Paired = *Second;


    CombineInfo *Where = checkAndPrepareMerge(CI, Paired);

    if (!Where) {

      ++I;

      continue;

    }


    Modified = true;


    LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);


    MachineBasicBlock::iterator NewMI;

    switch (CI.InstClass) {

    default:

      llvm_unreachable("unknown InstClass");

      break;

    case DS_READ:

      NewMI = mergeRead2Pair(CI, Paired, Where->I);

      break;

    case DS_WRITE:

      NewMI = mergeWrite2Pair(CI, Paired, Where->I);

      break;

    case S_BUFFER_LOAD_IMM:

    case S_BUFFER_LOAD_SGPR_IMM:

    case S_LOAD_IMM:

      NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 8;

      break;

    case BUFFER_LOAD:

      NewMI = mergeBufferLoadPair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case BUFFER_STORE:

      NewMI = mergeBufferStorePair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case MIMG:

      NewMI = mergeImagePair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case TBUFFER_LOAD:

      NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case TBUFFER_STORE:

      NewMI = mergeTBufferStorePair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case FLAT_LOAD:

    case FLAT_LOAD_SADDR:

    case GLOBAL_LOAD:

    case GLOBAL_LOAD_SADDR:

      NewMI = mergeFlatLoadPair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    case FLAT_STORE:

    case FLAT_STORE_SADDR:

    case GLOBAL_STORE:

    case GLOBAL_STORE_SADDR:

      NewMI = mergeFlatStorePair(CI, Paired, Where->I);

      OptimizeListAgain |= CI.Width + Paired.Width < 4;

      break;

    }

    CI.setMI(NewMI, *this);

    CI.Order = Where->Order;

    if (I == Second)

      I = Next;


    MergeList.erase(Second);

  }


  return Modified;

}


bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {

  if (skipFunction(MF.getFunction()))

    return false;

  return SILoadStoreOptimizer(

             &getAnalysis<AAResultsWrapperPass>().getAAResults())

      .run(MF);

}


bool SILoadStoreOptimizer::run(MachineFunction &MF) {

  this->MF = &MF;

  STM = &MF.getSubtarget<GCNSubtarget>();

  if (!STM->loadStoreOptEnabled())

    return false;


  TII = STM->getInstrInfo();

  TRI = &TII->getRegisterInfo();


  MRI = &MF.getRegInfo();


  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");


  bool Modified = false;


  // Contains the list of instructions for which constant offsets are being

  // promoted to the IMM. This is tracked for an entire block at time.

  SmallPtrSet<MachineInstr *, 4> AnchorList;

  MemInfoMap Visited;


  for (MachineBasicBlock &MBB : MF) {

    MachineBasicBlock::iterator SectionEnd;

    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;

         I = SectionEnd) {

      bool CollectModified;

      std::list<std::list<CombineInfo>> MergeableInsts;


      // First pass: Collect list of all instructions we know how to merge in a

      // subset of the block.

      std::tie(SectionEnd, CollectModified) =

          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);


      Modified |= CollectModified;


      do {

        OptimizeAgain = false;

        Modified |= optimizeBlock(MergeableInsts);

      } while (OptimizeAgain);

    }


    Visited.clear();

    AnchorList.clear();

  }


  return Modified;

}


PreservedAnalyses


SILoadStoreOptimizerPass::run(MachineFunction &MF,

                              MachineFunctionAnalysisManager &MFAM) {

  MFPropsModifier _(*this, MF);


  if (MF.getFunction().hasOptNone())

    return PreservedAnalyses::all();


  auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)

                  .getManager();

  AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());


  bool Changed = SILoadStoreOptimizer(&AA).run(MF);

  if (!Changed)

    return PreservedAnalyses::all();


  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();

  PA.preserveSet<CFGAnalyses>();

  return PA;

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

addInstToMergeableList
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
Definition AMDGPUImageIntrinsicOptimizer.cpp:98

collectMergeableInsts
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Definition AMDGPUImageIntrinsicOptimizer.cpp:142

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPU.h

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition ARMSLSHardening.cpp:72

AliasAnalysis.h

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

op
#define op(i)

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

_
#define _
Definition HexagonMCCodeEmitter.cpp:46

getAlign
static MaybeAlign getAlign(Value *Ptr)
Definition IRBuilder.cpp:528

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

LoopDeletionResult::Modified
@ Modified
Definition LoopDeletion.cpp:47

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineFunctionPass.h

Reg
Register Reg
Definition MachineSink.cpp:2126

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

getReg
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition MipsDisassembler.cpp:106

FAM
FunctionAnalysisManager FAM
Definition PassBuilderBindings.cpp:61

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

SIDefines.h

mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition SILoadStoreOptimizer.cpp:1080

needsConstrainedOpcode
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
Definition SILoadStoreOptimizer.cpp:1863

addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
Definition SILoadStoreOptimizer.cpp:955

getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition SILoadStoreOptimizer.cpp:1049

SILoadStoreOptimizer.h

optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
Definition ScalarizeMaskedMemIntrin.cpp:1090

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

llvm::AAManager
A manager for alias analyses.
Definition AliasAnalysis.h:1016

llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition AliasAnalysis.h:1064

llvm::AAResults
Definition AliasAnalysis.h:319

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:434

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DebugLoc::getMergedLocation
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Definition DebugLoc.cpp:169

llvm::DenseMap
Definition DenseMap.h:834

llvm::DenseSet
Implements a dense probed hash-table based set.
Definition DenseSet.h:289

llvm::FunctionAnalysisManagerMachineFunctionProxy
Definition MachinePassManager.h:130

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function::hasOptNone
bool hasOptNone() const
Do not optimize this function (-O0).
Definition Function.h:708

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition GCNSubtarget.h:547

llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition GCNSubtarget.h:126

llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition GCNSubtarget.h:558

llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition GCNSubtarget.h:132

llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition GCNSubtarget.h:408

llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition GCNSubtarget.h:348

llvm::HexagonInstrInfo::getRegisterInfo
const HexagonRegisterInfo & getRegisterInfo() const
Definition HexagonInstrInfo.h:53

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::MCInstrDesc::getOpcode
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition MCInstrDesc.h:231

llvm::MFPropsModifier
An RAII based helper class to modify MachineFunctionProperties when running pass.
Definition MachinePassManager.h:40

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:381

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:383

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:345

llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition MachineFunctionPass.h:31

llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition MachineFunctionPass.cpp:188

llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition MachineFunction.h:137

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:788

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition MachineFunction.cpp:565

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition MachineInstrBuilder.h:321

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:199

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:233

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:326

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:601

llvm::MachineInstr::dump
LLVM_ABI void dump() const
Definition MachineInstr.cpp:1762

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition MachineMemOperand.h:243

llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition MachineMemOperand.h:236

llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition MachineMemOperand.h:207

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:377

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:331

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:333

llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition MachineOperand.h:833

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition MachineOperand.h:851

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::hasOneNonDBGUse
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition MachineRegisterInfo.cpp:425

llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition MachineRegisterInfo.h:648

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::constrainRegClass
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
Definition MachineRegisterInfo.cpp:84

llvm::MachineRegisterInfo::getUniqueVRegDef
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition MachineRegisterInfo.cpp:417

llvm::Pass::dump
void dump() const
Definition Pass.cpp:146

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83

llvm::SIInstrInfo
Definition SIInstrInfo.h:101

llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition SIInstrInfo.h:724

llvm::SIInstrInfo::isVIMAGE
static bool isVIMAGE(const MachineInstr &MI)
Definition SIInstrInfo.h:676

llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition SIInstrInfo.h:716

llvm::SIInstrInfo::isVSAMPLE
static bool isVSAMPLE(const MachineInstr &MI)
Definition SIInstrInfo.h:684

llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition SIInstrInfo.h:700

llvm::SIInstrInfo::getNamedOperand
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
Definition SIInstrInfo.cpp:9856

llvm::SILoadStoreOptimizerPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SILoadStoreOptimizer.cpp:2905

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SITargetLowering::isLegalFlatAddressingMode
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
Definition SIISelLowering.cpp:1911

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:533

llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition SmallVector.h:966

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185

uint32_t

uint64_t

Changed
Changed
Definition ObjCARCOpts.cpp:2366

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

OpName
Definition R600Defines.h:62

false
Definition MachinePipeliner.cpp:245

llvm::AA
Abstract Attribute helper functions.
Definition Attributor.h:165

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPU::CPol::SCAL
@ SCAL
Definition SIDefines.h:418

llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition AMDGPUBaseInfo.cpp:3435

llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:507

llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:497

llvm::AMDGPU::FlatAddrSpace
FlatAddrSpace
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:512

llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition AMDGPUBaseInfo.cpp:522

llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:517

llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
Definition AMDGPUBaseInfo.h:436

llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:486

llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:533

llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition AMDGPUBaseInfo.cpp:491

llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:543

llvm::AMDGPU::getMIMGBaseOpcode
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:321

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition AMDGPUBaseInfo.cpp:326

llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:502

llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:528

llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:3521

llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:538

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition LanaiAluCode.h:39

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:62

llvm::SIInstrFlags::MIMG
@ MIMG
Definition SIDefines.h:87

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::jitlink::loongarch::Add64
@ Add64
64 bits label addition
Definition loongarch.h:332

llvm::logicalview::LVAttributeKind::Inserted
@ Inserted
Definition LVOptions.h:109

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::pdb::DbgHeaderType::Max
@ Max
Definition RawConstants.h:101

llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sframe::BaseReg
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::operator<
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:449

llvm::RegState
RegState
Flags to represent properties of register accesses.
Definition MachineInstrBuilder.h:50

llvm::maskLeadingOnes
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:88

llvm::createSILoadStoreOptimizerLegacyPass
FunctionPass * createSILoadStoreOptimizerLegacyPass()
Definition SILoadStoreOptimizer.cpp:951

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::SILoadStoreOptimizerLegacyID
char & SILoadStoreOptimizerLegacyID
Definition SILoadStoreOptimizer.cpp:949

llvm::popcount
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204

llvm::getMachineFunctionPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition MachinePassManager.cpp:162

llvm::countl_zero
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::IRMemLocation::Other
@ Other
Any other memory.
Definition ModRef.h:68

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74

llvm::Next
FunctionAddr VTableAddr Next
Definition InstrProf.h:141

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::AnchorList
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
Definition SampleProfileMatcher.h:25

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:219

llvm::maskTrailingOnes
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:742

llvm::printReg
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition TargetRegisterInfo.cpp:110

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863

llvm::AMDGPU::GcnBufferFormatInfo
Definition AMDGPUBaseInfo.h:95

llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition AMDGPUBaseInfo.h:97

llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition AMDGPUBaseInfo.h:96

llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition AMDGPUBaseInfo.h:99

llvm::AMDGPU::MIMGInfo
Definition AMDGPUBaseInfo.h:546

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2980

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2981