doxygen/AMDGPUISelLowering_8cpp_source.html

//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// This is the parent TargetLowering class for hardware code gen

/// targets.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUISelLowering.h"

#include "AMDGPU.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPUMachineFunctionInfo.h"

#include "AMDGPUMemoryUtils.h"

#include "AMDGPUSelectionDAGInfo.h"

#include "SIMachineFunctionInfo.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Target/TargetMachine.h"


using namespace llvm;


#include "AMDGPUGenCallingConv.inc"


static cl::opt<bool> AMDGPUBypassSlowDiv(

  "amdgpu-bypass-slow-div",

  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),

  cl::init(true));


// Find a larger type to do a load / store of a vector with.


EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {

  unsigned StoreSize = VT.getStoreSizeInBits();

  if (StoreSize <= 32)

    return EVT::getIntegerVT(Ctx, StoreSize);


  if (StoreSize % 32 == 0)

    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);


  return VT;

}


unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {

  return DAG.computeKnownBits(Op).countMaxActiveBits();

}


unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {

  // In order for this to be a signed 24-bit value, bit 23, must

  // be a sign bit.

  return DAG.ComputeMaxSignificantBits(Op);

}


AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

                                           const TargetSubtargetInfo &STI,

                                           const AMDGPUSubtarget &AMDGPUSTI)

    : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {

  // Always lower memset, memcpy, and memmove intrinsics to load/store

  // instructions, rather then generating calls to memset, mempcy or memmove.

  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;

  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;

  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;


  // Enable ganging up loads and stores in the memcpy DAG lowering.

  MaxGluedStoresPerMemcpy = 16;


  // Lower floating point store/load to integer store/load to reduce the number

  // of patterns in tablegen.

  setOperationAction(ISD::LOAD, MVT::f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);


  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);


  setOperationAction(ISD::LOAD, MVT::v3f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);


  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);


  setOperationAction(ISD::LOAD, MVT::v5f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);


  setOperationAction(ISD::LOAD, MVT::v6f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);


  setOperationAction(ISD::LOAD, MVT::v7f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);


  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);


  setOperationAction(ISD::LOAD, MVT::v9f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);


  setOperationAction(ISD::LOAD, MVT::v10f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);


  setOperationAction(ISD::LOAD, MVT::v11f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);


  setOperationAction(ISD::LOAD, MVT::v12f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);


  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);


  setOperationAction(ISD::LOAD, MVT::v32f32, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);


  setOperationAction(ISD::LOAD, MVT::i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);


  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);


  setOperationAction(ISD::LOAD, MVT::f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);


  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);


  setOperationAction(ISD::LOAD, MVT::v3i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);


  setOperationAction(ISD::LOAD, MVT::v4i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);


  setOperationAction(ISD::LOAD, MVT::v3f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);


  setOperationAction(ISD::LOAD, MVT::v4f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);


  setOperationAction(ISD::LOAD, MVT::v8i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);


  setOperationAction(ISD::LOAD, MVT::v8f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);


  setOperationAction(ISD::LOAD, MVT::v16i64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);


  setOperationAction(ISD::LOAD, MVT::v16f64, Promote);

  AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);


  setOperationAction(ISD::LOAD, MVT::i128, Promote);

  AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);


  // TODO: Would be better to consume as directly legal

  setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);

  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);


  setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);

  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);


  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);

  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);


  setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);

  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);


  setOperationAction(ISD::ATOMIC_LOAD, MVT::v2f32, Promote);

  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64);


  setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);

  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);


  setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);

  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);


  setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);

  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);


  setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);

  AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);


  setOperationAction(ISD::ATOMIC_STORE, MVT::v2f32, Promote);

  AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64);


  // There are no 64-bit extloads. These should be done as a 32-bit extload and

  // an extension to 64-bit.

  for (MVT VT : MVT::integer_valuetypes())

    setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,

                     Expand);


  for (MVT VT : MVT::integer_valuetypes()) {

    if (VT == MVT::i64)

      continue;


    for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {

      setLoadExtAction(Op, VT, MVT::i1, Promote);

      setLoadExtAction(Op, VT, MVT::i8, Legal);

      setLoadExtAction(Op, VT, MVT::i16, Legal);

      setLoadExtAction(Op, VT, MVT::i32, Expand);

    }

  }


  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())

    for (auto MemVT :

         {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})

      setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,

                       Expand);


  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);


  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);


  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);

  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);


  setOperationAction(ISD::STORE, MVT::f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);


  setOperationAction(ISD::STORE, MVT::v2f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);


  setOperationAction(ISD::STORE, MVT::v3f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);


  setOperationAction(ISD::STORE, MVT::v4f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);


  setOperationAction(ISD::STORE, MVT::v5f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);


  setOperationAction(ISD::STORE, MVT::v6f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);


  setOperationAction(ISD::STORE, MVT::v7f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);


  setOperationAction(ISD::STORE, MVT::v8f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);


  setOperationAction(ISD::STORE, MVT::v9f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);


  setOperationAction(ISD::STORE, MVT::v10f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);


  setOperationAction(ISD::STORE, MVT::v11f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);


  setOperationAction(ISD::STORE, MVT::v12f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);


  setOperationAction(ISD::STORE, MVT::v16f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);


  setOperationAction(ISD::STORE, MVT::v32f32, Promote);

  AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);


  setOperationAction(ISD::STORE, MVT::i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);


  setOperationAction(ISD::STORE, MVT::v2i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);


  setOperationAction(ISD::STORE, MVT::f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);


  setOperationAction(ISD::STORE, MVT::v2f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);


  setOperationAction(ISD::STORE, MVT::v3i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);


  setOperationAction(ISD::STORE, MVT::v3f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);


  setOperationAction(ISD::STORE, MVT::v4i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);


  setOperationAction(ISD::STORE, MVT::v4f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);


  setOperationAction(ISD::STORE, MVT::v8i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);


  setOperationAction(ISD::STORE, MVT::v8f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);


  setOperationAction(ISD::STORE, MVT::v16i64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);


  setOperationAction(ISD::STORE, MVT::v16f64, Promote);

  AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);


  setOperationAction(ISD::STORE, MVT::i128, Promote);

  AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);


  setTruncStoreAction(MVT::i64, MVT::i1, Expand);

  setTruncStoreAction(MVT::i64, MVT::i8, Expand);

  setTruncStoreAction(MVT::i64, MVT::i16, Expand);

  setTruncStoreAction(MVT::i64, MVT::i32, Expand);


  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);

  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);

  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);

  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);


  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);

  setTruncStoreAction(MVT::f32, MVT::f16, Expand);

  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);

  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);

  setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);

  setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);

  setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);

  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);

  setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);

  setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);

  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);

  setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);

  setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);

  setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);

  setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);


  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);

  setTruncStoreAction(MVT::f64, MVT::f16, Expand);

  setTruncStoreAction(MVT::f64, MVT::f32, Expand);


  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);

  setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);

  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);


  setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);


  setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);

  setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);

  setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);

  setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);

  setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);

  setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);

  setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);


  setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);

  setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);

  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);

  setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);

  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);


  setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);

  setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);

  setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);


  setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);

  setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);

  setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);


  setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);

  setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);

  setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);


  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);

  setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);

  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);


  setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);

  setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);

  setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);

  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);

  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);

  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);

  setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);


  setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);

  setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);


  setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);


  // For R600, this is totally unsupported, just custom lower to produce an

  // error.

  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);


  // Library functions.  These default to Expand, but we have instructions

  // for them.

  setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,

                      ISD::FROUNDEVEN, ISD::FTRUNC},

                     {MVT::f16, MVT::f32}, Legal);

  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);


  setOperationAction(ISD::FLOG2, MVT::f32, Custom);

  setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);

  setOperationAction({ISD::LROUND, ISD::LLROUND},

                     {MVT::f16, MVT::f32, MVT::f64}, Expand);


  setOperationAction(

      {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,

      Custom);

  setOperationAction({ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f64, Custom);


  setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);


  setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);


  setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},

                     Expand);


  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);

  setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);

  setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);


  setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,

                     Custom);


  setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);


  // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches

  // scalarization code. Can be removed when IS_FPCLASS expand isn't called by

  // default unless marked custom/legal.

  setOperationAction(ISD::IS_FPCLASS,

                     {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,

                      MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,

                      MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,

                      MVT::v16f64},

                     Custom);


  // Expand to fneg + fadd.

  setOperationAction(ISD::FSUB, MVT::f64, Expand);


  setOperationAction(ISD::CONCAT_VECTORS,

                     {MVT::v3i32,  MVT::v3f32,  MVT::v4i32,  MVT::v4f32,

                      MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,

                      MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,

                      MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,

                      MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},

                     Custom);


  setOperationAction(

      ISD::EXTRACT_SUBVECTOR,

      {MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,

       MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,

       MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,

       MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,

       MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,

       MVT::v2f64,  MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,

       MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},

      Custom);


  setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64,

                     Expand);

  setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);


  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };

  for (MVT VT : ScalarIntVTs) {

    // These should use [SU]DIVREM, so set them to expand

    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,

                       Expand);


    // GPU does not have divrem function for signed or unsigned.

    setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);


    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.

    setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);


    setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);


    // AMDGPU uses ADDC/SUBC/ADDE/SUBE

    setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);

  }


  // The hardware supports 32-bit FSHR, but not FSHL.

  setOperationAction(ISD::FSHR, MVT::i32, Legal);


  setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);


  setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);


  setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);

  setOperationAction({ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,

                      ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,

                      ISD::FP_TO_UINT_SAT},

                     MVT::i64, Custom);

  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);


  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,

                     Legal);


  setOperationAction(

      {ISD::CTTZ, ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON},

      MVT::i64, Custom);


  for (auto VT : {MVT::i8, MVT::i16})

    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, VT, Custom);


  static const MVT::SimpleValueType VectorIntTypes[] = {

      MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,

      MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};


  for (MVT VT : VectorIntTypes) {

    // Expand the following operations for the current type by default.

    // clang-format off

    setOperationAction({ISD::ADD,            ISD::AND,

                        ISD::FP_TO_SINT,     ISD::FP_TO_UINT,

                        ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,

                        ISD::MUL,            ISD::MULHU,

                        ISD::MULHS,          ISD::OR,

                        ISD::SHL,            ISD::SRA,

                        ISD::SRL,            ISD::ROTL,

                        ISD::ROTR,           ISD::SUB,

                        ISD::SINT_TO_FP,     ISD::UINT_TO_FP,

                        ISD::SDIV,           ISD::UDIV,

                        ISD::SREM,           ISD::UREM,

                        ISD::SMUL_LOHI,      ISD::UMUL_LOHI,

                        ISD::SDIVREM,        ISD::UDIVREM,

                        ISD::SELECT,         ISD::VSELECT,

                        ISD::SELECT_CC,      ISD::XOR,

                        ISD::BSWAP,          ISD::CTPOP,

                        ISD::CTTZ,           ISD::CTLZ,

                        ISD::VECTOR_SHUFFLE, ISD::SETCC,

                        ISD::ADDRSPACECAST},

                       VT, Expand);

    // clang-format on

  }


  static const MVT::SimpleValueType FloatVectorTypes[] = {

      MVT::v2f32, MVT::v3f32,  MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,

      MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};


  for (MVT VT : FloatVectorTypes) {

    setOperationAction(

        {ISD::FABS,          ISD::FMINNUM,        ISD::FMAXNUM,

         ISD::FADD,          ISD::FCEIL,          ISD::FCOS,

         ISD::FDIV,          ISD::FEXP2,          ISD::FEXP,

         ISD::FEXP10,        ISD::FLOG2,          ISD::FREM,

         ISD::FLOG,          ISD::FLOG10,         ISD::FPOW,

         ISD::FFLOOR,        ISD::FTRUNC,         ISD::FMUL,

         ISD::FMA,           ISD::FRINT,          ISD::FNEARBYINT,

         ISD::FSQRT,         ISD::FSIN,           ISD::FSUB,

         ISD::FNEG,          ISD::VSELECT,        ISD::SELECT_CC,

         ISD::FCOPYSIGN,     ISD::VECTOR_SHUFFLE, ISD::SETCC,

         ISD::FCANONICALIZE, ISD::FROUNDEVEN},

        VT, Expand);

  }


  // This causes using an unrolled select operation rather than expansion with

  // bit operations. This is in general better, but the alternative using BFI

  // instructions may be better if the select sources are SGPRs.

  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);


  setOperationAction(ISD::SELECT, MVT::v3f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);


  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);


  setOperationAction(ISD::SELECT, MVT::v5f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);


  setOperationAction(ISD::SELECT, MVT::v6f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);


  setOperationAction(ISD::SELECT, MVT::v7f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);


  setOperationAction(ISD::SELECT, MVT::v9f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);


  setOperationAction(ISD::SELECT, MVT::v10f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);


  setOperationAction(ISD::SELECT, MVT::v11f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);


  setOperationAction(ISD::SELECT, MVT::v12f32, Promote);

  AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);


  setSchedulingPreference(Sched::RegPressure);

  setJumpIsExpensive(true);


  setMinCmpXchgSizeInBits(32);

  setSupportsUnalignedAtomics(false);


  PredictableSelectIsExpensive = false;


  // We want to find all load dependencies for long chains of stores to enable

  // merging into very wide vectors. The problem is with vectors with > 4

  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16

  // vectors are a legal type, even though we have to split the loads

  // usually. When we can more precisely specify load legality per address

  // space, we should be able to make FindBetterChain/MergeConsecutiveStores

  // smarter so that they can figure out what to do in 2 iterations without all

  // N > 4 stores on the same chain.

  GatherAllAliasesMaxDepth = 16;


  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry

  // about these during lowering.

  MaxStoresPerMemcpy  = 0xffffffff;

  MaxStoresPerMemmove = 0xffffffff;

  MaxStoresPerMemset  = 0xffffffff;


  // The expansion for 64-bit division is enormous.

  if (AMDGPUBypassSlowDiv)

    addBypassSlowDiv(64, 32);


  setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,

                       ISD::SRA,        ISD::SRL,

                       ISD::TRUNCATE,   ISD::MUL,

                       ISD::SMUL_LOHI,  ISD::UMUL_LOHI,

                       ISD::MULHU,      ISD::MULHS,

                       ISD::SELECT,     ISD::SELECT_CC,

                       ISD::STORE,      ISD::FADD,

                       ISD::FSUB,       ISD::FNEG,

                       ISD::FABS,       ISD::AssertZext,

                       ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});


  setMaxAtomicSizeInBitsSupported(64);

  setMaxDivRemBitWidthSupported(64);

  setMaxLargeFPConvertBitWidthSupported(64);

}


bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {

  const auto Flags = Op.getNode()->getFlags();

  if (Flags.hasNoSignedZeros())

    return true;


  return false;

}


//===----------------------------------------------------------------------===//

// Target Information

//===----------------------------------------------------------------------===//


LLVM_READNONE


static bool fnegFoldsIntoOpcode(unsigned Opc) {

  switch (Opc) {

  case ISD::FADD:

  case ISD::FSUB:

  case ISD::FMUL:

  case ISD::FMA:

  case ISD::FMAD:

  case ISD::FMINNUM:

  case ISD::FMAXNUM:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

  case ISD::SELECT:

  case ISD::FSIN:

  case ISD::FTRUNC:

  case ISD::FRINT:

  case ISD::FNEARBYINT:

  case ISD::FROUNDEVEN:

  case ISD::FCANONICALIZE:

  case AMDGPUISD::RCP:

  case AMDGPUISD::RCP_LEGACY:

  case AMDGPUISD::RCP_IFLAG:

  case AMDGPUISD::SIN_HW:

  case AMDGPUISD::FMUL_LEGACY:

  case AMDGPUISD::FMIN_LEGACY:

  case AMDGPUISD::FMAX_LEGACY:

  case AMDGPUISD::FMED3:

    // TODO: handle llvm.amdgcn.fma.legacy

    return true;

  case ISD::BITCAST:

    llvm_unreachable("bitcast is special cased");

  default:

    return false;

  }

}


static bool fnegFoldsIntoOp(const SDNode *N) {

  unsigned Opc = N->getOpcode();

  if (Opc == ISD::BITCAST) {

    // TODO: Is there a benefit to checking the conditions performFNegCombine

    // does? We don't for the other cases.

    SDValue BCSrc = N->getOperand(0);

    if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {

      return BCSrc.getNumOperands() == 2 &&

             BCSrc.getOperand(1).getValueSizeInBits() == 32;

    }


    return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;

  }


  return fnegFoldsIntoOpcode(Opc);

}


/// \p returns true if the operation will definitely need to use a 64-bit

/// encoding, and thus will use a VOP3 encoding regardless of the source

/// modifiers.

LLVM_READONLY


static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {

  return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||

         VT == MVT::f64;

}


/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the

/// type for ISD::SELECT.

LLVM_READONLY


static bool selectSupportsSourceMods(const SDNode *N) {

  // TODO: Only applies if select will be vector

  return N->getValueType(0) == MVT::f32;

}


// Most FP instructions support source modifiers, but this could be refined

// slightly.

LLVM_READONLY


static bool hasSourceMods(const SDNode *N) {

  if (isa<MemSDNode>(N))

    return false;


  switch (N->getOpcode()) {

  case ISD::CopyToReg:

  case ISD::FDIV:

  case ISD::FREM:

  case ISD::INLINEASM:

  case ISD::INLINEASM_BR:

  case AMDGPUISD::DIV_SCALE:

  case ISD::INTRINSIC_W_CHAIN:


  // TODO: Should really be looking at the users of the bitcast. These are

  // problematic because bitcasts are used to legalize all stores to integer

  // types.

  case ISD::BITCAST:

    return false;

  case ISD::INTRINSIC_WO_CHAIN: {

    switch (N->getConstantOperandVal(0)) {

    case Intrinsic::amdgcn_interp_p1:

    case Intrinsic::amdgcn_interp_p2:

    case Intrinsic::amdgcn_interp_mov:

    case Intrinsic::amdgcn_interp_p1_f16:

    case Intrinsic::amdgcn_interp_p2_f16:

      return false;

    default:

      return true;

    }

  }

  case ISD::SELECT:

    return selectSupportsSourceMods(N);

  default:

    return true;

  }

}


bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,

                                                 unsigned CostThreshold) {

  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus

  // it is truly free to use a source modifier in all cases. If there are

  // multiple users but for each one will necessitate using VOP3, there will be

  // a code size increase. Try to avoid increasing code size unless we know it

  // will save on the instruction count.

  unsigned NumMayIncreaseSize = 0;

  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();


  assert(!N->use_empty());


  // XXX - Should this limit number of uses to check?

  for (const SDNode *U : N->users()) {

    if (!hasSourceMods(U))

      return false;


    if (!opMustUseVOP3Encoding(U, VT)) {

      if (++NumMayIncreaseSize > CostThreshold)

        return false;

    }

  }


  return true;

}


EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

                                              ISD::NodeType ExtendKind) const {

  assert(!VT.isVector() && "only scalar expected");


  // Round to the next multiple of 32-bits.

  unsigned Size = VT.getSizeInBits();

  if (Size <= 32)

    return MVT::i32;

  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));

}


unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {

  return 32;

}


bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {

  return true;

}


// The backend supports 32 and 64 bit floating point immediates.

// FIXME: Why are we reporting vectors of FP immediates as legal?


bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

                                        bool ForCodeSize) const {

  return isTypeLegal(VT.getScalarType());

}


// We don't want to shrink f64 / f32 constants.


bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {

  EVT ScalarVT = VT.getScalarType();

  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);

}


bool AMDGPUTargetLowering::shouldReduceLoadWidth(

    SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,

    std::optional<unsigned> ByteOffset) const {

  // TODO: This may be worth removing. Check regression tests for diffs.

  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))

    return false;


  unsigned NewSize = NewVT.getStoreSizeInBits();


  // If we are reducing to a 32-bit load or a smaller multi-dword load,

  // this is always better.

  if (NewSize >= 32)

    return true;


  EVT OldVT = N->getValueType(0);

  unsigned OldSize = OldVT.getStoreSizeInBits();


  MemSDNode *MN = cast<MemSDNode>(N);

  unsigned AS = MN->getAddressSpace();

  // Do not shrink an aligned scalar load to sub-dword.

  // Scalar engine cannot do sub-dword loads.

  // TODO: Update this for GFX12 which does have scalar sub-dword loads.

  if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&

      (AS == AMDGPUAS::CONSTANT_ADDRESS ||

       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

       (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&

        MN->isInvariant())) &&

      AMDGPU::isUniformMMO(MN->getMemOperand()))

    return false;


  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar

  // extloads, so doing one requires using a buffer_load. In cases where we

  // still couldn't use a scalar load, using the wider load shouldn't really

  // hurt anything.


  // If the old size already had to be an extload, there's no harm in continuing

  // to reduce the width.

  return (OldSize < 32);

}


bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,

                                                   const SelectionDAG &DAG,

                                                   const MachineMemOperand &MMO) const {


  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());


  if (LoadTy.getScalarType() == MVT::i32)

    return false;


  unsigned LScalarSize = LoadTy.getScalarSizeInBits();

  unsigned CastScalarSize = CastTy.getScalarSizeInBits();


  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))

    return false;


  unsigned Fast = 0;

  return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),

                                        CastTy, MMO, &Fast) &&

         Fast;

}


// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also

// profitable with the expansion for 64-bit since it's generally good to

// speculate things.


bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

  return true;

}


bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

  return true;

}


bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {

  switch (N->getOpcode()) {

  case ISD::EntryToken:

  case ISD::TokenFactor:

    return true;

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IntrID = N->getConstantOperandVal(0);

    return AMDGPU::isIntrinsicAlwaysUniform(IntrID);

  }

  case ISD::INTRINSIC_W_CHAIN: {

    unsigned IntrID = N->getConstantOperandVal(1);

    return AMDGPU::isIntrinsicAlwaysUniform(IntrID);

  }

  case ISD::LOAD:

    if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==

        AMDGPUAS::CONSTANT_ADDRESS_32BIT)

      return true;

    return false;

  case AMDGPUISD::SETCC: // ballot-style instruction

    return true;

  }

  return false;

}


SDValue AMDGPUTargetLowering::getNegatedExpression(

    SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,

    NegatibleCost &Cost, unsigned Depth) const {


  switch (Op.getOpcode()) {

  case ISD::FMA:

  case ISD::FMAD: {

    // Negating a fma is not free if it has users without source mods.

    if (!allUsesHaveSourceMods(Op.getNode()))

      return SDValue();

    break;

  }

  case AMDGPUISD::RCP: {

    SDValue Src = Op.getOperand(0);

    EVT VT = Op.getValueType();

    SDLoc SL(Op);


    SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,

                                          ForCodeSize, Cost, Depth + 1);

    if (NegSrc)

      return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());

    return SDValue();

  }

  default:

    break;

  }


  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

                                              ForCodeSize, Cost, Depth);

}


//===---------------------------------------------------------------------===//

// Target Properties

//===---------------------------------------------------------------------===//


bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {

  assert(VT.isFloatingPoint());


  // Packed operations do not have a fabs modifier.

  // Report this based on the end legalized type.

  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;

}


bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {

  assert(VT.isFloatingPoint());

  // Report this based on the end legalized type.

  VT = VT.getScalarType();

  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;

}


bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,

                                                         unsigned NumElem,

                                                         unsigned AS) const {

  return true;

}


bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {

  // There are few operations which truly have vector input operands. Any vector

  // operation is going to involve operations on each component, and a

  // build_vector will be a copy per element, so it always makes sense to use a

  // build_vector input in place of the extracted element to avoid a copy into a

  // super register.

  //

  // We should probably only do this if all users are extracts only, but this

  // should be the common case.

  return true;

}


bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {

  // Truncate is just accessing a subregister.


  unsigned SrcSize = Source.getSizeInBits();

  unsigned DestSize = Dest.getSizeInBits();


  return DestSize < SrcSize && DestSize % 32 == 0 ;

}


bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {

  // Truncate is just accessing a subregister.


  unsigned SrcSize = Source->getScalarSizeInBits();

  unsigned DestSize = Dest->getScalarSizeInBits();


  if (DestSize== 16 && Subtarget->has16BitInsts())

    return SrcSize >= 32;


  return DestSize < SrcSize && DestSize % 32 == 0;

}


bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {

  unsigned SrcSize = Src->getScalarSizeInBits();

  unsigned DestSize = Dest->getScalarSizeInBits();


  if (SrcSize == 16 && Subtarget->has16BitInsts())

    return DestSize >= 32;


  return SrcSize == 32 && DestSize == 64;

}


bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {

  // Any register load of a 64-bit value really requires 2 32-bit moves. For all

  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,

  // this will enable reducing 64-bit operations the 32-bit, which is always

  // good.


  if (Src == MVT::i16)

    return Dest == MVT::i32 ||Dest == MVT::i64 ;


  return Src == MVT::i32 && Dest == MVT::i64;

}


bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,

                                                 EVT DestVT) const {

  switch (N->getOpcode()) {

  case ISD::ADD:

  case ISD::SUB:

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

  case ISD::MUL:

  case ISD::SETCC:

  case ISD::SELECT:

  case ISD::SMIN:

  case ISD::SMAX:

  case ISD::UMIN:

  case ISD::UMAX:

    if (isTypeLegal(MVT::i16) &&

        (!DestVT.isVector() ||

         !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P

      // Don't narrow back down to i16 if promoted to i32 already.

      if (!N->isDivergent() && DestVT.isInteger() &&

          DestVT.getScalarSizeInBits() > 1 &&

          DestVT.getScalarSizeInBits() <= 16 &&

          SrcVT.getScalarSizeInBits() > 16) {

        return false;

      }

    }

    return true;

  default:

    break;

  }


  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a

  // limited number of native 64-bit operations. Shrinking an operation to fit

  // in a single 32-bit register should always be helpful. As currently used,

  // this is much less general than the name suggests, and is only used in

  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is

  // not profitable, and may actually be harmful.

  if (isa<LoadSDNode>(N))

    return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;


  return true;

}


bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(

    const SDNode* N, CombineLevel Level) const {

  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||

          N->getOpcode() == ISD::SRL) &&

         "Expected shift op");


  SDValue ShiftLHS = N->getOperand(0);

  if (!ShiftLHS->hasOneUse())

    return false;


  if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&

      !ShiftLHS.getOperand(0)->hasOneUse())

    return false;


  // Always commute pre-type legalization and right shifts.

  // We're looking for shl(or(x,y),z) patterns.

  if (Level < CombineLevel::AfterLegalizeTypes ||

      N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)

    return true;


  // If only user is a i32 right-shift, then don't destroy a BFE pattern.

  if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&

      (N->user_begin()->getOpcode() == ISD::SRA ||

       N->user_begin()->getOpcode() == ISD::SRL))

    return false;


  // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.

  auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {

    if (LHS.getOpcode() != ISD::SHL)

      return false;

    auto *RHSLd = dyn_cast<LoadSDNode>(RHS);

    auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));

    auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));

    return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&

           LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&

           RHSLd->getExtensionType() == ISD::ZEXTLOAD;

  };

  SDValue LHS = N->getOperand(0).getOperand(0);

  SDValue RHS = N->getOperand(0).getOperand(1);

  return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));

}


//===---------------------------------------------------------------------===//

// TargetLowering Callbacks

//===---------------------------------------------------------------------===//


CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,

                                                  bool IsVarArg) {

  switch (CC) {

  case CallingConv::AMDGPU_VS:

  case CallingConv::AMDGPU_GS:

  case CallingConv::AMDGPU_PS:

  case CallingConv::AMDGPU_CS:

  case CallingConv::AMDGPU_HS:

  case CallingConv::AMDGPU_ES:

  case CallingConv::AMDGPU_LS:

    return CC_AMDGPU;

  case CallingConv::AMDGPU_CS_Chain:

  case CallingConv::AMDGPU_CS_ChainPreserve:

    return CC_AMDGPU_CS_CHAIN;

  case CallingConv::C:

  case CallingConv::Fast:

  case CallingConv::Cold:

    return CC_AMDGPU_Func;

  case CallingConv::AMDGPU_Gfx:

  case CallingConv::AMDGPU_Gfx_WholeWave:

    return CC_SI_Gfx;

  case CallingConv::AMDGPU_KERNEL:

  case CallingConv::SPIR_KERNEL:

  default:

    reportFatalUsageError("unsupported calling convention for call");

  }

}


CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,

                                                    bool IsVarArg) {

  switch (CC) {

  case CallingConv::AMDGPU_KERNEL:

  case CallingConv::SPIR_KERNEL:

    llvm_unreachable("kernels should not be handled here");

  case CallingConv::AMDGPU_VS:

  case CallingConv::AMDGPU_GS:

  case CallingConv::AMDGPU_PS:

  case CallingConv::AMDGPU_CS:

  case CallingConv::AMDGPU_CS_Chain:

  case CallingConv::AMDGPU_CS_ChainPreserve:

  case CallingConv::AMDGPU_HS:

  case CallingConv::AMDGPU_ES:

  case CallingConv::AMDGPU_LS:

    return RetCC_SI_Shader;

  case CallingConv::AMDGPU_Gfx:

  case CallingConv::AMDGPU_Gfx_WholeWave:

    return RetCC_SI_Gfx;

  case CallingConv::C:

  case CallingConv::Fast:

  case CallingConv::Cold:

    return RetCC_AMDGPU_Func;

  default:

    reportFatalUsageError("unsupported calling convention");

  }

}


/// The SelectionDAGBuilder will automatically promote function arguments

/// with illegal types.  However, this does not work for the AMDGPU targets

/// since the function arguments are stored in memory as these illegal types.

/// In order to handle this properly we need to get the original types sizes

/// from the LLVM IR Function and fixup the ISD:InputArg values before

/// passing them to AnalyzeFormalArguments()


/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting

/// input values across multiple registers.  Each item in the Ins array

/// represents a single value that will be stored in registers.  Ins[x].VT is

/// the value type of the value that will be stored in the register, so

/// whatever SDNode we lower the argument to needs to be this type.

///

/// In order to correctly lower the arguments we need to know the size of each

/// argument.  Since Ins[x].VT gives us the size of the register that will

/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type

/// for the original function argument so that we can deduce the correct memory

/// type to use for Ins[x].  In most cases the correct memory type will be

/// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,

/// we have a kernel argument of type v8i8, this argument will be split into

/// 8 parts and each part will be represented by its own item in the Ins array.

/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of

/// the argument before it was split.  From this, we deduce that the memory type

/// for each individual part is i8.  We pass the memory type as LocVT to the

/// calling convention analysis function and the register type (Ins[x].VT) as

/// the ValVT.


void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(

  CCState &State,

  const SmallVectorImpl<ISD::InputArg> &Ins) const {

  const MachineFunction &MF = State.getMachineFunction();

  const Function &Fn = MF.getFunction();

  LLVMContext &Ctx = Fn.getContext();

  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();

  CallingConv::ID CC = Fn.getCallingConv();


  Align MaxAlign = Align(1);

  uint64_t ExplicitArgOffset = 0;

  const DataLayout &DL = Fn.getDataLayout();


  unsigned InIndex = 0;


  for (const Argument &Arg : Fn.args()) {

    const bool IsByRef = Arg.hasByRefAttr();

    Type *BaseArgTy = Arg.getType();

    Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;

    Align Alignment = DL.getValueOrABITypeAlignment(

        IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);

    MaxAlign = std::max(Alignment, MaxAlign);

    uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);


    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;

    ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;


    // We're basically throwing away everything passed into us and starting over

    // to get accurate in-memory offsets. The "PartOffset" is completely useless

    // to us as computed in Ins.

    //

    // We also need to figure out what type legalization is trying to do to get

    // the correct memory offsets.


    SmallVector<EVT, 16> ValueVTs;

    SmallVector<uint64_t, 16> Offsets;

    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,

                    &Offsets, ArgOffset);


    for (unsigned Value = 0, NumValues = ValueVTs.size();

         Value != NumValues; ++Value) {

      uint64_t BasePartOffset = Offsets[Value];


      EVT ArgVT = ValueVTs[Value];

      EVT MemVT = ArgVT;

      MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);

      unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);


      if (NumRegs == 1) {

        // This argument is not split, so the IR type is the memory type.

        if (ArgVT.isExtended()) {

          // We have an extended type, like i24, so we should just use the

          // register type.

          MemVT = RegisterVT;

        } else {

          MemVT = ArgVT;

        }

      } else if (ArgVT.isVector() && RegisterVT.isVector() &&

                 ArgVT.getScalarType() == RegisterVT.getScalarType()) {

        assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());

        // We have a vector value which has been split into a vector with

        // the same scalar type, but fewer elements.  This should handle

        // all the floating-point vector types.

        MemVT = RegisterVT;

      } else if (ArgVT.isVector() &&

                 ArgVT.getVectorNumElements() == NumRegs) {

        // This arg has been split so that each element is stored in a separate

        // register.

        MemVT = ArgVT.getScalarType();

      } else if (ArgVT.isExtended()) {

        // We have an extended type, like i65.

        MemVT = RegisterVT;

      } else {

        unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;

        assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);

        if (RegisterVT.isInteger()) {

          MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);

        } else if (RegisterVT.isVector()) {

          assert(!RegisterVT.getScalarType().isFloatingPoint());

          unsigned NumElements = RegisterVT.getVectorNumElements();

          assert(MemoryBits % NumElements == 0);

          // This vector type has been split into another vector type with

          // a different elements size.

          EVT ScalarVT = EVT::getIntegerVT(State.getContext(),

                                           MemoryBits / NumElements);

          MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);

        } else {

          llvm_unreachable("cannot deduce memory type.");

        }

      }


      // Convert one element vectors to scalar.

      if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)

        MemVT = MemVT.getScalarType();


      // Round up vec3/vec5 argument.

      if (MemVT.isVector() && !MemVT.isPow2VectorType()) {

        MemVT = MemVT.getPow2VectorType(State.getContext());

      } else if (!MemVT.isSimple() && !MemVT.isVector()) {

        MemVT = MemVT.getRoundIntegerType(State.getContext());

      }


      unsigned PartOffset = 0;

      for (unsigned i = 0; i != NumRegs; ++i) {

        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,

                                               BasePartOffset + PartOffset,

                                               MemVT.getSimpleVT(),

                                               CCValAssign::Full));

        PartOffset += MemVT.getStoreSize();

      }

    }

  }

}


SDValue AMDGPUTargetLowering::LowerReturn(

  SDValue Chain, CallingConv::ID CallConv,

  bool isVarArg,

  const SmallVectorImpl<ISD::OutputArg> &Outs,

  const SmallVectorImpl<SDValue> &OutVals,

  const SDLoc &DL, SelectionDAG &DAG) const {

  // FIXME: Fails for r600 tests

  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&

  // "wave terminate should not have return values");

  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);

}


//===---------------------------------------------------------------------===//

// Target specific lowering

//===---------------------------------------------------------------------===//


/// Selects the correct CCAssignFn for a given CallingConvention value.


CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,

                                                    bool IsVarArg) {

  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);

}


CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,

                                                      bool IsVarArg) {

  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);

}


SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,

                                                  SelectionDAG &DAG,

                                                  MachineFrameInfo &MFI,

                                                  int ClobberedFI) const {

  SmallVector<SDValue, 8> ArgChains;

  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);

  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;


  // Include the original chain at the beginning of the list. When this is

  // used by target LowerCall hooks, this helps legalize find the

  // CALLSEQ_BEGIN node.

  ArgChains.push_back(Chain);


  // Add a chain value for each stack argument corresponding

  for (SDNode *U : DAG.getEntryNode().getNode()->users()) {

    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {

      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {

        if (FI->getIndex() < 0) {

          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());

          int64_t InLastByte = InFirstByte;

          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;


          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||

              (FirstByte <= InFirstByte && InFirstByte <= LastByte))

            ArgChains.push_back(SDValue(L, 1));

        }

      }

    }

  }


  // Build a tokenfactor for all the chains.

  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);

}


SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,

                                                 SmallVectorImpl<SDValue> &InVals,

                                                 StringRef Reason) const {

  SDValue Callee = CLI.Callee;

  SelectionDAG &DAG = CLI.DAG;


  const Function &Fn = DAG.getMachineFunction().getFunction();


  StringRef FuncName("<unknown>");


  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))

    FuncName = G->getSymbol();

  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))

    FuncName = G->getGlobal()->getName();


  DAG.getContext()->diagnose(

      DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));


  if (!CLI.IsTailCall) {

    for (ISD::InputArg &Arg : CLI.Ins)

      InVals.push_back(DAG.getPOISON(Arg.VT));

  }


  // FIXME: Hack because R600 doesn't handle callseq pseudos yet.

  if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)

    return CLI.Chain;


  SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);

  return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);

}


SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,

                                        SmallVectorImpl<SDValue> &InVals) const {

  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");

}


SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

                                                      SelectionDAG &DAG) const {

  const Function &Fn = DAG.getMachineFunction().getFunction();


  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

      Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));

  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};

  return DAG.getMergeValues(Ops, SDLoc());

}


SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,

                                             SelectionDAG &DAG) const {

  switch (Op.getOpcode()) {

  default:

    Op->print(errs(), &DAG);

    llvm_unreachable("Custom lowering code for this "

                     "instruction is not implemented yet!");

    break;

  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);

  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);

  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);

  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);

  case ISD::SDIVREM:

    return LowerSDIVREM(Op, DAG);

  case ISD::FCEIL: return LowerFCEIL(Op, DAG);

  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);

  case ISD::FRINT: return LowerFRINT(Op, DAG);

  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);

  case ISD::FROUNDEVEN:

    return LowerFROUNDEVEN(Op, DAG);

  case ISD::FROUND: return LowerFROUND(Op, DAG);

  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);

  case ISD::FLOG2:

    return LowerFLOG2(Op, DAG);

  case ISD::FLOG:

  case ISD::FLOG10:

    return LowerFLOGCommon(Op, DAG);

  case ISD::FEXP:

  case ISD::FEXP10:

    return lowerFEXP(Op, DAG);

  case ISD::FEXP2:

    return lowerFEXP2(Op, DAG);

  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT:

    return LowerFP_TO_INT(Op, DAG);

  case ISD::FP_TO_SINT_SAT:

  case ISD::FP_TO_UINT_SAT:

    return LowerFP_TO_INT_SAT(Op, DAG);

  case ISD::CTTZ:

  case ISD::CTTZ_ZERO_POISON:

  case ISD::CTLZ:

  case ISD::CTLZ_ZERO_POISON:

    return LowerCTLZ_CTTZ(Op, DAG);

  case ISD::CTLS:

    return LowerCTLS(Op, DAG);

  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

  }

  return Op;

}


void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,

                                              SmallVectorImpl<SDValue> &Results,

                                              SelectionDAG &DAG) const {

  switch (N->getOpcode()) {

  case ISD::SIGN_EXTEND_INREG:

    // Different parts of legalization seem to interpret which type of

    // sign_extend_inreg is the one to check for custom lowering. The extended

    // from type is what really matters, but some places check for custom

    // lowering of the result type. This results in trying to use

    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do

    // nothing here and let the illegal result integer be handled normally.

    return;

  case ISD::FLOG2:

    if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))

      Results.push_back(Lowered);

    return;

  case ISD::FLOG:

  case ISD::FLOG10:

    if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))

      Results.push_back(Lowered);

    return;

  case ISD::FEXP2:

    if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))

      Results.push_back(Lowered);

    return;

  case ISD::FEXP:

  case ISD::FEXP10:

    if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))

      Results.push_back(Lowered);

    return;

  case ISD::CTLZ:

  case ISD::CTLZ_ZERO_POISON:

    if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))

      Results.push_back(Lowered);

    return;

  default:

    return;

  }

}


SDValue AMDGPUTargetLowering::LowerBlockAddress(SDValue Op,

                                                SelectionDAG &DAG) const {

  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);

  SDLoc SL(Op);

  EVT VT = Op.getValueType();

  return DAG.getTargetBlockAddress(BA->getBlockAddress(), VT, BA->getOffset(),

                                   BA->getTargetFlags());

}


SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,

                                                 SDValue Op,

                                                 SelectionDAG &DAG) const {


  const DataLayout &DL = DAG.getDataLayout();

  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);

  const GlobalValue *GV = G->getGlobal();


  if (!MFI->isModuleEntryFunction()) {

    auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));

    if (std::optional<uint32_t> Address =

            AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {

      if (IsNamedBarrier) {

        unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;

        MFI->recordNumNamedBarriers(Address.value(), BarCnt);

      }

      return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());

    } else if (IsNamedBarrier) {

      llvm_unreachable("named barrier should have an assigned address");

    }

  }


  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||

      G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {

    if (!MFI->isModuleEntryFunction() &&

        GV->getName() != "llvm.amdgcn.module.lds" &&

        !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {

      SDLoc DL(Op);

      const Function &Fn = DAG.getMachineFunction().getFunction();

      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

          Fn, "local memory global used by non-kernel function",

          DL.getDebugLoc(), DS_Warning));


      // We currently don't have a way to correctly allocate LDS objects that

      // aren't directly associated with a kernel. We do force inlining of

      // functions that use local objects. However, if these dead functions are

      // not eliminated, we don't want a compile time error. Just emit a warning

      // and a trap, since there should be no callable path here.

      SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());

      SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,

                                        Trap, DAG.getRoot());

      DAG.setRoot(OutputChain);

      return DAG.getPOISON(Op.getValueType());

    }


    // XXX: What does the value of G->getOffset() mean?

    assert(G->getOffset() == 0 &&

         "Do not know what to do with an non-zero offset");


    // TODO: We could emit code to handle the initialization somewhere.

    // We ignore the initializer for now and legalize it to allow selection.

    // The initializer will anyway get errored out during assembly emission.

    unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));

    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());

  }

  return SDValue();

}


SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,

                                                  SelectionDAG &DAG) const {

  SmallVector<SDValue, 8> Args;

  SDLoc SL(Op);


  EVT VT = Op.getValueType();

  if (VT.getVectorElementType().getSizeInBits() < 32) {

    unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();

    if (OpBitSize >= 32 && OpBitSize % 32 == 0) {

      unsigned NewNumElt = OpBitSize / 32;

      EVT NewEltVT = (NewNumElt == 1) ? MVT::i32

                                      : EVT::getVectorVT(*DAG.getContext(),

                                                         MVT::i32, NewNumElt);

      for (const SDUse &U : Op->ops()) {

        SDValue In = U.get();

        SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);

        if (NewNumElt > 1)

          DAG.ExtractVectorElements(NewIn, Args);

        else

          Args.push_back(NewIn);

      }


      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                                   NewNumElt * Op.getNumOperands());

      SDValue BV = DAG.getBuildVector(NewVT, SL, Args);

      return DAG.getNode(ISD::BITCAST, SL, VT, BV);

    }

  }


  for (const SDUse &U : Op->ops())

    DAG.ExtractVectorElements(U.get(), Args);


  return DAG.getBuildVector(Op.getValueType(), SL, Args);

}


SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,

                                                     SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SmallVector<SDValue, 8> Args;

  unsigned Start = Op.getConstantOperandVal(1);

  EVT VT = Op.getValueType();

  EVT SrcVT = Op.getOperand(0).getValueType();


  if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {

    unsigned NumElt = VT.getVectorNumElements();

    unsigned NumSrcElt = SrcVT.getVectorNumElements();

    assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");


    // Extract 32-bit registers at a time.

    EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);

    EVT NewVT = NumElt == 2

                    ? MVT::i32

                    : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);

    SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));


    DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);

    if (NumElt == 2)

      Tmp = Args[0];

    else

      Tmp = DAG.getBuildVector(NewVT, SL, Args);


    return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);

  }


  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,

                            VT.getVectorNumElements());


  return DAG.getBuildVector(Op.getValueType(), SL, Args);

}


// TODO: Handle fabs too


static SDValue peekFNeg(SDValue Val) {

  if (Val.getOpcode() == ISD::FNEG)

    return Val.getOperand(0);


  return Val;

}


static SDValue peekFPSignOps(SDValue Val) {

  if (Val.getOpcode() == ISD::FNEG)

    Val = Val.getOperand(0);

  if (Val.getOpcode() == ISD::FABS)

    Val = Val.getOperand(0);

  if (Val.getOpcode() == ISD::FCOPYSIGN)

    Val = Val.getOperand(0);

  return Val;

}


SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(

    const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,

    SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();

  switch (CCOpcode) {

  case ISD::SETOEQ:

  case ISD::SETONE:

  case ISD::SETUNE:

  case ISD::SETNE:

  case ISD::SETUEQ:

  case ISD::SETEQ:

  case ISD::SETFALSE:

  case ISD::SETFALSE2:

  case ISD::SETTRUE:

  case ISD::SETTRUE2:

  case ISD::SETUO:

  case ISD::SETO:

    break;

  case ISD::SETULE:

  case ISD::SETULT: {

    if (LHS == True)

      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);

    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);

  }

  case ISD::SETOLE:

  case ISD::SETOLT:

  case ISD::SETLE:

  case ISD::SETLT: {

    // Ordered. Assume ordered for undefined.


    // Only do this after legalization to avoid interfering with other combines

    // which might occur.

    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&

        !DCI.isCalledByLegalizer())

      return SDValue();


    // We need to permute the operands to get the correct NaN behavior. The

    // selected operand is the second one based on the failing compare with NaN,

    // so permute it based on the compare type the hardware uses.

    if (LHS == True)

      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);

    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);

  }

  case ISD::SETUGE:

  case ISD::SETUGT: {

    if (LHS == True)

      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);

    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);

  }

  case ISD::SETGT:

  case ISD::SETGE:

  case ISD::SETOGE:

  case ISD::SETOGT: {

    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&

        !DCI.isCalledByLegalizer())

      return SDValue();


    if (LHS == True)

      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);

    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);

  }

  case ISD::SETCC_INVALID:

    llvm_unreachable("Invalid setcc condcode!");

  }

  return SDValue();

}


/// Generate Min/Max node


SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,

                                                   SDValue LHS, SDValue RHS,

                                                   SDValue True, SDValue False,

                                                   SDValue CC,

                                                   DAGCombinerInfo &DCI) const {

  if ((LHS == True && RHS == False) || (LHS == False && RHS == True))

    return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);


  SelectionDAG &DAG = DCI.DAG;


  // If we can't directly match this, try to see if we can fold an fneg to

  // match.


  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);

  ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);

  SDValue NegTrue = peekFNeg(True);


  // Undo the combine foldFreeOpFromSelect does if it helps us match the

  // fmin/fmax.

  //

  // select (fcmp olt (lhs, K)), (fneg lhs), -K

  // -> fneg (fmin_legacy lhs, K)

  //

  // TODO: Use getNegatedExpression

  if (LHS == NegTrue && CFalse && CRHS) {

    APFloat NegRHS = neg(CRHS->getValueAPF());

    if (NegRHS == CFalse->getValueAPF()) {

      SDValue Combined =

          combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);

      if (Combined)

        return DAG.getNode(ISD::FNEG, DL, VT, Combined);

      return SDValue();

    }

  }


  return SDValue();

}


std::pair<SDValue, SDValue>


AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);


  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);


  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);

  const SDValue One = DAG.getConstant(1, SL, MVT::i32);


  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);

  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);


  return std::pair(Lo, Hi);

}


SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);


  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);

  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);

}


SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);


  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);

  const SDValue One = DAG.getConstant(1, SL, MVT::i32);

  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);

}


// Split a vector type into two parts. The first part is a power of two vector.

// The second part is whatever is left over, and is a scalar if it would

// otherwise be a 1-vector.

std::pair<EVT, EVT>


AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {

  EVT LoVT, HiVT;

  EVT EltVT = VT.getVectorElementType();

  unsigned NumElts = VT.getVectorNumElements();

  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);

  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);

  HiVT = NumElts - LoNumElts == 1

             ? EltVT

             : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);

  return std::pair(LoVT, HiVT);

}


// Split a vector value into two parts of types LoVT and HiVT. HiVT could be

// scalar.

std::pair<SDValue, SDValue>


AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,

                                  const EVT &LoVT, const EVT &HiVT,

                                  SelectionDAG &DAG) const {

  EVT VT = N.getValueType();

  assert(LoVT.getVectorNumElements() +

                 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=

             VT.getVectorNumElements() &&

         "More vector elements requested than available!");

  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,

                           DAG.getVectorIdxConstant(0, DL));


  unsigned LoNumElts = LoVT.getVectorNumElements();


  if (HiVT.isVector()) {

    unsigned HiNumElts = HiVT.getVectorNumElements();

    if ((VT.getVectorNumElements() % HiNumElts) == 0) {

      // Avoid creating an extract_subvector with an index that isn't a multiple

      // of the result type.

      SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,

                               DAG.getConstant(LoNumElts, DL, MVT::i32));

      return {Lo, Hi};

    }


    SmallVector<SDValue, 8> Elts;

    DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,

                              /*Count=*/HiNumElts);

    SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);

    return {Lo, Hi};

  }


  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, HiVT, N,

                           DAG.getVectorIdxConstant(LoNumElts, DL));

  return {Lo, Hi};

}


SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,

                                              SelectionDAG &DAG) const {

  LoadSDNode *Load = cast<LoadSDNode>(Op);

  EVT VT = Op.getValueType();

  SDLoc SL(Op);


  // If this is a 2 element vector, we really want to scalarize and not create

  // weird 1 element vectors.

  if (VT.getVectorNumElements() == 2) {

    SDValue Ops[2];

    std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);

    return DAG.getMergeValues(Ops, SL);

  }


  SDValue BasePtr = Load->getBasePtr();

  EVT MemVT = Load->getMemoryVT();


  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();


  EVT LoVT, HiVT;

  EVT LoMemVT, HiMemVT;

  SDValue Lo, Hi;


  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);

  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);

  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);


  unsigned Size = LoMemVT.getStoreSize();

  Align BaseAlign = Load->getAlign();

  Align HiAlign = commonAlignment(BaseAlign, Size);


  SDValue LoLoad = DAG.getExtLoad(

      Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,

      LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());

  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));

  SDValue HiLoad = DAG.getExtLoad(

      Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,

      SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,

      Load->getMemOperand()->getFlags(), Load->getAAInfo());


  SDValue Join;

  if (LoVT == HiVT) {

    // This is the case that the vector is power of two so was evenly split.

    Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);

  } else {

    Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,

                       DAG.getVectorIdxConstant(0, SL));

    Join = DAG.getNode(

        HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,

        VT, Join, HiLoad,

        DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));

  }


  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,

                                     LoLoad.getValue(1), HiLoad.getValue(1))};


  return DAG.getMergeValues(Ops, SL);

}


SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,

                                                     SelectionDAG &DAG) const {

  LoadSDNode *Load = cast<LoadSDNode>(Op);

  EVT VT = Op.getValueType();

  SDValue BasePtr = Load->getBasePtr();

  EVT MemVT = Load->getMemoryVT();

  SDLoc SL(Op);

  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();

  Align BaseAlign = Load->getAlign();

  unsigned NumElements = MemVT.getVectorNumElements();


  // Widen from vec3 to vec4 when the load is at least 8-byte aligned

  // or 16-byte fully dereferenceable. Otherwise, split the vector load.

  if (NumElements != 3 ||

      (BaseAlign < Align(8) &&

       !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))

    return SplitVectorLoad(Op, DAG);


  assert(NumElements == 3);


  EVT WideVT =

      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);

  EVT WideMemVT =

      EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);

  SDValue WideLoad = DAG.getExtLoad(

      Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,

      WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());

  return DAG.getMergeValues(

      {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,

                   DAG.getVectorIdxConstant(0, SL)),

       WideLoad.getValue(1)},

      SL);

}


SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,

                                               SelectionDAG &DAG) const {

  StoreSDNode *Store = cast<StoreSDNode>(Op);

  SDValue Val = Store->getValue();

  EVT VT = Val.getValueType();


  // If this is a 2 element vector, we really want to scalarize and not create

  // weird 1 element vectors.

  if (VT.getVectorNumElements() == 2)

    return scalarizeVectorStore(Store, DAG);


  EVT MemVT = Store->getMemoryVT();

  SDValue Chain = Store->getChain();

  SDValue BasePtr = Store->getBasePtr();

  SDLoc SL(Op);


  EVT LoVT, HiVT;

  EVT LoMemVT, HiMemVT;

  SDValue Lo, Hi;


  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);

  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);

  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);


  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());


  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();

  Align BaseAlign = Store->getAlign();

  unsigned Size = LoMemVT.getStoreSize();

  Align HiAlign = commonAlignment(BaseAlign, Size);


  SDValue LoStore =

      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,

                        Store->getMemOperand()->getFlags(), Store->getAAInfo());

  SDValue HiStore = DAG.getTruncStore(

      Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,

      Store->getMemOperand()->getFlags(), Store->getAAInfo());


  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);

}


// This is a shortcut for integer division because we have fast i32<->f32

// conversions, and fast f32 reciprocal instructions. The fractional part of a

// float is enough to accurately represent up to a 24-bit integer.


SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,

                                            bool Sign) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  assert(VT == MVT::i32 && "LowerDIVREM24 expects an i32");


  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  MVT IntVT = MVT::i32;

  MVT FltVT = MVT::f32;


  unsigned LHSSignBits;

  unsigned RHSSignBits;

  if (Sign) {

    LHSSignBits = DAG.ComputeNumSignBits(LHS);

    RHSSignBits = DAG.ComputeNumSignBits(RHS);

    if (LHSSignBits < 9 || RHSSignBits < 9)

      return SDValue();

  } else {

    KnownBits LHSKnown = DAG.computeKnownBits(LHS);

    KnownBits RHSKnown = DAG.computeKnownBits(RHS);

    APInt U24Max = APInt::getLowBitsSet(32, 24);

    if (LHSKnown.getMaxValue().ugt(U24Max) ||

        RHSKnown.getMaxValue().ugt(U24Max))

      return SDValue();

    LHSSignBits = LHSKnown.countMinLeadingZeros();

    RHSSignBits = RHSKnown.countMinLeadingZeros();

  }


  unsigned BitSize = VT.getSizeInBits();

  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);

  unsigned DivBits = BitSize - SignBits;

  if (Sign)

    ++DivBits;


  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;

  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;


  SDValue jq = DAG.getConstant(1, DL, IntVT);


  if (Sign) {

    // char|short jq = ia ^ ib;

    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);


    // jq = jq >> (bitsize - 2)

    jq = DAG.getNode(ISD::SRA, DL, VT, jq,

                     DAG.getConstant(BitSize - 2, DL, VT));


    // jq = jq | 0x1

    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));

  }


  // int ia = (int)LHS;

  SDValue ia = LHS;


  // int ib, (int)RHS;

  SDValue ib = RHS;


  // float fa = (float)ia;

  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);


  // float fb = (float)ib;

  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);


  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,

                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));


  // fq = trunc(fq);

  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);


  // float fqneg = -fq;

  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);


  MachineFunction &MF = DAG.getMachineFunction();


  bool UseFmadFtz = false;

  if (Subtarget->isGCN()) {

    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

    UseFmadFtz =

        MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();

  }


  // float fr = mad(fqneg, fb, fa);

  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA

                    : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ

                                 : (unsigned)ISD::FMAD;

  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);


  // int iq = (int)fq;

  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);


  // fr = fabs(fr);

  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);


  // fb = fabs(fb);

  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);


  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  // int cv = fr >= fb;

  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);


  // jq = (cv ? jq : 0);

  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));


  // dst = iq + jq;

  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);


  // Rem needs compensation, it's easier to recompute it

  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);

  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);


  // Truncate to number of bits this divide really is.

  if (Sign) {

    SDValue InRegSize

      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));

    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);

    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);

  } else {

    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);

    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);

    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);

  }


  return DAG.getMergeValues({ Div, Rem }, DL);

}


void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,

                                      SelectionDAG &DAG,

                                      SmallVectorImpl<SDValue> &Results) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");


  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());


  SDValue One = DAG.getConstant(1, DL, HalfVT);

  SDValue Zero = DAG.getConstant(0, DL, HalfVT);


  //HiLo split

  SDValue LHS_Lo, LHS_Hi;

  SDValue LHS = Op.getOperand(0);

  std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);


  SDValue RHS_Lo, RHS_Hi;

  SDValue RHS = Op.getOperand(1);

  std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);


  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&

      DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {


    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),

                              LHS_Lo, RHS_Lo);


    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});

    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});


    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));

    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));

    return;

  }


  if (isTypeLegal(MVT::i64)) {

    // The algorithm here is based on ideas from "Software Integer Division",

    // Tom Rodeheffer, August 2008.


    MachineFunction &MF = DAG.getMachineFunction();

    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();


    // Compute denominator reciprocal.

    unsigned FMAD =

        !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA

        : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()

            ? (unsigned)ISD::FMAD

            : (unsigned)AMDGPUISD::FMAD_FTZ;


    SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);

    SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);

    SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,

      DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),

      Cvt_Lo);

    SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);

    SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,

      DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));

    SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,

      DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));

    SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);

    SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,

      DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),

      Mul1);

    SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);

    SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);

    SDValue Rcp64 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));


    SDValue Zero64 = DAG.getConstant(0, DL, VT);

    SDValue One64  = DAG.getConstant(1, DL, VT);

    SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);

    SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);


    // First round of UNR (Unsigned integer Newton-Raphson).

    SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);

    SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);

    SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);

    SDValue Mulhi1_Lo, Mulhi1_Hi;

    std::tie(Mulhi1_Lo, Mulhi1_Hi) =

        DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);

    SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,

                                  Mulhi1_Lo, Zero1);

    SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,

                                  Mulhi1_Hi, Add1_Lo.getValue(1));

    SDValue Add1 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));


    // Second round of UNR.

    SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);

    SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);

    SDValue Mulhi2_Lo, Mulhi2_Hi;

    std::tie(Mulhi2_Lo, Mulhi2_Hi) =

        DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);

    SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,

                                  Mulhi2_Lo, Zero1);

    SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,

                                  Mulhi2_Hi, Add2_Lo.getValue(1));

    SDValue Add2 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));


    SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);


    SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);


    SDValue Mul3_Lo, Mul3_Hi;

    std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);

    SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,

                                  Mul3_Lo, Zero1);

    SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,

                                  Mul3_Hi, Sub1_Lo.getValue(1));

    SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);

    SDValue Sub1 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));


    SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);

    SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,

                                 ISD::SETUGE);

    SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,

                                 ISD::SETUGE);

    SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);


    // TODO: Here and below portions of the code can be enclosed into if/endif.

    // Currently control flow is unconditional and we have 4 selects after

    // potential endif to substitute PHIs.


    // if C3 != 0 ...

    SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,

                                  RHS_Lo, Zero1);

    SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,

                                  RHS_Hi, Sub1_Lo.getValue(1));

    SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,

                                  Zero, Sub2_Lo.getValue(1));

    SDValue Sub2 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));


    SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);


    SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,

                                 ISD::SETUGE);

    SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,

                                 ISD::SETUGE);

    SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);


    // if (C6 != 0)

    SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);


    SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,

                                  RHS_Lo, Zero1);

    SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,

                                  RHS_Hi, Sub2_Lo.getValue(1));

    SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,

                                  Zero, Sub3_Lo.getValue(1));

    SDValue Sub3 = DAG.getBitcast(VT,

                        DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));


    // endif C6

    // endif C3


    SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);

    SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);


    SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);

    SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);


    Results.push_back(Div);

    Results.push_back(Rem);


    return;

  }


  // r600 expandion.

  // Get Speculative values

  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);

  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);


  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);

  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});

  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);


  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);

  SDValue DIV_Lo = Zero;


  const unsigned halfBitWidth = HalfVT.getSizeInBits();


  for (unsigned i = 0; i < halfBitWidth; ++i) {

    const unsigned bitPos = halfBitWidth - i - 1;

    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);

    // Get value of high bit

    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);

    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);

    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);


    // Shift

    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));

    // Add LHS high bit

    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);


    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);

    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);


    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);


    // Update REM

    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);

    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);

  }


  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});

  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);

  Results.push_back(DIV);

  Results.push_back(REM);

}


SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  if (VT == MVT::i64) {

    SmallVector<SDValue, 2> Results;

    LowerUDIVREM64(Op, DAG, Results);

    return DAG.getMergeValues(Results, DL);

  }


  if (VT == MVT::i32) {

    if (SDValue Res = LowerDIVREM24(Op, DAG, false))

      return Res;

  }


  SDValue X = Op.getOperand(0);

  SDValue Y = Op.getOperand(1);


  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the

  // algorithm used here.


  // Initial estimate of inv(y).

  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);


  // One round of UNR.

  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);

  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);

  Z = DAG.getNode(ISD::ADD, DL, VT, Z,

                  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));


  // Quotient/remainder estimate.

  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);

  SDValue R =

      DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));


  // First quotient/remainder refinement.

  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  SDValue One = DAG.getConstant(1, DL, VT);

  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);

  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,

                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);

  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,

                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);


  // Second quotient/remainder refinement.

  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);

  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,

                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);

  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,

                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);


  return DAG.getMergeValues({Q, R}, DL);

}


SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();


  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);


  SDValue Zero = DAG.getConstant(0, DL, VT);

  SDValue NegOne = DAG.getAllOnesConstant(DL, VT);


  if (VT == MVT::i32) {

    if (SDValue Res = LowerDIVREM24(Op, DAG, true))

      return Res;

  }


  if (VT == MVT::i64 &&

      DAG.ComputeNumSignBits(LHS) > 32 &&

      DAG.ComputeNumSignBits(RHS) > 32) {

    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());


    //HiLo split

    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);

    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);

    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),

                                 LHS_Lo, RHS_Lo);

    SDValue Res[2] = {

      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),

      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))

    };

    return DAG.getMergeValues(Res, DL);

  }


  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);

  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);

  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);

  SDValue RSign = LHSign; // Remainder sign is the same as LHS


  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);

  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);


  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);

  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);


  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);

  SDValue Rem = Div.getValue(1);


  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);

  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);


  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);

  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);


  SDValue Res[2] = {

    Div,

    Rem

  };

  return DAG.getMergeValues(Res, DL);

}


SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  // result = trunc(src)

  // if (src > 0.0 && src != result)

  //   result += 1.0


  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);


  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);

  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);


  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);

  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);

  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);


  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);

  // TODO: Should this propagate fast-math-flags?

  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);

}


static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,

                                  SelectionDAG &DAG) {

  const unsigned FractBits = 52;

  const unsigned ExpBits = 11;


  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,

                                Hi,

                                DAG.getConstant(FractBits - 32, SL, MVT::i32),

                                DAG.getConstant(ExpBits, SL, MVT::i32));

  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,

                            DAG.getConstant(1023, SL, MVT::i32));


  return Exp;

}


SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  assert(Op.getValueType() == MVT::f64);


  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);


  // Extract the upper half, since this is where we will find the sign and

  // exponent.

  SDValue Hi = getHiHalf64(Src, DAG);


  SDValue Exp = extractF64Exponent(Hi, SL, DAG);


  const unsigned FractBits = 52;


  // Extract the sign bit.

  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);

  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);


  // Extend back to 64-bits.

  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});

  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);


  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);

  const SDValue FractMask

    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);


  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);

  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);

  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);


  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);


  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);

  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);


  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);

  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);


  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);

}


SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  assert(Op.getValueType() == MVT::f64);


  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");

  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);

  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);


  // TODO: Should this propagate fast-math-flags?


  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);

  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);


  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);


  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");

  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);

  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);


  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);

}


SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,

                                              SelectionDAG &DAG) const {

  // FNEARBYINT and FRINT are the same, except in their handling of FP

  // exceptions. Those aren't really meaningful for us, and OpenCL only has

  // rint, so just treat them as equivalent.

  return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),

                     Op.getOperand(0));

}


SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {

  auto VT = Op.getValueType();

  auto Arg = Op.getOperand(0u);

  return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);

}


// XXX - May require not supporting f32 denormals?


// Don't handle v2f16. The extra instructions to scalarize and repack around the

// compare and vselect end up producing worse code than scalarizing the whole

// operation.


SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue X = Op.getOperand(0);

  EVT VT = Op.getValueType();


  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);


  // TODO: Should this propagate fast-math-flags?


  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);


  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);


  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);

  const SDValue One = DAG.getConstantFP(1.0, SL, VT);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);

  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);

  SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);


  SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);

  return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);

}


SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  // result = trunc(src);

  // if (src < 0.0 && src != result)

  //   result += -1.0.


  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);


  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);

  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);


  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);

  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);

  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);


  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);

  // TODO: Should this propagate fast-math-flags?

  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);

}


/// Return true if it's known that \p Src can never be an f32 denormal value.


static bool valueIsKnownNeverF32Denorm(SDValue Src) {

  switch (Src.getOpcode()) {

  case ISD::FP_EXTEND:

    return Src.getOperand(0).getValueType() == MVT::f16;

  case ISD::FP16_TO_FP:

  case ISD::FFREXP:

  case ISD::FSQRT:

  case AMDGPUISD::LOG:

  case AMDGPUISD::EXP:

    return true;

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IntrinsicID = Src.getConstantOperandVal(0);

    switch (IntrinsicID) {

    case Intrinsic::amdgcn_frexp_mant:

    case Intrinsic::amdgcn_log:

    case Intrinsic::amdgcn_log_clamp:

    case Intrinsic::amdgcn_exp2:

    case Intrinsic::amdgcn_sqrt:

      return true;

    default:

      return false;

    }

  }

  default:

    return false;

  }


  llvm_unreachable("covered opcode switch");

}


bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,

                                           SDNodeFlags Flags) {

  return Flags.hasApproximateFuncs();

}


bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,

                                                  SDValue Src,

                                                  SDNodeFlags Flags) {

  return !valueIsKnownNeverF32Denorm(Src) &&

         DAG.getMachineFunction()

                 .getDenormalMode(APFloat::IEEEsingle())

                 .Input != DenormalMode::PreserveSign;

}


SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,

                                                    SDValue Src,

                                                    SDNodeFlags Flags) const {

  SDLoc SL(Src);

  EVT VT = Src.getValueType();

  const fltSemantics &Semantics = VT.getFltSemantics();

  SDValue SmallestNormal =

      DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);


  // Want to scale denormals up, but negatives and 0 work just as well on the

  // scaled path.

  SDValue IsLtSmallestNormal = DAG.getSetCC(

      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,

      SmallestNormal, ISD::SETOLT);


  return IsLtSmallestNormal;

}


SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,

                                          SDNodeFlags Flags) const {

  SDLoc SL(Src);

  EVT VT = Src.getValueType();

  const fltSemantics &Semantics = VT.getFltSemantics();

  SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);


  SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);

  SDValue IsFinite = DAG.getSetCC(

      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,

      Inf, ISD::SETOLT);

  return IsFinite;

}


/// If denormal handling is required return the scaled input to FLOG2, and the

/// check for denormal range. Otherwise, return null values.

std::pair<SDValue, SDValue>


AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,

                                        SDValue Src, SDNodeFlags Flags) const {

  if (!needsDenormHandlingF32(DAG, Src, Flags))

    return {};


  MVT VT = MVT::f32;

  const fltSemantics &Semantics = APFloat::IEEEsingle();

  SDValue SmallestNormal =

      DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);


  SDValue IsLtSmallestNormal = DAG.getSetCC(

      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,

      SmallestNormal, ISD::SETOLT);


  SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);

  SDValue One = DAG.getConstantFP(1.0, SL, VT);

  SDValue ScaleFactor =

      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);


  SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);

  return {ScaledInput, IsLtSmallestNormal};

}


SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {

  // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.

  // If we have to handle denormals, scale up the input and adjust the result.


  // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)

  // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)


  SDLoc SL(Op);

  EVT VT = Op.getValueType();

  SDValue Src = Op.getOperand(0);

  SDNodeFlags Flags = Op->getFlags();


  if (VT == MVT::f16) {

    // Nothing in half is a denormal when promoted to f32.

    assert(!isTypeLegal(VT));

    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);

    SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);

    return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,

                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);

  }


  auto [ScaledInput, IsLtSmallestNormal] =

      getScaledLogInput(DAG, SL, Src, Flags);

  if (!ScaledInput)

    return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);


  SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);


  SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);

  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);

  SDValue ResultOffset =

      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);

  return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);

}


static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,

                      SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {

  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);

  return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);

}


SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDValue X = Op.getOperand(0);

  EVT VT = Op.getValueType();

  SDNodeFlags Flags = Op->getFlags();

  SDLoc DL(Op);

  const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;

  assert(IsLog10 || Op.getOpcode() == ISD::FLOG);


  if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {

    // TODO: The direct f16 path is 1.79 ulp for f16. This should be used

    // depending on !fpmath metadata.


    bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||

                                           !isTypeLegal(MVT::f16));


    if (PromoteToF32) {

      // Log and multiply in f32 is always good enough for f16.

      X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);

    }


    SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);

    if (PromoteToF32) {

      return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,

                         DAG.getTargetConstant(0, DL, MVT::i32), Flags);

    }


    return Lowered;

  }


  SDValue ScaledInput, IsScaled;

  if (VT == MVT::f16)

    X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);

  else {

    std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);

    if (ScaledInput)

      X = ScaledInput;

  }


  SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);


  SDValue R;

  if (Subtarget->hasFastFMAF32()) {

    // c+cc are ln(2)/ln(10) to more than 49 bits

    const float c_log10 = 0x1.344134p-2f;

    const float cc_log10 = 0x1.09f79ep-26f;


    // c + cc is ln(2) to more than 49 bits

    const float c_log = 0x1.62e42ep-1f;

    const float cc_log = 0x1.efa39ep-25f;


    SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);

    SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);

    // This adds correction terms for which contraction may lead to an increase

    // in the error of the approximation, so disable it.

    Flags.setAllowContract(false);

    R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);

    SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);

    SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);

    SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);

    R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);

  } else {

    // ch+ct is ln(2)/ln(10) to more than 36 bits

    const float ch_log10 = 0x1.344000p-2f;

    const float ct_log10 = 0x1.3509f6p-18f;


    // ch + ct is ln(2) to more than 36 bits

    const float ch_log = 0x1.62e000p-1f;

    const float ct_log = 0x1.0bfbe8p-15f;


    SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);

    SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);


    SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);

    SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);

    SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);

    SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);

    SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);

    // This adds correction terms for which contraction may lead to an increase

    // in the error of the approximation, so disable it.

    Flags.setAllowContract(false);

    SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);

    SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);

    SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);

    R = getMad(DAG, DL, VT, YH, CH, Mad1);

  }


  const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();


  // TODO: Check if known finite from source value.

  if (!IsFiniteOnly) {

    SDValue IsFinite = getIsFinite(DAG, Y, Flags);

    R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);

  }


  if (IsScaled) {

    SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);

    SDValue ShiftK =

        DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);

    SDValue Shift =

        DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);

    R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);

  }


  return R;

}


SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {

  return LowerFLOGCommon(Op, DAG);

}


// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a

// promote f16 operation.


SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,

                                              SelectionDAG &DAG, bool IsLog10,

                                              SDNodeFlags Flags) const {

  EVT VT = Src.getValueType();

  unsigned LogOp =

      VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;


  double Log2BaseInverted =

      IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;


  if (VT == MVT::f32) {

    auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);

    if (ScaledInput) {

      SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);

      SDValue ScaledResultOffset =

          DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);


      SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);


      SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,

                                         ScaledResultOffset, Zero, Flags);


      SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);


      if (Subtarget->hasFastFMAF32())

        return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,

                           Flags);

      SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);

      return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);

    }

  }


  SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);

  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);


  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,

                     Flags);

}


// This expansion gives a result slightly better than 1ulp.


SDValue AMDGPUTargetLowering::lowerFEXPF64(SDValue Op,

                                           SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue X = Op.getOperand(0);


  // TODO: Check if reassoc is safe. There is an output change in exp2 and

  // exp10, which slightly increases ulp.

  SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;


  SDValue DN, F, T;


  if (Op.getOpcode() == ISD::FEXP2) {

    // dn = rint(x)

    DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);

    // f = x - dn

    F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);

    // t = f*C1 + f*C2

    SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);

    SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);

    SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);

    T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);

  } else if (Op.getOpcode() == ISD::FEXP10) {

    // dn = rint(x * C1)

    SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);

    SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);

    DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);


    // f = FMA(-dn, C2, FMA(-dn, C3, x))

    SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);

    SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);

    SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);

    SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);

    F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);


    // t = FMA(f, C4, f*C5)

    SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);

    SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);

    SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);

    T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);

  } else { // ISD::FEXP

    // dn = rint(x * C1)

    SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);

    SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);

    DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);


    // t = FMA(-dn, C2, FMA(-dn, C3, x))

    SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);

    SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);

    SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);

    SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);

    T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);

  }


  // Polynomial expansion for p

  SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),

                  Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),

                  Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),

                  Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),

                  Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),

                  Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,

                  DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);


  SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);


  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);

  P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);


  // z = ldexp(p, (int)dn)

  SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);

  SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);


  // Overflow/underflow guards

  SDValue CondHi = DAG.getSetCC(

      DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);


  if (!Flags.hasNoInfs()) {

    SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),

                                     DL, MVT::f64);

    Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);

  }


  SDValue CondLo = DAG.getSetCC(

      DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);

  SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);

  Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);


  return Z;

}


SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {

  // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.

  // If we have to handle denormals, scale up the input and adjust the result.


  EVT VT = Op.getValueType();

  if (VT == MVT::f64)

    return lowerFEXPF64(Op, DAG);


  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);

  SDNodeFlags Flags = Op->getFlags();


  if (VT == MVT::f16) {

    // Nothing in half is a denormal when promoted to f32.

    assert(!isTypeLegal(MVT::f16));

    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);

    SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);

    return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,

                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);

  }


  assert(VT == MVT::f32);


  if (!needsDenormHandlingF32(DAG, Src, Flags))

    return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);


  // bool needs_scaling = x < -0x1.f80000p+6f;

  // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);


  // -nextafter(128.0, -1)

  SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);


  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  SDValue NeedsScaling =

      DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);


  SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);

  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);


  SDValue AddOffset =

      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);


  SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);

  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);


  SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);

  SDValue One = DAG.getConstantFP(1.0, SL, VT);

  SDValue ResultScale =

      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);


  return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);

}


SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,

                                                  SelectionDAG &DAG,

                                                  SDNodeFlags Flags,

                                                  bool IsExp10) const {

  // exp(x) -> exp2(M_LOG2E_F * x);

  // exp10(x) -> exp2(log2(10) * x);

  EVT VT = X.getValueType();

  SDValue Const =

      DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);


  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);

  return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP

                                    : (unsigned)ISD::FEXP2,

                     SL, VT, Mul, Flags);

}


SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,

                                              SelectionDAG &DAG,

                                              SDNodeFlags Flags) const {

  EVT VT = X.getValueType();

  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))

    return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);


  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);

  SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);


  SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);


  SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);


  SDValue AdjustedX =

      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);


  const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);

  SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);


  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);


  SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);

  SDValue AdjustedResult =

      DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);


  return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,

                     Flags);

}


/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be

/// handled correctly.


SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,

                                                SelectionDAG &DAG,

                                                SDNodeFlags Flags) const {

  const EVT VT = X.getValueType();


  const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)

                                         : static_cast<unsigned>(ISD::FEXP2);


  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {

    // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);

    SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);

    SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);


    SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);

    SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);

    SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);

    SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);

    return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);

  }


  // bool s = x < -0x1.2f7030p+5f;

  // x += s ? 0x1.0p+5f : 0.0f;

  // exp10 = exp2(x * 0x1.a92000p+1f) *

  //        exp2(x * 0x1.4f0978p-11f) *

  //        (s ? 0x1.9f623ep-107f : 1.0f);


  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);


  SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);

  SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);


  SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);

  SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);

  SDValue AdjustedX =

      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);


  SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);

  SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);


  SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);

  SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);

  SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);

  SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);


  SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);


  SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);

  SDValue AdjustedResult =

      DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);


  return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,

                     Flags);

}


SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (VT == MVT::f64)

    return lowerFEXPF64(Op, DAG);


  SDLoc SL(Op);

  SDValue X = Op.getOperand(0);

  SDNodeFlags Flags = Op->getFlags();

  const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;


  // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying

  // library behavior. Also, is known-not-daz source sufficient?

  if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?

    return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)

                   : lowerFEXPUnsafe(X, SL, DAG, Flags);

  }


  if (VT.getScalarType() == MVT::f16) {

    if (VT.isVector())

      return SDValue();


    // Nothing in half is a denormal when promoted to f32.

    //

    // exp(f16 x) ->

    //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))

    //

    // exp10(f16 x) ->

    //   fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))

    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);

    SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);

    return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,

                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);

  }


  assert(VT == MVT::f32);


  //    Algorithm:

  //

  //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)

  //

  //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer

  //    n = 64*m + j,   0 <= j < 64

  //

  //    e^x = 2^((64*m + j + f)/64)

  //        = (2^m) * (2^(j/64)) * 2^(f/64)

  //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))

  //

  //    f = x*(64/ln(2)) - n

  //    r = f*(ln(2)/64) = x - n*(ln(2)/64)

  //

  //    e^x = (2^m) * (2^(j/64)) * e^r

  //

  //    (2^(j/64)) is precomputed

  //

  //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!

  //    e^r = 1 + q

  //

  //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!

  //

  //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )

  SDNodeFlags FlagsNoContract = Flags;

  FlagsNoContract.setAllowContract(false);


  SDValue PH, PL;

  if (Subtarget->hasFastFMAF32()) {

    const float c_exp = numbers::log2ef;

    const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits

    const float c_exp10 = 0x1.a934f0p+1f;

    const float cc_exp10 = 0x1.2f346ep-24f;


    SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);

    SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);


    PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);

    SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);

    SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);

    PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);

  } else {

    const float ch_exp = 0x1.714000p+0f;

    const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits


    const float ch_exp10 = 0x1.a92000p+1f;

    const float cl_exp10 = 0x1.4f0978p-11f;


    SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);

    SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);


    SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);

    SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);

    SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);

    SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);

    SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);


    PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);


    SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);

    SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);

    PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);

  }


  SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);


  // It is unsafe to contract this fsub into the PH multiply.

  SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);


  SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);

  SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);

  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);


  SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);


  SDValue UnderflowCheckConst =

      DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);


  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);

  SDValue Underflow =

      DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);


  R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);


  if (!Flags.hasNoInfs()) {

    SDValue OverflowCheckConst =

        DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);

    SDValue Overflow =

        DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);

    SDValue Inf =

        DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);

    R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);

  }


  return R;

}


static bool isCtlzOpc(unsigned Opc) {

  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;

}


static bool isCttzOpc(unsigned Opc) {

  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;

}


SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,

                                               SelectionDAG &DAG) const {

  auto SL = SDLoc(Op);

  auto Opc = Op.getOpcode();

  auto Arg = Op.getOperand(0u);

  auto ResultVT = Op.getValueType();


  if (ResultVT != MVT::i8 && ResultVT != MVT::i16)

    return {};


  assert(isCtlzOpc(Opc));

  assert(ResultVT == Arg.getValueType());


  const uint64_t NumBits = ResultVT.getFixedSizeInBits();

  SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);

  SDValue NewOp;


  if (Opc == ISD::CTLZ_ZERO_POISON) {

    NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);

    NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);

    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);

  } else {

    NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);

    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);

    NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);

  }


  return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);

}


SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));

  bool Ctlz = isCtlzOpc(Op.getOpcode());

  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;


  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||

                   Op.getOpcode() == ISD::CTTZ_ZERO_POISON;

  bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;


  if (Src.getValueType() == MVT::i32 || Is64BitScalar) {

    // (ctlz hi:lo) -> (umin (ffbh src), 32)

    // (cttz hi:lo) -> (umin (ffbl src), 32)

    // (ctlz_zero_poison src) -> (ffbh src)

    // (cttz_zero_poison src) -> (ffbl src)


    //  64-bit scalar version produce 32-bit result

    // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)

    // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)

    // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)

    // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)

    SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);

    if (!ZeroUndef) {

      const SDValue ConstVal = DAG.getConstant(

          Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);

      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);

    }

    return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);

  }


  SDValue Lo, Hi;

  std::tie(Lo, Hi) = split64BitValue(Src, DAG);


  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);

  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);


  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)

  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)

  // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))

  // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))


  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;

  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);

  if (Ctlz)

    OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);

  else

    OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);


  SDValue NewOpr;

  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);

  if (!ZeroUndef) {

    const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);

    NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);

  }


  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);

}


SDValue AMDGPUTargetLowering::LowerCTLS(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);

  assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");

  SDValue Ffbh = DAG.getNode(

      ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

      DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);

  SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,

                                DAG.getConstant(32, SL, MVT::i32));

  return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,

                     DAG.getAllOnesConstant(SL, MVT::i32));

}


SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,

                                               bool Signed) const {

  // The regular method converting a 64-bit integer to float roughly consists of

  // 2 steps: normalization and rounding. In fact, after normalization, the

  // conversion from a 64-bit integer to a float is essentially the same as the

  // one from a 32-bit integer. The only difference is that it has more

  // trailing bits to be rounded. To leverage the native 32-bit conversion, a

  // 64-bit integer could be preprocessed and fit into a 32-bit integer then

  // converted into the correct float number. The basic steps for the unsigned

  // conversion are illustrated in the following pseudo code:

  //

  // f32 uitofp(i64 u) {

  //   i32 hi, lo = split(u);

  //   // Only count the leading zeros in hi as we have native support of the

  //   // conversion from i32 to f32. If hi is all 0s, the conversion is

  //   // reduced to a 32-bit one automatically.

  //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.

  //   u <<= shamt;

  //   hi, lo = split(u);

  //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.

  //   // convert it as a 32-bit integer and scale the result back.

  //   return uitofp(hi) * 2^(32 - shamt);

  // }

  //

  // The signed one follows the same principle but uses 'ffbh_i32' to count its

  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is

  // converted instead followed by negation based its sign bit.


  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  SDValue Lo, Hi;

  std::tie(Lo, Hi) = split64BitValue(Src, DAG);

  SDValue Sign;

  SDValue ShAmt;

  if (Signed && Subtarget->isGCN()) {

    // We also need to consider the sign bit in Lo if Hi has just sign bits,

    // i.e. Hi is 0 or -1. However, that only needs to take the MSB into

    // account. That is, the maximal shift is

    // - 32 if Lo and Hi have opposite signs;

    // - 33 if Lo and Hi have the same sign.

    //

    // Or, MaxShAmt = 33 + OppositeSign, where

    //

    // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is

    // - -1 if Lo and Hi have opposite signs; and

    // -  0 otherwise.

    //

    // All in all, ShAmt is calculated as

    //

    //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.

    //

    // or

    //

    //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).

    //

    // to reduce the critical path.

    SDValue OppositeSign = DAG.getNode(

        ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),

        DAG.getConstant(31, SL, MVT::i32));

    SDValue MaxShAmt =

        DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),

                    OppositeSign);

    // Count the leading sign bits.

    ShAmt = DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

        DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);

    // Different from unsigned conversion, the shift should be one bit less to

    // preserve the sign bit.

    ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,

                        DAG.getConstant(1, SL, MVT::i32));

    ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);

  } else {

    if (Signed) {

      // Without 'ffbh_i32', only leading zeros could be counted. Take the

      // absolute value first.

      Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,

                         DAG.getConstant(63, SL, MVT::i64));

      SDValue Abs =

          DAG.getNode(ISD::XOR, SL, MVT::i64,

                      DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);

      std::tie(Lo, Hi) = split64BitValue(Abs, DAG);

    }

    // Count the leading zeros.

    ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);

    // The shift amount for signed integers is [0, 32].

  }

  // Normalize the given 64-bit integer.

  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);

  // Split it again.

  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);

  // Calculate the adjust bit for rounding.

  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)

  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,

                               DAG.getConstant(1, SL, MVT::i32), Lo);

  // Get the 32-bit normalized integer.

  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);

  // Convert the normalized 32-bit integer into f32.


  bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);

  unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;

  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);


  // Finally, need to scale back the converted floating number as the original

  // 64-bit integer is converted as a 32-bit one.

  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),

                      ShAmt);

  // On GCN, use LDEXP directly.

  if (UseLDEXP)

    return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);


  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent

  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit

  // exponent is enough to avoid overflowing into the sign bit.

  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,

                            DAG.getConstant(23, SL, MVT::i32));

  SDValue IVal =

      DAG.getNode(ISD::ADD, SL, MVT::i32,

                  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);

  if (Signed) {

    // Set the sign bit.

    Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,

                       DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),

                       DAG.getConstant(31, SL, MVT::i32));

    IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);

  }

  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);

}


SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,

                                               bool Signed) const {

  SDLoc SL(Op);

  SDValue Src = Op.getOperand(0);


  SDValue Lo, Hi;

  std::tie(Lo, Hi) = split64BitValue(Src, DAG);


  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,

                              SL, MVT::f64, Hi);


  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);


  SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,

                              DAG.getConstant(32, SL, MVT::i32));

  // TODO: Should this propagate fast-math-flags?

  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);

}


SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,

                                               SelectionDAG &DAG) const {

  // TODO: Factor out code common with LowerSINT_TO_FP.

  EVT DestVT = Op.getValueType();

  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();


  if (SrcVT == MVT::i16) {

    if (DestVT == MVT::f16)

      return Op;

    SDLoc DL(Op);


    // Promote src to i32

    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);

    return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);

  }


  if (DestVT == MVT::bf16) {

    SDLoc SL(Op);

    SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);

    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);

    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);

  }


  if (SrcVT != MVT::i64)

    return Op;


  if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {

    SDLoc DL(Op);


    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);

    SDValue FPRoundFlag =

        DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);

    SDValue FPRound =

        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);


    return FPRound;

  }


  if (DestVT == MVT::f32)

    return LowerINT_TO_FP32(Op, DAG, false);


  assert(DestVT == MVT::f64);

  return LowerINT_TO_FP64(Op, DAG, false);

}


SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,

                                              SelectionDAG &DAG) const {

  EVT DestVT = Op.getValueType();


  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();


  if (SrcVT == MVT::i16) {

    if (DestVT == MVT::f16)

      return Op;


    SDLoc DL(Op);

    // Promote src to i32

    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);

    return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);

  }


  if (DestVT == MVT::bf16) {

    SDLoc SL(Op);

    SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);

    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);

    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);

  }


  if (SrcVT != MVT::i64)

    return Op;


  // TODO: Factor out code common with LowerUINT_TO_FP.


  if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {

    SDLoc DL(Op);

    SDValue Src = Op.getOperand(0);


    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);

    SDValue FPRoundFlag =

        DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);

    SDValue FPRound =

        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);


    return FPRound;

  }


  if (DestVT == MVT::f32)

    return LowerINT_TO_FP32(Op, DAG, true);


  assert(DestVT == MVT::f64);

  return LowerINT_TO_FP64(Op, DAG, true);

}


SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,

                                               bool Signed) const {

  SDLoc SL(Op);


  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();


  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);


  // The basic idea of converting a floating point number into a pair of 32-bit

  // integers is illustrated as follows:

  //

  //     tf := trunc(val);

  //    hif := floor(tf * 2^-32);

  //    lof := tf - hif * 2^32; // lof is always positive due to floor.

  //     hi := fptoi(hif);

  //     lo := fptoi(lof);

  //

  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);

  SDValue Sign;

  if (Signed && SrcVT == MVT::f32) {

    // However, a 32-bit floating point number has only 23 bits mantissa and

    // it's not enough to hold all the significant bits of `lof` if val is

    // negative. To avoid the loss of precision, We need to take the absolute

    // value after truncating and flip the result back based on the original

    // signedness.

    Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,

                       DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),

                       DAG.getConstant(31, SL, MVT::i32));

    Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);

  }


  SDValue K0, K1;

  if (SrcVT == MVT::f64) {

    K0 = DAG.getConstantFP(

        llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,

        SrcVT);

    K1 = DAG.getConstantFP(

        llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,

        SrcVT);

  } else {

    K0 = DAG.getConstantFP(

        llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);

    K1 = DAG.getConstantFP(

        llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);

  }

  // TODO: Should this propagate fast-math-flags?

  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);


  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);


  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);


  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT

                                                         : ISD::FP_TO_UINT,

                           SL, MVT::i32, FloorMul);

  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);


  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,

                               DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));


  if (Signed && SrcVT == MVT::f32) {

    assert(Sign);

    // Flip the result based on the signedness, which is either all 0s or 1s.

    Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,

                       DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));

    // r := xor(r, sign) - sign;

    Result =

        DAG.getNode(ISD::SUB, SL, MVT::i64,

                    DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);

  }


  return Result;

}


SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue N0 = Op.getOperand(0);


  // Convert to target node to get known bits

  if (N0.getValueType() == MVT::f32)

    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);


  if (Op->getFlags().hasApproximateFuncs()) {

    // There is a generic expand for FP_TO_FP16 with unsafe fast math.

    return SDValue();

  }


  return LowerF64ToF16Safe(N0, DL, DAG);

}


// return node in i32


SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,

                                                SelectionDAG &DAG) const {

  assert(Src.getSimpleValueType() == MVT::f64);


  // f64 -> f16 conversion using round-to-nearest-even rounding mode.

  // TODO: We can generate better code for True16.

  const unsigned ExpMask = 0x7ff;

  const unsigned ExpBiasf64 = 1023;

  const unsigned ExpBiasf16 = 15;

  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);

  SDValue One = DAG.getConstant(1, DL, MVT::i32);

  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);

  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,

                           DAG.getConstant(32, DL, MVT::i64));

  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);

  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);

  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,

                          DAG.getConstant(20, DL, MVT::i64));

  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,

                  DAG.getConstant(ExpMask, DL, MVT::i32));

  // Subtract the fp64 exponent bias (1023) to get the real exponent and

  // add the f16 bias (15) to get the biased exponent for the f16 format.

  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,

                  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));


  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,

                          DAG.getConstant(8, DL, MVT::i32));

  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,

                  DAG.getConstant(0xffe, DL, MVT::i32));


  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,

                                  DAG.getConstant(0x1ff, DL, MVT::i32));

  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);


  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);

  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);


  // (M != 0 ? 0x0200 : 0) | 0x7c00;

  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,

      DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),

                      Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));


  // N = M | (E << 12);

  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,

      DAG.getNode(ISD::SHL, DL, MVT::i32, E,

                  DAG.getConstant(12, DL, MVT::i32)));


  // B = clamp(1-E, 0, 13);

  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,

                                  One, E);

  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);

  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,

                  DAG.getConstant(13, DL, MVT::i32));


  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,

                                   DAG.getConstant(0x1000, DL, MVT::i32));


  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);

  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);

  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);

  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);


  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);

  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,

                              DAG.getConstant(0x7, DL, MVT::i32));

  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,

                  DAG.getConstant(2, DL, MVT::i32));

  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),

                               One, Zero, ISD::SETEQ);

  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),

                               One, Zero, ISD::SETGT);

  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);

  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);


  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),

                      DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);

  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),

                      I, V, ISD::SETEQ);


  // Extract the sign bit.

  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,

                            DAG.getConstant(16, DL, MVT::i32));

  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,

                     DAG.getConstant(0x8000, DL, MVT::i32));


  return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);

}


SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,

                                             SelectionDAG &DAG) const {

  SDValue Src = Op.getOperand(0);

  unsigned OpOpcode = Op.getOpcode();

  EVT SrcVT = Src.getValueType();

  EVT DestVT = Op.getValueType();


  // Will be selected natively

  if (SrcVT == MVT::f16 && DestVT == MVT::i16)

    return Op;


  if (SrcVT == MVT::bf16) {

    SDLoc DL(Op);

    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);

    return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);

  }


  // Promote i16 to i32

  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {

    SDLoc DL(Op);


    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);

    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);

  }


  if (DestVT != MVT::i64)

    return Op;


  if (SrcVT == MVT::f16 ||

      (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {

    SDLoc DL(Op);


    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);

    unsigned Ext =

        OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);

  }


  if (SrcVT == MVT::f32 || SrcVT == MVT::f64)

    return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);


  return SDValue();

}


SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDValue Src = Op.getOperand(0);

  unsigned OpOpcode = Op.getOpcode();

  EVT SrcVT = Src.getValueType();

  EVT DstVT = Op.getValueType();

  SDValue SatVTOp = Op.getNode()->getOperand(1);

  EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();

  SDLoc DL(Op);


  uint64_t DstWidth = DstVT.getScalarSizeInBits();

  uint64_t SatWidth = SatVT.getScalarSizeInBits();

  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");


  // Will be selected natively

  if (DstVT == MVT::i32 && SatWidth == DstWidth &&

      (SrcVT == MVT::f32 || SrcVT == MVT::f64))

    return Op;


  if (DstVT == MVT::i16 && SatWidth == DstWidth && SrcVT == MVT::f16)

    return Op;


  // Perform all saturation at selected width (i16 or i32) and truncate

  if (SatWidth < DstWidth && SatWidth <= 32) {

    // For f16 conversion with sub-i16 saturation perform saturation

    // at i16, if available in the target. This removes the need for extra f16

    // to f32 conversion. For all the others use i32.

    MVT ResultVT =

        Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16

            ? MVT::i16

            : MVT::i32;


    const SDValue ResultVTOp = DAG.getValueType(ResultVT);

    const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();


    // First, convert input float into selected integer (i16 or i32)

    SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);

    SDValue IntSatVal;


    // Then, clamp at the saturation width using either i16 or i32 instructions

    if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {

      SDValue MinConst = DAG.getConstant(

          APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);

      SDValue MaxConst = DAG.getConstant(

          APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);

      SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);

      IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);

    } else {

      SDValue MinConst = DAG.getConstant(

          APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);

      IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);

    }


    // Finally, after saturating at i16 or i32 fit into the destination type

    return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,

                             DstVT);

  }


  // SatWidth == DstWidth


  // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion

  // below)

  if (DstVT == MVT::i64 &&

      (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||

       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {

    const SDValue Int32VTOp = DAG.getValueType(MVT::i32);

    return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);

  }


  // Promote f16/bf16 src to f32 for i32 conversion

  if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {

    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);

    return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);

  }


  // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16

  // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32

  // saturation; this covers i16.f32 and i16.f64

  if (DstWidth < 32) {

    // Note: this triggers SatWidth < DstWidth above to generate saturated

    // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.

    MVT PromoteVT =

        (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;

    SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);

    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);

  }


  // TODO: can we implement i64 dst for f32/f64?


  return SDValue();

}


SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,

                                                     SelectionDAG &DAG) const {

  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();

  MVT VT = Op.getSimpleValueType();

  MVT ScalarVT = VT.getScalarType();


  assert(VT.isVector());


  SDValue Src = Op.getOperand(0);

  SDLoc DL(Op);


  // TODO: Don't scalarize on Evergreen?

  unsigned NElts = VT.getVectorNumElements();

  SmallVector<SDValue, 8> Args;

  DAG.ExtractVectorElements(Src, Args, 0, NElts);


  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());

  for (unsigned I = 0; I < NElts; ++I)

    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);


  return DAG.getBuildVector(VT, DL, Args);

}


//===----------------------------------------------------------------------===//

// Custom DAG optimizations

//===----------------------------------------------------------------------===//


static bool isU24(SDValue Op, SelectionDAG &DAG) {

  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;

}


static bool isI24(SDValue Op, SelectionDAG &DAG) {

  EVT VT = Op.getValueType();

  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated

                                     // as unsigned 24-bit values.

         AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;

}


static SDValue simplifyMul24(SDNode *Node24,

                             TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;


  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);

  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);

  unsigned NewOpcode = Node24->getOpcode();

  if (IsIntrin) {

    unsigned IID = Node24->getConstantOperandVal(0);

    switch (IID) {

    case Intrinsic::amdgcn_mul_i24:

      NewOpcode = AMDGPUISD::MUL_I24;

      break;

    case Intrinsic::amdgcn_mul_u24:

      NewOpcode = AMDGPUISD::MUL_U24;

      break;

    case Intrinsic::amdgcn_mulhi_i24:

      NewOpcode = AMDGPUISD::MULHI_I24;

      break;

    case Intrinsic::amdgcn_mulhi_u24:

      NewOpcode = AMDGPUISD::MULHI_U24;

      break;

    default:

      llvm_unreachable("Expected 24-bit mul intrinsic");

    }

  }


  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);


  // First try to simplify using SimplifyMultipleUseDemandedBits which allows

  // the operands to have other uses, but will only perform simplifications that

  // involve bypassing some nodes for this user.

  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);

  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);

  if (DemandedLHS || DemandedRHS)

    return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),

                       DemandedLHS ? DemandedLHS : LHS,

                       DemandedRHS ? DemandedRHS : RHS);


  // Now try SimplifyDemandedBits which can simplify the nodes used by our

  // operands if this node is the only user.

  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))

    return SDValue(Node24, 0);

  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))

    return SDValue(Node24, 0);


  return SDValue();

}


template <typename IntTy>


static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,

                               uint32_t Width, const SDLoc &DL) {

  if (Width + Offset < 32) {

    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);

    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);

    if constexpr (std::is_signed_v<IntTy>) {

      return DAG.getSignedConstant(Result, DL, MVT::i32);

    } else {

      return DAG.getConstant(Result, DL, MVT::i32);

    }

  }


  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);

}


static bool hasVolatileUser(SDNode *Val) {

  for (SDNode *U : Val->users()) {

    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {

      if (M->isVolatile())

        return true;

    }

  }


  return false;

}


bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {

  // i32 vectors are the canonical memory type.

  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))

    return false;


  if (!VT.isByteSized())

    return false;


  unsigned Size = VT.getStoreSize();


  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())

    return false;


  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))

    return false;


  return true;

}


// Replace load of an illegal type with a bitcast from a load of a friendlier

// type.


SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  LoadSDNode *LN = cast<LoadSDNode>(N);

  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))

    return SDValue();


  SDLoc SL(N);

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = LN->getMemoryVT();


  unsigned Size = VT.getStoreSize();

  Align Alignment = LN->getAlign();

  if (Alignment < Size && isTypeLegal(VT)) {

    unsigned IsFast;

    unsigned AS = LN->getAddressSpace();


    // Expand unaligned loads earlier than legalization. Due to visitation order

    // problems during legalization, the emitted instructions to pack and unpack

    // the bytes again are not eliminated in the case of an unaligned copy.

    if (!allowsMisalignedMemoryAccesses(

            VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {

      if (VT.isVector())

        return SplitVectorLoad(SDValue(LN, 0), DAG);


      SDValue Ops[2];

      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);


      return DAG.getMergeValues(Ops, SDLoc(N));

    }


    if (!IsFast)

      return SDValue();

  }


  if (!shouldCombineMemoryType(VT))

    return SDValue();


  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);


  SDValue NewLoad

    = DAG.getLoad(NewVT, SL, LN->getChain(),

                  LN->getBasePtr(), LN->getMemOperand());


  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);

  DCI.CombineTo(N, BC, NewLoad.getValue(1));

  return SDValue(N, 0);

}


// Replace store of an illegal type with a store of a bitcast to a friendlier

// type.


SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  if (!DCI.isBeforeLegalize())

    return SDValue();


  StoreSDNode *SN = cast<StoreSDNode>(N);

  if (!SN->isSimple() || !ISD::isNormalStore(SN))

    return SDValue();


  EVT VT = SN->getMemoryVT();

  unsigned Size = VT.getStoreSize();


  SDLoc SL(N);

  SelectionDAG &DAG = DCI.DAG;

  Align Alignment = SN->getAlign();

  if (Alignment < Size && isTypeLegal(VT)) {

    unsigned IsFast;

    unsigned AS = SN->getAddressSpace();


    // Expand unaligned stores earlier than legalization. Due to visitation

    // order problems during legalization, the emitted instructions to pack and

    // unpack the bytes again are not eliminated in the case of an unaligned

    // copy.

    if (!allowsMisalignedMemoryAccesses(

            VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {

      if (VT.isVector())

        return SplitVectorStore(SDValue(SN, 0), DAG);


      return expandUnalignedStore(SN, DAG);

    }


    if (!IsFast)

      return SDValue();

  }


  if (!shouldCombineMemoryType(VT))

    return SDValue();


  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);

  SDValue Val = SN->getValue();


  // DCI.AddToWorklist(Val.getNode());


  bool OtherUses = !Val.hasOneUse();

  SDValue CastVal = DAG.getBitcast(NewVT, Val);

  if (OtherUses) {

    SDValue CastBack = DAG.getBitcast(VT, CastVal);

    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);

  }


  return DAG.getStore(SN->getChain(), SL, CastVal,

                      SN->getBasePtr(), SN->getMemOperand());

}


// FIXME: This should go in generic DAG combiner with an isTruncateFree check,

// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU

// issues.


SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,

                                                        DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0);


  // (vt2 (assertzext (truncate vt0:x), vt1)) ->

  //     (vt2 (truncate (assertzext vt0:x, vt1)))

  if (N0.getOpcode() == ISD::TRUNCATE) {

    SDValue N1 = N->getOperand(1);

    EVT ExtVT = cast<VTSDNode>(N1)->getVT();

    SDLoc SL(N);


    SDValue Src = N0.getOperand(0);

    EVT SrcVT = Src.getValueType();

    if (SrcVT.bitsGE(ExtVT)) {

      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);

      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);

    }

  }


  return SDValue();

}


SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(

  SDNode *N, DAGCombinerInfo &DCI) const {

  unsigned IID = N->getConstantOperandVal(0);

  switch (IID) {

  case Intrinsic::amdgcn_mul_i24:

  case Intrinsic::amdgcn_mul_u24:

  case Intrinsic::amdgcn_mulhi_i24:

  case Intrinsic::amdgcn_mulhi_u24:

    return simplifyMul24(N, DCI);

  case Intrinsic::amdgcn_fract:

  case Intrinsic::amdgcn_rsq:

  case Intrinsic::amdgcn_rcp_legacy:

  case Intrinsic::amdgcn_rsq_legacy:

  case Intrinsic::amdgcn_rsq_clamp:

  case Intrinsic::amdgcn_tanh:

  case Intrinsic::amdgcn_prng_b32: {

    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted

    SDValue Src = N->getOperand(1);

    return Src.isUndef() ? Src : SDValue();

  }

  case Intrinsic::amdgcn_frexp_exp: {

    // frexp_exp (fneg x) -> frexp_exp x

    // frexp_exp (fabs x) -> frexp_exp x

    // frexp_exp (fneg (fabs x)) -> frexp_exp x

    SDValue Src = N->getOperand(1);

    SDValue PeekSign = peekFPSignOps(Src);

    if (PeekSign == Src)

      return SDValue();

    return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),

                   0);

  }

  default:

    return SDValue();

  }

}


/// Split the 64-bit value \p LHS into two 32-bit components, and perform the

/// binary operation \p Opc to it with the corresponding constant operands.


SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(

  DAGCombinerInfo &DCI, const SDLoc &SL,

  unsigned Opc, SDValue LHS,

  uint32_t ValLo, uint32_t ValHi) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue Lo, Hi;

  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);


  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);

  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);


  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);

  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);


  // Re-visit the ands. It's possible we eliminated one of them and it could

  // simplify the vector.

  DCI.AddToWorklist(Lo.getNode());

  DCI.AddToWorklist(Hi.getNode());


  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});

  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);

}


SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

  SDLoc SL(N);

  SelectionDAG &DAG = DCI.DAG;


  unsigned RHSVal;

  if (CRHS) {

    RHSVal = CRHS->getZExtValue();

    if (!RHSVal)

      return LHS;


    switch (LHS->getOpcode()) {

    default:

      break;

    case ISD::ZERO_EXTEND:

    case ISD::SIGN_EXTEND:

    case ISD::ANY_EXTEND: {

      SDValue X = LHS->getOperand(0);


      if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&

          isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {

        // Prefer build_vector as the canonical form if packed types are legal.

        // (shl ([asz]ext i16:x), 16 -> build_vector 0, x

        SDValue Vec = DAG.getBuildVector(

            MVT::v2i16, SL,

            {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});

        return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);

      }


      // shl (ext x) => zext (shl x), if shift does not overflow int

      if (VT != MVT::i64)

        break;

      KnownBits Known = DAG.computeKnownBits(X);

      unsigned LZ = Known.countMinLeadingZeros();

      if (LZ < RHSVal)

        break;

      EVT XVT = X.getValueType();

      SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));

      return DAG.getZExtOrTrunc(Shl, SL, VT);

    }

    }

  }


  if (VT.getScalarType() != MVT::i64)

    return SDValue();


  // On some subtargets, 64-bit shift is a quarter rate instruction. In the

  // common case, splitting this into a move and a 32-bit shift is faster and

  // the same code size.

  KnownBits Known = DAG.computeKnownBits(RHS);


  EVT ElementType = VT.getScalarType();

  EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());

  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);


  if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())

    return SDValue();

  SDValue ShiftAmt;


  if (CRHS) {

    ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,

                               TargetType);

  } else {

    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);

    const SDValue ShiftMask =

        DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);

    // This AND instruction will clamp out of bounds shift values.

    // It will also be removed during later instruction selection.

    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);

  }


  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);

  SDValue NewShift =

      DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());


  const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);

  SDValue Vec;


  if (VT.isVector()) {

    EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());

    unsigned NElts = TargetType.getVectorNumElements();

    SmallVector<SDValue, 8> HiOps;

    SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);


    DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);

    for (unsigned I = 0; I != NElts; ++I)

      HiAndLoOps[2 * I + 1] = HiOps[I];

    Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);

  } else {

    EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);

    Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});

  }

  return DAG.getNode(ISD::BITCAST, SL, VT, Vec);

}


SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  SDValue RHS = N->getOperand(1);

  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);

  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);


  if (VT.getScalarType() != MVT::i64)

    return SDValue();


  // For C >= 32

  // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))


  // On some subtargets, 64-bit shift is a quarter rate instruction. In the

  // common case, splitting this into a move and a 32-bit shift is faster and

  // the same code size.

  KnownBits Known = DAG.computeKnownBits(RHS);


  EVT ElementType = VT.getScalarType();

  EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());

  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);


  if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())

    return SDValue();


  SDValue ShiftFullAmt =

      DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);

  SDValue ShiftAmt;

  if (CRHS) {

    unsigned RHSVal = CRHS->getZExtValue();

    ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,

                               TargetType);

  } else if (Known.getMinValue().getZExtValue() ==

             (ElementType.getSizeInBits() - 1)) {

    ShiftAmt = ShiftFullAmt;

  } else {

    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);

    const SDValue ShiftMask =

        DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);

    // This AND instruction will clamp out of bounds shift values.

    // It will also be removed during later instruction selection.

    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);

  }


  EVT ConcatType;

  SDValue Hi;

  SDLoc LHSSL(LHS);

  // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi

  if (VT.isVector()) {

    unsigned NElts = TargetType.getVectorNumElements();

    ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());

    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);

    SmallVector<SDValue, 8> HiOps(NElts);

    SmallVector<SDValue, 16> HiAndLoOps;


    DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);

    for (unsigned I = 0; I != NElts; ++I) {

      HiOps[I] = HiAndLoOps[2 * I + 1];

    }

    Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);

  } else {

    const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);

    ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);

    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);

    Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);

  }


  KnownBits KnownLHS = DAG.computeKnownBits(LHS);

  SDValue NewShift, HiShift;

  if (KnownLHS.isNegative()) {

    HiShift = DAG.getAllOnesConstant(SL, TargetType);

    NewShift =

        DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());

  } else if (CRHS &&

             CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {

    NewShift = HiShift =

        DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());

  } else {

    Hi = DAG.getFreeze(Hi);

    HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);

    NewShift =

        DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());

  }


  SDValue Vec;

  if (VT.isVector()) {

    unsigned NElts = TargetType.getVectorNumElements();

    SmallVector<SDValue, 8> HiOps;

    SmallVector<SDValue, 8> LoOps;

    SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);


    DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);

    DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);

    for (unsigned I = 0; I != NElts; ++I) {

      HiAndLoOps[2 * I + 1] = HiOps[I];

      HiAndLoOps[2 * I] = LoOps[I];

    }

    Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);

  } else {

    Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});

  }

  return DAG.getNode(ISD::BITCAST, SL, VT, Vec);

}


SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  SDValue RHS = N->getOperand(1);

  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);

  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);

  unsigned RHSVal;


  if (CRHS) {

    RHSVal = CRHS->getZExtValue();


    // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)

    // this improves the ability to match BFE patterns in isel.

    if (LHS.getOpcode() == ISD::AND) {

      if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {

        unsigned MaskIdx, MaskLen;

        if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&

            MaskIdx == RHSVal) {

          return DAG.getNode(ISD::AND, SL, VT,

                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),

                                         N->getOperand(1)),

                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),

                                         N->getOperand(1)));

        }

      }

    }

  }


  if (VT.getScalarType() != MVT::i64)

    return SDValue();


  // for C >= 32

  // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)


  // On some subtargets, 64-bit shift is a quarter rate instruction. In the

  // common case, splitting this into a move and a 32-bit shift is faster and

  // the same code size.

  KnownBits Known = DAG.computeKnownBits(RHS);


  EVT ElementType = VT.getScalarType();

  EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());

  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);


  if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())

    return SDValue();


  SDValue ShiftAmt;

  if (CRHS) {

    ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,

                               TargetType);

  } else {

    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);

    const SDValue ShiftMask =

        DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);

    // This AND instruction will clamp out of bounds shift values.

    // It will also be removed during later instruction selection.

    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);

  }


  const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);

  EVT ConcatType;

  SDValue Hi;

  SDLoc LHSSL(LHS);

  // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi

  if (VT.isVector()) {

    unsigned NElts = TargetType.getVectorNumElements();

    ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());

    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);

    SmallVector<SDValue, 8> HiOps(NElts);

    SmallVector<SDValue, 16> HiAndLoOps;


    DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);

    for (unsigned I = 0; I != NElts; ++I)

      HiOps[I] = HiAndLoOps[2 * I + 1];

    Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);

  } else {

    const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);

    ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);

    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);

    Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);

  }


  SDValue NewShift =

      DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());


  SDValue Vec;

  if (VT.isVector()) {

    unsigned NElts = TargetType.getVectorNumElements();

    SmallVector<SDValue, 8> LoOps;

    SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);


    DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);

    for (unsigned I = 0; I != NElts; ++I)

      HiAndLoOps[2 * I] = LoOps[I];

    Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);

  } else {

    Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});

  }

  return DAG.getNode(ISD::BITCAST, SL, VT, Vec);

}


SDValue AMDGPUTargetLowering::performTruncateCombine(

  SDNode *N, DAGCombinerInfo &DCI) const {

  SDLoc SL(N);

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(0);


  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)

  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {

    SDValue Vec = Src.getOperand(0);

    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {

      SDValue Elt0 = Vec.getOperand(0);

      EVT EltVT = Elt0.getValueType();

      if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {

        if (EltVT.isFloatingPoint()) {

          Elt0 = DAG.getNode(ISD::BITCAST, SL,

                             EltVT.changeTypeToInteger(), Elt0);

        }


        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);

      }

    }

  }


  // Equivalent of above for accessing the high element of a vector as an

  // integer operation.

  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)

  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {

    if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {

      SDValue BV = stripBitcast(Src.getOperand(0));

      if (BV.getOpcode() == ISD::BUILD_VECTOR) {

        EVT SrcEltVT = BV.getOperand(0).getValueType();

        unsigned SrcEltSize = SrcEltVT.getSizeInBits();

        unsigned BitIndex = K->getZExtValue();

        unsigned PartIndex = BitIndex / SrcEltSize;


        if (PartIndex * SrcEltSize == BitIndex &&

            PartIndex < BV.getNumOperands()) {

          if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {

            SDValue SrcElt =

                DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),

                            BV.getOperand(PartIndex));

            return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);

          }

        }

      }

    }

  }


  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.

  //

  // i16 (trunc (srl i64:x, K)), K <= 16 ->

  //     i16 (trunc (srl (i32 (trunc x), K)))

  if (VT.getScalarSizeInBits() < 32) {

    EVT SrcVT = Src.getValueType();

    if (SrcVT.getScalarSizeInBits() > 32 &&

        (Src.getOpcode() == ISD::SRL ||

         Src.getOpcode() == ISD::SRA ||

         Src.getOpcode() == ISD::SHL)) {

      SDValue Amt = Src.getOperand(1);

      KnownBits Known = DAG.computeKnownBits(Amt);


      // - For left shifts, do the transform as long as the shift

      //   amount is still legal for i32, so when ShiftAmt < 32 (<= 31)

      // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid

      //   losing information stored in the high bits when truncating.

      const unsigned MaxCstSize =

          (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());

      if (Known.getMaxValue().ule(MaxCstSize)) {

        EVT MidVT = VT.isVector() ?

          EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                           VT.getVectorNumElements()) : MVT::i32;


        EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());

        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,

                                    Src.getOperand(0));

        DCI.AddToWorklist(Trunc.getNode());


        if (Amt.getValueType() != NewShiftVT) {

          Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);

          DCI.AddToWorklist(Amt.getNode());

        }


        SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,

                                          Trunc, Amt);

        return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);

      }

    }

  }


  return SDValue();

}


// We need to specifically handle i64 mul here to avoid unnecessary conversion

// instructions. If we only match on the legalized i64 mul expansion,

// SimplifyDemandedBits will be unable to remove them because there will be

// multiple uses due to the separate mul + mulh[su].


static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,

                        SDValue N0, SDValue N1, unsigned Size, bool Signed) {

  if (Size <= 32) {

    unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;

    return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);

  }


  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;

  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;


  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);

  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);


  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);

}


/// If \p V is an add of a constant 1, returns the other operand. Otherwise

/// return SDValue().


static SDValue getAddOneOp(const SDNode *V) {

  if (V->getOpcode() != ISD::ADD)

    return SDValue();


  return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();

}


SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  assert(N->getOpcode() == ISD::MUL);

  EVT VT = N->getValueType(0);


  // Don't generate 24-bit multiplies on values that are in SGPRs, since

  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs

  // unnecessarily). isDivergent() is used as an approximation of whether the

  // value is in an SGPR.

  if (!N->isDivergent())

    return SDValue();


  unsigned Size = VT.getSizeInBits();

  if (VT.isVector() || Size > 64)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad

  // matching.


  // mul x, (add y, 1) -> add (mul x, y), x

  auto IsFoldableAdd = [](SDValue V) -> SDValue {

    SDValue AddOp = getAddOneOp(V.getNode());

    if (!AddOp)

      return SDValue();


    if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {

          return U->getOpcode() == ISD::MUL;

        }))

      return AddOp;


    return SDValue();

  };


  // FIXME: The selection pattern is not properly checking for commuted

  // operands, so we have to place the mul in the LHS

  if (SDValue MulOper = IsFoldableAdd(N0)) {

    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);

    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);

  }


  if (SDValue MulOper = IsFoldableAdd(N1)) {

    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);

    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);

  }


  // There are i16 integer mul/mad.

  if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))

    return SDValue();


  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends

  // in the source into any_extends if the result of the mul is truncated. Since

  // we can assume the high bits are whatever we want, use the underlying value

  // to avoid the unknown high bits from interfering.

  if (N0.getOpcode() == ISD::ANY_EXTEND)

    N0 = N0.getOperand(0);


  if (N1.getOpcode() == ISD::ANY_EXTEND)

    N1 = N1.getOperand(0);


  SDValue Mul;


  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {

    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);

    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);

    Mul = getMul24(DAG, DL, N0, N1, Size, false);

  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {

    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);

    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);

    Mul = getMul24(DAG, DL, N0, N1, Size, true);

  } else {

    return SDValue();

  }


  // We need to use sext even for MUL_U24, because MUL_U24 is used

  // for signed multiply of 8 and 16-bit types.

  return DAG.getSExtOrTrunc(Mul, DL, VT);

}


SDValue


AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  if (N->getValueType(0) != MVT::i32)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  bool Signed = N->getOpcode() == ISD::SMUL_LOHI;

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends

  // in the source into any_extends if the result of the mul is truncated. Since

  // we can assume the high bits are whatever we want, use the underlying value

  // to avoid the unknown high bits from interfering.

  if (N0.getOpcode() == ISD::ANY_EXTEND)

    N0 = N0.getOperand(0);

  if (N1.getOpcode() == ISD::ANY_EXTEND)

    N1 = N1.getOperand(0);


  // Try to use two fast 24-bit multiplies (one for each half of the result)

  // instead of one slow extending multiply.

  unsigned LoOpcode = 0;

  unsigned HiOpcode = 0;

  if (Signed) {

    if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {

      N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);

      N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);

      LoOpcode = AMDGPUISD::MUL_I24;

      HiOpcode = AMDGPUISD::MULHI_I24;

    }

  } else {

    if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {

      N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);

      N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);

      LoOpcode = AMDGPUISD::MUL_U24;

      HiOpcode = AMDGPUISD::MULHI_U24;

    }

  }

  if (!LoOpcode)

    return SDValue();


  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);

  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);

  DCI.CombineTo(N, Lo, Hi);

  return SDValue(N, 0);

}


SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);


  if (!Subtarget->hasMulI24() || VT.isVector())

    return SDValue();


  // Don't generate 24-bit multiplies on values that are in SGPRs, since

  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs

  // unnecessarily). isDivergent() is used as an approximation of whether the

  // value is in an SGPR.

  // This doesn't apply if no s_mul_hi is available (since we'll end up with a

  // valu op anyway)

  if (Subtarget->hasSMulHi() && !N->isDivergent())

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  if (!isI24(N0, DAG) || !isI24(N1, DAG))

    return SDValue();


  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);

  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);


  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);

  DCI.AddToWorklist(Mulhi.getNode());

  return DAG.getSExtOrTrunc(Mulhi, DL, VT);

}


SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);


  if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())

    return SDValue();


  // Don't generate 24-bit multiplies on values that are in SGPRs, since

  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs

  // unnecessarily). isDivergent() is used as an approximation of whether the

  // value is in an SGPR.

  // This doesn't apply if no s_mul_hi is available (since we'll end up with a

  // valu op anyway)

  if (!N->isDivergent() && Subtarget->hasSMulHi())

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  if (!isU24(N0, DAG) || !isU24(N1, DAG))

    return SDValue();


  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);

  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);


  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);

  DCI.AddToWorklist(Mulhi.getNode());

  return DAG.getZExtOrTrunc(Mulhi, DL, VT);

}


SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,

                                          SDValue Op,

                                          const SDLoc &DL,

                                          unsigned Opc) const {

  EVT VT = Op.getValueType();

  if (VT.bitsGT(MVT::i32))

    return SDValue();


  if (VT != MVT::i32)

    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);


  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);

  if (VT != MVT::i32)

    FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);


  return FFBX;

}


// The native instructions return -1 on 0 input. Optimize out a select that

// produces -1 on 0.

//

// TODO: If zero is not undef, we could also do this if the output is compared

// against the bitwidth.

//

// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.


SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,

                                                 SDValue LHS, SDValue RHS,

                                                 DAGCombinerInfo &DCI) const {

  if (!isNullConstant(Cond.getOperand(1)))

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

  SDValue CmpLHS = Cond.getOperand(0);


  // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x

  // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x

  if (CCOpcode == ISD::SETEQ &&

      (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&

      RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {

    unsigned Opc =

        isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;

    return getFFBX_U32(DAG, CmpLHS, SL, Opc);

  }


  // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x

  // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x

  if (CCOpcode == ISD::SETNE &&

      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&

      LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {

    unsigned Opc =

        isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;


    return getFFBX_U32(DAG, CmpLHS, SL, Opc);

  }


  return SDValue();

}


static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,

                                         unsigned Op,

                                         const SDLoc &SL,

                                         SDValue Cond,

                                         SDValue N1,

                                         SDValue N2) {

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N1.getValueType();


  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,

                                  N1.getOperand(0), N2.getOperand(0));

  DCI.AddToWorklist(NewSelect.getNode());

  return DAG.getNode(Op, SL, VT, NewSelect);

}


// Pull a free FP operation out of a select so it may fold into uses.

//

// select c, (fneg x), (fneg y) -> fneg (select c, x, y)

// select c, (fneg x), k -> fneg (select c, x, (fneg k))

//

// select c, (fabs x), (fabs y) -> fabs (select c, x, y)

// select c, (fabs x), +k -> fabs (select c, x, k)

SDValue


AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,

                                           SDValue N) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue Cond = N.getOperand(0);

  SDValue LHS = N.getOperand(1);

  SDValue RHS = N.getOperand(2);


  EVT VT = N.getValueType();

  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||

      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {

    if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))

      return SDValue();


    return distributeOpThroughSelect(DCI, LHS.getOpcode(),

                                     SDLoc(N), Cond, LHS, RHS);

  }


  bool Inv = false;

  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {

    std::swap(LHS, RHS);

    Inv = true;

  }


  // TODO: Support vector constants.

  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);

  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&

      !selectSupportsSourceMods(N.getNode())) {

    SDLoc SL(N);

    // If one side is an fneg/fabs and the other is a constant, we can push the

    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.

    SDValue NewLHS = LHS.getOperand(0);

    SDValue NewRHS = RHS;


    // Careful: if the neg can be folded up, don't try to pull it back down.

    bool ShouldFoldNeg = true;


    if (NewLHS.hasOneUse()) {

      unsigned Opc = NewLHS.getOpcode();

      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))

        ShouldFoldNeg = false;

      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)

        ShouldFoldNeg = false;

    }


    if (ShouldFoldNeg) {

      if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())

        return SDValue();


      // We're going to be forced to use a source modifier anyway, there's no

      // point to pulling the negate out unless we can get a size reduction by

      // negating the constant.

      //

      // TODO: Generalize to use getCheaperNegatedExpression which doesn't know

      // about cheaper constants.

      if (NewLHS.getOpcode() == ISD::FABS &&

          getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)

        return SDValue();


      if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))

        return SDValue();


      if (LHS.getOpcode() == ISD::FNEG)

        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);


      if (Inv)

        std::swap(NewLHS, NewRHS);


      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,

                                      Cond, NewLHS, NewRHS);

      DCI.AddToWorklist(NewSelect.getNode());

      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);

    }

  }


  return SDValue();

}


SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,

                                                   DAGCombinerInfo &DCI) const {

  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))

    return Folded;


  SDValue Cond = N->getOperand(0);

  if (Cond.getOpcode() != ISD::SETCC)

    return SDValue();


  EVT VT = N->getValueType(0);

  SDValue LHS = Cond.getOperand(0);

  SDValue RHS = Cond.getOperand(1);

  SDValue CC = Cond.getOperand(2);


  SDValue True = N->getOperand(1);

  SDValue False = N->getOperand(2);


  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.

    SelectionDAG &DAG = DCI.DAG;

    if (DAG.isConstantValueOfAnyType(True) &&

        !DAG.isConstantValueOfAnyType(False)) {

      // Swap cmp + select pair to move constant to false input.

      // This will allow using VOPC cndmasks more often.

      // select (setcc x, y), k, x -> select (setccinv x, y), x, k


      SDLoc SL(N);

      ISD::CondCode NewCC =

          getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());


      SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);

      return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);

    }


    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {

      SDValue MinMax

        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);

      // Revisit this node so we can catch min3/max3/med3 patterns.

      //DCI.AddToWorklist(MinMax.getNode());

      return MinMax;

    }

  }


  // There's no reason to not do this if the condition has other uses.

  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);

}


static bool isInv2Pi(const APFloat &APF) {

  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));

  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));

  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));


  return APF.bitwiseIsEqual(KF16) ||

         APF.bitwiseIsEqual(KF32) ||

         APF.bitwiseIsEqual(KF64);

}


// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an

// additional cost to negate them.

TargetLowering::NegatibleCost


AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {

  if (C->isZero())

    return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;


  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))

    return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;


  return NegatibleCost::Neutral;

}


bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {

  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))

    return getConstantNegateCost(C) == NegatibleCost::Expensive;

  return false;

}


bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {

  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))

    return getConstantNegateCost(C) == NegatibleCost::Cheaper;

  return false;

}


static unsigned inverseMinMax(unsigned Opc) {

  switch (Opc) {

  case ISD::FMAXNUM:

    return ISD::FMINNUM;

  case ISD::FMINNUM:

    return ISD::FMAXNUM;

  case ISD::FMAXNUM_IEEE:

    return ISD::FMINNUM_IEEE;

  case ISD::FMINNUM_IEEE:

    return ISD::FMAXNUM_IEEE;

  case ISD::FMAXIMUM:

    return ISD::FMINIMUM;

  case ISD::FMINIMUM:

    return ISD::FMAXIMUM;

  case ISD::FMAXIMUMNUM:

    return ISD::FMINIMUMNUM;

  case ISD::FMINIMUMNUM:

    return ISD::FMAXIMUMNUM;

  case AMDGPUISD::FMAX_LEGACY:

    return AMDGPUISD::FMIN_LEGACY;

  case AMDGPUISD::FMIN_LEGACY:

    return AMDGPUISD::FMAX_LEGACY;

  default:

    llvm_unreachable("invalid min/max opcode");

  }

}


/// \return true if it's profitable to try to push an fneg into its source

/// instruction.


bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {

  // If the input has multiple uses and we can either fold the negate down, or

  // the other uses cannot, give up. This both prevents unprofitable

  // transformations and infinite loops: we won't repeatedly try to fold around

  // a negate that has no 'good' form.

  if (N0.hasOneUse()) {

    // This may be able to fold into the source, but at a code size cost. Don't

    // fold if the fold into the user is free.

    if (allUsesHaveSourceMods(N, 0))

      return false;

  } else {

    if (fnegFoldsIntoOp(N0.getNode()) &&

        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))

      return false;

  }


  return true;

}


SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);


  unsigned Opc = N0.getOpcode();


  if (!shouldFoldFNegIntoSrc(N, N0))

    return SDValue();


  SDLoc SL(N);

  switch (Opc) {

  case ISD::FADD: {

    if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())

      return SDValue();


    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))

    SDValue LHS = N0.getOperand(0);

    SDValue RHS = N0.getOperand(1);


    if (LHS.getOpcode() != ISD::FNEG)

      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);

    else

      LHS = LHS.getOperand(0);


    if (RHS.getOpcode() != ISD::FNEG)

      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

    else

      RHS = RHS.getOperand(0);


    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());

    if (Res.getOpcode() != ISD::FADD)

      return SDValue(); // Op got folded away.

    if (!N0.hasOneUse())

      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));

    return Res;

  }

  case ISD::FMUL:

  case AMDGPUISD::FMUL_LEGACY: {

    // (fneg (fmul x, y)) -> (fmul x, (fneg y))

    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))

    SDValue LHS = N0.getOperand(0);

    SDValue RHS = N0.getOperand(1);


    if (LHS.getOpcode() == ISD::FNEG)

      LHS = LHS.getOperand(0);

    else if (RHS.getOpcode() == ISD::FNEG)

      RHS = RHS.getOperand(0);

    else

      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);


    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());

    if (Res.getOpcode() != Opc)

      return SDValue(); // Op got folded away.

    if (!N0.hasOneUse())

      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));

    return Res;

  }

  case ISD::FMA:

  case ISD::FMAD: {

    // TODO: handle llvm.amdgcn.fma.legacy

    if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())

      return SDValue();


    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))

    SDValue LHS = N0.getOperand(0);

    SDValue MHS = N0.getOperand(1);

    SDValue RHS = N0.getOperand(2);


    if (LHS.getOpcode() == ISD::FNEG)

      LHS = LHS.getOperand(0);

    else if (MHS.getOpcode() == ISD::FNEG)

      MHS = MHS.getOperand(0);

    else

      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);


    if (RHS.getOpcode() != ISD::FNEG)

      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

    else

      RHS = RHS.getOperand(0);


    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);

    if (Res.getOpcode() != Opc)

      return SDValue(); // Op got folded away.

    if (!N0.hasOneUse())

      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));

    return Res;

  }

  case ISD::FMAXNUM:

  case ISD::FMINNUM:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMINNUM_IEEE:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

  case AMDGPUISD::FMAX_LEGACY:

  case AMDGPUISD::FMIN_LEGACY: {

    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)

    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)

    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)

    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)


    SDValue LHS = N0.getOperand(0);

    SDValue RHS = N0.getOperand(1);


    // 0 doesn't have a negated inline immediate.

    // TODO: This constant check should be generalized to other operations.

    if (isConstantCostlierToNegate(RHS))

      return SDValue();


    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);

    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

    unsigned Opposite = inverseMinMax(Opc);


    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());

    if (Res.getOpcode() != Opposite)

      return SDValue(); // Op got folded away.

    if (!N0.hasOneUse())

      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));

    return Res;

  }

  case AMDGPUISD::FMED3: {

    SDValue Ops[3];

    for (unsigned I = 0; I < 3; ++I)

      Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());


    SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());

    if (Res.getOpcode() != AMDGPUISD::FMED3)

      return SDValue(); // Op got folded away.


    if (!N0.hasOneUse()) {

      SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);

      DAG.ReplaceAllUsesWith(N0, Neg);


      for (SDNode *U : Neg->users())

        DCI.AddToWorklist(U);

    }


    return Res;

  }

  case ISD::FP_EXTEND:

  case ISD::FTRUNC:

  case ISD::FRINT:

  case ISD::FNEARBYINT: // XXX - Should fround be handled?

  case ISD::FROUNDEVEN:

  case ISD::FSIN:

  case ISD::FCANONICALIZE:

  case AMDGPUISD::RCP:

  case AMDGPUISD::RCP_LEGACY:

  case AMDGPUISD::RCP_IFLAG:

  case AMDGPUISD::SIN_HW: {

    SDValue CvtSrc = N0.getOperand(0);

    if (CvtSrc.getOpcode() == ISD::FNEG) {

      // (fneg (fp_extend (fneg x))) -> (fp_extend x)

      // (fneg (rcp (fneg x))) -> (rcp x)

      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));

    }


    if (!N0.hasOneUse())

      return SDValue();


    // (fneg (fp_extend x)) -> (fp_extend (fneg x))

    // (fneg (rcp x)) -> (rcp (fneg x))

    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);

    return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());

  }

  case ISD::FP_ROUND: {

    SDValue CvtSrc = N0.getOperand(0);


    if (CvtSrc.getOpcode() == ISD::FNEG) {

      // (fneg (fp_round (fneg x))) -> (fp_round x)

      return DAG.getNode(ISD::FP_ROUND, SL, VT,

                         CvtSrc.getOperand(0), N0.getOperand(1));

    }


    if (!N0.hasOneUse())

      return SDValue();


    // (fneg (fp_round x)) -> (fp_round (fneg x))

    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);

    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));

  }

  case ISD::FP16_TO_FP: {

    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal

    // f16, but legalization of f16 fneg ends up pulling it out of the source.

    // Put the fneg back as a legal source operation that can be matched later.

    SDLoc SL(N);


    SDValue Src = N0.getOperand(0);

    EVT SrcVT = Src.getValueType();


    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)

    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,

                                  DAG.getConstant(0x8000, SL, SrcVT));

    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);

  }

  case ISD::SELECT: {

    // fneg (select c, a, b) -> select c, (fneg a), (fneg b)

    // TODO: Invert conditions of foldFreeOpFromSelect

    return SDValue();

  }

  case ISD::BITCAST: {

    SDLoc SL(N);

    SDValue BCSrc = N0.getOperand(0);

    if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {

      SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);

      if (HighBits.getValueType().getSizeInBits() != 32 ||

          !fnegFoldsIntoOp(HighBits.getNode()))

        return SDValue();


      // f64 fneg only really needs to operate on the high half of of the

      // register, so try to force it to an f32 operation to help make use of

      // source modifiers.

      //

      //

      // fneg (f64 (bitcast (build_vector x, y))) ->

      // f64 (bitcast (build_vector (bitcast i32:x to f32),

      //                            (fneg (bitcast i32:y to f32)))


      SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);

      SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);

      SDValue CastBack =

          DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);


      SmallVector<SDValue, 8> Ops(BCSrc->ops());

      Ops.back() = CastBack;

      DCI.AddToWorklist(NegHi.getNode());

      SDValue Build =

          DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);

      SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);


      if (!N0.hasOneUse())

        DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));

      return Result;

    }


    if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&

        BCSrc.hasOneUse()) {

      // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->

      //   select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)


      // TODO: Cast back result for multiple uses is beneficial in some cases.


      SDValue LHS =

          DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));

      SDValue RHS =

          DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));


      SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);

      SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);


      return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,

                         NegRHS);

    }


    return SDValue();

  }

  default:

    return SDValue();

  }

}


SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0);


  if (!N0.hasOneUse())

    return SDValue();


  switch (N0.getOpcode()) {

  case ISD::FP16_TO_FP: {

    assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");

    SDLoc SL(N);

    SDValue Src = N0.getOperand(0);

    EVT SrcVT = Src.getValueType();


    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)

    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,

                                  DAG.getConstant(0x7fff, SL, SrcVT));

    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);

  }

  default:

    return SDValue();

  }

}


SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));

  if (!CFP)

    return SDValue();


  // XXX - Should this flush denormals?

  const APFloat &Val = CFP->getValueAPF();

  APFloat One(Val.getSemantics(), "1.0");

  return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));

}


bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const {

  if (!Subtarget->isGCN())

    return false;


  ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(N);

  ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(N);

  auto &ST = DAG.getSubtarget<GCNSubtarget>();

  const auto *TII = ST.getInstrInfo();


  if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))

    return false;


  if (ST.has64BitLiterals())

    return true;


  if (SDConstant) {

    const APInt &APVal = SDConstant->getAPIntValue();

    return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);

  }


  APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();

  return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);

}


SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  switch(N->getOpcode()) {

  default:

    break;

  case ISD::BITCAST: {

    EVT DestVT = N->getValueType(0);


    // Push casts through vector builds. This helps avoid emitting a large

    // number of copies when materializing floating point vector constants.

    //

    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>

    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))

    if (DestVT.isVector()) {

      SDValue Src = N->getOperand(0);

      if (Src.getOpcode() == ISD::BUILD_VECTOR &&

          (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||

           isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {

        EVT SrcVT = Src.getValueType();

        unsigned NElts = DestVT.getVectorNumElements();


        if (SrcVT.getVectorNumElements() == NElts) {

          EVT DestEltVT = DestVT.getVectorElementType();


          SmallVector<SDValue, 8> CastedElts;

          SDLoc SL(N);

          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {

            SDValue Elt = Src.getOperand(I);

            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));

          }


          return DAG.getBuildVector(DestVT, SL, CastedElts);

        }

      }

    }


    if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())

      break;


    // Fold bitcasts of constants.

    //

    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)

    // TODO: Generalize and move to DAGCombiner

    SDValue Src = N->getOperand(0);

    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {

      SDLoc SL(N);

      if (isInt64ImmLegal(C, DAG))

        break;

      uint64_t CVal = C->getZExtValue();

      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,

                               DAG.getConstant(Lo_32(CVal), SL, MVT::i32),

                               DAG.getConstant(Hi_32(CVal), SL, MVT::i32));

      return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);

    }


    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {

      const APInt &Val = C->getValueAPF().bitcastToAPInt();

      SDLoc SL(N);

      if (isInt64ImmLegal(C, DAG))

        break;

      uint64_t CVal = Val.getZExtValue();

      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,

                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),

                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));


      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);

    }


    break;

  }

  case ISD::SHL:

  case ISD::SRA:

  case ISD::SRL: {

    // Range metadata can be invalidated when loads are converted to legal types

    // (e.g. v2i64 -> v4i32).

    // Try to convert vector shl/sra/srl before type legalization so that range

    // metadata can be utilized.

    if (!(N->getValueType(0).isVector() &&

          DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&

        DCI.getDAGCombineLevel() < AfterLegalizeDAG)

      break;

    if (N->getOpcode() == ISD::SHL)

      return performShlCombine(N, DCI);

    if (N->getOpcode() == ISD::SRA)

      return performSraCombine(N, DCI);

    return performSrlCombine(N, DCI);

  }

  case ISD::TRUNCATE:

    return performTruncateCombine(N, DCI);

  case ISD::MUL:

    return performMulCombine(N, DCI);

  case AMDGPUISD::MUL_U24:

  case AMDGPUISD::MUL_I24: {

    if (SDValue Simplified = simplifyMul24(N, DCI))

      return Simplified;

    break;

  }

  case AMDGPUISD::MULHI_I24:

  case AMDGPUISD::MULHI_U24:

    return simplifyMul24(N, DCI);

  case ISD::SMUL_LOHI:

  case ISD::UMUL_LOHI:

    return performMulLoHiCombine(N, DCI);

  case ISD::MULHS:

    return performMulhsCombine(N, DCI);

  case ISD::MULHU:

    return performMulhuCombine(N, DCI);

  case ISD::SELECT:

    return performSelectCombine(N, DCI);

  case ISD::FNEG:

    return performFNegCombine(N, DCI);

  case ISD::FABS:

    return performFAbsCombine(N, DCI);

  case AMDGPUISD::BFE_I32:

  case AMDGPUISD::BFE_U32: {

    assert(!N->getValueType(0).isVector() &&

           "Vector handling of BFE not implemented");

    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));

    if (!Width)

      break;


    uint32_t WidthVal = Width->getZExtValue() & 0x1f;

    if (WidthVal == 0)

      return DAG.getConstant(0, DL, MVT::i32);


    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));

    if (!Offset)

      break;


    SDValue BitsFrom = N->getOperand(0);

    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;


    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;


    if (OffsetVal == 0) {

      // This is already sign / zero extended, so try to fold away extra BFEs.

      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);


      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);

      if (OpSignBits >= SignBits)

        return BitsFrom;


      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);

      if (Signed) {

        // This is a sign_extend_inreg. Replace it to take advantage of existing

        // DAG Combines. If not eliminated, we will match back to BFE during

        // selection.


        // TODO: The sext_inreg of extended types ends, although we can could

        // handle them in a single BFE.

        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,

                           DAG.getValueType(SmallVT));

      }


      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);

    }


    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {

      if (Signed) {

        return constantFoldBFE<int32_t>(DAG,

                                        CVal->getSExtValue(),

                                        OffsetVal,

                                        WidthVal,

                                        DL);

      }


      return constantFoldBFE<uint32_t>(DAG,

                                       CVal->getZExtValue(),

                                       OffsetVal,

                                       WidthVal,

                                       DL);

    }


    if ((OffsetVal + WidthVal) >= 32 &&

        !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {

      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);

      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,

                         BitsFrom, ShiftVal);

    }


    if (BitsFrom.hasOneUse()) {

      APInt Demanded = APInt::getBitsSet(32,

                                         OffsetVal,

                                         OffsetVal + WidthVal);


      KnownBits Known;

      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

                                            !DCI.isBeforeLegalizeOps());

      const TargetLowering &TLI = DAG.getTargetLoweringInfo();

      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||

          TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {

        DCI.CommitTargetLoweringOpt(TLO);

      }

    }


    break;

  }

  case ISD::LOAD:

    return performLoadCombine(N, DCI);

  case ISD::STORE:

    return performStoreCombine(N, DCI);

  case AMDGPUISD::RCP:

  case AMDGPUISD::RCP_IFLAG:

    return performRcpCombine(N, DCI);

  case ISD::AssertZext:

  case ISD::AssertSext:

    return performAssertSZExtCombine(N, DCI);

  case ISD::INTRINSIC_WO_CHAIN:

    return performIntrinsicWOChainCombine(N, DCI);

  case AMDGPUISD::FMAD_FTZ: {

    SDValue N0 = N->getOperand(0);

    SDValue N1 = N->getOperand(1);

    SDValue N2 = N->getOperand(2);

    EVT VT = N->getValueType(0);


    // FMAD_FTZ is a FMAD + flush denormals to zero.

    // We flush the inputs, the intermediate step, and the output.

    ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);

    ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);

    ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);

    if (N0CFP && N1CFP && N2CFP) {

      const auto FTZ = [](const APFloat &V) {

        if (V.isDenormal()) {

          APFloat Zero(V.getSemantics(), 0);

          return V.isNegative() ? -Zero : Zero;

        }

        return V;

      };


      APFloat V0 = FTZ(N0CFP->getValueAPF());

      APFloat V1 = FTZ(N1CFP->getValueAPF());

      APFloat V2 = FTZ(N2CFP->getValueAPF());

      V0.multiply(V1, APFloat::rmNearestTiesToEven);

      V0 = FTZ(V0);

      V0.add(V2, APFloat::rmNearestTiesToEven);

      return DAG.getConstantFP(FTZ(V0), DL, VT);

    }

    break;

  }

  }

  return SDValue();

}


bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(

    SDValue Op, const APInt &OriginalDemandedBits,

    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

    unsigned Depth) const {

  switch (Op.getOpcode()) {

  case ISD::INTRINSIC_WO_CHAIN: {

    switch (Op.getConstantOperandVal(0)) {

    case Intrinsic::amdgcn_readfirstlane:

    case Intrinsic::amdgcn_readlane:

    case Intrinsic::amdgcn_set_inactive:

    case Intrinsic::amdgcn_wwm: {

      if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,

                               OriginalDemandedElts, Known, TLO, Depth + 1))

        return true;

      break;

    }

    default:

      break;

    }

    break;

  }

  default:

    break;

  }


  return false;

}


//===----------------------------------------------------------------------===//

// Helper functions

//===----------------------------------------------------------------------===//


SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,

                                                   const TargetRegisterClass *RC,

                                                   Register Reg, EVT VT,

                                                   const SDLoc &SL,

                                                   bool RawReg) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineRegisterInfo &MRI = MF.getRegInfo();

  Register VReg;


  if (!MRI.isLiveIn(Reg)) {

    VReg = MRI.createVirtualRegister(RC);

    MRI.addLiveIn(Reg, VReg);

  } else {

    VReg = MRI.getLiveInVirtReg(Reg);

  }


  if (RawReg)

    return DAG.getRegister(VReg, VT);


  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);

}


// This may be called multiple times, and nothing prevents creating multiple

// objects at the same offset. See if we already defined this object.


static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,

                                       int64_t Offset) {

  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {

    if (MFI.getObjectOffset(I) == Offset) {

      assert(MFI.getObjectSize(I) == Size);

      return I;

    }

  }


  return MFI.CreateFixedObject(Size, Offset, true);

}


SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,

                                                  EVT VT,

                                                  const SDLoc &SL,

                                                  int64_t Offset) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();

  int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);


  auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);

  SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);


  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),

                     MachineMemOperand::MODereferenceable |

                         MachineMemOperand::MOInvariant);

}


SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,

                                                   const SDLoc &SL,

                                                   SDValue Chain,

                                                   SDValue ArgVal,

                                                   int64_t Offset) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


  SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);

  // Stores to the argument stack area are relative to the stack pointer.

  SDValue SP =

      DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);

  Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);

  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),

                               MachineMemOperand::MODereferenceable);

  return Store;

}


SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,

                                             const TargetRegisterClass *RC,

                                             EVT VT, const SDLoc &SL,

                                             const ArgDescriptor &Arg) const {

  assert(Arg && "Attempting to load missing argument");


  SDValue V = Arg.isRegister() ?

    CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :

    loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());


  if (!Arg.isMasked())

    return V;


  unsigned Mask = Arg.getMask();

  unsigned Shift = llvm::countr_zero<unsigned>(Mask);

  V = DAG.getNode(ISD::SRL, SL, VT, V,

                  DAG.getShiftAmountConstant(Shift, VT, SL));

  return DAG.getNode(ISD::AND, SL, VT, V,

                     DAG.getConstant(Mask >> Shift, SL, VT));

}


uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(

    uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {

  unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();

  const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();

  uint64_t ArgOffset =

      alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;

  switch (Param) {

  case FIRST_IMPLICIT:

    return ArgOffset;

  case PRIVATE_BASE:

    return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;

  case SHARED_BASE:

    return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;

  case QUEUE_PTR:

    return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;

  }

  llvm_unreachable("unexpected implicit parameter type");

}


uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(

    const MachineFunction &MF, const ImplicitParameter Param) const {

  const AMDGPUMachineFunctionInfo *MFI =

      MF.getInfo<AMDGPUMachineFunctionInfo>();

  return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);

}


SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,

                                              SelectionDAG &DAG, int Enabled,

                                              int &RefinementSteps,

                                              bool &UseOneConstNR,

                                              bool Reciprocal) const {

  EVT VT = Operand.getValueType();


  if (VT == MVT::f32) {

    RefinementSteps = 0;

    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);

  }


  // TODO: There is also f64 rsq instruction, but the documentation is less

  // clear on its precision.


  return SDValue();

}


SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,

                                               SelectionDAG &DAG, int Enabled,

                                               int &RefinementSteps) const {

  EVT VT = Operand.getValueType();


  if (VT == MVT::f32) {

    // Reciprocal, < 1 ulp error.

    //

    // This reciprocal approximation converges to < 0.5 ulp error with one

    // newton rhapson performed with two fused multiple adds (FMAs).


    RefinementSteps = 0;

    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);

  }


  // TODO: There is also f64 rcp instruction, but the documentation is less

  // clear on its precision.


  return SDValue();

}


static unsigned workitemIntrinsicDim(unsigned ID) {

  switch (ID) {

  case Intrinsic::amdgcn_workitem_id_x:

    return 0;

  case Intrinsic::amdgcn_workitem_id_y:

    return 1;

  case Intrinsic::amdgcn_workitem_id_z:

    return 2;

  default:

    llvm_unreachable("not a workitem intrinsic");

  }

}


void AMDGPUTargetLowering::computeKnownBitsForTargetNode(

    const SDValue Op, KnownBits &Known,

    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {


  Known.resetAll(); // Don't know anything.


  unsigned Opc = Op.getOpcode();


  switch (Opc) {

  default:

    break;

  case AMDGPUISD::CARRY:

  case AMDGPUISD::BORROW: {

    Known.Zero = APInt::getHighBitsSet(32, 31);

    break;

  }


  case AMDGPUISD::BFE_I32:

  case AMDGPUISD::BFE_U32: {

    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    if (!CWidth)

      return;


    uint32_t Width = CWidth->getZExtValue() & 0x1f;


    if (Opc == AMDGPUISD::BFE_U32)

      Known.Zero = APInt::getHighBitsSet(32, 32 - Width);


    break;

  }

  case AMDGPUISD::FP_TO_FP16: {

    unsigned BitWidth = Known.getBitWidth();


    // High bits are zero.

    Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);

    break;

  }

  case AMDGPUISD::MUL_U24:

  case AMDGPUISD::MUL_I24: {

    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

    unsigned BitWidth = Op.getScalarValueSizeInBits();


    // Sign/Zero extend from 24 bits.

    if (Opc == AMDGPUISD::MUL_I24) {

      LHSKnown = LHSKnown.trunc(24).sext(BitWidth);

      RHSKnown = RHSKnown.trunc(24).sext(BitWidth);

    } else {

      LHSKnown = LHSKnown.trunc(24).zext(BitWidth);

      RHSKnown = RHSKnown.trunc(24).zext(BitWidth);

    }


    // TODO: SelfMultiply can be poison, but not undef.

    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);

    if (SelfMultiply)

      SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(

          Op.getOperand(0), DemandedElts, UndefPoisonKind::UndefOrPoison,

          Depth + 1);


    Known = KnownBits::mul(LHSKnown, RHSKnown, SelfMultiply);

    break;

  }

  case AMDGPUISD::PERM: {

    ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    if (!CMask)

      return;


    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

    unsigned Sel = CMask->getZExtValue();


    for (unsigned I = 0; I < 32; I += 8) {

      unsigned SelBits = Sel & 0xff;

      if (SelBits < 4) {

        SelBits *= 8;

        Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;

        Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;

      } else if (SelBits < 7) {

        SelBits = (SelBits & 3) * 8;

        Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;

        Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;

      } else if (SelBits == 0x0c) {

        Known.Zero |= 0xFFull << I;

      } else if (SelBits > 0x0c) {

        Known.One |= 0xFFull << I;

      }

      Sel >>= 8;

    }

    break;

  }

  case AMDGPUISD::BUFFER_LOAD_UBYTE:  {

    Known.Zero.setHighBits(24);

    break;

  }

  case AMDGPUISD::BUFFER_LOAD_USHORT: {

    Known.Zero.setHighBits(16);

    break;

  }

  case AMDGPUISD::LDS: {

    auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());

    Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());


    Known.Zero.setHighBits(16);

    Known.Zero.setLowBits(Log2(Alignment));

    break;

  }

  case AMDGPUISD::SMIN3:

  case AMDGPUISD::SMAX3:

  case AMDGPUISD::SMED3:

  case AMDGPUISD::UMIN3:

  case AMDGPUISD::UMAX3:

  case AMDGPUISD::UMED3: {

    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);

    if (Known2.isUnknown())

      break;


    KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

    if (Known1.isUnknown())

      break;


    KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

    if (Known0.isUnknown())

      break;


    // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.

    Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;

    Known.One = Known0.One & Known1.One & Known2.One;

    break;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IID = Op.getConstantOperandVal(0);

    switch (IID) {

    case Intrinsic::amdgcn_workitem_id_x:

    case Intrinsic::amdgcn_workitem_id_y:

    case Intrinsic::amdgcn_workitem_id_z: {

      unsigned MaxValue = Subtarget->getMaxWorkitemID(

          DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));

      Known.Zero.setHighBits(llvm::countl_zero(MaxValue));

      break;

    }

    default:

      break;

    }

  }

  }

}


unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    unsigned Depth) const {

  switch (Op.getOpcode()) {

  case AMDGPUISD::BFE_I32: {

    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    if (!Width)

      return 1;


    unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;

    if (!isNullConstant(Op.getOperand(1)))

      return SignBits;


    // TODO: Could probably figure something out with non-0 offsets.

    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);

    return std::max(SignBits, Op0SignBits);

  }


  case AMDGPUISD::BFE_U32: {

    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));

    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;

  }


  case AMDGPUISD::CARRY:

  case AMDGPUISD::BORROW:

    return 31;

  case AMDGPUISD::BUFFER_LOAD_BYTE:

    return 25;

  case AMDGPUISD::BUFFER_LOAD_SHORT:

    return 17;

  case AMDGPUISD::BUFFER_LOAD_UBYTE:

    return 24;

  case AMDGPUISD::BUFFER_LOAD_USHORT:

    return 16;

  case AMDGPUISD::FP_TO_FP16:

    return 16;

  case AMDGPUISD::SMIN3:

  case AMDGPUISD::SMAX3:

  case AMDGPUISD::SMED3:

  case AMDGPUISD::UMIN3:

  case AMDGPUISD::UMAX3:

  case AMDGPUISD::UMED3: {

    unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);

    if (Tmp2 == 1)

      return 1; // Early out.


    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);

    if (Tmp1 == 1)

      return 1; // Early out.


    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);

    if (Tmp0 == 1)

      return 1; // Early out.


    return std::min({Tmp0, Tmp1, Tmp2});

  }

  default:

    return 1;

  }

}


unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(

    GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,

    const MachineRegisterInfo &MRI, unsigned Depth) const {

  const MachineInstr *MI = MRI.getVRegDef(R);

  if (!MI)

    return 1;


  // TODO: Check range metadata on MMO.

  switch (MI->getOpcode()) {

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:

    return 25;

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:

    return 17;

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:

    return 24;

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:

    return 16;

  case AMDGPU::G_AMDGPU_SMED3:

  case AMDGPU::G_AMDGPU_UMED3: {

    auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();

    unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);

    if (Tmp2 == 1)

      return 1;

    unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);

    if (Tmp1 == 1)

      return 1;

    unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);

    if (Tmp0 == 1)

      return 1;

    return std::min({Tmp0, Tmp1, Tmp2});

  }

  default:

    return 1;

  }

}


bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

    UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {

  unsigned Opcode = Op.getOpcode();

  switch (Opcode) {

  case AMDGPUISD::BFE_I32:

  case AMDGPUISD::BFE_U32:

    return false;

  }

  return TargetLowering::canCreateUndefOrPoisonForTargetNode(

      Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);

}


bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(

    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,

    unsigned Depth) const {

  unsigned Opcode = Op.getOpcode();

  switch (Opcode) {

  case AMDGPUISD::FMIN_LEGACY:

  case AMDGPUISD::FMAX_LEGACY: {

    if (SNaN)

      return true;


    // TODO: Can check no nans on one of the operands for each one, but which

    // one?

    return false;

  }

  case AMDGPUISD::FMUL_LEGACY:

  case AMDGPUISD::CVT_PKRTZ_F16_F32: {

    if (SNaN)

      return true;

    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&

           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);

  }

  case AMDGPUISD::FMED3:

  case AMDGPUISD::FMIN3:

  case AMDGPUISD::FMAX3:

  case AMDGPUISD::FMINIMUM3:

  case AMDGPUISD::FMAXIMUM3:

  case AMDGPUISD::FMAD_FTZ: {

    if (SNaN)

      return true;

    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&

           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&

           DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);

  }

  case AMDGPUISD::CVT_F32_UBYTE0:

  case AMDGPUISD::CVT_F32_UBYTE1:

  case AMDGPUISD::CVT_F32_UBYTE2:

  case AMDGPUISD::CVT_F32_UBYTE3:

    return true;


  case AMDGPUISD::RCP:

  case AMDGPUISD::RSQ:

  case AMDGPUISD::RCP_LEGACY:

  case AMDGPUISD::RSQ_CLAMP: {

    if (SNaN)

      return true;


    // TODO: Need is known positive check.

    return false;

  }

  case ISD::FLDEXP:

  case AMDGPUISD::FRACT: {

    if (SNaN)

      return true;

    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);

  }

  case AMDGPUISD::DIV_SCALE:

  case AMDGPUISD::DIV_FMAS:

  case AMDGPUISD::DIV_FIXUP:

    // TODO: Refine on operands.

    return SNaN;

  case AMDGPUISD::SIN_HW:

  case AMDGPUISD::COS_HW: {

    // TODO: Need check for infinity

    return SNaN;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IntrinsicID = Op.getConstantOperandVal(0);

    // TODO: Handle more intrinsics

    switch (IntrinsicID) {

    case Intrinsic::amdgcn_cubeid:

    case Intrinsic::amdgcn_cvt_off_f32_i4:

      return true;


    case Intrinsic::amdgcn_frexp_mant: {

      if (SNaN)

        return true;

      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);

    }

    case Intrinsic::amdgcn_cvt_pkrtz: {

      if (SNaN)

        return true;

      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&

             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);

    }

    case Intrinsic::amdgcn_rcp:

    case Intrinsic::amdgcn_rsq:

    case Intrinsic::amdgcn_rcp_legacy:

    case Intrinsic::amdgcn_rsq_legacy:

    case Intrinsic::amdgcn_rsq_clamp:

    case Intrinsic::amdgcn_tanh: {

      if (SNaN)

        return true;


      // TODO: Need is known positive check.

      return false;

    }

    case Intrinsic::amdgcn_trig_preop:

    case Intrinsic::amdgcn_fdot2:

      // TODO: Refine on operand

      return SNaN;

    case Intrinsic::amdgcn_fma_legacy:

      if (SNaN)

        return true;

      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&

             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&

             DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);

    default:

      return false;

    }

  }

  default:

    return false;

  }

}


bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,

                                               Register N0, Register N1) const {

  return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks

}


Trap
@ Trap
Definition AArch64AsmPrinter.cpp:85

SDValue
return SDValue()

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

hasSourceMods
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
Definition AMDGPUCombinerHelper.cpp:85

isInv2Pi
static bool isInv2Pi(const APFloat &APF)
Definition AMDGPUCombinerHelper.cpp:149

opMustUseVOP3Encoding
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
Definition AMDGPUCombinerHelper.cpp:77

inverseMinMax
static unsigned inverseMinMax(unsigned Opc)
Definition AMDGPUCombinerHelper.cpp:175

extractF64Exponent
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:2495

workitemIntrinsicDim
static unsigned workitemIntrinsicDim(unsigned ID)
Definition AMDGPUISelLowering.cpp:6031

getOrCreateFixedStackObject
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
Definition AMDGPUISelLowering.cpp:5898

constantFoldBFE
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
Definition AMDGPUISelLowering.cpp:4112

getMad
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
Definition AMDGPUISelLowering.cpp:2794

getAddOneOp
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
Definition AMDGPUISelLowering.cpp:4776

selectSupportsSourceMods
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
Definition AMDGPUISelLowering.cpp:731

AMDGPUBypassSlowDiv
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))

getMul24
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
Definition AMDGPUISelLowering.cpp:4758

fnegFoldsIntoOp
static bool fnegFoldsIntoOp(const SDNode *N)
Definition AMDGPUISelLowering.cpp:702

isI24
static bool isI24(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:4053

isCttzOpc
static bool isCttzOpc(unsigned Opc)
Definition AMDGPUISelLowering.cpp:3356

isU24
static bool isU24(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:4049

peekFPSignOps
static SDValue peekFPSignOps(SDValue Val)
Definition AMDGPUISelLowering.cpp:1674

valueIsKnownNeverF32Denorm
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Definition AMDGPUISelLowering.cpp:2657

distributeOpThroughSelect
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
Definition AMDGPUISelLowering.cpp:5042

peekFNeg
static SDValue peekFNeg(SDValue Val)
Definition AMDGPUISelLowering.cpp:1667

simplifyMul24
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
Definition AMDGPUISelLowering.cpp:4060

isCtlzOpc
static bool isCtlzOpc(unsigned Opc)
Definition AMDGPUISelLowering.cpp:3352

fnegFoldsIntoOpcode
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
Definition AMDGPUISelLowering.cpp:663

hasVolatileUser
static bool hasVolatileUser(SDNode *Val)
Definition AMDGPUISelLowering.cpp:4127

AMDGPUISelLowering.h
Interface definition of the TargetLowering class that is common to all AMD GPUs.

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

AMDGPUMachineFunctionInfo.h

AMDGPUMemoryUtils.h

AMDGPUSelectionDAGInfo.h

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:808

X
#define X(NUM, ENUM, NAME)
Definition ELF.h:853

Analysis
block Block Frequency Analysis
Definition BlockFrequencyInfo.cpp:300

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CommandLine.h

LLVM_READNONE
#define LLVM_READNONE
Definition Compiler.h:315

LLVM_READONLY
#define LLVM_READONLY
Definition Compiler.h:322

DiagnosticInfo.h

GISelValueTracking.h
Provides analysis for querying information about KnownBits during GISel passes.

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

KnownBits.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

G
#define G(x, y, z)
Definition MD5.cpp:55

MachineFrameInfo.h

getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Definition MachineInstrBundle.cpp:104

T
#define T
Definition Mips16ISelLowering.cpp:282

Signed
@ Signed
Definition NVPTXISelLowering.cpp:6398

P
#define P(N)

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

CH
#define CH(x, y, z)
Definition SHA256.cpp:34

SIMachineFunctionInfo.h

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

Mul
BinaryOperator * Mul
Definition X86PartialReduction.cpp:75

llvm::AMDGPUCallLowering::CCAssignFnForCall
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Definition AMDGPUISelLowering.cpp:1133

llvm::AMDGPUCallLowering::CCAssignFnForReturn
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
Definition AMDGPUISelLowering.cpp:1161

llvm::AMDGPUMachineFunctionInfo
Definition AMDGPUMachineFunctionInfo.h:24

llvm::AMDGPUMachineFunctionInfo::getExplicitKernArgSize
uint64_t getExplicitKernArgSize() const
Definition AMDGPUMachineFunctionInfo.h:75

llvm::AMDGPUMachineFunctionInfo::isModuleEntryFunction
bool isModuleEntryFunction() const
Definition AMDGPUMachineFunctionInfo.h:91

llvm::AMDGPUMachineFunctionInfo::allocateLDSGlobal
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
Definition AMDGPUMachineFunctionInfo.h:105

llvm::AMDGPUMachineFunctionInfo::recordNumNamedBarriers
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
Definition AMDGPUMachineFunctionInfo.h:83

llvm::AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
Definition AMDGPUMachineFunctionInfo.cpp:185

llvm::AMDGPUSubtarget
Definition AMDGPUSubtarget.h:30

llvm::AMDGPUTargetLowering::numBitsSigned
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:56

llvm::AMDGPUTargetLowering::combineFMinMaxLegacy
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
Definition AMDGPUISelLowering.cpp:1753

llvm::AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
Definition AMDGPUISelLowering.cpp:6191

llvm::AMDGPUTargetLowering::performMulhuCombine
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4950

llvm::AMDGPUTargetLowering::getTypeForExtReturn
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
Definition AMDGPUISelLowering.cpp:802

llvm::AMDGPUTargetLowering::SplitVectorLoad
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
Definition AMDGPUISelLowering.cpp:1876

llvm::AMDGPUTargetLowering::LowerCONCAT_VECTORS
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1596

llvm::AMDGPUTargetLowering::performLoadCombine
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4159

llvm::AMDGPUTargetLowering::analyzeFormalArgumentsCompute
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
Definition AMDGPUISelLowering.cpp:1215

llvm::AMDGPUTargetLowering::LowerF64ToF16Safe
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3798

llvm::AMDGPUTargetLowering::LowerFROUND
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2604

llvm::AMDGPUTargetLowering::storeStackInputValue
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
Definition AMDGPUISelLowering.cpp:5926

llvm::AMDGPUTargetLowering::storeOfVectorConstantIsCheap
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
Definition AMDGPUISelLowering.cpp:980

llvm::AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1631

llvm::AMDGPUTargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition AMDGPUISelLowering.cpp:6044

llvm::AMDGPUTargetLowering::shouldCombineMemoryType
bool shouldCombineMemoryType(EVT VT) const
Definition AMDGPUISelLowering.cpp:4138

llvm::AMDGPUTargetLowering::splitBinaryBitConstantOpImpl
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
Definition AMDGPUISelLowering.cpp:4330

llvm::AMDGPUTargetLowering::lowerUnhandledCall
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
Definition AMDGPUISelLowering.cpp:1390

llvm::AMDGPUTargetLowering::LowerGlobalAddress
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1538

llvm::AMDGPUTargetLowering::performAssertSZExtCombine
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4269

llvm::AMDGPUTargetLowering::isTruncateFree
bool isTruncateFree(EVT Src, EVT Dest) const override
Definition AMDGPUISelLowering.cpp:998

llvm::AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
Definition AMDGPUISelLowering.cpp:986

llvm::AMDGPUTargetLowering::LowerFCEIL
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2470

llvm::AMDGPUTargetLowering::getConstantNegateCost
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
Definition AMDGPUISelLowering.cpp:5201

llvm::AMDGPUTargetLowering::LowerFLOGUnsafe
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
Definition AMDGPUISelLowering.cpp:2913

llvm::AMDGPUTargetLowering::performMulhsCombine
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4917

llvm::AMDGPUTargetLowering::lowerFEXPUnsafeImpl
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
Definition AMDGPUISelLowering.cpp:3113

llvm::AMDGPUTargetLowering::isSDNodeAlwaysUniform
bool isSDNodeAlwaysUniform(const SDNode *N) const override
Definition AMDGPUISelLowering.cpp:906

llvm::AMDGPUTargetLowering::isDesirableToCommuteWithShift
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Definition AMDGPUISelLowering.cpp:1087

llvm::AMDGPUTargetLowering::performShlCombine
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4353

llvm::AMDGPUTargetLowering::isCheapToSpeculateCtlz
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
Definition AMDGPUISelLowering.cpp:902

llvm::AMDGPUTargetLowering::LowerSDIVREM
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2410

llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition AMDGPUISelLowering.cpp:973

llvm::AMDGPUTargetLowering::LowerFLOG10
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2907

llvm::AMDGPUTargetLowering::LowerINT_TO_FP64
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
Definition AMDGPUISelLowering.cpp:3592

llvm::AMDGPUTargetLowering::computeNumSignBitsForTargetInstr
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
Definition AMDGPUISelLowering.cpp:6252

llvm::AMDGPUTargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
Definition AMDGPUISelLowering.cpp:1436

llvm::AMDGPUTargetLowering::LowerFP_TO_FP16
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3781

llvm::AMDGPUTargetLowering::addTokenForArgument
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
Definition AMDGPUISelLowering.cpp:1356

llvm::AMDGPUTargetLowering::isConstantCheaperToNegate
bool isConstantCheaperToNegate(SDValue N) const
Definition AMDGPUISelLowering.cpp:5217

llvm::AMDGPUTargetLowering::isReassocProfitable
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
Definition AMDGPUISelLowering.cpp:6416

llvm::AMDGPUTargetLowering::isKnownNeverNaNForTargetNode
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
Definition AMDGPUISelLowering.cpp:6301

llvm::AMDGPUTargetLowering::needsDenormHandlingF32
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
Definition AMDGPUISelLowering.cpp:2692

llvm::AMDGPUTargetLowering::getImplicitParameterOffset
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
Definition AMDGPUISelLowering.cpp:5985

llvm::AMDGPUTargetLowering::lowerFEXPF64
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2953

llvm::AMDGPUTargetLowering::LowerFFLOOR
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2631

llvm::AMDGPUTargetLowering::performSelectCombine
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5142

llvm::AMDGPUTargetLowering::performFNegCombine
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5271

llvm::AMDGPUTargetLowering::LowerFP_TO_INT
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3886

llvm::AMDGPUTargetLowering::isConstantCostlierToNegate
bool isConstantCostlierToNegate(SDValue N) const
Definition AMDGPUISelLowering.cpp:5211

llvm::AMDGPUTargetLowering::loadInputValue
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
Definition AMDGPUISelLowering.cpp:5945

llvm::AMDGPUTargetLowering::LowerDIVREM24
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
Definition AMDGPUISelLowering.cpp:2014

llvm::AMDGPUTargetLowering::lowerFEXP10Unsafe
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
Definition AMDGPUISelLowering.cpp:3163

llvm::AMDGPUTargetLowering::shouldReduceLoadWidth
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
Definition AMDGPUISelLowering.cpp:834

llvm::AMDGPUTargetLowering::LowerUINT_TO_FP
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3611

llvm::AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
Definition AMDGPUISelLowering.cpp:6288

llvm::AMDGPUTargetLowering::isCheapToSpeculateCttz
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
Definition AMDGPUISelLowering.cpp:898

llvm::AMDGPUTargetLowering::performCtlz_CttzCombine
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5008

llvm::AMDGPUTargetLowering::performSraCombine
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4452

llvm::AMDGPUTargetLowering::isSelectSupported
bool isSelectSupported(SelectSupportKind) const override
Definition AMDGPUISelLowering.cpp:817

llvm::AMDGPUTargetLowering::isZExtFree
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
Definition AMDGPUISelLowering.cpp:1019

llvm::AMDGPUTargetLowering::lowerFEXP2
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3059

llvm::AMDGPUTargetLowering::LowerCall
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
Definition AMDGPUISelLowering.cpp:1421

llvm::AMDGPUTargetLowering::performSrlCombine
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4558

llvm::AMDGPUTargetLowering::lowerFEXP
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3217

llvm::AMDGPUTargetLowering::getIsLtSmallestNormal
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
Definition AMDGPUISelLowering.cpp:2701

llvm::AMDGPUTargetLowering::mayIgnoreSignedZero
bool mayIgnoreSignedZero(SDValue Op) const
Definition AMDGPUISelLowering.cpp:650

llvm::AMDGPUTargetLowering::getIsFinite
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
Definition AMDGPUISelLowering.cpp:2719

llvm::AMDGPUTargetLowering::isLoadBitCastBeneficial
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
Definition AMDGPUISelLowering.cpp:874

llvm::AMDGPUTargetLowering::splitVector
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
Definition AMDGPUISelLowering.cpp:1841

llvm::AMDGPUTargetLowering::AMDGPUTargetLowering
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
Definition AMDGPUISelLowering.cpp:62

llvm::AMDGPUTargetLowering::LowerFLOGCommon
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2800

llvm::AMDGPUTargetLowering::foldFreeOpFromSelect
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
Definition AMDGPUISelLowering.cpp:5065

llvm::AMDGPUTargetLowering::LowerINT_TO_FP32
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
Definition AMDGPUISelLowering.cpp:3463

llvm::AMDGPUTargetLowering::isFAbsFree
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
Definition AMDGPUISelLowering.cpp:965

llvm::AMDGPUTargetLowering::isInt64ImmLegal
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
Definition AMDGPUISelLowering.cpp:5572

llvm::AMDGPUTargetLowering::loadStackInputValue
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
Definition AMDGPUISelLowering.cpp:5910

llvm::AMDGPUTargetLowering::LowerFLOG2
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2759

llvm::AMDGPUTargetLowering::getEquivalentMemType
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
Definition AMDGPUISelLowering.cpp:41

llvm::AMDGPUTargetLowering::LowerCTLS
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
Definition AMDGPUISelLowering.cpp:3450

llvm::AMDGPUTargetLowering::getSqrtEstimate
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
Definition AMDGPUISelLowering.cpp:5992

llvm::AMDGPUTargetLowering::performTruncateCombine
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4661

llvm::AMDGPUTargetLowering::LowerSINT_TO_FP
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3657

llvm::AMDGPUTargetLowering::ImplicitParameter
ImplicitParameter
Definition AMDGPUISelLowering.h:399

llvm::AMDGPUTargetLowering::SHARED_BASE
@ SHARED_BASE
Definition AMDGPUISelLowering.h:402

llvm::AMDGPUTargetLowering::PRIVATE_BASE
@ PRIVATE_BASE
Definition AMDGPUISelLowering.h:401

llvm::AMDGPUTargetLowering::FIRST_IMPLICIT
@ FIRST_IMPLICIT
Definition AMDGPUISelLowering.h:400

llvm::AMDGPUTargetLowering::QUEUE_PTR
@ QUEUE_PTR
Definition AMDGPUISelLowering.h:403

llvm::AMDGPUTargetLowering::stripBitcast
static SDValue stripBitcast(SDValue Val)
Definition AMDGPUISelLowering.h:198

llvm::AMDGPUTargetLowering::LowerBlockAddress
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1529

llvm::AMDGPUTargetLowering::CreateLiveInRegister
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
Definition AMDGPUISelLowering.cpp:5874

llvm::AMDGPUTargetLowering::SplitVectorStore
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
Definition AMDGPUISelLowering.cpp:1970

llvm::AMDGPUTargetLowering::LowerCTLZ_CTTZ
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3390

llvm::AMDGPUTargetLowering::getNegatedExpression
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
Definition AMDGPUISelLowering.cpp:930

llvm::AMDGPUTargetLowering::split64BitValue
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
Definition AMDGPUISelLowering.cpp:1792

llvm::AMDGPUTargetLowering::performMulCombine
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4783

llvm::AMDGPUTargetLowering::getRecipEstimate
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
Definition AMDGPUISelLowering.cpp:6010

llvm::AMDGPUTargetLowering::LowerFNEARBYINT
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2584

llvm::AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:4022

llvm::AMDGPUTargetLowering::CCAssignFnForReturn
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
Definition AMDGPUISelLowering.cpp:1351

llvm::AMDGPUTargetLowering::getScaledLogInput
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
Definition AMDGPUISelLowering.cpp:2736

llvm::AMDGPUTargetLowering::CCAssignFnForCall
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
Definition AMDGPUISelLowering.cpp:1346

llvm::AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
Definition AMDGPUISelLowering.cpp:5842

llvm::AMDGPUTargetLowering::allUsesHaveSourceMods
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
Definition AMDGPUISelLowering.cpp:776

llvm::AMDGPUTargetLowering::LowerFROUNDEVEN
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2556

llvm::AMDGPUTargetLowering::isFPImmLegal
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
Definition AMDGPUISelLowering.cpp:823

llvm::AMDGPUTargetLowering::numBitsUnsigned
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:52

llvm::AMDGPUTargetLowering::lowerFEXPUnsafe
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Definition AMDGPUISelLowering.cpp:3129

llvm::AMDGPUTargetLowering::LowerFTRUNC
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2510

llvm::AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1426

llvm::AMDGPUTargetLowering::allowApproxFunc
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
Definition AMDGPUISelLowering.cpp:2687

llvm::AMDGPUTargetLowering::ShouldShrinkFPConstant
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
Definition AMDGPUISelLowering.cpp:829

llvm::AMDGPUTargetLowering::LowerReturn
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
Definition AMDGPUISelLowering.cpp:1329

llvm::AMDGPUTargetLowering::performStoreCombine
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4212

llvm::AMDGPUTargetLowering::ReplaceNodeResults
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Definition AMDGPUISelLowering.cpp:1489

llvm::AMDGPUTargetLowering::performRcpCombine
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5560

llvm::AMDGPUTargetLowering::getLoHalf64
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1806

llvm::AMDGPUTargetLowering::lowerCTLZResults
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3360

llvm::AMDGPUTargetLowering::LowerFP_TO_INT_SAT
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3930

llvm::AMDGPUTargetLowering::performFAbsCombine
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5535

llvm::AMDGPUTargetLowering::LowerFP_TO_INT64
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
Definition AMDGPUISelLowering.cpp:3706

llvm::AMDGPUTargetLowering::shouldFoldFNegIntoSrc
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
Definition AMDGPUISelLowering.cpp:5252

llvm::AMDGPUTargetLowering::isNarrowingProfitable
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
Definition AMDGPUISelLowering.cpp:1041

llvm::AMDGPUTargetLowering::LowerFRINT
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2593

llvm::AMDGPUTargetLowering::performIntrinsicWOChainCombine
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4292

llvm::AMDGPUTargetLowering::LowerUDIVREM
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:2355

llvm::AMDGPUTargetLowering::performMulLoHiCombine
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:4868

llvm::AMDGPUTargetLowering::PerformDAGCombine
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Definition AMDGPUISelLowering.cpp:5596

llvm::AMDGPUTargetLowering::LowerUDIVREM64
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
Definition AMDGPUISelLowering.cpp:2141

llvm::AMDGPUTargetLowering::WidenOrSplitVectorLoad
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
Definition AMDGPUISelLowering.cpp:1936

llvm::AMDGPUTargetLowering::getSplitDestVTs
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
Definition AMDGPUISelLowering.cpp:1826

llvm::AMDGPUTargetLowering::getHiHalf64
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1814

llvm::AMDGPUTargetLowering::combineFMinMaxLegacyImpl
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:1684

llvm::AMDGPUTargetLowering::getVectorIdxWidth
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
Definition AMDGPUISelLowering.cpp:813

llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296

llvm::APFloatBase::IEEEdouble
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297

llvm::APFloatBase::rmNearestTiesToEven
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344

llvm::APFloatBase::IEEEhalf
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294

llvm::APFloat
Definition APFloat.h:1029

llvm::APFloat::bitwiseIsEqual
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1503

llvm::APFloat::add
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1240

llvm::APFloat::getSemantics
const fltSemantics & getSemantics() const
Definition APFloat.h:1546

llvm::APFloat::multiply
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1258

llvm::APFloat::getSmallestNormalized
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217

llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition APFloat.h:1430

llvm::APFloat::getInf
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563

llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414

llvm::APInt::getMaxValue
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207

llvm::APInt::ugt
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition APInt.h:1189

llvm::APInt::getBitsSet
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259

llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210

llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220

llvm::APInt::ule
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157

llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297

llvm::APInt::setLowBits
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1411

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::BlockAddressSDNode
Definition SelectionDAGNodes.h:2506

llvm::BlockAddressSDNode::getOffset
int64_t getOffset() const
Definition SelectionDAGNodes.h:2520

llvm::BlockAddressSDNode::getTargetFlags
unsigned getTargetFlags() const
Definition SelectionDAGNodes.h:2521

llvm::BlockAddressSDNode::getBlockAddress
const BlockAddress * getBlockAddress() const
Definition SelectionDAGNodes.h:2519

llvm::CCState
CCState - This class holds information needed while lowering arguments and return values.
Definition CallingConvLower.h:171

llvm::CCValAssign::Full
@ Full
Definition CallingConvLower.h:37

llvm::CCValAssign::getCustomMem
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
Definition CallingConvLower.h:104

llvm::ConstantFPSDNode
Definition SelectionDAGNodes.h:1870

llvm::ConstantFPSDNode::getValueAPF
const APFloat & getValueAPF() const
Definition SelectionDAGNodes.h:1883

llvm::ConstantFPSDNode::isNegative
bool isNegative() const
Return true if the value is negative.
Definition SelectionDAGNodes.h:1902

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1815

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1832

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1831

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1103

llvm::ExternalSymbolSDNode
Definition SelectionDAGNodes.h:2548

llvm::FrameIndexSDNode
Definition SelectionDAGNodes.h:2104

llvm::Function
Definition Function.h:65

llvm::Function::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362

llvm::Function::args
iterator_range< arg_iterator > args()
Definition Function.h:892

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GISelValueTracking
Definition GISelValueTracking.h:34

llvm::GlobalAddressSDNode
Definition SelectionDAGNodes.h:2060

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:249

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2656

llvm::LoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2675

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::SimpleValueType
SimpleValueType
Definition MachineValueType.h:38

llvm::MVT::integer_fixedlen_vector_valuetypes
static auto integer_fixedlen_vector_valuetypes()
Definition MachineValueType.h:581

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:374

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:322

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:106

llvm::MVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition MachineValueType.h:90

llvm::MVT::integer_valuetypes
static auto integer_valuetypes()
Definition MachineValueType.h:552

llvm::MVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition MachineValueType.h:80

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:287

llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition MachineFrameInfo.h:112

llvm::MachineFrameInfo::CreateFixedObject
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
Definition MachineFrameInfo.cpp:83

llvm::MachineFrameInfo::getObjectSize
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
Definition MachineFrameInfo.h:486

llvm::MachineFrameInfo::getObjectOffset
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
Definition MachineFrameInfo.h:553

llvm::MachineFrameInfo::getObjectIndexBegin
int getObjectIndexBegin() const
Return the minimum frame object index.
Definition MachineFrameInfo.h:423

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition MachineFunction.h:804

llvm::MachineFunction::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition MachineFunction.cpp:331

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:884

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition MachineMemOperand.h:145

llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition MachineMemOperand.h:147

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition MachineMemOperand.h:227

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::hasOneNonDBGUse
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition MachineRegisterInfo.cpp:425

llvm::MachineRegisterInfo::getVRegDef
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition MachineRegisterInfo.cpp:404

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::isLiveIn
LLVM_ABI bool isLiveIn(Register Reg) const
Definition MachineRegisterInfo.cpp:458

llvm::MachineRegisterInfo::getLiveInVirtReg
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
Definition MachineRegisterInfo.cpp:476

llvm::MachineRegisterInfo::addLiveIn
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
Definition MachineRegisterInfo.h:1003

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1418

llvm::MemSDNode::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Definition SelectionDAGNodes.h:1545

llvm::MemSDNode::getAlign
Align getAlign() const
Definition SelectionDAGNodes.h:1443

llvm::MemSDNode::isSimple
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
Definition SelectionDAGNodes.h:1506

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1514

llvm::MemSDNode::getChain
const SDValue & getChain() const
Definition SelectionDAGNodes.h:1581

llvm::MemSDNode::isInvariant
bool isInvariant() const
Definition SelectionDAGNodes.h:1467

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1509

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1246

llvm::SDLoc::getDebugLoc
const DebugLoc & getDebugLoc() const
Definition SelectionDAGNodes.h:1262

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:511

llvm::SDNode::ops
ArrayRef< SDUse > ops() const
Definition SelectionDAGNodes.h:1065

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:706

llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition SelectionDAGNodes.h:778

llvm::SDNode::getFlags
SDNodeFlags getFlags() const
Definition SelectionDAGNodes.h:1107

llvm::SDNode::getVTList
SDVTList getVTList() const
Definition SelectionDAGNodes.h:1084

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1056

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1854

llvm::SDNode::users
iterator_range< user_iterator > users()
Definition SelectionDAGNodes.h:918

llvm::SDUse
Represents a use of a SDNode.
Definition SelectionDAGNodes.h:280

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::hasOneUse
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
Definition SelectionDAGNodes.h:1323

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1281

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1289

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1277

llvm::SDValue::getNumOperands
unsigned getNumOperands() const
Definition SelectionDAGNodes.h:1285

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:418

llvm::SIMachineFunctionInfo::getMode
SIModeRegisterDefaults getMode() const
Definition SIMachineFunctionInfo.h:677

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:231

llvm::SelectionDAG::getExtLoad
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10663

llvm::SelectionDAG::getExtOrTrunc
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition SelectionDAG.h:1053

llvm::SelectionDAG::ComputeMaxSignificantBits
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
Definition SelectionDAG.cpp:5603

llvm::SelectionDAG::getRoot
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition SelectionDAG.h:601

llvm::SelectionDAG::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
Definition SelectionDAG.h:516

llvm::SelectionDAG::getMergeValues
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition SelectionDAG.cpp:10381

llvm::SelectionDAG::getVTList
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition SelectionDAG.cpp:12073

llvm::SelectionDAG::getShiftAmountConstant
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
Definition SelectionDAG.cpp:1872

llvm::SelectionDAG::getAllOnesConstant
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition SelectionDAG.cpp:1861

llvm::SelectionDAG::ExtractVectorElements
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
Definition SelectionDAG.cpp:14499

llvm::SelectionDAG::getFreeze
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
Definition SelectionDAG.cpp:2568

llvm::SelectionDAG::getConstantFP
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1934

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2434

llvm::SelectionDAG::getLoad
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition SelectionDAG.cpp:10646

llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition SelectionDAG.h:1382

llvm::SelectionDAG::getNOT
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
Definition SelectionDAG.cpp:1681

llvm::SelectionDAG::getTargetLoweringInfo
const TargetLowering & getTargetLoweringInfo() const
Definition SelectionDAG.h:520

llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition SelectionDAG.h:1184

llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition SelectionDAG.h:896

llvm::SelectionDAG::getBitcast
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition SelectionDAG.cpp:2539

llvm::SelectionDAG::getCopyFromReg
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition SelectionDAG.h:867

llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition SelectionDAG.h:1412

llvm::SelectionDAG::getZeroExtendInReg
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
Definition SelectionDAG.cpp:1621

llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition SelectionDAG.h:514

llvm::SelectionDAG::getConstant
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1725

llvm::SelectionDAG::getTruncStore
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10772

llvm::SelectionDAG::ReplaceAllUsesWith
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
Definition SelectionDAG.cpp:13031

llvm::SelectionDAG::getStore
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition SelectionDAG.cpp:10696

llvm::SelectionDAG::getSignedConstant
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition SelectionDAG.cpp:1855

llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition SelectionDAG.h:1172

llvm::SelectionDAG::isConstantValueOfAnyType
bool isConstantValueOfAnyType(SDValue N) const
Definition SelectionDAG.h:2666

llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition SelectionDAG.h:1422

llvm::SelectionDAG::getSExtOrTrunc
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
Definition SelectionDAG.cpp:1555

llvm::SelectionDAG::isGuaranteedNotToBeUndefOrPoison
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, UndefPoisonKind Kind=UndefPoisonKind::UndefOrPoison, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, Kind can be used to track poison ...
Definition SelectionDAG.cpp:5616

llvm::SelectionDAG::getIntPtrConstant
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1867

llvm::SelectionDAG::getValueType
LLVM_ABI SDValue getValueType(EVT)
Definition SelectionDAG.cpp:2114

llvm::SelectionDAG::getNode
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition SelectionDAG.cpp:11704

llvm::SelectionDAG::isKnownNeverNaN
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
Definition SelectionDAG.cpp:6331

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:730

llvm::SelectionDAG::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
Definition SelectionDAG.cpp:4903

llvm::SelectionDAG::getTargetBlockAddress
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:836

llvm::SelectionDAG::getVectorIdxConstant
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1885

llvm::SelectionDAG::ReplaceAllUsesOfValueWith
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
Definition SelectionDAG.cpp:13193

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:509

llvm::SelectionDAG::getPOISON
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
Definition SelectionDAG.h:1212

llvm::SelectionDAG::getFrameIndex
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
Definition SelectionDAG.cpp:2001

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3350

llvm::SelectionDAG::getZExtOrTrunc
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition SelectionDAG.cpp:1561

llvm::SelectionDAG::MaskedValueIsZero
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
Definition SelectionDAG.cpp:2916

llvm::SelectionDAG::getObjectPtrOffset
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
Definition SelectionDAG.h:1157

llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition SelectionDAG.h:534

llvm::SelectionDAG::setRoot
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition SelectionDAG.h:610

llvm::SelectionDAG::UpdateNodeOperands
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
Definition SelectionDAG.cpp:12166

llvm::SelectionDAG::getEntryNode
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition SelectionDAG.h:604

llvm::SelectionDAG::SplitScalar
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:14398

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::StoreSDNode
This class is used to represent ISD::STORE nodes.
Definition SelectionDAGNodes.h:2684

llvm::StoreSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2703

llvm::StoreSDNode::getValue
const SDValue & getValue() const
Definition SelectionDAGNodes.h:2702

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition TargetLowering.h:2705

llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
Definition TargetLowering.h:2934

llvm::TargetLoweringBase::Enabled
@ Enabled
Definition TargetLowering.h:591

llvm::TargetLoweringBase::PredictableSelectIsExpensive
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
Definition TargetLowering.h:4024

llvm::TargetLoweringBase::shouldReduceLoadWidth
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
Definition TargetLowering.h:1918

llvm::TargetLoweringBase::Custom
@ Custom
Definition TargetLowering.h:208

llvm::TargetLoweringBase::Expand
@ Expand
Definition TargetLowering.h:206

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:205

llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3985

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition TargetLowering.h:374

llvm::TargetLoweringBase::getNumRegistersForCallingConv
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
Definition TargetLowering.h:1895

llvm::TargetLoweringBase::MaxGluedStoresPerMemcpy
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
Definition TargetLowering.h:3991

llvm::TargetLoweringBase::getRegisterTypeForCallingConv
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition TargetLowering.h:1887

llvm::TargetLoweringBase::addBypassSlowDiv
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
Definition TargetLowering.h:2681

llvm::TargetLoweringBase::setMaxLargeFPConvertBitWidthSupported
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
Definition TargetLowering.h:2940

llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Definition TargetLowering.h:2928

llvm::TargetLoweringBase::SelectSupportKind
SelectSupportKind
Enum that describes what type of support for selects the target has.
Definition TargetLowering.h:244

llvm::TargetLoweringBase::allowsMisalignedMemoryAccesses
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
Definition TargetLowering.h:2016

llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:3970

llvm::TargetLoweringBase::getShiftAmountTy
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
Definition TargetLoweringBase.cpp:1332

llvm::TargetLoweringBase::MaxStoresPerMemmove
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
Definition TargetLowering.h:4018

llvm::TargetLoweringBase::getSetCCResultType
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
Definition TargetLoweringBase.cpp:1967

llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
Definition TargetLowering.h:4020

llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition TargetLowering.h:1105

llvm::TargetLoweringBase::setSupportsUnalignedAtomics
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
Definition TargetLowering.h:2950

llvm::TargetLoweringBase::isOperationLegal
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Definition TargetLowering.h:1483

llvm::TargetLoweringBase::MaxStoresPerMemset
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
Definition TargetLowering.h:3968

llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition TargetLowering.h:2768

llvm::TargetLoweringBase::setMinCmpXchgSizeInBits
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
Definition TargetLowering.h:2945

llvm::TargetLoweringBase::AddPromotedToType
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
Definition TargetLowering.h:2872

llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition TargetLowering.h:2893

llvm::TargetLoweringBase::setLoadExtAction
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
Definition TargetLowering.h:2722

llvm::TargetLoweringBase::GatherAllAliasesMaxDepth
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
Definition TargetLowering.h:3956

llvm::TargetLoweringBase::NegatibleCost
NegatibleCost
Enum that specifies when a float negation is beneficial.
Definition TargetLowering.h:286

llvm::TargetLoweringBase::NegatibleCost::Cheaper
@ Cheaper
Definition TargetLowering.h:287

llvm::TargetLoweringBase::NegatibleCost::Expensive
@ Expensive
Definition TargetLowering.h:289

llvm::TargetLoweringBase::NegatibleCost::Neutral
@ Neutral
Definition TargetLowering.h:288

llvm::TargetLoweringBase::allowsMemoryAccessForAlignment
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
Definition TargetLoweringBase.cpp:2148

llvm::TargetLoweringBase::MaxStoresPerMemcpy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
Definition TargetLowering.h:3983

llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition TargetLowering.h:2647

llvm::TargetLoweringBase::setJumpIsExpensive
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
Definition TargetLoweringBase.cpp:1383

llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition TargetLowering.h:4047

llvm::TargetLowering::scalarizeVectorStore
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
Definition TargetLowering.cpp:11236

llvm::TargetLowering::SimplifyMultipleUseDemandedBits
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition TargetLowering.cpp:708

llvm::TargetLowering::expandUnalignedStore
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
Definition TargetLowering.cpp:11459

llvm::TargetLowering::ShrinkDemandedConstant
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
Definition TargetLowering.cpp:541

llvm::TargetLowering::expandUnalignedLoad
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
Definition TargetLowering.cpp:11312

llvm::TargetLowering::getNegatedExpression
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
Definition TargetLowering.cpp:7580

llvm::TargetLowering::scalarizeVectorLoad
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
Definition TargetLowering.cpp:11148

llvm::TargetLowering::SimplifyDemandedBits
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Definition TargetLowering.cpp:1162

llvm::TargetLowering::TargetLowering
TargetLowering(const TargetLowering &)=delete

llvm::TargetLowering::canCreateUndefOrPoisonForTargetNode
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Definition TargetLowering.cpp:4065

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetSubtargetInfo
TargetSubtargetInfo - Generic base class for all target subtargets.
Definition TargetSubtargetInfo.h:66

llvm::Triple::r600
@ r600
Definition Triple.h:76

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318

llvm::cl::opt
Definition CommandLine.h:1454

uint32_t

uint64_t

unsigned

Analysis.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:37

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET
@ QUEUE_PTR_OFFSET
Definition SIDefines.h:1096

llvm::AMDGPU::ImplicitArg::SHARED_BASE_OFFSET
@ SHARED_BASE_OFFSET
Definition SIDefines.h:1095

llvm::AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET
@ PRIVATE_BASE_OFFSET
Definition SIDefines.h:1094

llvm::AMDGPU::isIntrinsicAlwaysUniform
bool isIntrinsicAlwaysUniform(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3517

llvm::AMDGPU::isNamedBarrier
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
Definition AMDGPUMemoryUtils.cpp:78

llvm::AMDGPU::isUniformMMO
bool isUniformMMO(const MachineMemOperand *MMO)
Definition AMDGPUInstrInfo.cpp:30

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_CS
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition CallingConv.h:197

llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition CallingConv.h:188

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::CallingConv::AMDGPU_Gfx
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition CallingConv.h:232

llvm::CallingConv::AMDGPU_CS_ChainPreserve
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition CallingConv.h:249

llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition CallingConv.h:206

llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition CallingConv.h:191

llvm::CallingConv::AMDGPU_CS_Chain
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition CallingConv.h:245

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition CallingConv.h:194

llvm::CallingConv::Cold
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47

llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition CallingConv.h:144

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41

llvm::CallingConv::AMDGPU_Gfx_WholeWave
@ AMDGPU_Gfx_WholeWave
Definition CallingConv.h:288

llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition CallingConv.h:218

llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition CallingConv.h:213

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::IRSimilarity::Legal
@ Legal
Definition IRSimilarityIdentifier.h:77

llvm::ISD::NodeType
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823

llvm::ISD::STORE
@ STORE
Definition ISDOpcodes.h:1170

llvm::ISD::LRINT
@ LRINT
Definition ISDOpcodes.h:1070

llvm::ISD::FLOG10
@ FLOG10
Definition ISDOpcodes.h:1057

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:269

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:268

llvm::ISD::INSERT_SUBVECTOR
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:885

llvm::ISD::UMIN
@ UMIN
Definition ISDOpcodes.h:729

llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783

llvm::ISD::ROTR
@ ROTR
Definition ISDOpcodes.h:773

llvm::ISD::FPOW
@ FPOW
Definition ISDOpcodes.h:1043

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::ATOMIC_STORE
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition ISDOpcodes.h:1379

llvm::ISD::FTRUNC
@ FTRUNC
Definition ISDOpcodes.h:1062

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:267

llvm::ISD::ADDC
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294

llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522

llvm::ISD::FMAXNUM_IEEE
@ FMAXNUM_IEEE
Definition ISDOpcodes.h:1104

llvm::ISD::LLRINT
@ LLRINT
Definition ISDOpcodes.h:1071

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:1169

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518

llvm::ISD::SUBC
@ SUBC
Definition ISDOpcodes.h:295

llvm::ISD::FABS
@ FABS
Definition ISDOpcodes.h:1031

llvm::ISD::FNEARBYINT
@ FNEARBYINT
Definition ISDOpcodes.h:1064

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884

llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::FEXP10
@ FEXP10
Definition ISDOpcodes.h:1060

llvm::ISD::FP_TO_FP16
@ FP_TO_FP16
Definition ISDOpcodes.h:1008

llvm::ISD::UDIVREM
@ UDIVREM
Definition ISDOpcodes.h:281

llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::FMAXIMUM
@ FMAXIMUM
Definition ISDOpcodes.h:1110

llvm::ISD::FP16_TO_FP
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:1007

llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254

llvm::ISD::FFLOOR
@ FFLOOR
Definition ISDOpcodes.h:1067

llvm::ISD::FLDEXP
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition ISDOpcodes.h:1046

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::CTLZ_ZERO_POISON
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792

llvm::ISD::LLROUND
@ LLROUND
Definition ISDOpcodes.h:1069

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848

llvm::ISD::FLOG2
@ FLOG2
Definition ISDOpcodes.h:1056

llvm::ISD::UADDSAT
@ UADDSAT
Definition ISDOpcodes.h:366

llvm::ISD::FMAXNUM
@ FMAXNUM
Definition ISDOpcodes.h:1088

llvm::ISD::FRINT
@ FRINT
Definition ISDOpcodes.h:1063

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1030

llvm::ISD::CTTZ
@ CTTZ
Definition ISDOpcodes.h:784

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:931

llvm::ISD::BRIND
@ BRIND
BRIND - Indirect branch.
Definition ISDOpcodes.h:1190

llvm::ISD::BR_JT
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:1194

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::FCANONICALIZE
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541

llvm::ISD::IS_FPCLASS
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:276

llvm::ISD::ATOMIC_LOAD
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition ISDOpcodes.h:1375

llvm::ISD::EXTRACT_ELEMENT
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247

llvm::ISD::FSHR
@ FSHR
Definition ISDOpcodes.h:775

llvm::ISD::FROUND
@ FROUND
Definition ISDOpcodes.h:1065

llvm::ISD::CTLS
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:796

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704

llvm::ISD::STRICT_FP16_TO_FP
@ STRICT_FP16_TO_FP
Definition ISDOpcodes.h:1009

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649

llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614

llvm::ISD::FMINNUM_IEEE
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition ISDOpcodes.h:1103

llvm::ISD::FCOS
@ FCOS
Definition ISDOpcodes.h:1035

llvm::ISD::EntryToken
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854

llvm::ISD::FP_TO_UINT_SAT
@ FP_TO_UINT_SAT
Definition ISDOpcodes.h:950

llvm::ISD::CTPOP
@ CTPOP
Definition ISDOpcodes.h:786

llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::FMINNUM
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition ISDOpcodes.h:1087

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:705

llvm::ISD::DYNAMIC_STACKALLOC
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:1179

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982

llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328

llvm::ISD::FROUNDEVEN
@ FROUNDEVEN
Definition ISDOpcodes.h:1066

llvm::ISD::INLINEASM_BR
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition ISDOpcodes.h:1235

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:421

llvm::ISD::FMINIMUM
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:1109

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::TRAP
@ TRAP
TRAP - Trapping instruction.
Definition ISDOpcodes.h:1346

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:329

llvm::ISD::FLOG
@ FLOG
Definition ISDOpcodes.h:1055

llvm::ISD::SUBE
@ SUBE
Definition ISDOpcodes.h:305

llvm::ISD::ADDE
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:270

llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565

llvm::ISD::TokenFactor
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53

llvm::ISD::FSIN
@ FSIN
Definition ISDOpcodes.h:1034

llvm::ISD::FEXP
@ FEXP
Definition ISDOpcodes.h:1058

llvm::ISD::FCEIL
@ FCEIL
Definition ISDOpcodes.h:1061

llvm::ISD::CTTZ_ZERO_POISON
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:791

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FFREXP
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition ISDOpcodes.h:1053

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963

llvm::ISD::LROUND
@ LROUND
Definition ISDOpcodes.h:1068

llvm::ISD::CTLZ
@ CTLZ
Definition ISDOpcodes.h:785

llvm::ISD::FMAXIMUMNUM
@ FMAXIMUMNUM
Definition ISDOpcodes.h:1115

llvm::ISD::FSQRT
@ FSQRT
Definition ISDOpcodes.h:1032

llvm::ISD::ADDRSPACECAST
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:1001

llvm::ISD::INLINEASM
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition ISDOpcodes.h:1232

llvm::ISD::FP_TO_SINT_SAT
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:949

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860

llvm::ISD::ROTL
@ ROTL
Definition ISDOpcodes.h:772

llvm::ISD::AssertSext
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62

llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534

llvm::ISD::AssertZext
@ AssertZext
Definition ISDOpcodes.h:63

llvm::ISD::FEXP2
@ FEXP2
Definition ISDOpcodes.h:1059

llvm::ISD::SMAX
@ SMAX
Definition ISDOpcodes.h:728

llvm::ISD::UMAX
@ UMAX
Definition ISDOpcodes.h:730

llvm::ISD::FMINIMUMNUM
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition ISDOpcodes.h:1114

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556

llvm::ISD::isNormalStore
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
Definition SelectionDAGNodes.h:3452

llvm::ISD::CondCode
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition ISDOpcodes.h:1776

llvm::ISD::SETOEQ
@ SETOEQ
Definition ISDOpcodes.h:1779

llvm::ISD::SETUNE
@ SETUNE
Definition ISDOpcodes.h:1792

llvm::ISD::SETUEQ
@ SETUEQ
Definition ISDOpcodes.h:1787

llvm::ISD::SETTRUE2
@ SETTRUE2
Definition ISDOpcodes.h:1802

llvm::ISD::SETOLE
@ SETOLE
Definition ISDOpcodes.h:1783

llvm::ISD::SETOLT
@ SETOLT
Definition ISDOpcodes.h:1782

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1801

llvm::ISD::SETUGT
@ SETUGT
Definition ISDOpcodes.h:1788

llvm::ISD::SETOGT
@ SETOGT
Definition ISDOpcodes.h:1780

llvm::ISD::SETULT
@ SETULT
Definition ISDOpcodes.h:1790

llvm::ISD::SETUO
@ SETUO
Definition ISDOpcodes.h:1786

llvm::ISD::SETONE
@ SETONE
Definition ISDOpcodes.h:1784

llvm::ISD::SETGT
@ SETGT
Definition ISDOpcodes.h:1797

llvm::ISD::SETFALSE2
@ SETFALSE2
Definition ISDOpcodes.h:1795

llvm::ISD::SETLT
@ SETLT
Definition ISDOpcodes.h:1799

llvm::ISD::SETO
@ SETO
Definition ISDOpcodes.h:1785

llvm::ISD::SETGE
@ SETGE
Definition ISDOpcodes.h:1798

llvm::ISD::SETTRUE
@ SETTRUE
Definition ISDOpcodes.h:1793

llvm::ISD::SETUGE
@ SETUGE
Definition ISDOpcodes.h:1789

llvm::ISD::SETLE
@ SETLE
Definition ISDOpcodes.h:1800

llvm::ISD::SETULE
@ SETULE
Definition ISDOpcodes.h:1791

llvm::ISD::SETOGE
@ SETOGE
Definition ISDOpcodes.h:1781

llvm::ISD::SETFALSE
@ SETFALSE
Definition ISDOpcodes.h:1778

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1796

llvm::ISD::SETCC_INVALID
@ SETCC_INVALID
Definition ISDOpcodes.h:1804

llvm::ISD::LoadExtType
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition ISDOpcodes.h:1756

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::EXTLOAD
@ EXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::isNormalLoad
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Definition SelectionDAGNodes.h:3414

llvm::Sched::RegPressure
@ RegPressure
Definition TargetLowering.h:107

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::numbers::ln2
constexpr double ln2
Definition STLForwardCompat.h:66

llvm::numbers::ln10
constexpr double ln10
Definition STLForwardCompat.h:67

llvm::numbers::log2ef
constexpr float log2ef
Definition MathExtras.h:51

llvm::numbers::log2e
constexpr double log2e
Definition STLForwardCompat.h:68

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::getAlign
MaybeAlign getAlign(const CallInst &I, unsigned Index)
Definition NVVMProperties.cpp:320

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::isNullConstant
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
Definition SelectionDAG.cpp:13625

llvm::ComputeValueVTs
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::CCAssignFn
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
Definition CallingConvLower.h:157

llvm::isConstOrConstSplatFP
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
Definition SelectionDAG.cpp:13819

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204

llvm::countl_zero
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263

llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition PointerIntPair.h:262

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150

llvm::alignTo
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::CaptureComponents::Address
@ Address
Definition ModRef.h:368

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:904

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::CombineLevel
CombineLevel
Definition DAGCombine.h:15

llvm::AfterLegalizeDAG
@ AfterLegalizeDAG
Definition DAGCombine.h:19

llvm::BeforeLegalizeTypes
@ BeforeLegalizeTypes
Definition DAGCombine.h:16

llvm::AfterLegalizeTypes
@ AfterLegalizeTypes
Definition DAGCombine.h:17

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:232

llvm::bit_cast
To bit_cast(const From &from) noexcept
Definition bit.h:90

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:41

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:38

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::isConstOrConstSplat
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
Definition SelectionDAG.cpp:13776

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:219

llvm::DS_Warning
@ DS_Warning
Definition DiagnosticInfo.h:52

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::isOneConstant
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Definition SelectionDAG.cpp:13644

llvm::UndefPoisonKind
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20

llvm::UndefPoisonKind::UndefOrPoison
@ UndefOrPoison
Definition UndefPoison.h:23

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::CostThreshold
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
Definition LoadStoreVec.cpp:22

llvm::neg
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1666

llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197

llvm::isAllOnesConstant
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Definition SelectionDAG.cpp:13639

llvm::reportFatalUsageError
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863

N
#define N

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::ArgDescriptor
Definition AMDGPUArgumentUsageInfo.h:23

llvm::ArgDescriptor::getRegister
MCRegister getRegister() const
Definition AMDGPUArgumentUsageInfo.h:62

llvm::ArgDescriptor::isRegister
bool isRegister() const
Definition AMDGPUArgumentUsageInfo.h:60

llvm::ArgDescriptor::isMasked
bool isMasked() const
Definition AMDGPUArgumentUsageInfo.h:72

llvm::ArgDescriptor::getStackOffset
unsigned getStackOffset() const
Definition AMDGPUArgumentUsageInfo.h:64

llvm::ArgDescriptor::getMask
unsigned getMask() const
Definition AMDGPUArgumentUsageInfo.h:66

llvm::DenormalMode::Input
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
Definition FloatingPointMode.h:97

llvm::DenormalMode::PreserveSign
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Definition FloatingPointMode.h:81

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition FloatingPointMode.h:119

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418

llvm::EVT::getPow2VectorType
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:508

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145

llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70

llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129

llvm::EVT::bitsGT
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155

llvm::EVT::getDoubleNumVectorElementsVT
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396

llvm::EVT::isByteSized
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:266

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408

llvm::EVT::getHalfSizedIntegerVT
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:453

llvm::EVT::isPow2VectorType
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501

llvm::EVT::getStoreSizeInBits
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:435

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339

llvm::EVT::getIntegerVT
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61

llvm::EVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404

llvm::EVT::getRoundIntegerType
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:442

llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346

llvm::EVT::bitsGE
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:315

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351

llvm::EVT::isExtended
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150

llvm::EVT::changeElementType
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121

llvm::EVT::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition ValueTypes.cpp:336

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359

llvm::EVT::bitsLE
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331

llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160

llvm::ISD::InputArg
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
Definition TargetCallingConv.h:204

llvm::ISD::InputArg::VT
MVT VT
Legalized type of this argument part.
Definition TargetCallingConv.h:207

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::isUnknown
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64

llvm::KnownBits::trunc
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165

llvm::KnownBits::getBitWidth
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44

llvm::KnownBits::zext
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176

llvm::KnownBits::resetAll
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72

llvm::KnownBits::countMaxActiveBits
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310

llvm::KnownBits::sext
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184

llvm::KnownBits::countMinLeadingZeros
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262

llvm::KnownBits::getMaxValue
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146

llvm::KnownBits::getMinValue
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130

llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103

llvm::KnownBits::One
APInt One
Definition KnownBits.h:26

llvm::KnownBits::mul
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition KnownBits.cpp:1001

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::MIPatternMatch::And
Matching combinators.
Definition MIPatternMatch.h:314

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MachinePointerInfo::isDereferenceable
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
Definition MachineOperand.cpp:1128

llvm::MachinePointerInfo::getStack
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
Definition MachineOperand.cpp:1163

llvm::MachinePointerInfo::getWithOffset
MachinePointerInfo getWithOffset(int64_t O) const
Definition MachineMemOperand.h:82

llvm::MinMax
Definition AssumeBundleQueries.h:69

llvm::SDNodeFlags
These are IR-level optimization flags that may be propagated to SDNodes.
Definition SelectionDAGNodes.h:378

llvm::SDNodeFlags::setAllowContract
void setAllowContract(bool b)
Definition SelectionDAGNodes.h:462

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::SIModeRegisterDefaults::FP32Denormals
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
Definition SIModeRegisterDefaults.h:33

llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition TargetLowering.h:4831

llvm::TargetLowering::CallLoweringInfo::IsTailCall
bool IsTailCall
Definition TargetLowering.h:4850

llvm::TargetLowering::CallLoweringInfo::Callee
SDValue Callee
Definition TargetLowering.h:4857

llvm::TargetLowering::CallLoweringInfo::DL
SDLoc DL
Definition TargetLowering.h:4860

llvm::TargetLowering::CallLoweringInfo::Ins
SmallVector< ISD::InputArg, 32 > Ins
Definition TargetLowering.h:4864

llvm::TargetLowering::CallLoweringInfo::Chain
SDValue Chain
Definition TargetLowering.h:4832

llvm::TargetLowering::CallLoweringInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4859

llvm::TargetLowering::DAGCombinerInfo
Definition TargetLowering.h:4535

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalizeOps
bool isBeforeLegalizeOps() const
Definition TargetLowering.h:4547

llvm::TargetLowering::DAGCombinerInfo::getDAGCombineLevel
CombineLevel getDAGCombineLevel()
Definition TargetLowering.h:4549

llvm::TargetLowering::DAGCombinerInfo::AddToWorklist
LLVM_ABI void AddToWorklist(SDNode *N)
Definition DAGCombiner.cpp:932

llvm::TargetLowering::DAGCombinerInfo::isCalledByLegalizer
bool isCalledByLegalizer() const
Definition TargetLowering.h:4550

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalize
bool isBeforeLegalize() const
Definition TargetLowering.h:4546

llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4541

llvm::TargetLowering::DAGCombinerInfo::CombineTo
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
Definition DAGCombiner.cpp:937

llvm::TargetLowering::DAGCombinerInfo::CommitTargetLoweringOpt
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
Definition DAGCombiner.cpp:957

llvm::TargetLowering::TargetLoweringOpt
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Definition TargetLowering.h:4229

llvm::cl::desc
Definition CommandLine.h:410

llvm::fltSemantics
Definition APFloat.h:997