24#include "llvm/IR/IntrinsicsAMDGPU.h"
30#define DEBUG_TYPE "amdgpu-late-codegenprepare"
39 WidenLoads(
"amdgpu-late-codegenprepare-widen-constant-loads",
40 cl::desc(
"Widen sub-dword constant address space loads in "
41 "AMDGPULateCodeGenPrepare"),
46class AMDGPULateCodeGenPrepare
47 :
public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
60 :
F(
F),
DL(
F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
62 bool visitInstruction(Instruction &) {
return false; }
65 bool isDWORDAligned(
const Value *V)
const {
70 bool canWidenScalarExtLoad(LoadInst &LI)
const;
71 bool visitLoadInst(LoadInst &LI);
76class LiveRegOptimizer {
80 const GCNSubtarget &ST;
83 Type *
const ConvertToScalar;
87 DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
92 Type *calculateConvertType(
Type *OriginalType);
99 Value *convertFromOptType(
Type *ConvertType, Instruction *V,
101 BasicBlock *InsertBlock);
105 bool optimizeLiveType(Instruction *
I,
106 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
110 bool shouldReplace(
Type *ITy) {
115 const auto *TLI = ST.getTargetLowering();
130 bool isOpLegal(
const Instruction *
I) {
141 unsigned EB =
IT->getBitWidth();
142 unsigned EC = VT->getNumElements();
144 if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
145 switch (BO->getOpcode()) {
146 case Instruction::Add:
147 case Instruction::Sub:
148 case Instruction::And:
149 case Instruction::Or:
150 case Instruction::Xor:
163 bool isCoercionProfitable(Instruction *
II) {
164 SmallPtrSet<Instruction *, 4> CVisited;
165 SmallVector<Instruction *, 4> UserList;
169 for (User *V :
II->users())
175 return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
176 return isa<PHINode, ShuffleVectorInst, InsertElementInst,
177 ExtractElementInst, CastInst>(
II);
180 while (!UserList.
empty()) {
182 if (!CVisited.
insert(CII).second)
187 if (CII->getParent() ==
II->getParent() && !IsLookThru(CII) &&
195 for (User *V : CII->users())
202 LiveRegOptimizer(
Module &Mod,
const GCNSubtarget &ST)
203 : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
209bool AMDGPULateCodeGenPrepare::run() {
217 LiveRegOptimizer LRO(*
F.getParent(), ST);
221 bool HasScalarSubwordLoads =
ST.hasScalarSubwordLoads();
226 Changed |= LRO.optimizeLiveType(&
I, DeadInsts);
233Type *LiveRegOptimizer::calculateConvertType(
Type *OriginalType) {
239 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
240 TypeSize ConvertScalarSize =
DL.getTypeSizeInBits(ConvertToScalar);
241 unsigned ConvertEltCount =
242 (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
244 if (OriginalSize <= ConvertScalarSize)
247 return VectorType::get(Type::getIntNTy(
Mod.getContext(), ConvertScalarSize),
248 ConvertEltCount,
false);
251Value *LiveRegOptimizer::convertToOptType(Instruction *V,
254 Type *NewTy = calculateConvertType(
V->getType());
256 TypeSize OriginalSize =
DL.getTypeSizeInBits(VTy);
257 TypeSize NewSize =
DL.getTypeSizeInBits(NewTy);
262 if (OriginalSize == NewSize)
263 return Builder.CreateBitCast(V, NewTy,
V->getName() +
".bc");
266 assert(NewSize > OriginalSize);
269 SmallVector<int, 8> ShuffleMask;
271 for (
unsigned I = 0;
I < OriginalElementCount;
I++)
274 for (uint64_t
I = OriginalElementCount;
I < ExpandedVecElementCount;
I++)
275 ShuffleMask.
push_back(OriginalElementCount);
277 Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
278 return Builder.CreateBitCast(ExpandedVec, NewTy,
V->getName() +
".bc");
281Value *LiveRegOptimizer::convertFromOptType(
Type *ConvertType, Instruction *V,
283 BasicBlock *InsertBB) {
286 TypeSize OriginalSize =
DL.getTypeSizeInBits(
V->getType());
287 TypeSize NewSize =
DL.getTypeSizeInBits(NewVTy);
291 if (OriginalSize == NewSize)
292 return Builder.CreateBitCast(V, NewVTy,
V->getName() +
".bc");
296 assert(OriginalSize > NewSize);
298 if (!
V->getType()->isVectorTy()) {
313 SmallVector<int, 8> ShuffleMask(NarrowElementCount);
314 std::iota(ShuffleMask.
begin(), ShuffleMask.
end(), 0);
316 return Builder.CreateShuffleVector(Converted, ShuffleMask);
319bool LiveRegOptimizer::optimizeLiveType(
320 Instruction *
I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
321 SmallVector<Instruction *, 4> Worklist;
322 SmallPtrSet<PHINode *, 4> PhiNodes;
323 SmallPtrSet<Instruction *, 4> Defs;
324 SmallPtrSet<Instruction *, 4>
Uses;
325 SmallPtrSet<Instruction *, 4> Visited;
328 while (!Worklist.
empty()) {
334 if (!shouldReplace(
II->getType()))
337 if (!isCoercionProfitable(
II))
343 for (
Value *V :
Phi->incoming_values()) {
346 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
363 for (User *V :
II->users()) {
366 if (!PhiNodes.
count(OpPhi) && !Visited.
count(OpPhi))
374 Uses.insert(UseInst);
382 for (Instruction *
D : Defs) {
385 Value *ConvertVal = convertToOptType(
D, InsertPt);
387 ValMap[
D] = ConvertVal;
392 for (PHINode *Phi : PhiNodes) {
394 Phi->getNumIncomingValues(),
395 Phi->getName() +
".tc",
Phi->getIterator());
399 for (PHINode *Phi : PhiNodes) {
401 bool MissingIncVal =
false;
402 for (
int I = 0,
E =
Phi->getNumIncomingValues();
I <
E;
I++) {
403 Value *IncVal =
Phi->getIncomingValue(
I);
405 Type *NewType = calculateConvertType(
Phi->getType());
406 NewPhi->
addIncoming(ConstantInt::get(NewType, 0,
false),
407 Phi->getIncomingBlock(
I));
411 MissingIncVal =
true;
417 SmallVector<Value *, 4> PHIWorklist;
418 SmallPtrSet<Value *, 4> VisitedPhis;
420 while (!PHIWorklist.
empty()) {
422 VisitedPhis.
insert(NextDeadValue);
424 llvm::find_if(PhiNodes, [
this, &NextDeadValue](PHINode *CandPhi) {
425 return ValMap[CandPhi] == NextDeadValue;
429 if (OriginalPhi != PhiNodes.end())
430 ValMap.
erase(*OriginalPhi);
434 for (User *U : NextDeadValue->
users()) {
444 for (Instruction *U :
Uses) {
448 Value *NewVal =
nullptr;
449 if (BBUseValMap.
contains(
U->getParent()) &&
450 BBUseValMap[
U->getParent()].contains(Val))
451 NewVal = BBUseValMap[
U->getParent()][Val];
463 InsertPt,
U->getParent());
464 BBUseValMap[
U->getParent()][ValMap[
Op]] = NewVal;
468 U->setOperand(
OpIdx, NewVal);
476bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI)
const {
489 unsigned TySize =
DL.getTypeStoreSize(Ty);
500bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
509 if (!canWidenScalarExtLoad(LI))
517 if (!isDWORDAligned(
Base))
520 int64_t Adjust =
Offset & 0x3;
531 unsigned LdBits =
DL.getTypeStoreSizeInBits(LI.
getType());
532 auto *IntNTy = Type::getIntNTy(LI.
getContext(), LdBits);
534 auto *NewPtr = IRB.CreateConstGEP1_64(
539 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr,
Align(4));
542 unsigned ShAmt = Adjust * 8;
543 Value *NewVal = IRB.CreateBitCast(
544 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt),
545 DL.typeSizeEqualsStoreSize(LI.
getType()) ? IntNTy
560 bool Changed = AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
576 return "AMDGPU IR late optimizations";
603 return AMDGPULateCodeGenPrepare(
F, ST, &AC, UI).run();
607 "AMDGPU IR late optimizations",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
Machine Check Debug Module
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPULateCodeGenPrepareLegacy()
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
bool erase(const KeyT &Val)
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
FunctionPass class - This class is used to implement most global optimizations.
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Base class for instruction visitors.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
const ParentTy * getParent() const
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
DenseMap< const Value *, Value * > ValueToValueMap
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.